class TwitterCldr::Segmentation::WordIterator

Constants

DICTIONARY_BREAK_ENGINES

Public Instance Methods

each_boundary(str) { |boundary| ... } click to toggle source
# File lib/twitter_cldr/segmentation/word_iterator.rb, line 22
def each_boundary(str, &block)
  return to_enum(__method__, str) unless block_given?

  # Rather than put a bunch of duplicate logic in
  # each_boundary_helper to make sure we don't yield the same
  # boundary twice, we wrap it in this additional de-duping
  # enumerator and call it a day.
  last_boundary = nil

  each_boundary_helper(str) do |boundary|
    yield boundary if boundary != last_boundary
    last_boundary = boundary
  end
end

Private Instance Methods

codepoint_to_engine_cache() click to toggle source
# File lib/twitter_cldr/segmentation/word_iterator.rb, line 165
def codepoint_to_engine_cache
  @@codepoint_to_engine_cache ||= {}
end
dictionary_break_engine_for(codepoint) click to toggle source
# File lib/twitter_cldr/segmentation/word_iterator.rb, line 145
def dictionary_break_engine_for(codepoint)
  codepoint_to_engine_cache[codepoint] ||= begin
    engine = DICTIONARY_BREAK_ENGINES.find do |break_engine|
      break_engine.word_set.include?(codepoint)
    end

    (engine || UnhandledBreakEngine).instance
  end
end
dictionary_re() click to toggle source
# File lib/twitter_cldr/segmentation/word_iterator.rb, line 155
def dictionary_re
  @@dictionary_re ||= begin
    ranges = TwitterCldr::Utils::RangeSet.from_array(dictionary_set).ranges.map do |r|
      "\\u{#{r.first.to_s(16)}}-\\u{#{r.last.to_s(16)}}"
    end

    /[#{ranges.join}]/
  end
end
dictionary_set() click to toggle source

all dictionary characters, i.e. characters that must be handled by one of the dictionary-based break engines

# File lib/twitter_cldr/segmentation/word_iterator.rb, line 137
def dictionary_set
  @@dictionary_set ||= Set.new.tap do |set|
    DICTIONARY_BREAK_ENGINES.each do |break_engine|
      set.merge(break_engine.word_set)
    end
  end
end
each_boundary_helper(str) { |0| ... } click to toggle source
# File lib/twitter_cldr/segmentation/word_iterator.rb, line 39
def each_boundary_helper(str, &block)
  # Set up two independent cursors so the algorithm can iterate
  # over those portions of the input string that require a
  # dictionary-based break iterator independently from those that
  # only need the normal, rule-based break iterator. Cursors
  # hold references to the input text and a list of all the
  # corresponding Unicode codepoints, meaning they are fairly
  # expensive to create. The duplication below should only
  # produce a shallow copy however. The text and codepoint list
  # are not duplicated, but the cursor's integer position can
  # be varied independently.
  dict_cursor = create_cursor(str)
  rule_cursor = dict_cursor.dup

  # implicit start of text boundary
  yield 0

  until dict_cursor.eos? || rule_cursor.eos?
    # We use a regex to identify the beginnings of potential runs
    # of dictionary characters. This regex was benchmarked and
    # found to be pretty fast, but could become a bottleneck if
    # other parts of the algorithm are improved in the future.
    m = dictionary_re.match(dict_cursor.text, dict_cursor.position)
    break unless m

    dict_cursor.position = m.begin(0)
    dict_break_engine = dictionary_break_engine_for(dict_cursor.codepoint)

    # It's possible to encounter a dictionary character that can't
    # be handled by any of the dictionary-based break engines
    # because it's too short to make up an actual word. The
    # break engine will simply yield no breaks in such a case, which
    # we test for below by peeking for the first boundary value and
    # rescuing a StopIteration error. Since the run of dictionary
    # characters may be arbitrarily long, peeking should be more
    # performant than attempting to calculate all the boundary
    # positions for the run at once.
    #
    # It should be noted that, despite our best efforts here in
    # WordIterator, certain dictionary-based break engines (eg.
    # CjBreakEngine) cannot yield word boundaries without first
    # examining the entire run of dictionary characters. In practice
    # this shouldn't be too big an issue, since Chinese text often
    # contains punctuation that should limit the average run length.
    dict_enum = dict_break_engine.each_boundary(dict_cursor)

    dict_boundary = begin
      dict_enum.peek
    rescue StopIteration
      nil
    end

    # If a dictionary boundary was found, attempt to use the rule-based
    # break iterator to find breaks in the text immediately before it.
    # Otherwise, since none of the dictionary-based break engines could
    # find any boundaries in the current run, we advance the dictionary
    # cursor in an attempt to find the next dictionary boundary. Doing
    # so effectively causes the algorithm to fall back to the rule-based
    # break engine.
    if dict_boundary
      # Only use the rule-based break engine if there are characters to
      # process.
      if rule_cursor.position < m.begin(0)
        rule_set.each_boundary(rule_cursor, m.begin(0), &block)
      end

      # Yield all the dictionary breaks from the enum. We can't use .each
      # here because that will restart the iteration. Ruby's loop
      # construct automatically rescues StopIteration.
      loop do
        yield dict_enum.next
      end

      # We've reached the end of a dictionary character run, so yield
      # the end of text boundary.
      yield dict_cursor.position

      # These should be the same after a successful dictionary run, i.e.
      # they should both be positioned at the end of the current rule-based
      # and dictionary-based portions of the run, ready for the next one.
      rule_cursor.position = dict_cursor.position
    else
      dict_cursor.advance
    end
  end

  # Find boundaries in the straggler, non-dictionary run at the end of
  # the input text.
  unless rule_cursor.eos?
    rule_set.each_boundary(rule_cursor, &block)
  end

  # implicit end of text boundary
  yield rule_cursor.length
end