class TwitterCldr::Segmentation::ThaiBreakEngine

See: github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java

Constants

THAI_MAIYAMOK

repeat character

THAI_PAIYANNOI

ellision character

Public Class Methods

word_set() click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 20
def self.word_set
  @word_set ||= begin
    uset = TwitterCldr::Shared::UnicodeSet.new
    uset.apply_pattern('[[:Thai:]&[:Line_Break=SA:]]')
    uset.to_set
  end
end

Public Instance Methods

each_boundary(*args, &block) click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 34
def each_boundary(*args, &block)
  engine.each_boundary(*args, &block)
end

Private Instance Methods

advance_past_suffix(cursor, end_pos, state) click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 69
def advance_past_suffix(cursor, end_pos, state)
  suffix_length = 0

  if cursor.position < end_pos && state.word_length > 0
    uc = cursor.codepoint

    candidates = state.words[state.words_found].candidates(
      cursor, engine.dictionary, end_pos
    )

    if candidates <= 0 && suffix_set.include?(uc)
      if uc == THAI_PAIYANNOI
        unless suffix_set.include?(cursor.previous)
          # skip over previous end and PAIYANNOI
          cursor.advance(2)
          suffix_length += 1
          uc = cursor.codepoint
        else
          # restore prior position
          cursor.advance
        end
      end

      if uc == THAI_MAIYAMOK
        if cursor.previous != THAI_MAIYAMOK
          # skip over previous end and MAIYAMOK
          cursor.advance(2)
          suffix_length += 1
        else
          # restore prior position
          cursor.advance
        end
      end
    else
      cursor.position = state.current + state.word_length
    end
  end

  suffix_length
end
begin_word_set() click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 125
def begin_word_set
  @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
    set.add_range(0x0E01..0x0E2E)  # KO KAI through HO NOKHUK
    set.add_range(0x0E40..0x0E44)  # SARA E through SARA AI MAIMALAI
  end
end
end_word_set() click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 117
def end_word_set
  @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
    set.add_list(self.class.word_set)
    set.subtract(0x0E31)  # MAI HAN-AKAT
    set.subtract_range(0x0E40..0x0E44)  # SARA E through SARA AI MAIMALAI
  end
end
engine() click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 40
def engine
  @engine ||= BrahmicBreakEngine.new(
    # How many words in a row are "good enough"?
    lookahead: 3,

    # Will not combine a non-word with a preceding dictionary word longer than this
    root_combine_threshold: 3,

    # Will not combine a non-word that shares at least this much prefix with a
    # dictionary word with a preceding word
    prefix_combine_threshold: 3,

    # Minimum word size
    min_word: 2,

    # Minimum number of characters for two words (min_word * 2)
    min_word_span: 4,

    word_set: self.class.word_set,
    mark_set: mark_set,
    end_word_set: end_word_set,
    begin_word_set: begin_word_set,
    dictionary: Dictionary.thai,
    advance_past_suffix: -> (*args) do
      advance_past_suffix(*args)
    end
  )
end
mark_set() click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 110
def mark_set
  @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
    set.apply_pattern('[[:Thai:]&[:Line_Break=SA:]&[:M:]]')
    set.add(0x0020)
  end
end
suffix_set() click to toggle source
# File lib/twitter_cldr/segmentation/thai_break_engine.rb, line 132
def suffix_set
  @suffix_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
    set.add(THAI_PAIYANNOI)
    set.add(THAI_MAIYAMOK)
  end
end