class TwitterCldr::Segmentation::BurmeseBreakEngine

See: github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java

Public Class Methods

word_set() click to toggle source
# File lib/twitter_cldr/segmentation/burmese_break_engine.rb, line 20
def self.word_set
  @word_set ||= begin
    uset = TwitterCldr::Shared::UnicodeSet.new
    uset.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]]')
    uset.to_set
  end
end

Private Instance Methods

begin_word_set() click to toggle source
# File lib/twitter_cldr/segmentation/burmese_break_engine.rb, line 74
def begin_word_set
  @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
    # basic consonants and independent vowels
    set.add_range(0x1000..0x102A)
  end
end
end_word_set() click to toggle source
# File lib/twitter_cldr/segmentation/burmese_break_engine.rb, line 68
def end_word_set
  @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
    set.add_list(self.class.word_set)
  end
end
engine() click to toggle source

All Brahmic scripts (including Burmese) can make use of the same break logic, so we use composition here and defer to the Brahmic break engine.

# File lib/twitter_cldr/segmentation/burmese_break_engine.rb, line 32
def engine
  @engine ||= BrahmicBreakEngine.new(
    # How many words in a row are "good enough"?
    lookahead: 3,

    # Will not combine a non-word with a preceding dictionary word longer than this
    root_combine_threshold: 3,

    # Will not combine a non-word that shares at least this much prefix with a
    # dictionary word with a preceding word
    prefix_combine_threshold: 3,

    # Minimum word size
    min_word: 2,

    # Minimum number of characters for two words (same as min_word for Burmese)
    min_word_span: 2,

    word_set: self.class.word_set,
    mark_set: mark_set,
    end_word_set: end_word_set,
    begin_word_set: begin_word_set,
    dictionary: Dictionary.burmese,
    advance_past_suffix: -> (*) do
      0  # not applicable to Burmese
    end
  )
end
mark_set() click to toggle source
# File lib/twitter_cldr/segmentation/burmese_break_engine.rb, line 61
def mark_set
  @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
    set.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]&[:M:]]')
    set.add(0x0020)
  end
end