class TwitterCldr::Segmentation::KhmerBreakEngine
Public Class Methods
word_set()
click to toggle source
# File lib/twitter_cldr/segmentation/khmer_break_engine.rb, line 20 def self.word_set @word_set ||= begin uset = TwitterCldr::Shared::UnicodeSet.new uset.apply_pattern('[[:Khmer:]&[:Line_Break=SA:]]') uset.to_set end end
Private Instance Methods
begin_word_set()
click to toggle source
# File lib/twitter_cldr/segmentation/khmer_break_engine.rb, line 75 def begin_word_set @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set| set.add_range(0x1780..0x17B3) end end
end_word_set()
click to toggle source
# File lib/twitter_cldr/segmentation/khmer_break_engine.rb, line 68 def end_word_set @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set| set.add_list(self.class.word_set) set.subtract(0x17D2) # KHMER SIGN COENG that combines some characters end end
engine()
click to toggle source
All Brahmic scripts (including Khmer) can make use of the same break logic, so we use composition here and defer to the Brahmic break engine.
# File lib/twitter_cldr/segmentation/khmer_break_engine.rb, line 32 def engine @engine ||= BrahmicBreakEngine.new( # How many words in a row are "good enough"? lookahead: 3, # Will not combine a non-word with a preceding dictionary word longer than this root_combine_threshold: 3, # Will not combine a non-word that shares at least this much prefix with a # dictionary word with a preceding word prefix_combine_threshold: 3, # Minimum word size min_word: 4, # Minimum number of characters for two words (same as min_word for Khmer) min_word_span: 4, word_set: self.class.word_set, mark_set: mark_set, end_word_set: end_word_set, begin_word_set: begin_word_set, dictionary: Dictionary.khmer, advance_past_suffix: -> (*) do 0 # not applicable to Khmer end ) end
mark_set()
click to toggle source
# File lib/twitter_cldr/segmentation/khmer_break_engine.rb, line 61 def mark_set @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set| set.apply_pattern('[[:Khmer:]&[:Line_Break=SA:]&[:M:]]') set.add(0x0020) end end