class ThaiLang::WordBreaker
Constants
- A
- ACTIVATED
- CAP_A
- CAP_Z
- COMPLETED
- DICT
- DIX_PTR_IS_FINAL
- DIX_PTR_ROW_NO
- DIX_PTR_S
- INIT
- LATIN
- LINK_KIND
- LINK_P_IDX
- LINK_UNK
- LINK_W
- PUNC
- RANGE_E
- RANGE_S
- SPACE
- TRANSDUCER_E
- TRANSDUCER_KIND
- TRANSDUCER_S
- TRANSDUCER_STATE
- UNK
- WAITING
- Z
Public Class Methods
new(dix_path = nil)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 44 def initialize(dix_path = nil) dix_path = File.expand_path('../../../data/tdict-std.txt', __FILE__) unless dix_path @dix = PrefixTree.new(File.open(dix_path).each_line.map { [_1.chomp, 1] }) end
Public Instance Methods
better_link?(l, r)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 64 def better_link?(l, r) l[LINK_UNK] < r[LINK_UNK] or l[LINK_W] < r[LINK_W] end
break_into_words(text)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 49 def break_into_words(text) tokenize(@dix, text) end
build_path(dix, s)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 137 def build_path(dix, s) left_boundary = 0 ch_vec = s.codepoints ch_len = ch_vec.length path = [[0,0,0,INIT]] dix_ptrs = [] latin_transducer = [0,0,WAITING,LATIN] punc_transducer =[0,0,WAITING,PUNC] ch_vec.each.with_index do |ch, i| dix_ptrs << [i, 0, false] unk_link = path[left_boundary] link = [left_boundary, unk_link[LINK_W] + 1, unk_link[LINK_UNK] + 1, UNK] j = 0 while j < dix_ptrs.length dix_ptr = dix_ptrs[j] offset = i - dix_ptr[DIX_PTR_S] row_no = dix_ptr[DIX_PTR_ROW_NO] child = dix.lookup(row_no, offset, ch) # puts "ch:#{ch} offset:#{offset} rowno:#{row_no} child:#{child}" if child dix_ptrs[j] = [dix_ptr[DIX_PTR_S], child[NODE_PTR_ROW_NO], child[NODE_PTR_IS_FINAL]] j += 1 else unless j + 1 == dix_ptrs.length dix_ptrs[j] = dix_ptrs.pop else dix_ptrs.pop end end end update_latin_transducer(latin_transducer, ch, i, ch_vec) update_punc_transducer(punc_transducer, ch, i, ch_vec) dix_ptrs.each do |dix_ptr| if dix_ptr[DIX_PTR_IS_FINAL] new_s = dix_ptr[DIX_PTR_S] # puts "NEW_S:#{new_s} DIX-PTR:#{dix_ptr} i:#{i}" prev_link = path[new_s] w = prev_link[LINK_W] unk = prev_link[LINK_UNK] new_link = [new_s, w + 1, unk, DICT] link = new_link if better_link?(new_link, link) end end if latin_transducer[TRANSDUCER_STATE] == COMPLETED s = latin_transducer[TRANSDUCER_S] prev_link = path[s] w = prev_link[LINK_W] unk = prev_link[LINK_UNK] new_link = [s, w + 1, unk, LATIN] link = new_link if better_link?(new_link, link) end if punc_transducer[TRANSDUCER_STATE] == COMPLETED s = punc_transducer[TRANSDUCER_S] prev_link = path[s] w = prev_link[LINK_W] unk = prev_link[LINK_UNK] new_link = [s, w + 1, unk, PUNC] link = new_link if better_link?(new_link, link) end left_boundary = i if link[LINK_KIND] != UNK path << link end path end
latin?(ch)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 77 def latin?(ch) (ch >= CAP_A and ch <= CAP_Z) or (ch >= A and ch <= Z) end
path_to_ranges(path)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 209 def path_to_ranges(path) e = path.length - 1 ranges = [] while e > 0 link = path[e] s = link[LINK_P_IDX] ranges << [s,e] e = s end ranges.reverse end
punc?(ch)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 109 def punc?(ch) ch == SPACE end
ranges_to_toks(ranges, str)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 221 def ranges_to_toks(ranges, str) ranges.map {|s,e| str[s...e]} end
tokenize(dix, str)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 225 def tokenize(dix, str) ranges_to_toks(path_to_ranges(build_path(dix, str)), str) end
tokenize_with_delim(dix, str, delim)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 229 def tokenize_with_delim(dix, str, delim) tokenize(dix, str).join(delim) end
update_latin_transducer(transducer, ch, i, ch_vec)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 86 def update_latin_transducer(transducer, ch, i, ch_vec) if transducer[TRANSDUCER_STATE] == WAITING if latin?(ch) transducer[TRANSDUCER_S] = i transducer[TRANSDUCER_STATE] = ACTIVATED if i + 1 == ch_vec.length or not latin?(ch_vec[i + 1]) transducer[TRANSDUCER_E] = i + 1 transducer[TRANSDUCER_STATE] = COMPLETED end end else if latin?(ch) transducer[TRANSDUCER_E] = i + 1 transducer[TRANSDUCER_STATE] = COMPLETED else transducer[TRANSDUCER_STATE] = WAITING end end end
update_punc_transducer(transducer, ch, i, ch_vec)
click to toggle source
# File lib/thailang4r/word_breaker.rb, line 113 def update_punc_transducer(transducer, ch, i, ch_vec) if transducer[TRANSDUCER_STATE] == WAITING if punc?(ch) transducer[TRANSDUCER_S] = i transducer[TRANSDUCER_STATE] = ACTIVATED if i + 1 == ch_vec.length or not punc?(ch_vec[i + 1]) transducer[TRANSDUCER_E] = i + 1 transducer[TRANSDUCER_STATE] = COMPLETED end end else if punc?(ch) transducer[TRANSDUCER_E] = i + 1 transducer[TRANSDUCER_STATE] = COMPLETED else transducer[TRANSDUCER_STATE] = WAITING end end end