class ThaiLang::WordBreaker

Constants

A
ACTIVATED
CAP_A
CAP_Z
COMPLETED
DICT
DIX_PTR_IS_FINAL
DIX_PTR_ROW_NO
DIX_PTR_S
INIT
LATIN
PUNC
RANGE_E
RANGE_S
SPACE
TRANSDUCER_E
TRANSDUCER_KIND
TRANSDUCER_S
TRANSDUCER_STATE
UNK
WAITING
Z

Public Class Methods

new(dix_path = nil) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 44
def initialize(dix_path = nil)
  dix_path = File.expand_path('../../../data/tdict-std.txt', __FILE__) unless dix_path
  @dix = PrefixTree.new(File.open(dix_path).each_line.map { [_1.chomp, 1] })
end

Public Instance Methods

break_into_words(text) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 49
def break_into_words(text)
  tokenize(@dix, text)
end
build_path(dix, s) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 137
def build_path(dix, s)
  left_boundary = 0
  ch_vec = s.codepoints
  ch_len = ch_vec.length
  path = [[0,0,0,INIT]]
  dix_ptrs = []
  latin_transducer = [0,0,WAITING,LATIN]
  punc_transducer =[0,0,WAITING,PUNC]
  ch_vec.each.with_index do |ch, i|
    dix_ptrs << [i, 0, false]
    unk_link = path[left_boundary]
    link = [left_boundary, unk_link[LINK_W] + 1, unk_link[LINK_UNK] + 1, UNK]
    j = 0
    while j < dix_ptrs.length
      dix_ptr = dix_ptrs[j]
      offset = i - dix_ptr[DIX_PTR_S]
      row_no = dix_ptr[DIX_PTR_ROW_NO]
      child = dix.lookup(row_no, offset, ch)
      #      puts "ch:#{ch} offset:#{offset} rowno:#{row_no} child:#{child}"
      if child
        dix_ptrs[j] = [dix_ptr[DIX_PTR_S], child[NODE_PTR_ROW_NO], child[NODE_PTR_IS_FINAL]]
        j += 1
      else
        unless j + 1 == dix_ptrs.length
          dix_ptrs[j] = dix_ptrs.pop
        else
          dix_ptrs.pop
        end
      end
    end
    
    update_latin_transducer(latin_transducer, ch, i, ch_vec)
    update_punc_transducer(punc_transducer, ch, i, ch_vec)
    
    dix_ptrs.each do |dix_ptr|
      if dix_ptr[DIX_PTR_IS_FINAL]
        new_s = dix_ptr[DIX_PTR_S]
        #        puts "NEW_S:#{new_s} DIX-PTR:#{dix_ptr} i:#{i}"
        prev_link = path[new_s]
        w = prev_link[LINK_W]
        unk = prev_link[LINK_UNK]
        new_link = [new_s, w + 1, unk, DICT]
        link = new_link if better_link?(new_link, link)
      end
    end
    
    if latin_transducer[TRANSDUCER_STATE] == COMPLETED
      s = latin_transducer[TRANSDUCER_S]
      prev_link = path[s]
      w = prev_link[LINK_W]
      unk = prev_link[LINK_UNK]
      new_link = [s, w + 1, unk, LATIN]
      link = new_link if better_link?(new_link, link)      
    end
    
    if punc_transducer[TRANSDUCER_STATE] == COMPLETED
      s = punc_transducer[TRANSDUCER_S]
      prev_link = path[s]
      w = prev_link[LINK_W]
      unk = prev_link[LINK_UNK]
      new_link = [s, w + 1, unk, PUNC]     
      link = new_link if better_link?(new_link, link)
    end
    left_boundary = i if link[LINK_KIND] != UNK
    path << link
  end
  path
end
latin?(ch) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 77
def latin?(ch)
  (ch >= CAP_A and ch <= CAP_Z) or (ch >= A and ch <= Z)
end
path_to_ranges(path) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 209
def path_to_ranges(path)
  e = path.length - 1
  ranges = []
  while e > 0
    link = path[e]
    s = link[LINK_P_IDX]
    ranges << [s,e]
    e = s
  end
  ranges.reverse
end
punc?(ch) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 109
def punc?(ch)
  ch == SPACE
end
ranges_to_toks(ranges, str) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 221
def ranges_to_toks(ranges, str)
  ranges.map {|s,e| str[s...e]}
end
tokenize(dix, str) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 225
def tokenize(dix, str)
  ranges_to_toks(path_to_ranges(build_path(dix, str)), str)
end
tokenize_with_delim(dix, str, delim) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 229
def tokenize_with_delim(dix, str, delim)
  tokenize(dix, str).join(delim)
end
update_latin_transducer(transducer, ch, i, ch_vec) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 86
def update_latin_transducer(transducer, ch, i, ch_vec)
  if transducer[TRANSDUCER_STATE] == WAITING
    if latin?(ch)
      transducer[TRANSDUCER_S] = i
      transducer[TRANSDUCER_STATE] = ACTIVATED
      if i + 1 == ch_vec.length or not latin?(ch_vec[i + 1])
        transducer[TRANSDUCER_E] = i + 1
        transducer[TRANSDUCER_STATE] = COMPLETED
      end
    end
  else
    if latin?(ch)
      transducer[TRANSDUCER_E] = i + 1
      transducer[TRANSDUCER_STATE] = COMPLETED      
    else
      transducer[TRANSDUCER_STATE] = WAITING
    end
  end
end
update_punc_transducer(transducer, ch, i, ch_vec) click to toggle source
# File lib/thailang4r/word_breaker.rb, line 113
def update_punc_transducer(transducer, ch, i, ch_vec)
  if transducer[TRANSDUCER_STATE] == WAITING
    if punc?(ch)
      transducer[TRANSDUCER_S] = i
      transducer[TRANSDUCER_STATE] = ACTIVATED
      if i + 1 == ch_vec.length or not punc?(ch_vec[i + 1])
        transducer[TRANSDUCER_E] = i + 1
        transducer[TRANSDUCER_STATE] = COMPLETED
      end
    end
  else
    if punc?(ch)
      transducer[TRANSDUCER_E] = i + 1
      transducer[TRANSDUCER_STATE] = COMPLETED      
    else
      transducer[TRANSDUCER_STATE] = WAITING
    end
  end
end