class Analects::Tokenizer

Constants

ALGO

ALGO = RMMSeg::Algorithm

Public Class Methods

new(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic') click to toggle source
# File lib/analects/tokenizer.rb, line 6
def initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic')
  unless File.exist?(chars_dic) && File.exist?(words_dic)
    create_dict_from_cedict( chars_dic, words_dic )
  end
  #RMMSeg::Dictionary.dictionaries = [[:chars, chars_dic], [:words, words_dic]]
  RMMSeg::Config.dictionaries = [[chars_dic, true], [words_dic, false]]
end

Public Instance Methods

call( str )
Alias for: tokenize
cedict( fn = '/tmp/cedict.json' ) click to toggle source
# File lib/analects/tokenizer.rb, line 18
def cedict( fn = '/tmp/cedict.json' )
  require 'json'
  unless File.exist?( fn )
    library.cedict.retrieve
    File.write( fn, library.cedict.to_a.to_json )
  end
  @cedict ||= JSON.parse IO.read( fn )
end
create_dict_from_cedict(chars_dic, words_dic) click to toggle source
# File lib/analects/tokenizer.rb, line 27
def create_dict_from_cedict(chars_dic, words_dic)
  words = Set.new
  histo = Hash.new(0)

  cedict.each do |c|
    words << c[0]
    words << c[1]
    (c[0] + c[1]).chars.each do |c|
      histo[c] += 1
    end
  end

  File.write(words_dic, words.sort.join("\n"))
  File.write(chars_dic, histo.map {|ch, cnt| "%s %d\n" % [ ch, cnt ]}.join )
end
library() click to toggle source
# File lib/analects/tokenizer.rb, line 14
def library
  @library ||= Analects::Library.new
end
tokenize( str ) click to toggle source
# File lib/analects/tokenizer.rb, line 43
def tokenize( str )
  [].tap do |result|
    ALGO.new( str ).tap do |alg|
      until (tok = alg.next_token).nil?
        result << tok.text.force_encoding('UTF-8')
      end
    end
  end
end
Also aliased as: call