class StTools::Tokenizer
Attributes
input[R]
result[R]
Public Class Methods
new(opts ={})
click to toggle source
# File lib/st_tools/tokenizer.rb, line 9 def initialize(opts ={}) load_socr(opts[:socr]) init_class('') end
Private Instance Methods
init_class(input)
click to toggle source
# File lib/st_tools/tokenizer.rb, line 93 def init_class(input) @input = input @tokens = Array.new tmp = StTools::String.normalize(input).gsub(/\-\s{1,100}/, '-').gsub(/\s{1,100}\-/, '-') .gsub(/\/\s{1,100}/, '/').gsub(/\s{1,100}\//, '/') tmp = tmp.split(/[\s\,\.\_\(\)]/) tmp = socr_processing(tmp) tmp.delete_if { |x| x == '' } @result = tmp end
load_socr(socr)
click to toggle source
# File lib/st_tools/tokenizer.rb, line 23 def load_socr(socr) if socr.nil? @@socr = nil return nil end if socr.class == Symbol dict = ::YAML.load_file(__dir__ + "/../socr/#{socr.to_s.downcase}.yml") else dict = socr end dict = Hash[dict.map { |k, v| [k.to_sym, v] }] # Валидация списка сокращений raise "ОШИБКА: словарь сокращений не имеет секции :startend" if dict[:startend].nil? raise "ОШИБКА: словарь сокращений не имеет секции :synonyms" if dict[:synonyms].nil? @@socr = dict end
processing(input)
click to toggle source
# File lib/st_tools/tokenizer.rb, line 15 def processing(input) init_class(input) end
socr_one_processing(word)
click to toggle source
# File lib/st_tools/tokenizer.rb, line 59 def socr_one_processing(word) out = Array.new arr = word.split(/[\-\/]/) voc = @@socr[:startend] count = arr.count 0.upto(count) do out << socr_part_processing(voc, arr) end out.count == 0 ? arr : out end
socr_part_processing(voc, words)
click to toggle source
# File lib/st_tools/tokenizer.rb, line 43 def socr_part_processing(voc, words) if words[0].nil? == false && words[1].nil? return words.shift end return '' if words[0].nil? voc.each do |etalon| if etalon.match(/^#{words[0]}/) && etalon.match(/#{words[1]}$/) words.shift words.shift return etalon end end words.shift end
socr_processing(arr)
click to toggle source
# File lib/st_tools/tokenizer.rb, line 76 def socr_processing(arr) if @@socr.nil? arr = arr.join(' ').gsub(/\-/, ' ').split(/\s/) else out = Array.new arr.each do |word| if word.match(/[\-\/]/) out += socr_one_processing(word) else out << socr_synonyms_processing(word) end end arr = out end return arr end
socr_synonyms_processing(word)
click to toggle source
# File lib/st_tools/tokenizer.rb, line 72 def socr_synonyms_processing(word) @@socr[:synonyms][word] || word end