class TwitterCldr::Tokenizers::Tokenizer
Attributes
custom_splitter[R]
recognizers[R]
remove_empty_entries[R]
Public Class Methods
new(recognizers, splitter = nil, remove_empty_entries = true)
click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 59 def initialize(recognizers, splitter = nil, remove_empty_entries = true) @recognizers = recognizers @custom_splitter = splitter @remove_empty_entries = remove_empty_entries end
union(*tokenizers) { |recognizer| ... }
click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 38 def self.union(*tokenizers) recognizers = tokenizers.inject([]) do |ret, tokenizer| ret + tokenizer.recognizers.inject([]) do |recog_ret, recognizer| if (block_given? && yield(recognizer)) || !block_given? recog_ret << recognizer end recog_ret end end splitter = if tokenizers.all?(&:custom_splitter) Regexp.compile( tokenizers.map do |tokenizer| tokenizer.custom_splitter.source end.join("|") ) end new(recognizers, splitter) end
Public Instance Methods
insert_before(token_type, *new_recognizers)
click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 69 def insert_before(token_type, *new_recognizers) idx = recognizers.find_index { |rec| rec.token_type == token_type } recognizers.insert(idx, *new_recognizers) clear_splitter nil end
recognizer_at(token_type)
click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 65 def recognizer_at(token_type) recognizers.find { |r| r.token_type == token_type } end
tokenize(text)
click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 76 def tokenize(text) text.split(splitter).inject([]) do |ret, token_text| recognizer = recognizers.find do |recognizer| recognizer.recognizes?(token_text) end if recognizer if recognizer.token_type == :composite content = token_text.match(recognizer.content)[1] ret << CompositeToken.new(tokenize(content)) else cleaned_text = recognizer.clean(token_text) if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries ret << Token.new( value: cleaned_text, type: recognizer.token_type ) end end end ret end end
Private Instance Methods
clear_splitter()
click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 111 def clear_splitter @splitter = nil end
splitter()
click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 104 def splitter @splitter ||= (@custom_splitter || begin sources = recognizers.map { |rec| rec.regex.source } Regexp.new("(" + sources.join("|") + ")") end) end