class TwitterCldr::Tokenizers::Tokenizer

Attributes

custom_splitter[R]
recognizers[R]
remove_empty_entries[R]

Public Class Methods

new(recognizers, splitter = nil, remove_empty_entries = true) click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 59
def initialize(recognizers, splitter = nil, remove_empty_entries = true)
  @recognizers = recognizers
  @custom_splitter = splitter
  @remove_empty_entries = remove_empty_entries
end
union(*tokenizers) { |recognizer| ... } click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 38
def self.union(*tokenizers)
  recognizers = tokenizers.inject([]) do |ret, tokenizer|
    ret + tokenizer.recognizers.inject([]) do |recog_ret, recognizer|
      if (block_given? && yield(recognizer)) || !block_given?
        recog_ret << recognizer
      end
      recog_ret
    end
  end

  splitter = if tokenizers.all?(&:custom_splitter)
    Regexp.compile(
      tokenizers.map do |tokenizer|
        tokenizer.custom_splitter.source
      end.join("|")
    )
  end

  new(recognizers, splitter)
end

Public Instance Methods

insert_before(token_type, *new_recognizers) click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 69
def insert_before(token_type, *new_recognizers)
  idx = recognizers.find_index { |rec| rec.token_type == token_type }
  recognizers.insert(idx, *new_recognizers)
  clear_splitter
  nil
end
recognizer_at(token_type) click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 65
def recognizer_at(token_type)
  recognizers.find { |r| r.token_type == token_type }
end
tokenize(text) click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 76
def tokenize(text)
  text.split(splitter).inject([]) do |ret, token_text|
    recognizer = recognizers.find do |recognizer|
      recognizer.recognizes?(token_text)
    end

    if recognizer
      if recognizer.token_type == :composite
        content = token_text.match(recognizer.content)[1]
        ret << CompositeToken.new(tokenize(content))
      else
        cleaned_text = recognizer.clean(token_text)

        if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries
          ret << Token.new(
            value: cleaned_text,
            type: recognizer.token_type
          )
        end
      end
    end

    ret
  end
end

Private Instance Methods

clear_splitter() click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 111
def clear_splitter
  @splitter = nil
end
splitter() click to toggle source
# File lib/twitter_cldr/tokenizers/tokenizer.rb, line 104
def splitter
  @splitter ||= (@custom_splitter || begin
    sources = recognizers.map { |rec| rec.regex.source }
    Regexp.new("(" + sources.join("|") + ")")
  end)
end