module Greeb::Tokenizer

Greeb's tokenization facilities. Use 'em with love.

Unicode character categories been obtained from <www.fileformat.info/info/unicode/category/index.htm>.

Constants

BREAKS

Line breaks.

FLOATS

Floating point values.

INTEGERS

Integer values.

LETTERS

English and Russian letters.

PUNCTUATIONS

Punctuation character (i.e.: “.” or “!”).

RESIDUALS

Residuals.

SENTENCE_PUNCTUATIONS

In-sentence punctuation character (i.e.: “,” or “-”).

SEPARATORS

In-subsentence seprator (i.e.: “*” or “=”).

SPACES

Spaces (i.e.: “ ” or &nbsp).

Public Instance Methods

split(token) click to toggle source

Split one line into characters array, but also combine duplicated characters.

For instance, `“a bnnnc”` would be transformed into the following array: `[“a”, “ ”, “b”, “nnn”, “c”]`.

@param token [String] a token to be splitted.

@return [Array<String>] splitted characters.

# File lib/greeb/tokenizer.rb, line 81
def split(token)
  token.scan(/((.|\n)\2*)/).map!(&:first)
end
tokenize(text) click to toggle source

Perform the tokenization process.

@param text [String] a text to be tokenized.

@return [Array<Greeb::Span>] a set of tokens.

# File lib/greeb/tokenizer.rb, line 51
def tokenize text
  scanner = Greeb::StringScanner.new(text)
  tokens = []
  while !scanner.eos?
    parse! scanner, tokens, LETTERS, :letter or
    parse! scanner, tokens, FLOATS, :float or
    parse! scanner, tokens, INTEGERS, :integer or
    split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
    split_parse! scanner, tokens, PUNCTUATIONS, :punct or
    split_parse! scanner, tokens, SEPARATORS, :separ or
    split_parse! scanner, tokens, SPACES, :space or
    split_parse! scanner, tokens, BREAKS, :break or
    parse! scanner, tokens, RESIDUALS, :residual or
    raise Greeb::UnknownSpan.new(text, scanner.char_pos)
  end
  tokens
ensure
  scanner.terminate
end

Protected Instance Methods

parse!(scanner, tokens, pattern, type) click to toggle source

Try to parse one small piece of text that is covered by pattern of necessary type.

@param scanner [Greeb::StringScanner] string scanner. @param tokens [Array<Greeb::Span>] result array. @param pattern [Regexp] a regular expression to extract the token. @param type [Symbol] a symbol that represents the necessary token

type.

@return [Array<Greeb::Span>] the modified set of extracted tokens.

# File lib/greeb/tokenizer.rb, line 97
def parse! scanner, tokens, pattern, type
  return false unless token = scanner.scan(pattern)
  position = scanner.char_pos
  tokens << Greeb::Span.new(position - token.length,
                              position,
                              type)
end
split_parse!(scanner, tokens, pattern, type) click to toggle source

Try to parse one small piece of text that is covered by pattern of necessary type. This method performs grouping of the same characters.

@param scanner [Greeb::StringScanner] string scanner. @param tokens [Array<Greeb::Span>] result array. @param pattern [Regexp] a regular expression to extract the token. @param type [Symbol] a symbol that represents the necessary token

type.

@return [Array<Greeb::Span>] the modified set of extracted tokens.

# File lib/greeb/tokenizer.rb, line 117
def split_parse! scanner, tokens, pattern, type
  return false unless token = scanner.scan(pattern)
  position = scanner.char_pos - token.length
  split(token).inject(position) do |before, s|
    tokens << Greeb::Span.new(before, before + s.length, type)
    before + s.length
  end
end