class Opener::Tokenizer

Primary tokenizer class that delegates the work to the various language specific tokenizers.

@!attribute [r] options

@return [Hash]

Constants

DEFAULT_LANGUAGE

The default language to use when no custom one is specified.

@return [String]

DEFAULT_OPTIONS

Hash containing the default options to use.

@return [Hash]

VERSION

Attributes

options[R]

Public Class Methods

new(options = {}) click to toggle source

@param [Hash] options

@option options [Array] :args Collection of arbitrary arguments to pass

to the individual tokenizer commands.

@option options [String] :language The language to use for the

tokenization process.

@option options [TrueClass|FalseClass] :kaf When set to `true` the input

is assumed to be KAF.
# File lib/opener/tokenizer.rb, line 52
def initialize(options = {})
  @options = DEFAULT_OPTIONS.merge(options)
end

Public Instance Methods

kaf_elements(input) click to toggle source

Returns an Array containing the language an input from a KAF document.

@param [String] input The KAF document. @return [Array]

# File lib/opener/tokenizer.rb, line 93
def kaf_elements(input)
  document = Nokogiri::XML(input)
  language = document.at('KAF').attr('xml:lang')
  text     = document.at('raw').text

  return language, text
end
run(input, params = {}) click to toggle source

Tokenizes the input and returns the results as a KAF document.

@param [String] input @return [String]

# File lib/opener/tokenizer.rb, line 62
def run input, params = {}
  if options[:kaf]
    language, input = kaf_elements(input)
  else
    language = options[:language]
  end

  unless valid_language?(language)
    raise Core::UnsupportedLanguageError, language
  end

  kernel = language_constant(language).new(:args => options[:args])

  stdout, stderr, process = Open3.capture3(
    *kernel.command.split(" "),
    :stdin_data => input
  )

  raise stderr unless process.success?

  return stdout
end
Also aliased as: tokenize
tokenize(input, params = {})
Alias for: run

Private Instance Methods

language_constant(language) click to toggle source

@param [String] language @return [Class]

# File lib/opener/tokenizer.rb, line 107
def language_constant(language)
  name = Core::LanguageCode.constant_name(language)

  Tokenizers.const_get(name)
end
valid_language?(language) click to toggle source

@return [TrueClass|FalseClass]

# File lib/opener/tokenizer.rb, line 116
def valid_language?(language)
  name = Core::LanguageCode.constant_name(language)

  return Tokenizers.const_defined?(name)
end