class Lurn::Text::WordTokenizer

Constants

STOP_WORDS

Attributes

options[RW]

Public Class Methods

new(options = {}) click to toggle source
# File lib/lurn/text/word_tokenizer.rb, line 19
def initialize(options = {})
  @options = options
  @options[:strip_punctuation] ||= false
  @options[:strip_stopwords] ||= false
  @options[:stem_words] ||= false
  @options[:ngrams] ||= 1
end

Public Instance Methods

to_h() click to toggle source
# File lib/lurn/text/word_tokenizer.rb, line 43
def to_h
  options
end
tokenize(document) click to toggle source
# File lib/lurn/text/word_tokenizer.rb, line 27
def tokenize(document)
  document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true
  document = document.split("\s")

  if(@options[:stem_words])
    stemmer = Lingua::Stemmer.new(language: :en)
    document = document.map { |word| stemmer.stem(word) }
  end

  if(@options[:ngrams] > 1)
    document = document.each_cons(@options[:ngrams]).to_a
  end

  document
end