class Lurn::Text::WordTokenizer
Constants
- STOP_WORDS
Attributes
options[RW]
Public Class Methods
new(options = {})
click to toggle source
# File lib/lurn/text/word_tokenizer.rb, line 19 def initialize(options = {}) @options = options @options[:strip_punctuation] ||= false @options[:strip_stopwords] ||= false @options[:stem_words] ||= false @options[:ngrams] ||= 1 end
Public Instance Methods
to_h()
click to toggle source
# File lib/lurn/text/word_tokenizer.rb, line 43 def to_h options end
tokenize(document)
click to toggle source
# File lib/lurn/text/word_tokenizer.rb, line 27 def tokenize(document) document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true document = document.split("\s") if(@options[:stem_words]) stemmer = Lingua::Stemmer.new(language: :en) document = document.map { |word| stemmer.stem(word) } end if(@options[:ngrams] > 1) document = document.each_cons(@options[:ngrams]).to_a end document end