class String
Constants
- IgnoredWords
Public Instance Methods
normalize()
click to toggle source
# File lib/extensions/string.rb, line 21 def normalize downcase. # lowercase unaccent. # 'normalize' accents delete(%q{'"‘’“”}). # remove quotes gsub(/[^a-z0-9]+/, ' '). # convert non-alphanumeric to whitespace strip.squeeze(' ') # compress/remove whitespace end
tokenize()
click to toggle source
# File lib/extensions/string.rb, line 29 def tokenize words = normalize.split(/\s+/) new_words = words - IgnoredWords new_words.empty? ? words : new_words # handles 'The The', etc. end