class String

Constants

IgnoredWords

Public Instance Methods

normalize() click to toggle source
# File lib/extensions/string.rb, line 21
def normalize
  downcase.                     # lowercase
    unaccent.                   # 'normalize' accents
    delete(%q{'"‘’“”}).         # remove quotes
    gsub(/[^a-z0-9]+/, ' ').    # convert non-alphanumeric to whitespace
    strip.squeeze(' ')          # compress/remove whitespace
end
tokenize() click to toggle source
# File lib/extensions/string.rb, line 29
def tokenize
  words = normalize.split(/\s+/)
  new_words = words - IgnoredWords
  new_words.empty? ? words : new_words    # handles 'The The', etc.
end