module ClassifierReborn::Hasher

Constants

STOPWORDS

Create a lazily-loaded hash of stopword data

STOPWORDS_PATH

Public Instance Methods

add_custom_stopword_path(path) click to toggle source

Add custom path to a new stopword file created by user

# File lib/classifier-reborn/extensions/hasher.rb, line 41
def add_custom_stopword_path(path)
  STOPWORDS_PATH.unshift(path)
end
clean_word_hash(str, language = 'en', enable_stemmer = true) click to toggle source

Return a word hash without extra punctuation or short symbols, just stemmed words

# File lib/classifier-reborn/extensions/hasher.rb, line 23
def clean_word_hash(str, language = 'en', enable_stemmer = true)
  word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
end
word_hash(str, language = 'en', enable_stemmer = true) click to toggle source

Return a Hash of strings => ints. Each word in the string is stemmed, interned, and indexes to its frequency in the document.

# File lib/classifier-reborn/extensions/hasher.rb, line 16
def word_hash(str, language = 'en', enable_stemmer = true)
  cleaned_word_hash = clean_word_hash(str, language, enable_stemmer)
  symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
  cleaned_word_hash.merge(symbol_hash)
end
word_hash_for_symbols(words) click to toggle source
# File lib/classifier-reborn/extensions/hasher.rb, line 45
def word_hash_for_symbols(words)
  d = Hash.new(0)
  words.each do |word|
    d[word.intern] += 1
  end
  d
end
word_hash_for_words(words, language = 'en', enable_stemmer = true) click to toggle source
# File lib/classifier-reborn/extensions/hasher.rb, line 27
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
  d = Hash.new(0)
  words.each do |word|
    next unless word.length > 2 && !STOPWORDS[language].include?(word)
    if enable_stemmer
      d[word.stem.intern] += 1
    else
      d[word.intern] += 1
    end
  end
  d
end