module Reclassifier::WordHash

Constants

CORPUS_SKIP_WORDS

Public Instance Methods

clean_word_hash(string) click to toggle source

Return a word hash without extra punctuation or short symbols, just stemmed words

# File lib/reclassifier/word_hash.rb, line 14
def clean_word_hash(string)
        word_hash_for_words(string.gsub(/[^\w\s]/," ").split)
end
word_hash(string) click to toggle source

Return a Hash of strings => ints. Each word in the string is stemmed, symbolized, and indexed to its frequency in the document.

# File lib/reclassifier/word_hash.rb, line 9
def word_hash(string)
        word_hash_for_words(string.gsub(/[^\w\s]/,"").split + string.gsub(/[\w]/," ").split)
end
word_hash_for_words(words) click to toggle source
# File lib/reclassifier/word_hash.rb, line 18
    def word_hash_for_words(words)
            d = {}

            words.each do |word|
                    word.downcase!

                    key = word.stem.to_sym

                    if word =~ /[^\w]/ || !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
                            d[key] ||= 0
                            d[key] += 1
                    end
            end

d
    end