class Analtex::DocumentFrequencyCalculator
Attributes
document_frequencies[R]
Public Class Methods
new(dictionary)
click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 5 def initialize(dictionary) @dictionary = dictionary dictionary_size = @dictionary.words.keys.length @document_frequencies = Array.new(dictionary_size, 0).to_a @num_documents = 0 @idf = Array.new(dictionary_size, 0).to_a end
Public Instance Methods
add_counts_from_bag_of_words(bag_of_words)
click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 13 def add_counts_from_bag_of_words(bag_of_words) bag_of_words.each_with_index do |count, word_index| @document_frequencies[word_index] = 1 + @document_frequencies[word_index] if (count > 0) end @num_documents += 1 end
calculate_idfs()
click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 20 def calculate_idfs for i in 0..@dictionary.words.count @idf[i] = Math.log(@num_documents.to_f / @document_frequencies[i].to_f) end end
normalize_bag_of_words(bag_of_words)
click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 26 def normalize_bag_of_words(bag_of_words) total_terms_in_document = bag_of_words.reduce(0, &:+) normalized_bag_of_words = Array.new(bag_of_words.length, 0).to_a bag_of_words.each_with_index do |count, word_index| raise 'This should not happen' if total_terms_in_document == 0 tf = count.to_f / total_terms_in_document.to_f tf_idf = tf * @idf[word_index] raise 'Got a NaN!' if tf_idf.nan? normalized_bag_of_words[word_index] = tf_idf end normalized_bag_of_words end