class Analtex::DocumentFrequencyCalculator

Attributes

document_frequencies[R]

Public Class Methods

new(dictionary) click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 5
def initialize(dictionary)
  @dictionary = dictionary
  dictionary_size = @dictionary.words.keys.length
  @document_frequencies = Array.new(dictionary_size, 0).to_a
  @num_documents = 0
  @idf = Array.new(dictionary_size, 0).to_a
end

Public Instance Methods

add_counts_from_bag_of_words(bag_of_words) click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 13
def add_counts_from_bag_of_words(bag_of_words)
  bag_of_words.each_with_index do |count, word_index|
    @document_frequencies[word_index] = 1 + @document_frequencies[word_index] if (count > 0)
  end
  @num_documents += 1
end
calculate_idfs() click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 20
def calculate_idfs
  for i in 0..@dictionary.words.count
    @idf[i] = Math.log(@num_documents.to_f / @document_frequencies[i].to_f)
  end
end
normalize_bag_of_words(bag_of_words) click to toggle source
# File lib/analtex/document_frequency_calculator.rb, line 26
def normalize_bag_of_words(bag_of_words)
  total_terms_in_document = bag_of_words.reduce(0, &:+)
  normalized_bag_of_words = Array.new(bag_of_words.length, 0).to_a
  bag_of_words.each_with_index do |count, word_index|
    raise 'This should not happen' if total_terms_in_document == 0
    tf = count.to_f / total_terms_in_document.to_f
    tf_idf = tf * @idf[word_index]
    raise 'Got a NaN!' if tf_idf.nan?
    normalized_bag_of_words[word_index] = tf_idf
  end

  normalized_bag_of_words
end