class TfIdfSimilarity::TermCountModel
Attributes
average_document_size[R]
The average number of tokens in a document.
documents[R]
The documents in the corpus.
terms[R]
The set of terms in the corpus.
Public Class Methods
new(documents, opts = {})
click to toggle source
@param [Array<Document>] documents documents @param [Hash] opts optional arguments @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
# File lib/tf-idf-similarity/term_count_model.rb, line 16 def initialize(documents, opts = {}) @documents = documents @terms = Set.new(documents.map(&:terms).flatten).to_a @library = (opts[:library] || :matrix).to_sym array = Array.new(terms.size) do |i| Array.new(documents.size) do |j| documents[j].term_count(terms[i]) end end @matrix = initialize_matrix(array) @average_document_size = documents.empty? ? 0 : sum / column_size.to_f end
Public Instance Methods
document_count(term)
click to toggle source
@param [String] term a term @return [Integer] the number of documents the term appears in
# File lib/tf-idf-similarity/term_count_model.rb, line 34 def document_count(term) index = terms.index(term) if index case @library when :gsl, :narray row(index).where.size when :nmatrix row(index).each.count(&:nonzero?) else vector = row(index) unless vector.respond_to?(:count) vector = vector.to_a end vector.count(&:nonzero?) end else 0 end end
term_count(term)
click to toggle source
@param [String] term a term @return [Integer] the number of times the term appears in the corpus
# File lib/tf-idf-similarity/term_count_model.rb, line 56 def term_count(term) index = terms.index(term) if index case @library when :gsl, :narray row(index).sum when :nmatrix row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower else vector = row(index) unless vector.respond_to?(:reduce) vector = vector.to_a end vector.reduce(0, :+) end else 0 end end