class TfIdfSimilarity::TermCountModel

Attributes

average_document_size[R]

The average number of tokens in a document.

documents[R]

The documents in the corpus.

terms[R]

The set of terms in the corpus.

Public Class Methods

new(documents, opts = {}) click to toggle source

@param [Array<Document>] documents documents @param [Hash] opts optional arguments @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)

# File lib/tf-idf-similarity/term_count_model.rb, line 16
def initialize(documents, opts = {})
  @documents = documents
  @terms = Set.new(documents.map(&:terms).flatten).to_a
  @library = (opts[:library] || :matrix).to_sym

  array = Array.new(terms.size) do |i|
    Array.new(documents.size) do |j|
      documents[j].term_count(terms[i])
    end
  end

  @matrix = initialize_matrix(array)

  @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
end

Public Instance Methods

document_count(term) click to toggle source

@param [String] term a term @return [Integer] the number of documents the term appears in

# File lib/tf-idf-similarity/term_count_model.rb, line 34
def document_count(term)
  index = terms.index(term)
  if index
    case @library
    when :gsl, :narray
      row(index).where.size
    when :nmatrix
      row(index).each.count(&:nonzero?)
    else
      vector = row(index)
      unless vector.respond_to?(:count)
        vector = vector.to_a
      end
      vector.count(&:nonzero?)
    end
  else
    0
  end
end
term_count(term) click to toggle source

@param [String] term a term @return [Integer] the number of times the term appears in the corpus

# File lib/tf-idf-similarity/term_count_model.rb, line 56
def term_count(term)
  index = terms.index(term)
  if index
    case @library
    when :gsl, :narray
      row(index).sum
    when :nmatrix
      row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
    else
      vector = row(index)
      unless vector.respond_to?(:reduce)
        vector = vector.to_a
      end
      vector.reduce(0, :+)
    end
  else
    0
  end
end