class TfIdfSimilarity::Model

Public Class Methods

new(documents, opts = {}) click to toggle source

@param [Array<Document>] documents documents @param [Hash] opts optional arguments @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)

# File lib/tf-idf-similarity/model.rb, line 11
def initialize(documents, opts = {})
  @model = TermCountModel.new(documents, opts)
  @library = (opts[:library] || :matrix).to_sym

  array = Array.new(terms.size) do |i|
    idf = inverse_document_frequency(terms[i])
    Array.new(documents.size) do |j|
      term_frequency(documents[j], terms[i]) * idf
    end
  end

  @matrix = initialize_matrix(array)
end

Public Instance Methods

document_index(document) click to toggle source

Return the index of the document in the corpus.

@param [Document] document a document @return [Integer,nil] the index of the document

# File lib/tf-idf-similarity/model.rb, line 52
def document_index(document)
  @model.documents.index(document)
end
similarity_matrix() click to toggle source

Returns a similarity matrix for the documents in the corpus.

@return [GSL::Matrix,NMatrix,Matrix] a similarity matrix @note Columns are normalized to unit vectors, so we can calculate the cosine

similarity of all document vectors.
# File lib/tf-idf-similarity/model.rb, line 40
def similarity_matrix
  if documents.empty?
    []
  else
    multiply_self(normalize)
  end
end
term_frequency_inverse_document_frequency(document, term) click to toggle source

Return the term frequency–inverse document frequency.

@param [Document] document a document @param [String] term a term @return [Float] the term frequency–inverse document frequency

# File lib/tf-idf-similarity/model.rb, line 30
def term_frequency_inverse_document_frequency(document, term)
  inverse_document_frequency(term) * term_frequency(document, term)
end
Also aliased as: tfidf
text_index(text) click to toggle source

Return the index of the document with matching text.

@param [String] text a text @return [Integer,nil] the index of the document

# File lib/tf-idf-similarity/model.rb, line 60
def text_index(text)
  @model.documents.index do |document|
    document.text == text
  end
end
tfidf(document, term)