class TfIdfSimilarity::TfIdfModel

Public Instance Methods

augmented_average_term_frequency(document, term) click to toggle source

Chisholm ATFA

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 127
def augmented_average_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    0.9 + 0.1 * count / document.average_term_count
  else
    0
  end
end
Also aliased as: augmented_average_tf
augmented_average_tf(document, term)
augmented_log_term_frequency(document, term) click to toggle source

Chisholm LOGG

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 173
def augmented_log_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    0.2 + 0.8 * log(count + 1)
  else
    0
  end
end
Also aliased as: augmented_log_tf
augmented_log_tf(document, term)
augmented_normalized_term_frequency(document, term) click to toggle source

SMART a, Salton n, Chisholm ATF1

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 121
def augmented_normalized_term_frequency(document, term)
  0.5 + 0.5 * normalized_term_frequency(document, term)
end
Also aliased as: augmented_normalized_tf
augmented_normalized_tf(document, term)
binary_term_frequency(document, term) click to toggle source

SMART b, Salton b, Chisholm BNRY

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 103
def binary_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    1
  else
    0
  end
end
Also aliased as: binary_tf
binary_tf(document, term)
changed_coefficient_augmented_normalized_term_frequency(document, term) click to toggle source

Chisholm ATFC

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 138
def changed_coefficient_augmented_normalized_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    0.2 + 0.8 * count / document.maximum_term_count
  else
    0
  end
end
changed_coefficient_augmented_normalized_tf(document, term)
entropy(term) click to toggle source

Chisholm ENPY

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 62
def entropy(term)
  denominator = @model.term_count(term).to_f
  logN = log(documents.size)
  1 + documents.reduce(0) do |sum,document|
    quotient = document.term_count(term) / denominator
    sum += quotient * log(quotient) / logN
  end
end
global_frequency_inverse_document_frequency(term) click to toggle source

Chisholm IGFF

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 38
def global_frequency_inverse_document_frequency(term)
  @model.term_count(term) / @model.document_count(term).to_f
end
Also aliased as: gfidf
idf(term)
incremented_gfidf(term)
incremented_global_frequency_inverse_document_frequency(term) click to toggle source

Chisholm IGFI

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 50
def incremented_global_frequency_inverse_document_frequency(term)
  global_frequency_inverse_document_frequency(term) + 1
end
Also aliased as: incremented_gfidf
inverse_document_frequency(term) click to toggle source

Return the term's inverse document frequency.

@param [String] term a term @return [Float] the term's inverse document frequency

# File lib/tf-idf-similarity/tf_idf_model.rb, line 10
def inverse_document_frequency(term)
  df = @model.document_count(term)
  1 + log(documents.size / (df + 1.0))
end
Also aliased as: idf
log_global_frequency_inverse_document_frequency(term) click to toggle source

Chisholm IGFL

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 44
def log_global_frequency_inverse_document_frequency(term)
  log(global_frequency_inverse_document_frequency(term) + 1)
end
Also aliased as: log_gfidf
log_term_frequency(document, term) click to toggle source

@see github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12

SMART l, Chisholm LOGA

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 151
def log_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    1 + log(count)
  else
    0
  end
end
Also aliased as: log_tf
log_tf(document, term)
Alias for: log_term_frequency
no_collection_frequency(term) click to toggle source

@see github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17

SMART n, Salton x, Chisholm NONE

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 18
def no_collection_frequency(term)
  1.0
end
normalized_log_term_frequency(document, term) click to toggle source

SMART L, Chisholm LOGN

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 162
def normalized_log_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    (1 + log(count)) / (1 + log(document.average_term_count))
  else
    0
  end
end
Also aliased as: normalized_log_tf
normalized_log_tf(document, term)
normalized_term_frequency(document, term, a = 0) click to toggle source

@see en.wikipedia.org/wiki/Tf*idf @see nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 115
def normalized_term_frequency(document, term, a = 0)
  a + (1 - a) * document.term_count(term) / document.maximum_term_count
end
Also aliased as: normalized_tf
normalized_tf(document, term, a = 0)
pivoted_unique_normalization(matrix) click to toggle source

@see nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html

SMART u, Chisholm PUQN

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 85
def pivoted_unique_normalization(matrix)
  raise NotImplementedError
end
plain_idf(term, numerator = 0, denominator = 0)
plain_inverse_document_frequency(term, numerator = 0, denominator = 0) click to toggle source

@see github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50

SMART t, Salton f, Chisholm IDFB

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 25
def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
  log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
end
Also aliased as: plain_idf
probabilistic_idf(term)
probabilistic_inverse_document_frequency(term) click to toggle source

SMART p, Salton p, Chisholm IDFP

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 31
def probabilistic_inverse_document_frequency(term)
  count = @model.document_count(term).to_f
  log((documents.size - count) / count)
end
Also aliased as: probabilistic_idf
square_root_gfidf(term)
square_root_global_frequency_inverse_document_frequency(term) click to toggle source

Chisholm IGFS

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 56
def square_root_global_frequency_inverse_document_frequency(term)
  sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
end
Also aliased as: square_root_gfidf
square_root_term_frequency(document, term) click to toggle source

Chisholm SQRT

# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 184
def square_root_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    sqrt(count - 0.5) + 1
  else
    0
  end
end
Also aliased as: square_root_tf
square_root_tf(document, term)
term_frequency(document, term) click to toggle source

Returns the term's frequency in the document.

@param [Document] document a document @param [String] term a term @return [Float] the term's frequency in the document

# File lib/tf-idf-similarity/tf_idf_model.rb, line 21
def term_frequency(document, term)
  tf = document.term_count(term)
  sqrt(tf)
end
Also aliased as: tf
tf(document, term)
Alias for: term_frequency