class TfIdfSimilarity::TfIdfModel
Public Instance Methods
Chisholm ATFA
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 127 def augmented_average_term_frequency(document, term) count = document.term_count(term) if count > 0 0.9 + 0.1 * count / document.average_term_count else 0 end end
Chisholm LOGG
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 173 def augmented_log_term_frequency(document, term) count = document.term_count(term) if count > 0 0.2 + 0.8 * log(count + 1) else 0 end end
SMART a, Salton n, Chisholm ATF1
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 121 def augmented_normalized_term_frequency(document, term) 0.5 + 0.5 * normalized_term_frequency(document, term) end
SMART b, Salton b, Chisholm BNRY
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 103 def binary_term_frequency(document, term) count = document.term_count(term) if count > 0 1 else 0 end end
Chisholm ATFC
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 138 def changed_coefficient_augmented_normalized_term_frequency(document, term) count = document.term_count(term) if count > 0 0.2 + 0.8 * count / document.maximum_term_count else 0 end end
Chisholm ENPY
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 62 def entropy(term) denominator = @model.term_count(term).to_f logN = log(documents.size) 1 + documents.reduce(0) do |sum,document| quotient = document.term_count(term) / denominator sum += quotient * log(quotient) / logN end end
Chisholm IGFF
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 38 def global_frequency_inverse_document_frequency(term) @model.term_count(term) / @model.document_count(term).to_f end
Chisholm IGFI
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 50 def incremented_global_frequency_inverse_document_frequency(term) global_frequency_inverse_document_frequency(term) + 1 end
Return the term's inverse document frequency.
@param [String] term a term @return [Float] the term's inverse document frequency
# File lib/tf-idf-similarity/tf_idf_model.rb, line 10 def inverse_document_frequency(term) df = @model.document_count(term) 1 + log(documents.size / (df + 1.0)) end
Chisholm IGFL
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 44 def log_global_frequency_inverse_document_frequency(term) log(global_frequency_inverse_document_frequency(term) + 1) end
@see github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
SMART l, Chisholm LOGA
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 151 def log_term_frequency(document, term) count = document.term_count(term) if count > 0 1 + log(count) else 0 end end
@see github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
SMART n, Salton x, Chisholm NONE
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 18 def no_collection_frequency(term) 1.0 end
@see github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb @see github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb @see github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb @see github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb @see github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
SMART n, Salton x, Chisholm NONE
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 78 def no_normalization(matrix) matrix end
SMART L, Chisholm LOGN
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 162 def normalized_log_term_frequency(document, term) count = document.term_count(term) if count > 0 (1 + log(count)) / (1 + log(document.average_term_count)) else 0 end end
@see en.wikipedia.org/wiki/Tf*idf @see nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 115 def normalized_term_frequency(document, term, a = 0) a + (1 - a) * document.term_count(term) / document.maximum_term_count end
@see nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
SMART u, Chisholm PUQN
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 85 def pivoted_unique_normalization(matrix) raise NotImplementedError end
@see github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
SMART t, Salton f, Chisholm IDFB
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 25 def plain_inverse_document_frequency(term, numerator = 0, denominator = 0) log((documents.size + numerator) / (@model.document_count(term).to_f + denominator)) end
SMART p, Salton p, Chisholm IDFP
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 31 def probabilistic_inverse_document_frequency(term) count = @model.document_count(term).to_f log((documents.size - count) / count) end
Chisholm IGFS
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 56 def square_root_global_frequency_inverse_document_frequency(term) sqrt(global_frequency_inverse_document_frequency(term) - 0.9) end
Chisholm SQRT
# File lib/tf-idf-similarity/extras/tf_idf_model.rb, line 184 def square_root_term_frequency(document, term) count = document.term_count(term) if count > 0 sqrt(count - 0.5) + 1 else 0 end end
Returns the term's frequency in the document.
@param [Document] document a document @param [String] term a term @return [Float] the term's frequency in the document
# File lib/tf-idf-similarity/tf_idf_model.rb, line 21 def term_frequency(document, term) tf = document.term_count(term) sqrt(tf) end