class Reclassifier::Bayes

Bayesian classifier for arbitrary text.

Implementation is translated from Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze, Cambridge University Press. 2008, ISBN 0521865719.

Derived quantities are cached to improve performance of repeated classify calls.

Public Class Methods

new(classifications = [], options = {}) click to toggle source

Can be created with zero or more classifications, each of which will be initialized and given a training method. The classifications are specified as an array of symbols. Options are specified in a hash.

Options:

  • :clean - If false, punctuation will be included in the classifier. Otherwise, punctuation will be omitted. Default is true.

b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
# File lib/reclassifier/bayes.rb, line 24
def initialize(classifications = [], options = {})
  @classifications = {}
  @docs_in_classification_count = {}
  @options = options

  classifications.each {|classification| add_classification(classification)}
end

Public Instance Methods

add_classification(classification) click to toggle source

Adds the classification to the classifier. Has no effect if the classification already existed. Returns the classification.

b.add_classification(:not_spam)
=>  :not_spam
# File lib/reclassifier/bayes.rb, line 127
def add_classification(classification)
  @classifications[classification] ||= {}

  @docs_in_classification_count[classification] ||= 0

  classification
end
cache_set?() click to toggle source

Returns true if the cache has been set (i.e. classify has been run). Returns false otherwise.

classifier = Reclassifier::Bayes.new([:one, :other])

classifier.cache_set?
=>  false

classifier.train(:one, 'bbb')
classifier.train(:other, 'aaa')

classifier.classify('aaa')

classifier.cache_set?
=>  true
# File lib/reclassifier/bayes.rb, line 191
def cache_set?
  @cache.present?
end
calculate_scores(text) click to toggle source

Returns the scores of the specified text for each classification.

b.calculate_scores("I hate bad words and you")
=>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}

The largest of these scores (the one closest to 0) is the one picked out by classify

# File lib/reclassifier/bayes.rb, line 76
def calculate_scores(text)
  scores = {}

  @cache[:total_docs_classified_log] ||= Math.log(@docs_in_classification_count.values.reduce(:+))
  @cache[:words_classified] ||= @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}

  @classifications.each do |classification, classification_word_counts|
    # prior
    scores[classification] = Math.log(@docs_in_classification_count[classification])
    scores[classification] -= @cache[:total_docs_classified_log]

    # likelihood
    classification_word_count = classification_word_counts.values.reduce(:+).to_i
    smart_word_hash(text).each do |word, count|
      if @cache[:words_classified].include?(word)
        scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)

        scores[classification] -= count * Math.log(classification_word_count + @cache[:words_classified].count)
      end
    end
  end

  scores
end
classifications() click to toggle source

Provides a list of classification names

b.classifications
=>   [:this, :that, :the_other]
# File lib/reclassifier/bayes.rb, line 116
def classifications
  @classifications.keys
end
classify(text) click to toggle source

Returns the classification of the specified text, which is one of the classifications given in the initializer.

b.classify("I hate bad words and you")
=>  :uninteresting
# File lib/reclassifier/bayes.rb, line 107
def classify(text)
  calculate_scores(text.to_s).max_by {|classification| classification[1]}[0]
end
invalidate_cache() click to toggle source

Invalidates the cache.

classifier = Reclassifier::Bayes.new([:one, :other])

classifier.train(:one, 'bbb')
classifier.train(:other, 'aaa')

classifier.classify('aaa')

classifier.cache_set?
=>  true

classifier.invalidate_cache

classifier.cache_set?
=>  false
# File lib/reclassifier/bayes.rb, line 171
def invalidate_cache
  @cache = {}
end
remove_classification(classification) click to toggle source

Removes the classification from the classifier. Returns the classifier if the classification existed, else nil.

b.remove_classification(:not_spam)
=>  :not_spam
# File lib/reclassifier/bayes.rb, line 142
def remove_classification(classification)
  return_value = if @classifications.include?(classification)
                   classification
                 else
                   nil
                 end

  @classifications.delete(classification)

  return_value
end
train(classification, text) click to toggle source

Provides a general training method for all classifications specified in Bayes#new

b = Reclassifier::Bayes.new([:this, :that])
b.train(:this, "This text")
b.train(:that, "That text")
# File lib/reclassifier/bayes.rb, line 39
def train(classification, text)
  ensure_classification_exists(classification)

  update_doc_count(classification, 1)

  smart_word_hash(text).each do |word, count|
    @classifications[classification][word] ||= 0

    @classifications[classification][word] += count
  end
end
untrain(classification, text) click to toggle source

Untrain a (classification, text) pair. Be very careful with this method.

b = Reclassifier::Bayes.new([:this, :that])
b.train(:this, "This text")
b.untrain(:this, "This text")
# File lib/reclassifier/bayes.rb, line 59
def untrain(classification, text)
  ensure_classification_exists(classification)

  update_doc_count(classification, -1)

  smart_word_hash(text).each do |word, count|
    @classifications[classification][word] -= count if @classifications[classification].include?(word)
  end
end

Private Instance Methods

ensure_classification_exists(classification) click to toggle source
# File lib/reclassifier/bayes.rb, line 203
def ensure_classification_exists(classification)
  raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
end
smart_word_hash(string) click to toggle source
# File lib/reclassifier/bayes.rb, line 207
def smart_word_hash(string)
  if @options[:clean] == false
    word_hash(string)
  else
    clean_word_hash(string)
  end
end
update_doc_count(classification, value) click to toggle source
# File lib/reclassifier/bayes.rb, line 197
def update_doc_count(classification, value)
  @docs_in_classification_count[classification] += value

  invalidate_cache
end