class NBayes::Base

Attributes

assume_uniform[RW]
binarized[R]
data[RW]
debug[RW]
k[RW]
vocab[RW]

Public Class Methods

from(yml_file) click to toggle source

Loads class instance from a data file (e.g., yaml)

# File lib/nbayes.rb, line 321
def self.from(yml_file)
  File.open(yml_file, "rb") do |file|
    self.from_yml(file.read)
  end
end
from_yml(yml_data) click to toggle source
# File lib/nbayes.rb, line 314
def self.from_yml(yml_data)
  nbayes = YAML.load(yml_data)
  nbayes.reset_after_import()  # yaml does not properly set the defaults on the Hashes
  nbayes
end
new(options={}) click to toggle source
# File lib/nbayes.rb, line 183
def initialize(options={})
  @debug = false
  @k = 1
  @binarized = options[:binarized] || false
  @assume_uniform = false
  @vocab = Vocab.new(:log_size => options[:log_vocab])
  @data = Data.new
end

Public Instance Methods

calculate_probabilities(tokens) click to toggle source

Calculates the actual probability of a class given the tokens (this is the work horse of the code)

# File lib/nbayes.rb, line 256
def calculate_probabilities(tokens)
  # P(class|words) = P(w1,...,wn|class) * P(class) / P(w1,...,wn)
  #                = argmax P(w1,...,wn|class) * P(class)
  #
  # P(wi|class) = (count(wi, class) + k)/(count(w,class) + kV)
  prob_numerator = {}
  v_size = vocab.size

  cat_prob = Math.log(1 / data.categories.count.to_f)
  total_example_count = data.total_examples.to_f

  data.each do |category|
    unless assume_uniform
      cat_prob = Math.log(data.example_count(category) / total_example_count)
    end

    log_probs = 0
    denominator = (data.token_count(category) + @k * v_size).to_f
    tokens.each do |token|
      numerator = data.count_of_token_in_category(category, token) + @k
      log_probs += Math.log( numerator / denominator )
    end
    prob_numerator[category] = log_probs + cat_prob
  end
  normalize(prob_numerator)
end
category_stats() click to toggle source
# File lib/nbayes.rb, line 250
def category_stats
  data.category_stats
end
classify(tokens) click to toggle source
# File lib/nbayes.rb, line 240
def classify(tokens)
  print "classify: #{tokens.join(', ')}\n" if @debug
  probs = {}
  tokens = tokens.uniq if binarized
  probs = calculate_probabilities(tokens)
  print "results: #{probs.to_yaml}\n" if @debug
  probs.extend(NBayes::Result)
  probs
end
delete_category(category) click to toggle source

Delete an entire category from the classification data

# File lib/nbayes.rb, line 211
def delete_category(category)
  data.delete_category(category)
end
dump(arg) click to toggle source

Dumps class instance to a data file (e.g., yaml) or a string

# File lib/nbayes.rb, line 340
def dump(arg)
  if arg.instance_of? String
    File.open(arg, "w") {|f| YAML.dump(self, f) }
  else
    YAML.dump(arg)
  end
end
load(yml) click to toggle source

Load class instance

# File lib/nbayes.rb, line 328
def load(yml)
  if yml.nil?
    nbayes = NBayes::Base.new
  elsif yml[0..2] == "---"
    nbayes = self.class.from_yml(yml)
  else
    nbayes = self.class.from(yml)
  end
  nbayes
end
normalize(prob_numerator) click to toggle source
# File lib/nbayes.rb, line 283
def normalize(prob_numerator)
  # calculate the denominator, which normalizes this into a probability; it's just the sum of all numerators from above
  normalizer = 0
  prob_numerator.each {|cat, numerator| normalizer += numerator }
  # One more caveat:
  # We're using log probabilities, so the numbers are negative and the smallest negative number is actually the largest prob.
  # To convert, we need to maintain the relative distance between all of the probabilities:
  # - divide log prob by normalizer: this keeps ratios the same, but reverses the ordering
  # - re-normalize based off new counts
  # - final calculation
  # Ex: -1,-1,-2  =>  -4/-1, -4/-1, -4/-2
  #   - renormalize and calculate => 4/10, 4/10, 2/10
  intermed = {}
  renormalizer = 0
  prob_numerator.each do |cat, numerator|
    intermed[cat] = normalizer / numerator.to_f
    renormalizer += intermed[cat]
  end
  # calculate final probs
  final_probs = {}
  intermed.each do |cat, value|
    final_probs[cat] = value / renormalizer.to_f
  end
  final_probs
end
purge_less_than(x) click to toggle source

Allows removal of low frequency words that increase processing time and may overfit

  • tokens with a count less than x (measured by summing across all classes) are removed

Ex: nb.purge_less_than(2)

NOTE: this does not decrement the “examples” count, so purging is not always the same as if the item was never added in the first place, but usually so

# File lib/nbayes.rb, line 198
def purge_less_than(x)
  remove_list = {}
  @vocab.each do |token|
    if data.purge_less_than(token, x)
      # print "removing #{token}\n"
      remove_list[token] = 1
    end
  end  # each vocab word
  remove_list.keys.each {|token| @vocab.delete(token) }
  # print "total vocab size is now #{vocab.size}\n"
end
reset_after_import() click to toggle source

called internally after yaml import to reset Hash defaults

# File lib/nbayes.rb, line 310
def reset_after_import
  data.reset_after_import
end
train(tokens, category) click to toggle source
# File lib/nbayes.rb, line 215
def train(tokens, category)
  tokens = tokens.uniq if binarized
  data.increment_examples(category)
  tokens.each do |token|
    vocab.seen_token(token)
    data.add_token_to_category(category, token)
  end
end
untrain(tokens, category) click to toggle source

Be carefull with this function:

  • It decrement the number of examples for the category. If the being-untrained category has no more examples, it is removed from the category list.

  • It untrain already trained tokens, non existing tokens are not considered.

# File lib/nbayes.rb, line 228
def untrain(tokens, category)
  tokens = tokens.uniq if binarized
  data.decrement_examples(category)
  
  tokens.each do |token|
    if data.token_trained?(token, category)
      vocab.delete(token)
      data.remove_token_from_category(category, token)
    end
  end
end