class NBayes::Base
Attributes
Public Class Methods
Loads class instance from a data file (e.g., yaml)
# File lib/nbayes.rb, line 321 def self.from(yml_file) File.open(yml_file, "rb") do |file| self.from_yml(file.read) end end
# File lib/nbayes.rb, line 314 def self.from_yml(yml_data) nbayes = YAML.load(yml_data) nbayes.reset_after_import() # yaml does not properly set the defaults on the Hashes nbayes end
# File lib/nbayes.rb, line 183 def initialize(options={}) @debug = false @k = 1 @binarized = options[:binarized] || false @assume_uniform = false @vocab = Vocab.new(:log_size => options[:log_vocab]) @data = Data.new end
Public Instance Methods
Calculates the actual probability of a class given the tokens (this is the work horse of the code)
# File lib/nbayes.rb, line 256 def calculate_probabilities(tokens) # P(class|words) = P(w1,...,wn|class) * P(class) / P(w1,...,wn) # = argmax P(w1,...,wn|class) * P(class) # # P(wi|class) = (count(wi, class) + k)/(count(w,class) + kV) prob_numerator = {} v_size = vocab.size cat_prob = Math.log(1 / data.categories.count.to_f) total_example_count = data.total_examples.to_f data.each do |category| unless assume_uniform cat_prob = Math.log(data.example_count(category) / total_example_count) end log_probs = 0 denominator = (data.token_count(category) + @k * v_size).to_f tokens.each do |token| numerator = data.count_of_token_in_category(category, token) + @k log_probs += Math.log( numerator / denominator ) end prob_numerator[category] = log_probs + cat_prob end normalize(prob_numerator) end
# File lib/nbayes.rb, line 250 def category_stats data.category_stats end
# File lib/nbayes.rb, line 240 def classify(tokens) print "classify: #{tokens.join(', ')}\n" if @debug probs = {} tokens = tokens.uniq if binarized probs = calculate_probabilities(tokens) print "results: #{probs.to_yaml}\n" if @debug probs.extend(NBayes::Result) probs end
Delete an entire category from the classification data
# File lib/nbayes.rb, line 211 def delete_category(category) data.delete_category(category) end
Dumps class instance to a data file (e.g., yaml) or a string
# File lib/nbayes.rb, line 340 def dump(arg) if arg.instance_of? String File.open(arg, "w") {|f| YAML.dump(self, f) } else YAML.dump(arg) end end
Load class instance
# File lib/nbayes.rb, line 328 def load(yml) if yml.nil? nbayes = NBayes::Base.new elsif yml[0..2] == "---" nbayes = self.class.from_yml(yml) else nbayes = self.class.from(yml) end nbayes end
# File lib/nbayes.rb, line 283 def normalize(prob_numerator) # calculate the denominator, which normalizes this into a probability; it's just the sum of all numerators from above normalizer = 0 prob_numerator.each {|cat, numerator| normalizer += numerator } # One more caveat: # We're using log probabilities, so the numbers are negative and the smallest negative number is actually the largest prob. # To convert, we need to maintain the relative distance between all of the probabilities: # - divide log prob by normalizer: this keeps ratios the same, but reverses the ordering # - re-normalize based off new counts # - final calculation # Ex: -1,-1,-2 => -4/-1, -4/-1, -4/-2 # - renormalize and calculate => 4/10, 4/10, 2/10 intermed = {} renormalizer = 0 prob_numerator.each do |cat, numerator| intermed[cat] = normalizer / numerator.to_f renormalizer += intermed[cat] end # calculate final probs final_probs = {} intermed.each do |cat, value| final_probs[cat] = value / renormalizer.to_f end final_probs end
Allows removal of low frequency words that increase processing time and may overfit
-
tokens with a count less than x (measured by summing across all classes) are removed
Ex: nb.purge_less_than(2)
NOTE: this does not decrement the “examples” count, so purging is not always the same as if the item was never added in the first place, but usually so
# File lib/nbayes.rb, line 198 def purge_less_than(x) remove_list = {} @vocab.each do |token| if data.purge_less_than(token, x) # print "removing #{token}\n" remove_list[token] = 1 end end # each vocab word remove_list.keys.each {|token| @vocab.delete(token) } # print "total vocab size is now #{vocab.size}\n" end
called internally after yaml import to reset Hash defaults
# File lib/nbayes.rb, line 310 def reset_after_import data.reset_after_import end
# File lib/nbayes.rb, line 215 def train(tokens, category) tokens = tokens.uniq if binarized data.increment_examples(category) tokens.each do |token| vocab.seen_token(token) data.add_token_to_category(category, token) end end
Be carefull with this function:
-
It decrement the number of examples for the category. If the being-untrained category has no more examples, it is removed from the category list.
-
It untrain already trained tokens, non existing tokens are not considered.
# File lib/nbayes.rb, line 228 def untrain(tokens, category) tokens = tokens.uniq if binarized data.decrement_examples(category) tokens.each do |token| if data.token_trained?(token, category) vocab.delete(token) data.remove_token_from_category(category, token) end end end