class TextClassifier

Public Class Methods

classify(documents_by_category, test_doc) click to toggle source
# File lib/text_classifier.rb, line 4
def self.classify(documents_by_category, test_doc)
  stop_words = Set.new ['a','about','above','after','again','against','all','am','an','and','any','are','aren\'t','as','at','be','because','been','before','being','below','between','both','but','by','can\'t','cannot','could','couldn\'t','did','didn\'t','do','does','doesn\'t','doing','don\'t','down','during','each','few','for','from','further','had','hadn\'t','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','he\'s','her','here','here\'s','hers','herself','him','himself','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','if','in','into','is','isn\'t','it','it\'s','its','itself','let\'s','me','more','most','mustn\'t','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','so','some','such','than','that','that\'s','the','their','theirs','them','themselves','then','there','there\'s','these','they','they\'d','they\'ll','they\'re','they\'ve','this','those','through','to','too','under','until','up','very','was','wasn\'t','we','we\'d','we\'ll','we\'re','we\'ve','were','weren\'t','what','what\'s','when','when\'s','where','where\'s','which','while','who','who\'s','whom','why','why\'s','with','won\'t','would','wouldn\'t','you','you\'d','you\'ll','you\'re','you\'ve','your','yours','yourself','yourselves']
  num_categories = documents_by_category.size
  probability_of_category = Array.new(num_categories)
  num_words_in_category = Array.new(num_categories)
  count_words_by_category = Array.new(num_categories)
  entire_vocabulary = Set.new

  # count the total number of documents across all categories
  num_docs = 0
  for i in 0..num_categories-1 do
    documents_this_cat = documents_by_category[i]
    num_docs += documents_this_cat.size
    documents_this_cat.each do |doc|
      doc = doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')
    end
  end
  test_doc = test_doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')

  # count how many of each word are in each category and build the entire vocabulary
  for i in 0..num_categories-1 do
    category = documents_by_category[i]
    probability_of_category[i] = category.size.to_f / num_docs

    num_words_this_cat = 0
    count_words_this_cat = Hash.new(0)
    category.each do |document|
      document.split.each do |word|
        entire_vocabulary.add(word)
        num_words_this_cat += 1
        count_words_this_cat[word] += 1
      end
    end
    num_words_in_category[i] = num_words_this_cat
    count_words_by_category[i] = count_words_this_cat
  end

  # find the conditional probability of a word, given that we are in a category
  cond_probs = Array.new(num_categories)
  size = entire_vocabulary.size
  for i in 0..num_categories-1 do
    prob = Hash.new(0)
    denom = num_words_in_category[i] + size
    entire_vocabulary.each do |word|
      numer = 1.0 + count_words_by_category[i][word]
      prob[word] = numer / denom
    end

    cond_probs[i] = prob
  end

  # calculate the probability of each category on the new test document
  test_doc_probs = Array.new(num_categories)
  for i in 0..num_categories-1 do
    prob = cond_probs[i]
    total_prob = probability_of_category[i]
    test_doc.split.each do |word|
      total_prob *= prob[word]
    end

    test_doc_probs[i] = total_prob
  end

  # test_doc_probs are proportional to each other so scale to make them sum to 1
  sum_test_doc_probs = test_doc_probs.inject(:+)
  for i in 0..num_categories-1 do
    test_doc_probs[i] /= sum_test_doc_probs
  end

  return test_doc_probs
end