class Birdwatcher::WordList

Attributes

corpus[R]
options[R]
word_list[R]

Public Class Methods

new(options) click to toggle source
# File lib/birdwatcher/word_list.rb, line 5
def initialize(options)
  @options   = options
  @corpus    = []
  @word_list = {}
end

Public Instance Methods

add_to_corpus(text) click to toggle source
# File lib/birdwatcher/word_list.rb, line 11
def add_to_corpus(text)
  @corpus << text.to_s
end
process() click to toggle source
# File lib/birdwatcher/word_list.rb, line 15
def process
  words = {}
  corpus.each do |text|
    normalize_and_split(text).each do |word|
      next if exclude_word?(word)
      words.key?(word) ? words[word] += 1 : words[word] = 1
    end
  end
  if options[:min_word_count]
    words.delete_if { |word, count| count < options[:min_word_count].to_i }
  end
  sorted_words = words.sort_by { |word, count| count }.reverse
  if options[:word_cap]
    sorted_words = sorted_words.take(options[:word_cap].to_i)
  end
  @word_list = sorted_words
end

Private Instance Methods

exclude_word?(word) click to toggle source
# File lib/birdwatcher/word_list.rb, line 57
def exclude_word?(word)
  return true if word.empty?
  return true if options[:min_word_length] && word.length < options[:min_word_length]
  return true if options[:exclude_hashtags] && word.start_with?("#")
  return true if options[:exclude_mentions] && word.start_with?("@")
  return true if exclusion_list.include?(word)
  false
end
exclusion_list() click to toggle source
# File lib/birdwatcher/word_list.rb, line 35
def exclusion_list
  if !@exclusion_list
    @exclusion_list = options[:exclude_words] || []
    if options[:stopwords_file] && options[:exclude_stopwords]
      @exclusion_list += File.read(options[:stopwords_file]).split("\n").map do |w|
        w.strip.downcase
      end
    end
    if options[:common_words_file] && options[:exclude_common_words]
      @exclusion_list += File.read(options[:common_words_file]).split("\n").map do |w|
        w.strip.downcase
      end
    end
  end
  @exclusion_list
end
normalize_and_split(text) click to toggle source
# File lib/birdwatcher/word_list.rb, line 52
def normalize_and_split(text)
  text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ")
  text.split(" ").map(&:strip)
end