class Yawc
Attributes
to_h[R]
Public Class Methods
new(s, level: 2)
click to toggle source
level:
2 strips out ignore_words and stop_words 3 strips out dictionary words
# File lib/yawc.rb, line 27 def initialize(s, level: 2) @stopwords = WordsDotDat.stopwords a = case level when 2 words(s).split when 3 list = words(s).split list - WordsDotDat.words end h = a.group_by(&:to_s).\ inject({}){|r, x| r.merge(String.new(x[0]).\ force_encoding("utf-8") => x[-1].length)} @to_h = h.sort_by(&:last).reverse.to_h end
Public Instance Methods
words(s)
click to toggle source
# File lib/yawc.rb, line 46 def words(s) # words source: http://norvig.com/mayzner.html ignorewords = ["the", "of", "and", "to", "in", "a", "is", "that", "for", "it", "as", "was", "with", "be", "by", "on", "not", "he", "i", "this", "are", "or","his", "from", "at", "which", "but", "have", "an", "had", "they", "you", "were", "their", "one", "all", "we", "can", "her", "has", "there", "been", "if", "more", "when", "will", "would", "who", "so", "no"] s.downcase. gsub(/\w+'\w+/,''). # remove words containing an apostrophe gsub(/["']/,''). # remove quotation marks gsub(/\W[^a-z|#]+(\w+)/,' \1 '). # remove # non-alpabetical characters from start or beginning of words gsub(/\s.\s/,' '). # remove single digits gsub(/\b(?:#{(ignorewords + @stopwords).join('|')})\b/,''). # ignore common words gsub(/\B[^\w#]\B+/,'') # remove any other items which are # not words or hashtags end