class Yawc

Attributes

to_h[R]

Public Class Methods

new(s, level: 2) click to toggle source

level:

2 strips out ignore_words and stop_words
3 strips out dictionary words
# File lib/yawc.rb, line 27
def initialize(s, level: 2)
  
  @stopwords = WordsDotDat.stopwords
  
  a = case level
  when 2
    words(s).split
  when 3
    list = words(s).split
    list - WordsDotDat.words
  end
  
  h = a.group_by(&:to_s).\
      inject({}){|r, x| r.merge(String.new(x[0]).\
                                  force_encoding("utf-8") => x[-1].length)}
  @to_h = h.sort_by(&:last).reverse.to_h

end

Public Instance Methods

words(s) click to toggle source
# File lib/yawc.rb, line 46
def words(s)
  
  # words source:  http://norvig.com/mayzner.html
  
  ignorewords = ["the", "of", "and", "to", "in", "a", "is", "that", "for",
                 "it", "as", "was", "with", "be", "by", "on", "not", "he", 
                 "i", "this", "are", "or","his", "from", "at", "which", 
                 "but", "have", "an", "had", "they", "you", "were", 
                 "their", "one", "all", "we", "can", "her", "has", "there",
                 "been", "if", "more", "when", "will", "would", "who", 
                 "so", "no"]

  s.downcase. 
    gsub(/\w+'\w+/,'').  # remove words containing an apostrophe
    gsub(/["']/,'').     # remove quotation marks
    gsub(/\W[^a-z|#]+(\w+)/,' \1 ').     # remove
    #             non-alpabetical characters from start or beginning of words
    gsub(/\s.\s/,' ').                              # remove single digits
    gsub(/\b(?:#{(ignorewords + @stopwords).join('|')})\b/,'').   # ignore common words
    gsub(/\B[^\w#]\B+/,'')              # remove any other items which are
    #                                 not words or hashtags
  
end