module CloudText
Constants
- MAJOR
- MINOR
- TINY
- VERSION
PRE = nil
Public Class Methods
clean_text(input, options = {})
click to toggle source
# File lib/cloud_text.rb, line 8 def self.clean_text(input, options = {}) @input = input @options = options # Get feature on/off switches remove_digits = @options.fetch(:remove_digits, false) stemming_enabled = @options.fetch(:stemming, false) # Get variables @language = @options.fetch(:language, "en") @custom_stopwords = @options.fetch(:stopwords, []) @input = process_text(@input, stemming_enabled, remove_digits, @language, @custom_stopwords) count_words(@input) end
Private Class Methods
count_words(input)
click to toggle source
Counting the words, generate array for each element like => [word, frequency]
# File lib/cloud_text.rb, line 66 def self.count_words(input) input.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by(&:last).reverse end
filter_stopwords(input, language, custom_stopwords = nil)
click to toggle source
Remove stopwords for given language and also given custom stopwords
# File lib/cloud_text.rb, line 49 def self.filter_stopwords(input, language, custom_stopwords = nil) stopword_filter = language == "tr" ? Stopwords::Filter.new(tr_stopwords) : stopword_filter = Stopwords::Snowball::Filter.new(language) # Here we intentionally do not downcase custom_stopwords # since we want to filter only capitalized version of a word stopword_filter.stopwords << custom_stopwords if custom_stopwords stopword_filter.filter(input.split) end
lowercase_words(input, language)
click to toggle source
# File lib/cloud_text.rb, line 44 def self.lowercase_words(input, language) UnicodeUtils.downcase(input, language.to_sym) end
process_text(input, stemming_enabled, remove_digits, language, custom_stopwords)
click to toggle source
# File lib/cloud_text.rb, line 26 def self.process_text(input, stemming_enabled, remove_digits, language, custom_stopwords) input = remove_punctuation(input, remove_digits) input = reduce_whitespaces(input) input = lowercase_words(input, language) input = filter_stopwords(input, language, custom_stopwords) # Get custom_stopwords from user and filter words input = stemming(input, language) if stemming_enabled end
reduce_whitespaces(input)
click to toggle source
Reduce multiple whitespaces into single whitespace
# File lib/cloud_text.rb, line 40 def self.reduce_whitespaces(input) input.gsub(/\s+/, ' ') end
remove_punctuation(input, remove_digits)
click to toggle source
# File lib/cloud_text.rb, line 34 def self.remove_punctuation(input, remove_digits) regex = remove_digits ? /[^A-Za-z0-9^şŞıİçÇöÖüÜĞğ\s]|_|\d/ : /[^A-Za-z0-9^şŞıİçÇöÖüÜĞğ\s]|_/ input.gsub(regex, ' ') end
stemming(input, language)
click to toggle source
# File lib/cloud_text.rb, line 58 def self.stemming(input, language) stemmer = Lingua::Stemmer.new(language: language) input.map do |word| stemmer.stem(word) end end
tr_stopwords()
click to toggle source
# File lib/cloud_text.rb, line 70 def self.tr_stopwords File.readlines('tr_stopwords_dict').each(&:chomp!) end