class TextUtils::Classifier
Public Class Methods
new()
click to toggle source
# File lib/textutils/classifier.rb, line 9 def initialize @h = Hash.new( [] ) # hash w/ words - default value is empty ary (word_list) end
Public Instance Methods
classify( text_with_comments )
click to toggle source
# File lib/textutils/classifier.rb, line 35 def classify( text_with_comments ) ## check encoding logger.debug " classify - text.encoding: #{text_with_comments.encoding.name}" # nb: strip comments first text = strip_comments( text_with_comments ) counts = [] ## e.g. [[ 'en', 20], # 20 words ## [ 'de', 2]] # 2 words @h.each_with_index do |(key,words),i| logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words" counts << [key, count_words_in_text( words, text )] end # sort by word count (reverse sort e.g. highest count goes first) counts = counts.sort {|l,r| r[1] <=> l[1] } # dump stats logger.debug "results:" counts.each_with_index do |entry,i| ## e.g. 1. en: 20 words ## 2. de: 2 words logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}" end logger.debug "classifier - using key >>#{counts[0][0]}<<" ## return key/lang code w/ highest count counts[0][0] end
classify_file( path )
click to toggle source
# File lib/textutils/classifier.rb, line 31 def classify_file( path ) classify( File.read_utf8( path ) ) end
dump()
click to toggle source
# File lib/textutils/classifier.rb, line 71 def dump # for debugging dump setup (that is, keys w/ words etc.) @h.each_with_index do |(key, words), i| logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:" logger.debug words.inspect ## check encoding of words (trouble w/ windows cp850 argh!!!) last_encoding_name = '' words.each do |word| if last_encoding_name != word.encoding.name logger.debug " encoding: #{word.encoding.name}" last_encoding_name = word.encoding.name end end end end
train( key, ary_or_hash_or_str )
click to toggle source
# File lib/textutils/classifier.rb, line 13 def train( key, ary_or_hash_or_str ) ## add words to lang/topic key if ary_or_hash_or_str.kind_of?( Array ) words = ary_or_hash_or_str elsif ary_or_hash_or_str.kind_of?( Hash ) words = [] ary_or_hash_or_str.each do |_, values| words += values.strip.split('|') end else # assume string (allow list separated by |) words = ary_or_hash_or_str.strip.split('|') end @h[ key ] += words end
Private Instance Methods
count_word_in_text( word, text )
click to toggle source
# File lib/textutils/classifier.rb, line 123 def count_word_in_text( word, text ) count = 0 pos = text.index( word ) while pos.nil? == false count += 1 logger.debug "bingo - found >>#{word}<< on pos #{pos}, count: #{count}" ### todo: check if pos+word.length/size needs +1 or similar pos = text.index( word, pos+word.length) end count end
count_words_in_text( words, text )
click to toggle source
# File lib/textutils/classifier.rb, line 135 def count_words_in_text( words, text ) count = 0 words.each do |word| count += count_word_in_text( word, text ) end count end
strip_comments( text )
click to toggle source
# File lib/textutils/classifier.rb, line 91 def strip_comments( text ) new_text = '' text.each_line do |line| # comments allow: # 1) ##### (shell/ruby style) # 2) -- comment here (haskel/?? style) # 3) % comment here (tex/latex style) if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/ # skip komments and do NOT copy to result (keep comments secret!) logger.debug 'skipping comment line' next end ## todo: strip inline comments - why not? # pass 1) remove possible trailing eol comment ## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,) ## becomes -> nyc, New York line = line.sub( /\s+#.+$/, '' ) new_text << line new_text << "\n" end new_text end