class GibberishDetector
Constants
- ACCEPTED_CHARACTERS
- DATA_FILE
- VERSION
Public Class Methods
gibberish?(text, opts = {})
click to toggle source
# File lib/gibberish_detector.rb, line 12 def gibberish?(text, opts = {}) opts[:lib_path] ||= DATA_FILE opts[:raw] ||= false raise GibberishDetectorException, "Please run Gibberish.train! to build your trained data file." unless File.exist?(opts[:lib_path]) trained_library = YAML.load(File.open(opts[:lib_path])) raise GibberishDetectorException, 'Please run Gibberish.train! to build your trained data file.' if trained_library.nil? value = _averageTransitionProbability(text, trained_library[:matrix]) return value if opts[:raw] == true return true if value <= trained_library[:threshold] false end
train!(opts={})
click to toggle source
# File lib/gibberish_detector.rb, line 29 def train!(opts={}) opts[:big_text_file] = 'big.txt' opts[:good_text_file] = 'good.txt' opts[:bad_text_file] = 'bad.txt' opts[:lib_path] = DATA_FILE if File.exist?(opts[:big_text_file]) == false || File.exist?(opts[:good_text_file]) == false || File.exist?(opts[:bad_text_file]) == false raise GibberishDetectorException, "We couldn't find one of #{opts[:big_text_file]}, #{opts[:good_text_file]} or #{opts[:bad_text_file]}. Please ensure all three files exist before training." return false end k = ACCEPTED_CHARACTERS.length hsh = {} pos = ACCEPTED_CHARACTERS.dup.split('').each_with_index do |key, index| hsh[key] = index end.reverse pos = hsh log_prob_matrix = {} range = (0...k).to_a range.each do |index| arr = {} range.each do |index2| arr[index2] = 10 end log_prob_matrix[index] = arr end lines = File.open(opts[:big_text_file]).read lines.each_line do |line| filtered_line = normalize(line).split('') a = false filtered_line.each do |b| if a != false log_prob_matrix[pos[a]] ||= {} log_prob_matrix[pos[a]][pos[b]] ||= 0 log_prob_matrix[pos[a]][pos[b]] += 1 end a = b end end log_prob_matrix.each do |i, row| s = row.values.inject(:+).to_f row.each do |k, j| log_prob_matrix[i][k] = Math.log(j / s) end end good_lines = File.open(opts[:good_text_file]).read good_probs = [] good_lines.each_line do |line| good_probs << _averageTransitionProbability(line.chomp, log_prob_matrix) end bad_lines = File.open(opts[:bad_text_file]).read bad_probs = [] bad_lines.each_line do |line| bad_probs << _averageTransitionProbability(line.chomp, log_prob_matrix) end min_good_probs = good_probs.min max_bad_probs = bad_probs.max if min_good_probs <= max_bad_probs raise GibberishDetectorException, "The prob counts are invalid." end threshold = (min_good_probs + max_bad_probs) / 2 File.open(opts[:lib_path], 'w+') do |file| data = { :matrix => log_prob_matrix, :threshold => threshold } file << data.to_yaml end end
Private Class Methods
_averageTransitionProbability(line, log_prob_matrix)
click to toggle source
# File lib/gibberish_detector.rb, line 114 def _averageTransitionProbability(line, log_prob_matrix) log_prob = 1.0 transition_ct = 0 hsh = {} ACCEPTED_CHARACTERS.dup.split('').each_with_index do |key, index| hsh[key] = index end.reverse pos = hsh filtered_line = normalize(line.dup).split('') a = false filtered_line.each do |b| if a != false log_prob += log_prob_matrix[pos[a]][pos[b]] transition_ct += 1 end a = b end Math.exp(log_prob / [transition_ct, 1].max) end
normalize(text)
click to toggle source
# File lib/gibberish_detector.rb, line 110 def normalize(text) text.downcase.gsub(/[^a-z\ ]/, '') end