class ChatCorrect::Correct
Constants
- TYPES_OF_MISTAKES
Attributes
corrected_sentence[R]
original_sentence[R]
tgr[R]
Public Class Methods
new(original_sentence:, corrected_sentence:)
click to toggle source
# File lib/chat_correct/correct.rb, line 5 def initialize(original_sentence:, corrected_sentence:) @original_sentence = original_sentence @corrected_sentence = corrected_sentence @tgr = EngTagger.new Linguistics.use(:en) end
Public Instance Methods
correct()
click to toggle source
# File lib/chat_correct/correct.rb, line 12 def correct raise "You must include an Original Sentence" if original_sentence.nil? || original_sentence.eql?('') raise "You must include a Corrected Sentence" if corrected_sentence.nil? || corrected_sentence.eql?('') analyze end
mistake_report()
click to toggle source
# File lib/chat_correct/correct.rb, line 29 def mistake_report raise "You must include an Original Sentence" if original_sentence.nil? || original_sentence.eql?('') raise "You must include a Corrected Sentence" if corrected_sentence.nil? || corrected_sentence.eql?('') mistake_report_hash = {} TYPES_OF_MISTAKES.each do |mistake| counter = 0 mistakes.each do |key, value| counter += 1 if value['error_type'].eql?(mistake) || value['error_type'].split('_')[1].eql?(mistake) end mistake_report_hash[mistake] = counter end mistake_report_hash end
mistakes()
click to toggle source
# File lib/chat_correct/correct.rb, line 18 def mistakes raise "You must include an Original Sentence" if original_sentence.nil? || original_sentence.eql?('') raise "You must include a Corrected Sentence" if corrected_sentence.nil? || corrected_sentence.eql?('') mistakes_hash = {} analyze.each do |key, value| next if (!value['type'].split('_')[-1].eql?('mistake') && !value['type'].split('_')[0].eql?('missing')) || value['type'].split('_')[0].eql?('no') mistakes_hash = build_mistakes_hash(mistakes_hash, key, value) end mistakes_hash end
number_of_mistakes()
click to toggle source
# File lib/chat_correct/correct.rb, line 43 def number_of_mistakes raise "You must include an Original Sentence" if original_sentence.nil? || original_sentence.eql?('') raise "You must include a Corrected Sentence" if corrected_sentence.nil? || corrected_sentence.eql?('') mistakes.length end
Private Instance Methods
analyze()
click to toggle source
# File lib/chat_correct/correct.rb, line 51 def analyze @analyze ||= iterate_stages end
assign_next_token(hash, index, lookup, tokenized_array)
click to toggle source
# File lib/chat_correct/correct.rb, line 220 def assign_next_token(hash, index, lookup, tokenized_array) if (index + lookup) > (tokenized_array.length - 1) hash["next_word#{lookup}"] = 'ȹ' else hash["next_word#{lookup}"] = tokenized_array[index + lookup] end end
assign_previous_token(hash, index, lookup, tokenized_array)
click to toggle source
# File lib/chat_correct/correct.rb, line 212 def assign_previous_token(hash, index, lookup, tokenized_array) if index - lookup < 0 hash["prev_word#{lookup}"] = 'ȸ' else hash["prev_word#{lookup}"] = tokenized_array[index - lookup] end end
build_corrections_hash(correction_hash)
click to toggle source
# File lib/chat_correct/correct.rb, line 128 def build_corrections_hash(correction_hash) final_hash = {} correction_hash.each do |k, v| interim_hash = {} interim_hash['token'] = reverse_symbols(v.keys[0]) interim_hash['type'] = v.values[0] final_hash[k] = interim_hash end final_hash end
build_mistakes_hash(mistakes_hash, key, value)
click to toggle source
# File lib/chat_correct/correct.rb, line 104 def build_mistakes_hash(mistakes_hash, key, value) interim_hash = {} interim_hash['position'] = key interim_hash = update_interim_hash_with_error(interim_hash, value) if value['type'].split('_')[1].eql?('order') if mistakes_hash.length.eql?(0) || mistakes_hash[0]['error_type'].eql?('unnecessary_word') || mistakes_hash[0]['error_type'].eql?('word_order') interim_hash['mistake'] = reverse_symbols(original_sentence_info_hash[key]['token']) else interim_hash['mistake'] = reverse_symbols(original_sentence_info_hash[key - 1]['token']) end elsif value['type'].split('_').length > 2 && value['type'].split('_')[1].eql?('punctuation') interim_hash['mistake'] = '' else interim_hash['mistake'] = value['token'] end if correct[key + 1].blank? && value['type'].split('_')[1].eql?('order') interim_hash['correction'] = 'N/A' else interim_hash = update_interim_hash_with_correction(interim_hash, key) unless correct[key + 1].blank? end mistakes_hash[mistakes_hash.length] = interim_hash mistakes_hash end
corrected_sentence_info_hash()
click to toggle source
# File lib/chat_correct/correct.rb, line 178 def corrected_sentence_info_hash @corrected_sentence_info_hash ||= create_sentence_info_hash(corrected_sentence_tokenized, corrected_sentence_tokenized_downcased, corrected_sentence_tagged) end
corrected_sentence_tagged()
click to toggle source
# File lib/chat_correct/correct.rb, line 162 def corrected_sentence_tagged @corrected_sentence_tagged ||= tgr.add_tags(corrected_sentence).split end
corrected_sentence_tokenized()
click to toggle source
# File lib/chat_correct/correct.rb, line 154 def corrected_sentence_tokenized @corrected_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: corrected_sentence, tgr: tgr).combine end
corrected_sentence_tokenized_downcased()
click to toggle source
# File lib/chat_correct/correct.rb, line 170 def corrected_sentence_tokenized_downcased @corrected_sentence_tokenized_downcased ||= corrected_sentence_tokenized.map { |token| token.downcase } end
create_sentence_info_hash(sentence_tokenized, sentence_tokenized_downcased, sentence_tagged)
click to toggle source
# File lib/chat_correct/correct.rb, line 182 def create_sentence_info_hash(sentence_tokenized, sentence_tokenized_downcased, sentence_tagged) sentence_hash = {} sentence_tokenized.each_with_index do |token, index| sentence_info = {} sentence_info['token'] = token assign_previous_token(sentence_info, index, 1, sentence_tokenized) assign_previous_token(sentence_info, index, 2, sentence_tokenized) assign_next_token(sentence_info, index, 1, sentence_tokenized) assign_next_token(sentence_info, index, 2, sentence_tokenized) sentence_info['num_char'] = token.length sentence_info['position'] = index sentence_info['multiple_words'] = token.include?(' ') ? true : false sentence_info['lowercase'] = token.downcase sentence_info['match_id'] = 'c' + index.to_s if sentence_tokenized.eql?(corrected_sentence_tokenized) sentence_info['pos_tag'] = sentence_tagged[index].to_s.partition('>').first[1..-1] sentence_info['punctuation'] = ChatCorrect::Punctuation.new(text: token).is_punctuation? sentence_info['duplicates'] = (sentence_tokenized_downcased.count(token.downcase) > 1 ? true : false) sentence_info['uid'] = sentence_tokenized.eql?(corrected_sentence_tokenized) ? 'corrected' + index.to_s : 'original' + index.to_s sentence_info['matched'] = false sentence_info['is_time'] = ChatCorrect::Time.new(text: token).is_time? sentence_hash[index] = sentence_info end sentence_hash end
debug()
click to toggle source
# File lib/chat_correct/correct.rb, line 261 def debug # puts "++++++++++++++++++++" # original_sentence_info_hash.each do |k, v| # puts 'Key: ' + k.to_s + '; Word: ' + v['token'].to_s + '; Match ID: ' + v['match_id'].to_s # end end
iterate_sentences(inner_method)
click to toggle source
# File lib/chat_correct/correct.rb, line 321 def iterate_sentences(inner_method) corrected_sentence_info_hash.each do |kc, vc| next if vc['matched'] original_sentence_info_hash.each do |ks, vs| next if !vs['match_id'].to_s.strip.empty? send("#{inner_method}", kc, vc, ks, vs) end end end
iterate_stages()
click to toggle source
# File lib/chat_correct/correct.rb, line 55 def iterate_stages stage_1 # debug stage_2 # debug iterate_sentences('stage_3') # debug iterate_sentences('stage_4') # debug iterate_sentences('stage_5') # debug iterate_sentences('stage_6') # debug iterate_sentences('stage_7') # debug stage_8 # debug prev_next_match_check # debug stage_9 # debug correction_hash = ChatCorrect::CorrectionsHash.new(original_sentence_info_hash: original_sentence_info_hash, corrected_sentence_info_hash: corrected_sentence_info_hash).create build_corrections_hash(correction_hash) end
original_sentence_info_hash()
click to toggle source
# File lib/chat_correct/correct.rb, line 174 def original_sentence_info_hash @original_sentence_info_hash ||= create_sentence_info_hash(original_sentence_tokenized, original_sentence_tokenized_downcased, original_sentence_tagged) end
original_sentence_tagged()
click to toggle source
# File lib/chat_correct/correct.rb, line 158 def original_sentence_tagged @original_sentence_tagged ||= tgr.add_tags(original_sentence).split end
original_sentence_tokenized()
click to toggle source
# File lib/chat_correct/correct.rb, line 150 def original_sentence_tokenized @original_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: original_sentence, tgr: tgr).combine end
original_sentence_tokenized_downcased()
click to toggle source
# File lib/chat_correct/correct.rb, line 166 def original_sentence_tokenized_downcased @original_sentence_tokenized_downcased ||= original_sentence_tokenized.map { |token| token.downcase } end
prev_next_match_check()
click to toggle source
# File lib/chat_correct/correct.rb, line 228 def prev_next_match_check corrected_sentence_info_hash.each do |kc, vc| if !vc['matched'] prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash) next_match_vc = set_next_match(kc, corrected_sentence_info_hash) original_sentence_info_hash.each do |ks, vs| prev_match_vs = set_previous_match(ks, original_sentence_info_hash) next_match_vs = set_next_match(ks, original_sentence_info_hash) next if vs['match_id'] next unless prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs) original_sentence_info_hash[ks]['match_id'] = vc['match_id'] corrected_sentence_info_hash[kc]['matched'] = true end end end end
reverse_symbols(txt)
click to toggle source
# File lib/chat_correct/correct.rb, line 139 def reverse_symbols(txt) txt.gsub('∬', '"') .gsub('∯', '"') .gsub('ƪ', "'") .gsub('∫', "'") .gsub('∮', "'") .gsub('☍', ". ") .gsub('☊', ".") .gsub('☌', ",") end
set_next_match(key, hash)
click to toggle source
# File lib/chat_correct/correct.rb, line 253 def set_next_match(key, hash) if key.eql?(hash.length - 1) 'ȹ' else hash[key + 1]['match_id'] end end
set_previous_match(key, hash)
click to toggle source
# File lib/chat_correct/correct.rb, line 245 def set_previous_match(key, hash) if key.eql?(0) 'ȸ' else hash[key - 1]['match_id'] end end
stage_1()
click to toggle source
# File lib/chat_correct/correct.rb, line 268 def stage_1 matched_id_array = [] corrected_sentence_info_hash.each do |kc, vc| original_sentence_info_hash.each do |ko, vo| if (vc['lowercase'].eql?(vo['lowercase']) || (vc['prev_word1'].eql?(vo['prev_word1']) && vc['next_word1'].eql?(vo['next_word1']) && !vc['is_time'] && !vo['is_time'] && (!ChatCorrect::Punctuation.new(text: vc['prev_word1']).is_punctuation? && !ChatCorrect::Punctuation.new(text: vc['next_word1']).is_punctuation?) && vc['punctuation'].eql?(vo['punctuation']))) && !matched_id_array.include?(vc['match_id'].to_s) && !vo['duplicates'] && !vc['duplicates'] original_sentence_info_hash[ko]['match_id'] = vc['match_id'] corrected_sentence_info_hash[kc]['matched'] = true matched_id_array << vc['match_id'].to_s end end end end
stage_2()
click to toggle source
# File lib/chat_correct/correct.rb, line 291 def stage_2 corrected_sentence_info_hash.each do |kc, vc| if !vc['matched'] prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash) next_match_vc = set_next_match(kc, corrected_sentence_info_hash) if kc.eql?(corrected_sentence_info_hash.length - 1) next_word_vc = 'ȹ' else next_word_vc = corrected_sentence_info_hash[kc + 1]['token'] end original_sentence_info_hash.each do |ks, vs| prev_match_vs = set_previous_match(ks, original_sentence_info_hash) next_match_vs = set_next_match(ks, original_sentence_info_hash) if ks.eql?(original_sentence_info_hash.length - 1) next_word_vs = 'ȹ' else next_word_vs = original_sentence_info_hash[ks + 1]['token'] end next if vs['match_id'] if prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs) original_sentence_info_hash[ks]['match_id'] = vc['match_id'] corrected_sentence_info_hash[kc]['matched'] = true end next unless vs['token'].eql?(next_word_vs) && vs['token'] != next_word_vc original_sentence_info_hash[ks]['match_id'] = 'd' + ks.to_s end end end end
stage_3(kc, vc, ks, vs)
click to toggle source
# File lib/chat_correct/correct.rb, line 331 def stage_3(kc, vc, ks, vs) return if vc['token'] != vs['token'] || (vc['prev_word1'] != vs['prev_word1'] && vc['next_word1'] != vs['next_word1']) || vc['matched'] || vs['prev_word1'].eql?('ȸ') write_match_to_info_hash(ks, kc, vc) end
stage_4(kc, vc, ks, vs)
click to toggle source
# File lib/chat_correct/correct.rb, line 338 def stage_4(kc, vc, ks, vs) return if vc['token'].length < 4 || vs['token'].length < 4 || Text::Levenshtein.distance(vc['token'], vs['token']) > 2 || vc['matched'] write_match_to_info_hash(ks, kc, vc) end
stage_5(kc, vc, ks, vs)
click to toggle source
# File lib/chat_correct/correct.rb, line 344 def stage_5(kc, vc, ks, vs) return if !ChatCorrect::Pluralization.new(token_a: vc['token'], token_b: vs['token']).pluralization_error? || vc['matched'] write_match_to_info_hash(ks, kc, vc) end
stage_6(kc, vc, ks, vs)
click to toggle source
# File lib/chat_correct/correct.rb, line 350 def stage_6(kc, vc, ks, vs) return if !ChatCorrect::Verb.new(word: vs['token'], pos: vc['pos_tag'], text: vc['token']).verb_error? || (vc['prev_word1'] != vs['prev_word1'] && vc['next_word1'] != vs['next_word1']) || vc['matched'] || vs['next_word1'].include?(' ') write_match_to_info_hash(ks, kc, vc) end
stage_7(kc, vc, ks, vs)
click to toggle source
# File lib/chat_correct/correct.rb, line 357 def stage_7(kc, vc, ks, vs) # Distance between position of words is currently hardcoded to 5, # but this is a SWAG and can be adjusted based on testing. # The idea is to stop the algorithm from matching words like 'to' # and 'the' that appear very far apart in the sentence and should not be matched. return if vc['token'].length < 2 || vs['token'].length < 2 || Text::Levenshtein.distance(vc['token'], vs['token']) > 2 || vs['token'].to_s[0] != vc['token'].to_s[0] || (vs['position'].to_i - vc['position'].to_i).abs > 4 || vc['matched'] write_match_to_info_hash(ks, kc, vc) end
stage_8()
click to toggle source
# File lib/chat_correct/correct.rb, line 371 def stage_8 corrected_sentence_info_hash.each do |kc, vc| if !vc['matched'] next_match_vc = set_next_match(kc, corrected_sentence_info_hash) original_sentence_info_hash.each do |ks, vs| next_match_vs = set_next_match(ks, original_sentence_info_hash) next if vs['match_id'] write_match_to_info_hash(ks, kc, vc) if vs['multiple_words'] && vc['multiple_words'] && !vc['matched'] write_match_to_info_hash(ks, kc, vc) if next_match_vc.eql?('ȹ') && next_match_vs.eql?('ȹ') && vs['token'].gsub(/[[:punct:]]/, '').eql?('') && vc['token'].gsub(/[[:punct:]]/, '').eql?('') && !vc['matched'] end end end end
stage_9()
click to toggle source
# File lib/chat_correct/correct.rb, line 385 def stage_9 original_sentence_info_hash.each do |k, v| next if v['match_id'] original_sentence_info_hash[k]['match_id'] = 's' + k.to_s end end
update_interim_hash_with_correction(interim_hash, key)
click to toggle source
# File lib/chat_correct/correct.rb, line 93 def update_interim_hash_with_correction(interim_hash, key) if correct[key]['type'].split('_')[1].eql?('order') interim_hash['correction'] = 'N/A' elsif correct[key + 1]['type'].split('_')[0].eql?(correct[key]['type'].split('_')[0]) interim_hash['correction'] = correct[key + 1]['token'] else interim_hash['correction'] = '' end interim_hash end
update_interim_hash_with_error(interim_hash, value)
click to toggle source
# File lib/chat_correct/correct.rb, line 80 def update_interim_hash_with_error(interim_hash, value) if value['type'].split('_').length > 2 if value['type'].split('_')[1].eql?('punctuation') interim_hash['error_type'] = 'punctuation' else interim_hash['error_type'] = value['type'].split('_')[0] + '_' + value['type'].split('_')[1] end else interim_hash['error_type'] = value['type'].split('_')[0] end interim_hash end
write_match_to_info_hash(ks, kc, vc)
click to toggle source
# File lib/chat_correct/correct.rb, line 207 def write_match_to_info_hash(ks, kc, vc) original_sentence_info_hash[ks]['match_id'] = vc['match_id'] corrected_sentence_info_hash[kc]['matched'] = true end