class ChatCorrect::Tokenize
Constants
- ABBREVIATIONS
- PUNCTUATION
Attributes
text[R]
Public Class Methods
new(text:)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 6 def initialize(text:) @text = text end
Public Instance Methods
tokenize()
click to toggle source
# File lib/chat_correct/tokenize.rb, line 10 def tokenize return if text.nil? return [text] if /\A\w+\z/ =~ text converted_text = convert_quotes(text) converted_text = shift_all_punct(converted_text) converted_text = convert_contractions(converted_text) converted_text = convert_numbers_with_commas(converted_text) converted_text = convert_numbers_with_periods(converted_text) result = converted_text.split(' ') tokenized_array = separate_other_ending_punc(separate_full_stop(result)).map do |s| s.tr("\n", '').tr("\r", '').strip end end
tokenize_no_punct()
click to toggle source
# File lib/chat_correct/tokenize.rb, line 24 def tokenize_no_punct return if text.nil? || tokenize.nil? tokenize - PUNCTUATION end
Private Instance Methods
convert_contractions(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 105 def convert_contractions(txt) txt.gsub(/([A-Za-z])'([dms])\b/o) { $1 + 'ƪ' + $2 } .gsub(/n't\b/o, 'nƪt') .gsub(/'(ve|ll|re)\b/o) { 'ƪ' + $1 } end
convert_numbers_with_commas(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 111 def convert_numbers_with_commas(txt) txt.gsub(/(?<=\d),(?=\d)/, '☌') end
convert_numbers_with_periods(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 115 def convert_numbers_with_periods(txt) txt.gsub(/(?<=\d)\.(?=\d)/, '☊') end
convert_quotes(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 42 def convert_quotes(txt) txt.gsub(/`(?!`)(?=.*\w)/o, ' ∫ ') .gsub(/"(?=.*\w)/o, ' ∬ ') .gsub(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ∫ ' : ' ∫ ' } .gsub(/(\W|^)'(?=.*\w)/o, 'ƪ') .gsub(/"/, ' ∯ ') .gsub(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ∮ ' } .squeeze(' ').strip end
separate_full_stop(tokens)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 143 def separate_full_stop(tokens) words = [] tokens.each_with_index do |_t, i| if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/ w = $1 unless ABBREVIATIONS.include?(w.downcase) || w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i words << w words << '.' next end end words << tokens[i] end if words[-1] && words[-1] =~ /\A(.*\w)\.\z/ words[-1] = $1 words.push '.' end words end
separate_other_ending_punc(array)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 119 def separate_other_ending_punc(array) new_array = [] punctuation = ['。', '.', '!', '!', '?', '?'] array.each do |a| counter = false punctuation.each do |p| if a.length > 1 if a[-1] == p split = a.split(p) split.each do |b| new_array << b counter = true end new_array << p end end end if counter == false new_array << a end end new_array end
shift_all_punct(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 31 def shift_all_punct(txt) converted_text = shift_multiple_dash(txt) converted_text = shift_comma(converted_text) converted_text = shift_ellipse(converted_text) converted_text = shift_bracket(converted_text) converted_text = shift_other_punct(converted_text) converted_text = shift_upsidedown_question_mark(converted_text) converted_text = shift_upsidedown_exclamation(converted_text) shift_special_quotes(converted_text) end
shift_bracket(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 77 def shift_bracket(txt) txt.gsub(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' }.squeeze(' ').strip end
shift_comma(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 56 def shift_comma(txt) txt.gsub(/,(?!\d)/o, ' , ').squeeze(' ') end
shift_ellipse(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 68 def shift_ellipse(txt) txt.gsub(/(\.\.\.+)/o) { ' ' + $1 + ' ' }.squeeze(' ').strip end
shift_multiple_dash(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 52 def shift_multiple_dash(txt) txt.gsub(/--+/o, ' - ').squeeze(' ') end
shift_off_double_exclamation(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 93 def shift_off_double_exclamation(txt) txt.include?('!!') ? txt.gsub(/([!!])\s+/o) { ' ' + $1 + ' ' } : txt end
shift_off_double_mixed_1(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 97 def shift_off_double_mixed_1(txt) txt.include?('?!') ? txt.gsub(/\?\!/o) { ' ? ! ' } : txt end
shift_off_double_mixed_2(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 101 def shift_off_double_mixed_2(txt) txt.include?('!?') ? txt.gsub(/\!\?/o) { ' ! ? ' } : txt end
shift_off_double_quotation_mark(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 89 def shift_off_double_quotation_mark(txt) txt.include?('??') ? txt.gsub(/([\?\?])\s+/o) { ' ' + $1 + ' ' } : txt end
shift_other_punct(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 81 def shift_other_punct(txt) converted_text = shift_off_double_quotation_mark(txt) converted_text = shift_off_double_exclamation(converted_text) converted_text = shift_off_double_mixed_1(converted_text) converted_text = shift_off_double_mixed_2(converted_text) converted_text.gsub(/([\!\?\%;|])\s+/o) { ' ' + $1 + ' ' }.squeeze(' ').strip end
shift_special_quotes(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 72 def shift_special_quotes(txt) txt.gsub(/«/, ' « ').gsub(/»/, ' » ') .gsub(/„/, ' „ ').gsub(/“/, ' “ ') end
shift_upsidedown_exclamation(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 64 def shift_upsidedown_exclamation(txt) txt.gsub(/¡/, ' ¡ ') end
shift_upsidedown_question_mark(txt)
click to toggle source
# File lib/chat_correct/tokenize.rb, line 60 def shift_upsidedown_question_mark(txt) txt.gsub(/¿/, ' ¿ ') end