class ChatCorrect::Tokenize

Constants

ABBREVIATIONS
PUNCTUATION

Attributes

text[R]

Public Class Methods

new(text:) click to toggle source
# File lib/chat_correct/tokenize.rb, line 6
def initialize(text:)
  @text = text
end

Public Instance Methods

tokenize() click to toggle source
# File lib/chat_correct/tokenize.rb, line 10
def tokenize
  return if text.nil?
  return [text] if /\A\w+\z/ =~ text
  converted_text = convert_quotes(text)
  converted_text = shift_all_punct(converted_text)
  converted_text = convert_contractions(converted_text)
  converted_text = convert_numbers_with_commas(converted_text)
  converted_text = convert_numbers_with_periods(converted_text)
  result = converted_text.split(' ')
  tokenized_array = separate_other_ending_punc(separate_full_stop(result)).map do |s|
    s.tr("\n", '').tr("\r", '').strip
  end
end
tokenize_no_punct() click to toggle source
# File lib/chat_correct/tokenize.rb, line 24
def tokenize_no_punct
  return if text.nil? || tokenize.nil?
  tokenize - PUNCTUATION
end

Private Instance Methods

convert_contractions(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 105
def convert_contractions(txt)
  txt.gsub(/([A-Za-z])'([dms])\b/o) { $1 + 'ƪ' + $2 }
     .gsub(/n't\b/o, 'nƪt')
     .gsub(/'(ve|ll|re)\b/o) { 'ƪ' + $1 }
end
convert_numbers_with_commas(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 111
def convert_numbers_with_commas(txt)
  txt.gsub(/(?<=\d),(?=\d)/, '☌')
end
convert_numbers_with_periods(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 115
def convert_numbers_with_periods(txt)
  txt.gsub(/(?<=\d)\.(?=\d)/, '☊')
end
convert_quotes(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 42
def convert_quotes(txt)
  txt.gsub(/`(?!`)(?=.*\w)/o, ' ∫ ')
     .gsub(/"(?=.*\w)/o, ' ∬ ')
     .gsub(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ∫ ' : ' ∫ ' }
     .gsub(/(\W|^)'(?=.*\w)/o, 'ƪ')
     .gsub(/"/, ' ∯ ')
     .gsub(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ∮ ' }
     .squeeze(' ').strip
end
separate_full_stop(tokens) click to toggle source
# File lib/chat_correct/tokenize.rb, line 143
def separate_full_stop(tokens)
  words = []
  tokens.each_with_index do |_t, i|
    if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
      w = $1
      unless ABBREVIATIONS.include?(w.downcase) || w =~ /\A[a-z]\z/i ||
        w =~ /[a-z](?:\.[a-z])+\z/i
        words <<  w
        words << '.'
        next
      end
    end
    words << tokens[i]
  end
  if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
    words[-1] = $1
    words.push '.'
  end
  words
end
separate_other_ending_punc(array) click to toggle source
# File lib/chat_correct/tokenize.rb, line 119
def separate_other_ending_punc(array)
  new_array = []
  punctuation = ['。', '.', '!', '!', '?', '?']
  array.each do |a|
    counter = false
    punctuation.each do |p|
      if a.length > 1
        if a[-1] == p
          split = a.split(p)
          split.each do |b|
            new_array << b
            counter = true
          end
          new_array << p
        end
      end
    end
    if counter == false
      new_array  << a
    end
  end
  new_array
end
shift_all_punct(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 31
def shift_all_punct(txt)
  converted_text = shift_multiple_dash(txt)
  converted_text = shift_comma(converted_text)
  converted_text = shift_ellipse(converted_text)
  converted_text = shift_bracket(converted_text)
  converted_text = shift_other_punct(converted_text)
  converted_text = shift_upsidedown_question_mark(converted_text)
  converted_text = shift_upsidedown_exclamation(converted_text)
  shift_special_quotes(converted_text)
end
shift_bracket(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 77
def shift_bracket(txt)
  txt.gsub(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' }.squeeze(' ').strip
end
shift_comma(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 56
def shift_comma(txt)
  txt.gsub(/,(?!\d)/o, ' , ').squeeze(' ')
end
shift_ellipse(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 68
def shift_ellipse(txt)
  txt.gsub(/(\.\.\.+)/o) { ' ' + $1 + ' ' }.squeeze(' ').strip
end
shift_multiple_dash(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 52
def shift_multiple_dash(txt)
  txt.gsub(/--+/o, ' - ').squeeze(' ')
end
shift_off_double_exclamation(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 93
def shift_off_double_exclamation(txt)
  txt.include?('!!') ? txt.gsub(/([!!])\s+/o) { ' ' + $1 + ' ' } : txt
end
shift_off_double_mixed_1(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 97
def shift_off_double_mixed_1(txt)
  txt.include?('?!') ? txt.gsub(/\?\!/o) { ' ? ! ' } : txt
end
shift_off_double_mixed_2(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 101
def shift_off_double_mixed_2(txt)
  txt.include?('!?') ? txt.gsub(/\!\?/o) { ' ! ? ' } : txt
end
shift_off_double_quotation_mark(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 89
def shift_off_double_quotation_mark(txt)
  txt.include?('??') ? txt.gsub(/([\?\?])\s+/o) { ' ' + $1 + ' ' } : txt
end
shift_other_punct(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 81
def shift_other_punct(txt)
  converted_text = shift_off_double_quotation_mark(txt)
  converted_text = shift_off_double_exclamation(converted_text)
  converted_text = shift_off_double_mixed_1(converted_text)
  converted_text = shift_off_double_mixed_2(converted_text)
  converted_text.gsub(/([\!\?\%;|])\s+/o) { ' ' + $1 + ' ' }.squeeze(' ').strip
end
shift_special_quotes(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 72
def shift_special_quotes(txt)
  txt.gsub(/«/, ' « ').gsub(/»/, ' » ')
     .gsub(/„/, ' „ ').gsub(/“/, ' “ ')
end
shift_upsidedown_exclamation(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 64
def shift_upsidedown_exclamation(txt)
  txt.gsub(/¡/, ' ¡ ')
end
shift_upsidedown_question_mark(txt) click to toggle source
# File lib/chat_correct/tokenize.rb, line 60
def shift_upsidedown_question_mark(txt)
  txt.gsub(/¿/, ' ¿ ')
end