class Splitta::Frag

Attributes

last_word[R]
next_word[RW]
orig[R]
pred[RW]

Public Class Methods

new(orig, previous_frag: nil) click to toggle source
# File lib/splitta/frag.rb, line 12
def initialize(orig, previous_frag: nil)
  words = clean(orig).split
  previous_frag.next_word = words.first if previous_frag
  @orig = orig
  @last_word = words.last
end

Public Instance Methods

features(model) click to toggle source

… w1. (sb?) w2 … Features, listed roughly in order of importance:

(1) w1: word that includes a period (2) w2: the next word, if it exists (3) w1length: number of alphabetic characters in w1 (4) w2cap: true if w2 is capitalized (5) both: w1 and w2 (6) w1abbr: log count of w1 in training without a final period (7) w2lower: log count of w2 in training as lowercased (8) w1w2upper: w1 and w2 is capitalized

# File lib/splitta/frag.rb, line 30
def features(model)
  Enumerator.new do |y|
    y << [:w1, w1]
    y << [:w2, w2]
    y << [:both, w1, w2]

    if alphabetic?(w1)
      y << [:w1length, w1length]
      y << [:w1abbr, w1abbr(model)]
    end

    if alphabetic?(w2)
      y << [:w2cap, w2cap]
      y << [:w2lower, w2lower(model)]
      y << [:w1w2upper, w1, w2cap]
    end
  end
end
over?(threshold) click to toggle source
# File lib/splitta/frag.rb, line 49
def over?(threshold)
  !!pred && pred > threshold
end

Private Instance Methods

alphabetic?(str) click to toggle source
# File lib/splitta/frag.rb, line 92
def alphabetic?(str)
  !!/[a-zA-Z. ]+/u.match(str)
end
clean(text) click to toggle source

normalize numbers, discard some punctuation that can be ambiguous

# File lib/splitta/frag.rb, line 60
def clean(text)
  text = tokenize(text)
  text.gsub!(/[.,\d]*\d/, '<NUM>')
  text.gsub!(%r{[^a-zA-Z0-9,.;:<>\-'/?!$% ]}, '')
  text.gsub!('--', ' ')
  text
end
upcase?(str) click to toggle source
# File lib/splitta/frag.rb, line 96
def upcase?(str)
  str.upcase == str
end
w1() click to toggle source
# File lib/splitta/frag.rb, line 68
def w1
  @w1 ||= last_word&.sub(/(^.+?-)/, '')
end
w1abbr(model) click to toggle source
# File lib/splitta/frag.rb, line 80
def w1abbr(model)
  Math.log(1 + model.non_abbrs.fetch(w1.chop, 0.0)).to_i
end
w1length() click to toggle source
# File lib/splitta/frag.rb, line 76
def w1length
  [10, w1.sub(/\W/, '').length].min
end
w2() click to toggle source
# File lib/splitta/frag.rb, line 72
def w2
  @w2 ||= next_word&.sub(/(-.+?)$/, '')
end
w2cap() click to toggle source
# File lib/splitta/frag.rb, line 84
def w2cap
  upcase?(w2.chars.first) ? 'True' : 'False'
end
w2lower(model) click to toggle source
# File lib/splitta/frag.rb, line 88
def w2lower(model)
  Math.log(1 + model.lower_words.fetch(w2.downcase, 0.0)).to_i
end