class Splitta::Frag
Attributes
last_word[R]
next_word[RW]
orig[R]
pred[RW]
Public Class Methods
new(orig, previous_frag: nil)
click to toggle source
# File lib/splitta/frag.rb, line 12 def initialize(orig, previous_frag: nil) words = clean(orig).split previous_frag.next_word = words.first if previous_frag @orig = orig @last_word = words.last end
Public Instance Methods
features(model)
click to toggle source
… w1. (sb?) w2 … Features, listed roughly in order of importance:
(1) w1: word that includes a period (2) w2: the next word, if it exists (3) w1length: number of alphabetic characters in w1 (4) w2cap: true if w2 is capitalized (5) both: w1 and w2 (6) w1abbr: log count of w1 in training without a final period (7) w2lower: log count of w2 in training as lowercased (8) w1w2upper: w1 and w2 is capitalized
# File lib/splitta/frag.rb, line 30 def features(model) Enumerator.new do |y| y << [:w1, w1] y << [:w2, w2] y << [:both, w1, w2] if alphabetic?(w1) y << [:w1length, w1length] y << [:w1abbr, w1abbr(model)] end if alphabetic?(w2) y << [:w2cap, w2cap] y << [:w2lower, w2lower(model)] y << [:w1w2upper, w1, w2cap] end end end
over?(threshold)
click to toggle source
# File lib/splitta/frag.rb, line 49 def over?(threshold) !!pred && pred > threshold end
Private Instance Methods
alphabetic?(str)
click to toggle source
# File lib/splitta/frag.rb, line 92 def alphabetic?(str) !!/[a-zA-Z. ]+/u.match(str) end
clean(text)
click to toggle source
normalize numbers, discard some punctuation that can be ambiguous
# File lib/splitta/frag.rb, line 60 def clean(text) text = tokenize(text) text.gsub!(/[.,\d]*\d/, '<NUM>') text.gsub!(%r{[^a-zA-Z0-9,.;:<>\-'/?!$% ]}, '') text.gsub!('--', ' ') text end
upcase?(str)
click to toggle source
# File lib/splitta/frag.rb, line 96 def upcase?(str) str.upcase == str end
w1()
click to toggle source
# File lib/splitta/frag.rb, line 68 def w1 @w1 ||= last_word&.sub(/(^.+?-)/, '') end
w1abbr(model)
click to toggle source
# File lib/splitta/frag.rb, line 80 def w1abbr(model) Math.log(1 + model.non_abbrs.fetch(w1.chop, 0.0)).to_i end
w1length()
click to toggle source
# File lib/splitta/frag.rb, line 76 def w1length [10, w1.sub(/\W/, '').length].min end
w2()
click to toggle source
# File lib/splitta/frag.rb, line 72 def w2 @w2 ||= next_word&.sub(/(-.+?)$/, '') end
w2cap()
click to toggle source
# File lib/splitta/frag.rb, line 84 def w2cap upcase?(w2.chars.first) ? 'True' : 'False' end
w2lower(model)
click to toggle source
# File lib/splitta/frag.rb, line 88 def w2lower(model) Math.log(1 + model.lower_words.fetch(w2.downcase, 0.0)).to_i end