class TextSentencer
Constants
- DEFAULT_RULES
default rules
Public Class Methods
new(rules = nil)
click to toggle source
# File lib/text_sentencer/text_sentencer.rb, line 46 def initialize(rules = nil) rules ||= DEFAULT_RULES @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}] @rules[:break_pattern] ||= "" @rules[:candidate_pattern] ||= "" @rules[:positive_rules] ||= [] @rules[:negative_rules] ||= [] end
Public Instance Methods
annotate(text)
click to toggle source
# File lib/text_sentencer/text_sentencer.rb, line 55 def annotate(text) return nil if text.nil? sentences = segment(text) denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}} {text:text, denotations:denotations} end
segment(text)
click to toggle source
# File lib/text_sentencer/text_sentencer.rb, line 63 def segment(text) breaks = if @rules[:break_pattern].empty? [] else text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)} end candidates = if @rules[:candidate_pattern].empty? [] else text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)} end # breaks take precedent candidates -= breaks candidates.each do |c| last_end, next_begin = c if (last_end == 0) || (next_begin == text.length) breaks << c next end last_text = text[0...last_end] next_text = text[next_begin..-1] @rules[:positive_rules].each do |p| if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/) break_p = true @rules[:negative_rules].each do |n| if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/) break_p = false break end end breaks << c if break_p break end end end breaks.sort! sentences = [] lastbreak = 0 breaks.each do |b| sentences << [lastbreak, b[0]] if b[0] > lastbreak lastbreak = b[1] end sentences << [lastbreak, text.length] if lastbreak < text.length sentences end