class TextSentencer

Constants

DEFAULT_RULES

default rules

Public Class Methods

new(rules = nil) click to toggle source
# File lib/text_sentencer/text_sentencer.rb, line 46
def initialize(rules = nil)
        rules ||= DEFAULT_RULES
        @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
        @rules[:break_pattern] ||= ""
        @rules[:candidate_pattern] ||= ""
        @rules[:positive_rules] ||= []
        @rules[:negative_rules] ||= []
end

Public Instance Methods

annotate(text) click to toggle source
# File lib/text_sentencer/text_sentencer.rb, line 55
def annotate(text)
        return nil if text.nil?

        sentences = segment(text)
        denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
        {text:text, denotations:denotations}
end
segment(text) click to toggle source
# File lib/text_sentencer/text_sentencer.rb, line 63
def segment(text)
        breaks = if @rules[:break_pattern].empty?
                []
        else
                text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
        end

        candidates = if @rules[:candidate_pattern].empty?
                []
        else
                text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
        end

        # breaks take precedent
        candidates -= breaks

        candidates.each do |c|
                last_end, next_begin = c

                if (last_end == 0) || (next_begin == text.length)
                        breaks << c
                        next
                end

                last_text = text[0...last_end]
                next_text = text[next_begin..-1]

                @rules[:positive_rules].each do |p|
                        if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
                                break_p = true
                                @rules[:negative_rules].each do |n|
                                        if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
                                                break_p = false
                                                break
                                        end
                                end
                                breaks << c if break_p
                                break
                        end
                end
        end

        breaks.sort!

        sentences = []
        lastbreak = 0
        breaks.each do |b|
                sentences << [lastbreak, b[0]] if b[0] > lastbreak
                lastbreak = b[1]
        end
        sentences << [lastbreak, text.length] if lastbreak < text.length

        sentences
end