module NlpPure::Segmenting::DefaultSentence

SEE ALSO: Unsupervised Multilingual Sentence Boundary Detection. Kiss, Strunk; 2006. NOTE: this fails on some proper nouns with abbreviations (e.g. business names)

and fails on single-linebreak headings

Constants

DEFAULT_OPTIONS

Public Instance Methods

clean_input(text = nil) click to toggle source
# File lib/nlp_pure/segmenting/default_sentence.rb, line 40
def clean_input(text = nil)
  input = text.to_s
  # perform replacements to work around the limitations of the splitting regexp
  options.fetch(:gsub, []).each do |gsub_pair|
    input.gsub!(gsub_pair[0], gsub_pair[1])
  end
  # NOTE: leading whitespace is problematic; ref #12
  input.strip
end
cleanup_segmenting(segments) click to toggle source
# File lib/nlp_pure/segmenting/default_sentence.rb, line 89
def cleanup_segmenting(segments)
  segments.compact
end
handle_special_fragments(segments, segment) click to toggle source

rejoin leading punctuation, abbreviation, and numbers

# File lib/nlp_pure/segmenting/default_sentence.rb, line 62
def handle_special_fragments(segments, segment)
  # NOTE: always index zero because we're shifting
  while next_segment_appears_included?(segments[0])
    STDERR << "\t\t<< #{segments[0].inspect}\n" if ENV['DEBUG']
    segment = "#{segment}#{segments.shift}"
  end
  segment.strip
end
next_segment_appears_included?(segment) click to toggle source
# File lib/nlp_pure/segmenting/default_sentence.rb, line 71
def next_segment_appears_included?(segment)
  return false unless segment
  # NOTE: the logic is expanded for logging reasons (despite style violation)
  if segment[0] =~ options.fetch(:split, nil)
    STDERR << "\t! leading punctuation detected\n" if ENV['DEBUG']
  elsif segment[0] =~ /^\w/
    STDERR << "\t! assuming abbreviation\n" if ENV['DEBUG']
  elsif segment =~ /^\s[a-z0-9]/
    STDERR << "\t! greedily grabbing lowercase\n" if ENV['DEBUG']
  elsif segment =~ /^\d/
    STDERR << "\t! leading numeral detected\n" if ENV['DEBUG']
  else
    STDERR << "\t\tx\n" if ENV['DEBUG']
    return false
  end
  true
end
options() click to toggle source

NOTE: exposed as a method for easy mock/stub

# File lib/nlp_pure/segmenting/default_sentence.rb, line 25
def options
  DEFAULT_OPTIONS
end
parse(*args) click to toggle source
# File lib/nlp_pure/segmenting/default_sentence.rb, line 29
def parse(*args)
  return nil if args.nil? || args.empty?
  # naive split
  segments = clean_input(args[0]).split(options.fetch(:split, nil))
  # skip rejoin if one segment
  return segments if segments.length == 1
  returning = rejoin_segment_fragments(segments).compact
  STDERR << "#{returning.inspect}\n" if ENV['DEBUG']
  returning
end
rejoin_segment_fragments(segments) click to toggle source
# File lib/nlp_pure/segmenting/default_sentence.rb, line 50
def rejoin_segment_fragments(segments)
  reassociated_segments = []
  # take all segments
  while (segment = segments.shift)
    STDERR << "#{segment.inspect}\n" if ENV['DEBUG']
    # join segments if needed
    reassociated_segments << handle_special_fragments(segments, segment)
  end
  reassociated_segments
end