class Ve::Parse::FreelingEn

Constants

INTERNAL_INFO_FOR_PARSED_POS

Attributes

text[R]
tokens[R]

Public Class Methods

new(text, output) click to toggle source
# File lib/providers/freeling_en.rb, line 81
def initialize(text, output)
  @tokens = []
  @text = text
  position = 0

  output.each_with_index do |line, index|
    line.rstrip!
    token = {:raw => line}

    # Anything unparsed at the end of the text
    # This must happen before sentence splits are detected to avoid funny ordering
    if output.length > 1 && output.length == index + 1
      unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed,
                          :literal => unparsed_md[1],
                          :raw => ''}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
      end
    end

    # Sentence splits are just empty lines in Freeling
    if line.length == 0
      token[:type] = :sentence_split
      token[:literal] = ''
      @tokens << token
      next
    end

    # The parsed token
    info = line.split(/\s+/)
    token[:type] = :parsed
    [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i|
      token[attr] = info[i]
    end

    token[:literal].gsub!('_', ' ')
    token[:lemma].gsub!('_', ' ')

    # Anything unparsed preceding this token.
    # We need to do this complicated dance with _ since Freeling replaces spaces with it.
    # And so we need to be able to find the token with both spaces and _ in it since
    # we don't know what the original in the text actually is.
    # Once we have the location in the text we can figure out if it should be with spaces or _.
    unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx
    unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}}
    unparsed_md = unparsed_re.match(text, position)
    if unparsed_md && unparsed_md[1].length > 0
      unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
      unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
      @tokens << unparsed_token
      position += unparsed_token[:literal].length
    end

    token[:characters] = (position..(position+token[:literal].length-1))
    position += token[:literal].length
    @tokens << token
  end
end

Public Instance Methods

sentences() click to toggle source
# File lib/providers/freeling_en.rb, line 207
def sentences
  sentences = []
  current = ''

  @tokens.each do |token|
    if token[:type] == :sentence_split
      sentences << current
      current = ''
    else
      current << token[:literal]
    end
  end

  # In case there is no :sentence_split at the end
  sentences << current if current.length > 0

  sentences.collect { |s| s.strip! }
  sentences
end
words() click to toggle source
# File lib/providers/freeling_en.rb, line 181
def words
  words = []

  @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
    if token[:pos] == 'POS'
      # Possessive ending, add to previous token
      words[-1].word << token[:literal]
      words[-1].tokens << token
      next
    else
      # All other tokens
      pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]]

      if pos.nil? && token[:pos] =~ /^F\w+$/
        pos = Ve::PartOfSpeech::Symbol
      end

      pos = Ve::PartOfSpeech::TBD if pos.nil?
      word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar})
      words << word
    end
  end

  words
end