class Ve::Parse::FreelingEn
Constants
- INTERNAL_INFO_FOR_PARSED_POS
Attributes
text[R]
tokens[R]
Public Class Methods
new(text, output)
click to toggle source
# File lib/providers/freeling_en.rb, line 81 def initialize(text, output) @tokens = [] @text = text position = 0 output.each_with_index do |line, index| line.rstrip! token = {:raw => line} # Anything unparsed at the end of the text # This must happen before sentence splits are detected to avoid funny ordering if output.length > 1 && output.length == index + 1 unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position) if unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token end end # Sentence splits are just empty lines in Freeling if line.length == 0 token[:type] = :sentence_split token[:literal] = '' @tokens << token next end # The parsed token info = line.split(/\s+/) token[:type] = :parsed [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i| token[attr] = info[i] end token[:literal].gsub!('_', ' ') token[:lemma].gsub!('_', ' ') # Anything unparsed preceding this token. # We need to do this complicated dance with _ since Freeling replaces spaces with it. # And so we need to be able to find the token with both spaces and _ in it since # we don't know what the original in the text actually is. # Once we have the location in the text we can figure out if it should be with spaces or _. unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}} unparsed_md = unparsed_re.match(text, position) if unparsed_md && unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token position += unparsed_token[:literal].length end token[:characters] = (position..(position+token[:literal].length-1)) position += token[:literal].length @tokens << token end end
Public Instance Methods
sentences()
click to toggle source
# File lib/providers/freeling_en.rb, line 207 def sentences sentences = [] current = '' @tokens.each do |token| if token[:type] == :sentence_split sentences << current current = '' else current << token[:literal] end end # In case there is no :sentence_split at the end sentences << current if current.length > 0 sentences.collect { |s| s.strip! } sentences end
words()
click to toggle source
# File lib/providers/freeling_en.rb, line 181 def words words = [] @tokens.find_all { |t| t[:type] == :parsed }.each do |token| if token[:pos] == 'POS' # Possessive ending, add to previous token words[-1].word << token[:literal] words[-1].tokens << token next else # All other tokens pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]] if pos.nil? && token[:pos] =~ /^F\w+$/ pos = Ve::PartOfSpeech::Symbol end pos = Ve::PartOfSpeech::TBD if pos.nil? word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar}) words << word end end words end