class PROIEL::Converter::CoNLLU::Sentence
Attributes
tokens[RW]
Public Class Methods
new(sentence)
click to toggle source
initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence
# File lib/proiel/cli/converters/conll-u.rb, line 48 def initialize(sentence) id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil # initialize array to hold the sentence tokens tks = [] # keep track of how many new tokens have been created offset = 0 sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk| if tk.form =~ /[[:space:]]/ subtoks = tk.form.split(/[[:space:]]/) subtoks.each_with_index do |subtok, i| tks << PROIEL::Token.new(sentence, (i == 0 ? tk.id : 1000 + offset), # id (i == 0 ? tk.head_id : tk.id), # head_id subtok, # hope the lemmas split the same way as the tokens. Grab the form is you don't find a lemma (tk.lemma.split(/[[:space:]]/)[i] || subtok), tk.part_of_speech, # copy the postag tk.morphology, (i == 0 ? tk.relation : "flat"), nil, #empty_token_sort tk.citation_part, (i == 0 ? tk.presentation_before : nil), (i == (subtoks.size - 1) ? tk.presentation_after : nil), (i == 0 ? tk.antecedent_id : nil), (i == 0 ? tk.information_status : nil), (i == 0 ? tk.contrast_group : nil), (i == 0 ? tk.foreign_ids : nil), (i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), # This needs to be given a real slash object for the initialization, although it throws away the info (subtok == subtoks.first ? tk.alignment_id : nil) ) offset += 1 end else tks << tk end end tks.map(&:id).each_with_index.each do |id, i| id_to_number[id] = i + 1 end @tokens = tks.map do |t| Token.new(id_to_number[t.id], id_to_number[t.head_id], #insert dots in any whitespace inside words and lemmata t.form.to_s.gsub(/[[:space:]]/, '.'), t.lemma.to_s.gsub(/[[:space:]]/, '.'), t.part_of_speech, t.language, t.morphology, t.relation, t.empty_token_sort, t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] }, t.citation_part, self ) end end
Public Instance Methods
convert()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 114 def convert restructure_graph! relabel_graph! map_part_of_speech! self end
count_tokens()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 133 def count_tokens roots.map(&:count_subgraph).inject(0, :+) end
demote_parentheticals_and_vocatives!()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 168 def demote_parentheticals_and_vocatives! r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation } if p.any? and r.none? # promote the first vocative/parenthetical to head in case there's nothing else p.first.relation = 'pred' r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation } end raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one? p.each { |x| x.head_id = r.first.id } end
demote_subjunctions!()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 164 def demote_subjunctions! @tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!) end
find_token(identifier)
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 121 def find_token(identifier) @tokens.select { |t| t.id == identifier }.first end
map_part_of_speech!()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 183 def map_part_of_speech! roots.each(&:map_part_of_speech!) end
prune_empty_rootnodes!()
click to toggle source
TODO: this will leave several root nodes in many cases. For now, raise an error
# File lib/proiel/cli/converters/conll-u.rb, line 150 def prune_empty_rootnodes! unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty? empty_roots.each do |r| # promote the first dependent to root new_root = r.dependents.first new_root.head_id = 0 new_root.relation = r.relation r.dependents.each { |d| d.head_id = new_root.id } remove_token! r end prune_empty_rootnodes! end end
relabel_graph!()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 179 def relabel_graph! roots.each(&:relabel_graph!) end
remove_token!(token)
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 125 def remove_token!(token) @tokens.delete(token) end
restructure_graph!()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 187 def restructure_graph! @tokens.delete_if { |n| n.empty_token_sort == 'P' } @tokens.select(&:preposition?).each(&:process_preposition!) roots.each(&:change_coordinations!) @tokens.select(&:copula?).each(&:process_copula!) prune_empty_rootnodes! # do ellipses from left to right for proper remnant treatment @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!) demote_subjunctions! # DIRTY: remove the rest of the empty nodes by attaching them # to their grandmother with remnant. This is the best way to # do it given the current state of the UDEP scheme, but # revisions will come. roots.each(&:remove_empties!) demote_parentheticals_and_vocatives! end
roots()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 137 def roots @tokens.select { |t| t.head_id == 0 }.sort_by(&:id) end
to_conll()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 145 def to_conll @tokens.map(&:to_conll).join("\n") end
to_graph()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 141 def to_graph roots.map(&:to_graph).join("\n") end
to_s()
click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 129 def to_s @tokens.map(&:to_s).join("\n") end