class PROIEL::Converter::CoNLLU::Token
Constants
- MORPHOLOGY_POSITIONAL_TAG_SEQUENCE
Attributes
Public Class Methods
# File lib/proiel/cli/converters/conll-u.rb, line 218 def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence) @id = id @head_id = head_id @form = form @lemma = lemma @part_of_speech = part_of_speech @language = language @morphology = morphology @relation = relation @empty_token_sort = empty_token_sort @slashes = slashes @sentence = sentence @features = (morphology ? map_morphology(morphology) : '' ) @citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_') @upos = nil end
Public Instance Methods
# File lib/proiel/cli/converters/conll-u.rb, line 349 def TAM_particle? @relation == 'aux' and TAM_PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(',')) end
# File lib/proiel/cli/converters/conll-u.rb, line 626 def add_slash!(slash) @slashes << slash end
returns true
if the node is an adjective or an ordinal
# File lib/proiel/cli/converters/conll-u.rb, line 253 def adjectival? @part_of_speech == 'A-' or @part_of_speech == 'Mo' end
# File lib/proiel/cli/converters/conll-u.rb, line 261 def adverb? @part_of_speech =~ /\AD/ end
# File lib/proiel/cli/converters/conll-u.rb, line 295 def auxiliary? AUXILIARIES.include?([lemma, part_of_speech, language].join(',')) end
# File lib/proiel/cli/converters/conll-u.rb, line 265 def cardinal? @part_of_speech == 'Ma' end
Changes coordinations recursively from the bottom of the graph
# File lib/proiel/cli/converters/conll-u.rb, line 602 def change_coordinations! dependents.each(&:change_coordinations!) process_coordination! if conjunction? end
A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute constructino without a verb; or if it is the root (e.g. in a nominal clause)
# File lib/proiel/cli/converters/conll-u.rb, line 270 def clausal? (@part_of_speech == 'V-' and !nominalized?) or dependents.any?(&:copula?) or dependents.any? { |d| ['sub', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass'].include? d.relation } or root? end
# File lib/proiel/cli/converters/conll-u.rb, line 394 def conj_head raise "Not a conjunct" unless @relation == 'conj' if head.relation == 'conj' head.conj_head else head end end
# File lib/proiel/cli/converters/conll-u.rb, line 277 def conjunction? part_of_speech == 'C-' or @empty_token_sort == 'C' end
# File lib/proiel/cli/converters/conll-u.rb, line 281 def coordinated? head and head.conjunction? and head.relation == @relation end
Returns true
if the node has an xobj dependent and either 1) the lemma is copular or 2) the node is empty and has no pid slash or a pid slash to a node with a copular lemma
# File lib/proiel/cli/converters/conll-u.rb, line 288 def copula? @relation == 'cop' or (COPULAR_LEMMATA.include?([lemma, part_of_speech, language].join(',')) or (@empty_token_sort == 'V' and (pid.nil? or pid.is_empty? or COPULAR_LEMMATA.include?([pid.lemma, pid.part_of_speech, pid.language].join(',')))) and dependents.any? { |d| d.relation == 'xobj' } ) end
# File lib/proiel/cli/converters/conll-u.rb, line 382 def count_subgraph dependents.map(&:count_subgraph).inject(0, :+) + (is_empty? ? 0 : 1) end
# File lib/proiel/cli/converters/conll-u.rb, line 452 def dependents @sentence.tokens.select { |t| t.head_id == @id }.sort_by(&:id) end
# File lib/proiel/cli/converters/conll-u.rb, line 323 def deponent? DEPONENTS[@language] and DEPONENTS[@language].match(@lemma) end
# File lib/proiel/cli/converters/conll-u.rb, line 299 def determiner? DETERMINERS.include? @part_of_speech end
# File lib/proiel/cli/converters/conll-u.rb, line 303 def ellipsis? @empty_token_sort == 'V' end
# File lib/proiel/cli/converters/conll-u.rb, line 456 def find_appositive_head raise "Not an apposition" unless @relation == 'apos' if head.conjunction? and head.relation == 'apos' head.find_appositive_head else head end end
# File lib/proiel/cli/converters/conll-u.rb, line 571 def find_highest_daughter dependents.min_by { |d| OBLIQUENESS_HIERARCHY.find_index(d.map_relation[/[^:]*/]) || 1000 } end
# File lib/proiel/cli/converters/conll-u.rb, line 465 def find_postag possible_postags tag, crit, feats = possible_postags.shift if tag.nil? # raise "Found no postag" elsif crit.call self @upos = tag @features += ((@features.empty? ? '' : '|') + feats) if feats else find_postag possible_postags end end
# File lib/proiel/cli/converters/conll-u.rb, line 477 def find_relation possible_relations rel, crit = possible_relations.shift if rel.nil? # raise "Found no relation" elsif crit.call self rel else find_relation possible_relations end end
# File lib/proiel/cli/converters/conll-u.rb, line 563 def find_remnant if r = dependents.select { |d| d.relation == 'remnant' }.first r.find_remnant else self end end
# File lib/proiel/cli/converters/conll-u.rb, line 307 def foreign? @part_of_speech == 'F-' end
# File lib/proiel/cli/converters/conll-u.rb, line 411 def format_features(features) if features == '' '_' else features.split("|").sort.join("|") end end
# File lib/proiel/cli/converters/conll-u.rb, line 248 def genitive? @morphology =~ /......g.*/ end
# File lib/proiel/cli/converters/conll-u.rb, line 311 def has_content? @empty_token_sort.nil? or @empty_token_sort == '' end
# File lib/proiel/cli/converters/conll-u.rb, line 581 def has_preposition? dependents.any? { |d| d.preposition? and d.relation == "case" } end
# File lib/proiel/cli/converters/conll-u.rb, line 448 def head @sentence.tokens.select { |t| t.id == @head_id }.first end
# File lib/proiel/cli/converters/conll-u.rb, line 315 def interjection? @part_of_speech == 'I-' end
Inverts the direction of a dependency relation. By default the labels are also swapped, but new relations can be specified for both the new dependent and the new head.
# File lib/proiel/cli/converters/conll-u.rb, line 633 def invert!(new_dependent_relation = nil, new_head_relation = nil) raise "Cannot promote a token under root!" if @head_id == 0 new_dependent_relation ||= @relation new_head_relation ||= head.relation new_head_id = head.head_id head.head_id = @id head.relation = new_dependent_relation @head_id = new_head_id self.relation = new_head_relation end
# File lib/proiel/cli/converters/conll-u.rb, line 319 def is_empty? !has_content? end
# File lib/proiel/cli/converters/conll-u.rb, line 390 def left_corner ([self] + dependents).sort_by(&:id).first end
# File lib/proiel/cli/converters/conll-u.rb, line 240 def map_morphology morph res = [] for tag in 0..morph.length - 1 res << MORPHOLOGY_MAP[MORPHOLOGY_POSITIONAL_TAG_SEQUENCE[tag]][morph[tag]] end res.compact.join('|') end
# File lib/proiel/cli/converters/conll-u.rb, line 488 def map_part_of_speech! dependents.each(&:map_part_of_speech!) possible_postags = POS_MAP[@part_of_speech] find_postag possible_postags.dup # ugly, but the ugliness comes from UDEP @upos = 'ADJ' if @upos == 'DET' and @relation != 'det' end
# File lib/proiel/cli/converters/conll-u.rb, line 503 def map_relation possible_relations = RELATION_MAPPING[@relation] case possible_relations when String possible_relations when Array x = find_relation possible_relations.dup when nil # do nothing: the token has already changed its relation @relation else raise "Unknown value #{possible_relations.inspect} for #{@relation}" end end
# File lib/proiel/cli/converters/conll-u.rb, line 327 def mediopassive? (!deponent? and @morphology) ? @morphology[4] =~/[mpe]/ : false end
# File lib/proiel/cli/converters/conll-u.rb, line 335 def negation? NEGATION_LEMMATA.include?([lemma, part_of_speech, language].join(',')) end
# File lib/proiel/cli/converters/conll-u.rb, line 339 def nominal? @part_of_speech =~ /\A[NPM]/ or nominalized? end
# File lib/proiel/cli/converters/conll-u.rb, line 343 def nominalized? dependents.any? do |d| d.determiner? and ['atr', 'aux', 'det'].include? d.relation end end
# File lib/proiel/cli/converters/conll-u.rb, line 353 def particle? @relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(',')) end
# File lib/proiel/cli/converters/conll-u.rb, line 331 def passive? (!deponent? and @morphology) ? @morphology[4] == 'p' : false end
# File lib/proiel/cli/converters/conll-u.rb, line 403 def pid if pid = @slashes.select { |t, r| r == 'pid' }.first @sentence.tokens.select { |t| pid.first == t.id}.first else nil end end
# File lib/proiel/cli/converters/conll-u.rb, line 361 def preposition? @part_of_speech == 'R-' end
# File lib/proiel/cli/converters/conll-u.rb, line 607 def process_coordination! raise "Only coordinations can be processed this way!" unless conjunction? return if dependents.reject { |d| d.relation == 'aux' }.empty? distribute_shared_modifiers! dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id }.first.promote!("conj", "cc") end
# File lib/proiel/cli/converters/conll-u.rb, line 575 def process_copula! predicates = dependents.select { |d| d.relation == 'xobj' } raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1 predicates.first.promote!(nil, 'cop') end
# File lib/proiel/cli/converters/conll-u.rb, line 538 def process_ellipsis! aux = dependents.select(&:auxiliary?).first if aux aux.promote! return end new_head = find_highest_daughter new_head.promote!('orphan') # dependents.each do |d| # check if there's a partner with the same relation under the overt node. # TODO: this isn't really very convincing when it comes to ADVs # if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self # partner = partner.find_remnant # d.head_id = partner.id # d.relation = 'remnant' # if there's no partner, just attach under the overt node, preserving the relation # else # d.head_id = overt.id # end # end @sentence.remove_token!(self) end
# File lib/proiel/cli/converters/conll-u.rb, line 585 def process_preposition! raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-' obliques = dependents.select { |d| d.relation == 'obl' } raise "#{obliques.size} oblique dependents under #{to_n}\n#{to_graph}" if obliques.size > 1 return if obliques.empty? #shouldn't really happen, but in practice obliques.first.invert!("case") # , "adv") end
attach subjunctions with 'mark' under their verbs and promote the verb to take over the subjunction's relation. If the verb is empty, the subjunction stays as head.
# File lib/proiel/cli/converters/conll-u.rb, line 521 def process_subjunction! # ignore if the subjunction has no dependents or only conj dependents. # NB: this requires that the function is called *after* processing conjunctions return if dependents.reject { |d| ['conj', 'cc'].include? d.relation }.empty? pred = dependents.select { |d| d.relation == 'pred' } raise "#{pred.size} PREDs under the subjunction #{to_n}:\n#{@sentence.to_graph}" unless pred.one? pred = pred.first # promote the subjunction if the verb is empty if pred.is_empty? pred.dependents.each { |d| d.head_id = id } @sentence.remove_token! pred # else demote the subjunction else pred.invert!('mark') end end
promotes a node to its head's place. The node takes over its former head's relation and all dependents. The new relation for these dependents can be specified; if it is not, they will keep their former relation. The former head is made a dependent of the node (with a specified relation) or, if it is an empty node, destroyed.
# File lib/proiel/cli/converters/conll-u.rb, line 652 def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux') raise "Cannot promote a token under root!" if @head_id == 0 new_head_relation = head.relation new_head_id = head.head_id # move all dependents of the former head to the new one siblings.each do |t| t.head_id = @id # ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings) t.relation = new_sibling_relation if (new_sibling_relation and t.relation != 'aux') end # remove the former head if it was empty if head.is_empty? @sentence.remove_token!(head) # else make it a dependent of the new head else head.head_id = @id head.relation = new_dependent_relation end @head_id = new_head_id # don't use relation=, as we don't want this relation to be # copied down a tree of conjunctions @relation = new_head_relation end
# File lib/proiel/cli/converters/conll-u.rb, line 357 def pronominal? @part_of_speech =~ /\AP[^st]/ # no evidence that possessives are pronoun/determiner-like end
# File lib/proiel/cli/converters/conll-u.rb, line 365 def proper_noun? @part_of_speech == 'Ne' end
# File lib/proiel/cli/converters/conll-u.rb, line 496 def relabel_graph! dependents.each(&:relabel_graph!) # TODO: if there are iobjs without an obj among the dependents, one of them should be promoted to obj @relation = map_relation raise "No relation for #{form}" unless @relation end
# File lib/proiel/cli/converters/conll-u.rb, line 373 def relation=(rel) if conjunction? dependents.select { |d| d.relation == @relation }.each do |c| c.relation = rel end end @relation = rel end
# File lib/proiel/cli/converters/conll-u.rb, line 593 def remove_empties! dependents.each(&:remove_empties!) if is_empty? dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' } @sentence.remove_token! self end end
# File lib/proiel/cli/converters/conll-u.rb, line 369 def root? @head_id == 0 end
# File lib/proiel/cli/converters/conll-u.rb, line 444 def siblings @sentence.tokens.select { |t| t.head_id == @head_id } - [self] end
# File lib/proiel/cli/converters/conll-u.rb, line 386 def subgraph_set [self] + dependents.map(&:subgraph_set).flatten end
# File lib/proiel/cli/converters/conll-u.rb, line 257 def subjunction? @part_of_speech == 'G-' end
# File lib/proiel/cli/converters/conll-u.rb, line 419 def to_conll [@id, @form, @lemma, @upos, @part_of_speech, format_features(@features), @head_id, (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc '_', # slashes here @citation_part].join("\t") end
# File lib/proiel/cli/converters/conll-u.rb, line 440 def to_graph(indents = 0) ([("\t" * indents) + (to_n)] + dependents.map { |d| d.to_graph(indents + 1) }).join("\n") end
# File lib/proiel/cli/converters/conll-u.rb, line 436 def to_n [@relation, @id, (@form || @empty_token_sort), (@upos || @part_of_speech) ].join('-') end
# File lib/proiel/cli/converters/conll-u.rb, line 432 def to_s [@id, @form, @head_id, @relation].join("\t") end