class PROIEL::Converter::CoNLLU::Sentence

Attributes

tokens[RW]

Public Class Methods

new(sentence) click to toggle source

initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence

# File lib/proiel/cli/converters/conll-u.rb, line 48
def initialize(sentence)

  id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil

  # initialize array to hold the sentence tokens
  tks = []
  # keep track of how many new tokens have been created
  offset = 0
  
  sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk|

    if tk.form =~ /[[:space:]]/
      subtoks = tk.form.split(/[[:space:]]/)
      
      subtoks.each_with_index do |subtok, i|
        tks << PROIEL::Token.new(sentence,
                         (i == 0 ? tk.id : 1000 + offset), # id
                         (i == 0 ? tk.head_id : tk.id), # head_id
                         subtok,
                         # hope the lemmas split the same way as the tokens. Grab the form is you don't find a lemma
                         (tk.lemma.split(/[[:space:]]/)[i] || subtok), 
                         tk.part_of_speech, # copy the postag
                         tk.morphology,
                         (i == 0 ? tk.relation : "flat"),
                         nil, #empty_token_sort
                         tk.citation_part,
                         (i == 0 ? tk.presentation_before : nil),
                         (i == (subtoks.size - 1)  ? tk.presentation_after : nil), 
                         (i == 0 ? tk.antecedent_id : nil),
                         (i == 0 ? tk.information_status : nil),
                         (i == 0 ? tk.contrast_group : nil),
                         (i == 0 ? tk.foreign_ids : nil),
                         (i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), #  This needs to be given a real slash object for the initialization, although it throws away the info
                         (subtok == subtoks.first ? tk.alignment_id : nil)
                        )
        offset += 1
      end
    else
      tks << tk
    end
  end

  
  tks.map(&:id).each_with_index.each do |id, i|
    id_to_number[id] = i + 1
  end

  @tokens = tks.map do |t|

    Token.new(id_to_number[t.id],
              id_to_number[t.head_id],
              #insert dots in any whitespace inside words and lemmata
              t.form.to_s.gsub(/[[:space:]]/, '.'), 
              t.lemma.to_s.gsub(/[[:space:]]/, '.'),
              t.part_of_speech,
              t.language,
              t.morphology,
              t.relation,
              t.empty_token_sort,
              t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
              t.citation_part,
              self
             )
  end
end

Public Instance Methods

convert() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 114
def convert
  restructure_graph!
  relabel_graph!
  map_part_of_speech!
  self
end
count_tokens() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 133
def count_tokens
  roots.map(&:count_subgraph).inject(0, :+)
end
demote_parentheticals_and_vocatives!() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 168
def demote_parentheticals_and_vocatives!
  r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  if p.any? and r.none?
    # promote the first vocative/parenthetical to head in case there's nothing else
    p.first.relation = 'pred'
    r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  end
  raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
  p.each { |x| x.head_id = r.first.id }
end
demote_subjunctions!() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 164
def demote_subjunctions!
  @tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!)
end
find_token(identifier) click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 121
def find_token(identifier)
  @tokens.select { |t| t.id == identifier }.first
end
map_part_of_speech!() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 183
def map_part_of_speech!
  roots.each(&:map_part_of_speech!)
end
prune_empty_rootnodes!() click to toggle source

TODO: this will leave several root nodes in many cases. For now, raise an error

# File lib/proiel/cli/converters/conll-u.rb, line 150
def prune_empty_rootnodes!
  unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty?
    empty_roots.each do |r|
      # promote the first dependent to root
      new_root = r.dependents.first
      new_root.head_id = 0
      new_root.relation = r.relation
      r.dependents.each { |d| d.head_id = new_root.id }
      remove_token! r
    end
    prune_empty_rootnodes!
  end
end
relabel_graph!() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 179
def relabel_graph!
  roots.each(&:relabel_graph!)
end
remove_token!(token) click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 125
def remove_token!(token)
  @tokens.delete(token)
end
restructure_graph!() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 187
def restructure_graph!
  @tokens.delete_if { |n| n.empty_token_sort == 'P' }
  @tokens.select(&:preposition?).each(&:process_preposition!)
  roots.each(&:change_coordinations!)
  @tokens.select(&:copula?).each(&:process_copula!)
  prune_empty_rootnodes!
  # do ellipses from left to right for proper remnant treatment
  @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
  demote_subjunctions!
  # DIRTY: remove the rest of the empty nodes by attaching them
  # to their grandmother with remnant. This is the best way to
  # do it given the current state of the UDEP scheme, but
  # revisions will come.
  roots.each(&:remove_empties!)
  demote_parentheticals_and_vocatives!
end
roots() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 137
def roots
  @tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
end
to_conll() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 145
def to_conll
  @tokens.map(&:to_conll).join("\n")
end
to_graph() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 141
def to_graph
  roots.map(&:to_graph).join("\n")
end
to_s() click to toggle source
# File lib/proiel/cli/converters/conll-u.rb, line 129
def to_s
  @tokens.map(&:to_s).join("\n")
end