class PROIEL::Converter::CoNLLX

This converts to the CoNLL-X format as described on ilk.uvt.nl/conll/#dataformat.

Public Class Methods

find_lexical_head_and_relation(id_to_number, id_to_token, t, rel = '') click to toggle source
# File lib/proiel/cli/converters/conll-x.rb, line 54
def find_lexical_head_and_relation(id_to_number, id_to_token, t, rel = '')
  if t.is_root?
    [0, rel + t.relation] # FIXME: may be empty token anyway
  elsif id_to_token[t.head_id].has_content?
    [id_to_number[t.head_id], rel + t.relation]
  else
    find_lexical_head_and_relation(id_to_number, id_to_token, id_to_token[t.head_id], rel + "#{t.relation}(#{id_to_number[t.head_id]})")
  end
end
format_morphology(token) click to toggle source
# File lib/proiel/cli/converters/conll-x.rb, line 43
def format_morphology(token)
  token.morphology_hash.map do |k, v|
    # Remove inflection tag unless when set to inflecting
    if k == :inflection and v =='i'
      nil
    else
      "#{k.upcase[0..3]}#{v}"
    end
  end.compact.join('|')
end
process(tb, options) click to toggle source
# File lib/proiel/cli/converters/conll-x.rb, line 6
def process(tb, options)
  tb.sources.each do |source|
    source.divs.each do |div|
      div.sentences.each do |sentence|
        id_to_number = {}

        # Do not care about prodrop tokens
        tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
        
        # Renumber to make the sequence continguous after prodrop tokens where left out
        tk.map(&:id).each_with_index.each do |id, i|
          id_to_number[id] = i + 1
        end

        id_to_token = tk.inject({}) { |h, t| h.merge({t.id => t}) }

        tk.each do |token|
          unless token.is_empty?
            this_number = id_to_number[token.id]
            head_number, relation = find_lexical_head_and_relation(id_to_number, id_to_token, token)
            form = token.form.gsub(/[[:space:]]/, '.')
            lemma = token.lemma.gsub(/[[:space:]]/, '.')
            pos_major = token.part_of_speech_hash[:major]
            pos_full = token.part_of_speech
            morphology = format_morphology(token)

            puts [this_number, form, lemma, pos_major, pos_full,
                  morphology, head_number, relation, "_", "_"].join("\t")
          end
        end

        puts
      end
    end
  end
end