class Treebank::Transform

Constants

VERSION

Public Class Methods

new(doc) click to toggle source
# File lib/treebank/transform.rb, line 12
def initialize(doc)
  @doc  = Nokogiri::XML(doc);
end

Public Instance Methods

extract_cts_name(extension = '') click to toggle source
# File lib/treebank/transform.rb, line 23
def extract_cts_name(extension = '')
  sentence = @doc.xpath('//treebank/sentence').first
  match = sentence['document_id'].match('urn:cts:.*Lit:(.*)')
  "#{match[1]}#{extension}" if match
end
transform() click to toggle source
# File lib/treebank/transform.rb, line 16
def transform
  transform_sentence_level
  transform_document_level

  @doc.to_xml(indent: 2)
end

Private Instance Methods

beta2unicode() click to toggle source
# File lib/treebank/transform.rb, line 35
def beta2unicode
  Dir.chdir(RESOURCES) do
    @xslt = Nokogiri::XSLT(File.read(BETA_2_UNICODE))
    @doc = @xslt.transform(@doc)
  end
end
has_elliptic_head(label) click to toggle source
# File lib/treebank/transform.rb, line 60
def has_elliptic_head(label)
  label.match(/ExD\d+/)
end
transform_document_level() click to toggle source
# File lib/treebank/transform.rb, line 31
def transform_document_level
  beta2unicode
end
transform_elliptic_nodes(sentence, word_node) click to toggle source
# File lib/treebank/transform.rb, line 53
def transform_elliptic_nodes(sentence, word_node)
  if has_elliptic_head(word_node['relation'])
    word = EllipticWord.new(word_node, sentence)
    word.parse_elliptic_head
  end
end
transform_participles(node) click to toggle source
# File lib/treebank/transform.rb, line 64
def transform_participles(node)
  postag = node['postag']
  if postag.start_with?('t')
    node['postag'] = postag.sub('t', 'v')
  end
end
transform_sentence_level() click to toggle source
# File lib/treebank/transform.rb, line 42
def transform_sentence_level
  @doc.xpath('//treebank/sentence').each do |sentence_node|
    sentence = Sentence.new(sentence_node)
    sentence.ctsify
    sentence_node.xpath('word').each do |word_node|
      transform_elliptic_nodes(sentence, word_node)
      transform_participles(word_node)
    end
  end
end