class PROIEL::Converter::Tiger
Converter
for the TigerXML format (www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html) in the variant used by VISL under the name 'TIGER dependency format' (beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
Constants
- MORPHOLOGICAL_FEATURES
- OTHER_FEATURES
- SCHEMA_FILE
Public Class Methods
declare_annotation(builder, features, annotation_schema)
click to toggle source
# File lib/proiel/cli/converters/tiger.rb, line 51 def declare_annotation(builder, features, annotation_schema) builder.annotation do features.each do |name, domain| # FIXME: we may want to list possible values for some of these builder.feature(name: name, domain: domain) end builder.edgelabel do builder.value(name: '--') annotation_schema.primary_relations.each do |tag, features| builder.value({ name: tag }, features.summary) end end builder.secedgelabel do annotation_schema.secondary_relations.each do |tag, features| builder.value({name: tag }, features.summary) end end end end
process(tb, options)
click to toggle source
# File lib/proiel/cli/converters/tiger.rb, line 16 def process(tb, options) selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES @features = selected_features.map { |f| [f, 'FREC'] }.to_h builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2) builder.instruct! :xml, version: '1.0', encoding: 'UTF-8' tb.sources.each do |source| @hack = tb.annotation_schema write_source(builder, source) do source.divs.each do |div| div.sentences.each do |sentence| write_sentence(builder, sentence) end end end end end
token_attrs(s, t, type)
click to toggle source
# File lib/proiel/cli/converters/tiger.rb, line 74 def token_attrs(s, t, type) attrs = {} @features.each do |name, domain| if domain == 'FREC' or domain == type case name when :word, :cat attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form when *@semantic_features attrs[name] = t.sem_tags_to_hash[attr] when :lemma attrs[name] = t.lemma when :pos if t.empty_token_sort attrs[name] = t.empty_token_sort + "-" else attrs[name] = t.pos end when *MORPHOLOGICAL_FEATURES attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join else if t.respond_to?(name) attrs[name] = t.send(name) else raise "Do not know how to get required attribute #{name}" end end attrs[name] ||= "--" end end attrs end
write_nonterminals(builder, s)
click to toggle source
# File lib/proiel/cli/converters/tiger.rb, line 116 def write_nonterminals(builder, s) builder.nonterminals do # Add an empty root node h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h h[:id] = "s#{s.id}_root" builder.nt(h) do s.tokens.reject { |t| t.head or t.pro? }.each do |t| builder.edge(idref: "p#{t.id}", label: t.relation) end end # Add other NTs s.tokens.each do |t| builder.nt(token_attrs(s, t, 'NT').merge(id: "p#{t.id}")) do # Add an edge to the correspoding terminal node builder.edge(idref: "w#{t.id}", label: '--') # Add primary dependency edges t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) } # Add secondary dependency edges t.slashes.each do |relation, target_id| builder.secedge(idref: "p#{target_id}", label: relation) end end end end end
write_sentence(builder, s)
click to toggle source
# File lib/proiel/cli/converters/tiger.rb, line 146 def write_sentence(builder, s) builder.s(id: "s#{s.id}") do builder.graph(root: "s#{s.id}_root") do write_terminals(builder, s) write_nonterminals(builder, s) end end end
write_source(builder, s) { || ... }
click to toggle source
# File lib/proiel/cli/converters/tiger.rb, line 35 def write_source(builder, s) builder.corpus(id: s.id) do builder.head do builder.meta do builder.name(s.title) end declare_annotation(builder, @features, @hack) end builder.body do yield end end end
write_terminals(builder, s)
click to toggle source
# File lib/proiel/cli/converters/tiger.rb, line 108 def write_terminals(builder, s) builder.terminals do s.tokens.each do |t| builder.t(token_attrs(s, t, 'T').merge({ id: "w#{t.id}"})) end end end