class PROIEL::Converter::PROIELXML
Public Class Methods
grab_features(obj, mandatory_features, optional_features = [])
click to toggle source
# File lib/proiel/cli/converters/proielxml.rb, line 147 def grab_features(obj, mandatory_features, optional_features = []) attrs = {} mandatory_features.each do |f| v = obj.send(f) attrs[f.to_s.gsub('_', '-')] = v end optional_features.each do |f| v = obj.send(f) if v and v.to_s != '' attrs[f.to_s.gsub('_', '-')] = v end end attrs end
include_div?(div, options)
click to toggle source
# File lib/proiel/cli/converters/proielxml.rb, line 128 def include_div?(div, options) if options['remove-empty-divs'] div.sentences.any? { |sentence| include_sentence?(sentence, options) } else true end end
include_sentence?(sentence, options)
click to toggle source
# File lib/proiel/cli/converters/proielxml.rb, line 136 def include_sentence?(sentence, options) case sentence.status when :reviewed true when :annotated not options['remove-not-reviewed'] else not options['remove-not-reviewed'] and not options['remove-not-annotated'] end end
process(tb, options)
click to toggle source
# File lib/proiel/cli/converters/proielxml.rb, line 5 def process(tb, options) builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2) builder.instruct! :xml, version: '1.0', encoding: 'UTF-8' builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do builder.annotation do builder.relations do tb.annotation_schema.relation_tags.each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary primary secondary))) builder.value(attrs) end end builder.tag! 'parts-of-speech' do tb.annotation_schema.part_of_speech_tags.each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary))) builder.value(attrs) end end builder.morphology do tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values| builder.field(tag: cat_tag) do cat_values.each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary))) builder.value(attrs) end end end end builder.tag! 'information-statuses' do tb.annotation_schema.information_status_tags.each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary))) builder.value(attrs) end end end tb.sources.each do |source| mandatory_features = %i(id language) optional_features = [] optional_features += %i(alignment_id) unless options['remove-alignments'] builder.source(grab_features(source, mandatory_features, optional_features)) do PROIEL::Treebank::METADATA_ELEMENTS.each do |field| builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field) end source.divs.each do |div| if include_div?(div, options) mandatory_features = %i() optional_features = [] optional_features += %i(presentation_before presentation_after) optional_features += %i(alignment_id) unless options['remove-alignments'] builder.div(grab_features(div, mandatory_features, optional_features)) do builder.title div.title if div.title div.sentences.each do |sentence| if include_sentence?(sentence, options) mandatory_features = %i(id) optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier optional_features += %i(status) unless options['remove-status'] optional_features += %i(presentation_before presentation_after) optional_features += %i(alignment_id) unless options['remove-alignments'] optional_features += %i(annotated_at) unless options['remove-annotator'] optional_features += %i(reviewed_at) unless options['remove-reviewer'] optional_features += %i(annotated_by) unless options['remove-annotator'] optional_features += %i(reviewed_by) unless options['remove-reviewer'] builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do sentence.tokens.each do |token| next if token.empty_token_sort == 'P' and options['remove-information-structure'] next if token.empty_token_sort == 'C' and options['remove-syntax'] next if token.empty_token_sort == 'V' and options['remove-syntax'] mandatory_features = %i(id) optional_features = %i(citation_part) optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology'] optional_features += %i(head_id relation) unless options['remove-syntax'] optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure'] unless token.is_empty? mandatory_features << :form optional_features += %i(presentation_before presentation_after foreign_ids) else mandatory_features << :empty_token_sort end optional_features += %i(alignment_id) unless options['remove-alignments'] attrs = grab_features(token, mandatory_features, optional_features) unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML builder.token(attrs) do token.slashes.each do |relation, target_id| builder.slash(:"target-id" => target_id, relation: relation) end end else unless options['remove-syntax'] and token.is_empty? builder.token(attrs) end end end end end end end end end end end end end