class Opener::PropertyTagger::Processor

Class that applies property tagging to a given input KAF file.

Constants

FILE_ASPECTS_CACHE

Global cache used for storing loaded aspects.

@return [Opener::PropertyTagger::FileAspectsCache.new]

MAX_NGRAM

Use of n-grams to determine if a unigram (1 lemma) or bigram (2 lemmas) belong to a property.

REMOTE_ASPECTS_CACHE

Attributes

aspects[RW]
aspects_path[RW]
aspects_url[RW]
document[RW]
lexicons[RW]
pretty[RW]
timestamp[RW]

Public Class Methods

new(file, params: {}) click to toggle source

@param [String|IO] file The KAF file/input to process. @param [String] aspects_path Path to the aspects. @param [TrueClass|FalseClass] timestamp Add timestamps to the KAF. @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled

by default due to the performance overhead.
# File lib/opener/property_tagger/processor.rb, line 28
def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
  @document     = Nokogiri.XML file
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
  @timestamp    = timestamp
  @pretty       = pretty

  @params       = params
  @remote       = !url.nil?
  @aspects_path = path
  @aspects_url  = url
  @cache_keys   = params[:cache_keys] || {}
  @cache_keys.merge! lang: @document.root.attr('xml:lang')

  @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
end

Public Instance Methods

add_features_layer() click to toggle source

Remove the features layer from the KAF file if it exists and add a new one.

# File lib/opener/property_tagger/processor.rb, line 127
def add_features_layer
  existing = document.at_xpath('KAF/features')

  existing.remove if existing

  new_node('features', 'KAF')
end
add_linguistic_processor() click to toggle source
# File lib/opener/property_tagger/processor.rb, line 164
def add_linguistic_processor
  description = 'VUA property tagger'
  last_edited = '16jan2015'
  version     = '2.0'

  node = new_node('linguisticProcessors', 'KAF/kafHeader')
  node['layer'] = 'features'

  lp_node = new_node('lp', node)

  lp_node['version'] = "#{last_edited}-#{version}"
  lp_node['name']    = description

  if timestamp
    format = '%Y-%m-%dT%H:%M:%S%Z'

    lp_node['timestamp'] = Time.now.strftime(format)
  else
    lp_node['timestamp'] = '*'
  end
end
add_properties_layer() click to toggle source

Add the properties layer as a child to the features layer.

# File lib/opener/property_tagger/processor.rb, line 137
def add_properties_layer
  new_node("properties", "KAF/features")
end
add_property(lemma, values, index) click to toggle source
# File lib/opener/property_tagger/processor.rb, line 141
def add_property lemma, values, index
  property_node = new_node("property", "KAF/features/properties")

  property_node['lemma'] = lemma.to_s
  property_node['pid']   = "p#{index.to_s}"

  references_node = new_node("references", property_node)

  values.each do |v|
    comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
    references_node.add_child comm_node

    span_node = new_node 'span', references_node

    v.term_ids.each do |id|
      target_node       = new_node 'target', span_node

      target_node['id'] = id.to_s
      target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
    end
  end
end
extract_aspects() click to toggle source

Check which terms belong to an aspect (property) Text have priority over Lemmas, overriding if there is a conflict @return [Hash]

# File lib/opener/property_tagger/processor.rb, line 88
def extract_aspects
  all_term_ids = terms.keys
  lemmas       = terms.values
  uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }

  [:lemma, :text].each do |k|
    current_token = 0

    while current_token < terms.count
      (0..MAX_NGRAM).each do |tam_ngram|
        next unless current_token + tam_ngram <= terms.count

        ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase

        @lexicons[ngram.to_sym]&.each do |l|
          properties = if l.aspects.present? then l.aspects else [l.aspect] end
          properties.each do |p|
            next if p.blank?
            term_ids = all_term_ids[current_token..current_token+tam_ngram]
            next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }

            uniq_aspects[p.to_sym] << Hashie::Mash.new(
              term_ids: term_ids,
              ngram:    ngram,
              lexicon:  l,
            )
          end
        end
      end
      current_token += 1
    end
  end

  Hash[uniq_aspects.sort]
end
language() click to toggle source
# File lib/opener/property_tagger/processor.rb, line 63
def language
  @language ||= document.at_xpath('KAF').attr('xml:lang')
end
pretty_print(document) click to toggle source

Format the output document properly.

TODO: this should be handled by Oga in a nice way.

@return [String]

# File lib/opener/property_tagger/processor.rb, line 193
def pretty_print(document)
  doc = REXML::Document.new document.to_xml
  doc.context[:attribute_quote] = :quote
  out = ""
  formatter = REXML::Formatters::Pretty.new
  formatter.compact = true
  formatter.write(doc, out)

  out.strip
end
process() click to toggle source

Processes the input and returns the new KAF output. @return [String]

# File lib/opener/property_tagger/processor.rb, line 48
def process
  add_features_layer
  add_properties_layer

  extract_aspects.each.with_index do |(lemma, values), index|
    index += 1

    add_property lemma, values, index
  end

  add_linguistic_processor

  pretty ? pretty_print(document) : document.to_xml
end
terms() click to toggle source
# File lib/opener/property_tagger/processor.rb, line 67
def terms
  unless @terms
    @terms = {}

    document.xpath('KAF/terms/term').each do |term|
      @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
    end
  end

  @terms
end

Protected Instance Methods

aspects_file() click to toggle source

@return [String]

# File lib/opener/property_tagger/processor.rb, line 231
def aspects_file
  @aspects_file ||= File.expand_path "#{aspects_path}/#{language}.txt", __FILE__
end
is_kaf?() click to toggle source

Check if input is a KAF file. @return [Boolean]

# File lib/opener/property_tagger/processor.rb, line 224
def is_kaf?
  !!document.at_xpath('KAF')
end
new_node(tag, parent) click to toggle source
# File lib/opener/property_tagger/processor.rb, line 206
def new_node(tag, parent)
  if parent.is_a?(String)
    parent_node = document.at_xpath(parent)
  else
    parent_node = parent
  end

  node = Nokogiri::XML::Element.new(tag, document)

  parent_node.add_child node

  node
end