class PROIEL::Commands::CountWords

Public Class Methods

init_with_program(prog) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 5
def init_with_program(prog)
  prog.command(:info) do |c|
    c.syntax 'info [options] filename(s)'
    c.description 'Show information about the treebank'

    c.action do |args, options|
      if args.empty?
        STDERR.puts 'Missing filename(s). Use --help for more information.'
      else
        process(args, options)
      end
    end
  end
end
pretty_electronic_text_info(source) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 104
def pretty_electronic_text_info(source)
  [source.electronic_text_title,
   source.electronic_text_editor ? "ed. #{source.electronic_text_editor}" : nil,
   source.electronic_text_publisher,
   source.electronic_text_place,
   source.electronic_text_date].compact.join(', ')
end
pretty_language(source) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 87
def pretty_language(source)
  case source.language
  when 'lat'
    'Latin'
  else
    "Unknown (language code #{source.language})"
  end
end
pretty_license(source) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 112
def pretty_license(source)
  if source.license_url
    "#{source.license} (#{source.license_url})"
  else
    source.license
  end
end
pretty_printed_text_info(source) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 96
def pretty_printed_text_info(source)
  [source.printed_text_title,
   source.printed_text_editor ? "ed. #{source.printed_text_editor}" : nil,
   source.printed_text_publisher,
   source.printed_text_place,
   source.printed_text_date].compact.join(', ')
end
pretty_title(source) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 120
def pretty_title(source)
  [source.author, source.title].compact.join(', ')
end
process(args, options) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 20
def process(args, options)
  tb = PROIEL::Treebank.new

  args.each do |filename|
    STDERR.puts "Reading #{filename}...".green if options['verbose']

    tb.load_from_xml(filename)
  end

  t = treebank_statistics(tb)

  puts "Loaded treebank files contain #{tb.sources.count} source(s)".yellow
  puts "   Overall size: #{t.sentence_count} sentence(s), #{t.token_count} token(s)"
  puts

  tb.sources.each_with_index do |source, i|
    s = source_statistics(source)
    n = s.sentence_count
    r = s.reviewed_sentence_count * 100.0 / n
    a = s.annotated_sentence_count * 100.0 / n

    puts "#{i + 1}. #{pretty_title(source)}".yellow
    puts "   Version:      #{source.date}"
    puts "   License:      #{pretty_license(source)}"
    puts "   Language:     #{pretty_language(source)}"
    puts "   Printed text: #{pretty_printed_text_info(source)}"
    puts "   Electr. text: #{pretty_electronic_text_info(source)}"
    puts "   Size:         #{n} sentence(s), #{s.token_count} token(s)"
    puts "   Annotation:   %.2f%% reviewed, %.2f%% annotated" % [r, a]
  end
end
source_statistics(source) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 68
def source_statistics(source)
  OpenStruct.new.tap do |s|
    s.sentence_count = 0
    s.token_count = 0
    s.annotated_sentence_count = 0
    s.reviewed_sentence_count = 0

    source.divs.each do |div|
      div.sentences.each do |sentence|
        s.token_count += sentence.tokens.count
      end

      s.sentence_count += div.sentences.count
      s.annotated_sentence_count += div.sentences.select(&:annotated?).count
      s.reviewed_sentence_count += div.sentences.select(&:reviewed?).count
    end
  end
end
treebank_statistics(tb) click to toggle source
# File lib/proiel/cli/commands/info.rb, line 52
def treebank_statistics(tb)
  OpenStruct.new.tap do |s|
    s.sentence_count = 0
    s.token_count = 0
    s.annotated_sentence_count = 0
    s.reviewed_sentence_count = 0

    tb.sources.each do |source|
      s.token_count += source_statistics(source).token_count
      s.sentence_count += source_statistics(source).sentence_count
      s.annotated_sentence_count += source_statistics(source).annotated_sentence_count
      s.reviewed_sentence_count += source_statistics(source).reviewed_sentence_count
    end
  end
end