require 'csv'
def check!(errors, correct, total, name, gender, gcase, expected)
petrovich = Petrovich(name.merge(gender: gender)) lemma = name.values.join(' ') actual = Petrovich::Unicode.upcase(petrovich.public_send(gcase).to_s) total[[gender, gcase]] += 1 if actual == expected correct[[gender, gcase]] += 1 true else errors << [lemma, expected, actual, [gender, gcase]] actual end
end
def figure_namepart(args)
namepart_filename = args[:namepart] || "surnames" namepart_filename += 's' unless namepart_filename.end_with?('s') namepart_symbol = namepart_filename.chop.to_sym namepart_symbol = :lastname if namepart_symbol == :surname namepart_symbol = :middlename if namepart_symbol == :midname namepart_filename += ".#{args[:subset]}" if args[:subset] [namepart_filename, namepart_symbol]
end
desc 'Evaluate Petrovich' task :evaluate, [:namepart, :subset] => [:'evaluate:rules', :'evaluate:gender']
namespace :evaluate do
desc 'Evaluate the inflector on lastnames' task :rules, [:namepart, :subset] => :petrovich do |_, args| namepart_filename, namepart_symbol = figure_namepart(args) filename = File.expand_path("../../../eval/#{namepart_filename}.tsv", __FILE__) unless File.file?(filename) warn "File #{filename} not found, skipping task" next end errors_filename = ENV['errors'] || 'errors.tsv' correct, total = Hash.new(0), Hash.new(0) puts 'I will evaluate the inflector on "%s" ' \ 'and store errors to "%s".' % [filename, errors_filename] errors = [] CSV.open(filename, "r:BINARY", col_sep: "\t", headers: true).each do |row| word = row['word'].force_encoding('UTF-8') lemma = row['lemma'].force_encoding('UTF-8') grammemes = if row['grammemes'] row['grammemes'].force_encoding('UTF-8').split(',') else [] end gender = grammemes.include?('мр') ? :male : :female if grammemes.include? '0' # some words are aptotic so we have to ensure that Petrovich::CASES.each do |gcase| check!(errors, correct, total, { namepart_symbol => lemma }, gender, gcase, word) end elsif grammemes.include? 'им' check!(errors, correct, total, { namepart_symbol => lemma }, gender, :nominative, word) elsif grammemes.include? 'рд' check!(errors, correct, total, { namepart_symbol => lemma }, gender, :genitive, word) elsif grammemes.include? 'дт' check!(errors, correct, total, { namepart_symbol => lemma }, gender, :dative, word) elsif grammemes.include? 'вн' check!(errors, correct, total, { namepart_symbol => lemma }, gender, :accusative, word) elsif grammemes.include? 'тв' check!(errors, correct, total, { namepart_symbol => lemma }, gender, :instrumental, word) elsif grammemes.include? 'пр' check!(errors, correct, total, { namepart_symbol => lemma }, gender, :prepositional, word) end end errors.sort_by!{ |array| array.first.reverse + array.last.first.to_s } CSV.open(errors_filename, 'w', col_sep: "\t") do |errors_file| errors_file << %w(lemma expected actual params) errors.each do |array| errors_file << array end end total.each do |(gender, gcase), correct_count| accuracy = correct[[gender, gcase]] / correct_count.to_f * 100 puts "\tAc(%s|%s) = %.4f%%" % [gcase, gender, accuracy] end correct_size = correct.values.inject(&:+).to_i total_size = total.values.inject(&:+).to_i puts 'Well, the accuracy on %d examples is about %.4f%%.' % [total_size, (correct_size / total_size.to_f * 100)] puts 'Sum of the %d correct examples and %d mistakes is %d.' % [correct_size, total_size - correct_size, total_size] end desc 'Evaluate the gender detector' task :gender, [:namepart, :subset] => :petrovich do |_, args| GENDER_MAP = { 'мр' => :male, 'жр' => :female, 'мр-жр' => :androgynous } namepart_filename, namepart_symbol = figure_namepart(args) filename = File.expand_path("../../../eval/#{namepart_filename}.gender.tsv", __FILE__) unless File.file?(filename) warn "File #{filename} not found, skipping task" next end errors_filename = ENV['errors'] || 'errors.gender.tsv' correct, total = Hash.new(0), Hash.new(0) puts 'I will evaluate gender detector on "%s" ' \ 'and store errors to "%s".' % [filename, errors_filename] errors = [] hard_error_count = 0 CSV.open(filename, "r:BINARY", col_sep: "\t", headers: true).each do |row| lemma = row['lemma'].force_encoding('UTF-8') gender_name = row['gender'].force_encoding('UTF-8') expected_gender = GENDER_MAP[gender_name] detected_gender = Petrovich(namepart_symbol => lemma).gender total[expected_gender] += 1 if detected_gender == expected_gender correct[expected_gender] += 1 else errors << [lemma, expected_gender, detected_gender] if detected_gender != :androgynous hard_error_count += 1 warn " - #{Petrovich::Unicode.downcase(lemma)}" end end end puts 'Hard error count: %d.' % [hard_error_count] PART_INDEX = {:female => 0, :male => 1, :androgynous => 3} errors.sort_by!{ |array| array.first.reverse + PART_INDEX[array[1]].to_s } CSV.open(errors_filename, 'w', col_sep: "\t") do |errors_file| errors_file << %w(lemma expected actual) errors.each do |array| errors_file << array end end total.each do |gender, correct_count| accuracy = correct[gender] / correct_count.to_f * 100 puts "\tAc(%s) = %.4f%%" % [gender, accuracy] end correct_size = correct.values.inject(&:+).to_i total_size = total.values.inject(&:+).to_i puts 'Well, the accuracy on %d examples is about %.4f%%.' % [total_size, (correct_size / total_size.to_f * 100)] puts 'Sum of the %d correct examples and %d mistakes is %d.' % [correct_size, total_size - correct_size, total_size] end
end