class SynonymFinder

Constants

AUTH_MATCH
AUTH_NO_MATCH
NO_AUTH_INFO
PARTIAL_AUTH_INFO

Attributes

input[RW]

Public Class Methods

logger() click to toggle source
# File lib/synonym-finder.rb, line 17
def self.logger
  @@logger ||= Logger.new(nil)
end
logger=(logger) click to toggle source
# File lib/synonym-finder.rb, line 21
def self.logger=(logger)
  @@logger = logger
end
logger_reset() click to toggle source
# File lib/synonym-finder.rb, line 25
def self.logger_reset
  self.logger = Logger.new(nil)
end
logger_write(obj_id, message, method = :info) click to toggle source
# File lib/synonym-finder.rb, line 29
def self.logger_write(obj_id, message, method = :info)
  self.logger.send(method, "|%s|%s|" % [obj_id, message])
end
new(input, in_memory = true) click to toggle source
# File lib/synonym-finder.rb, line 34
def initialize(input, in_memory = true)
  @input = input
  @atomizer = Taxamatch::Atomizer.new
  @tm = Taxamatch::Base.new
  @stemmer = Lingua::Stemmer.new(:language => "latin")
  @db = init_db(in_memory)
  #tmp_populate
  build_tree unless @db.execute("select count(*) from names")[0][0].to_i > 0
  @matches = {}
  @part_matches = {}
  @duplicate_finder = DuplicateFinder.new(self)
  @group_organizer = GroupOrganizer.new(self)
end

Public Instance Methods

find_matches(threshold = 5) click to toggle source
# File lib/synonym-finder.rb, line 48
def find_matches(threshold = 5)
  @duplicate_finder.canonical_duplicates
  matches = @duplicate_finder.species_epithet_duplicates(threshold)
  matches = compare_authorship(matches)
  clean_up(matches)
  @group_organizer.organize
end

Private Instance Methods

build_tree() click to toggle source
# File lib/synonym-finder.rb, line 86
def build_tree
  SynonymFinder.logger_write(self.object_id, "Ingesting data")
  @input.each_with_index do |row, i|
    i += 1
    SynonymFinder.logger_write(self.object_id, "Ingesting record %s" % i) if i % 1000 == 0
    atomized_name = @atomizer.parse row[:name] rescue nil
    next unless atomized_name && atomized_name[:species]
    species_string = get_species(atomized_name)
    canonical_name = atomized_name[:genus][:string] + " " + species_string
    @db.execute("insert into names (id, name, authors, years) values (?, ?, ?, ?)", [row[:id], row[:name], Marshal.dump(atomized_name[:all_authors]), Marshal.dump(atomized_name[:all_years])])
    @db.execute("insert into name_parts (name_id, path, canonical, epithet, epithet_stem) values (?, ?, ?, ?, ?)", [row[:id], row[:path], canonical_name, species_string, stem_epithet(species_string)])
  end
end
clean_up(matches) click to toggle source
# File lib/synonym-finder.rb, line 58
def clean_up(matches)
  matches.each do |key, value|
    next if value[:type] != :chresonym && value[:auth_match] < 20
    value[:auth_match] == 100 || value[:type] == :chresonym ? @matches[key] = value : @part_matches[key] = value
  end
end
compare_authorship(matches) click to toggle source
# File lib/synonym-finder.rb, line 65
def compare_authorship(matches)
  SynonymFinder.logger_write(self.object_id, "Matching authorship")
  count = 0
  matches.each do |key, value|
    count += 1
    SynonymFinder.logger_write(self.object_id, "Matching authors %s" % count) if count % 1000 == 0
    ids = key.join(",")
    res = @db.execute("select authors, years from names where id in (#{ids})")
    data1 = {:all_authors => Marshal.load(res[0][0]), :all_years =>Marshal.load(res[0][1])}
    data2 = {:all_authors => Marshal.load(res[1][0]), :all_years =>Marshal.load(res[1][1])}
    if (data1[:all_authors] + data1[:all_years] + data2[:all_authors] + data2 [:all_years]) == []
      value[:auth_match] = NO_AUTH_INFO
    elsif (data1[:all_authors] + data1[:all_years]).empty? || (data2[:all_authors] + data2[:all_years]).empty?
      value[:auth_match] = PARTIAL_AUTH_INFO
    else
      value[:auth_match] = @tm.match_authors(data1, data2) == 0 ? AUTH_NO_MATCH : AUTH_MATCH
    end
  end
  matches
end
create_tables(db) click to toggle source
# File lib/synonym-finder.rb, line 115
def create_tables(db)
  db.execute("create table names (id string primary key, name string, authors, years)")
  # db.execute("create table paths (id integer primary key autoincrement, path)")
  # db.execute("create table paths_names (path_id integer, name_id string, level integer, primary key (path_id, name_id))")
  db.execute("create table name_parts (name_id string, path string, canonical string, epithet string, epithet_stem string)")
  db.execute("create index idx_name_parts_1 on name_parts (canonical)")
  db.execute("create index idx_name_parts_2 on name_parts (epithet_stem)")
  db.execute("create table groups (id integer primary key, type)")
  db.execute("create table names_groups (name_id integer, group_id integer, score_max integer, score_sum integer, score_num integer, primary key (name_id, group_id))")
  db.execute("create index idx_names_groups_2 on names_groups (group_id)")
end
get_species(atomized_name) click to toggle source
# File lib/synonym-finder.rb, line 127
def get_species(atomized_name)
  species = [atomized_name[:species][:string]]
  species += atomized_name[:infraspecies].map {|i| i[:string]} if atomized_name[:infraspecies]
  species.join(" ")
end
init_db(in_memory) click to toggle source
# File lib/synonym-finder.rb, line 100
def init_db(in_memory)
  if in_memory == true
    db = SQLite3::Database.new( ":memory:" )
    create_tables(db)
  else
    db_file = "/tmp/syn_finder.sql"
    db_exist = File.exist?(db_file)
    db = SQLite3::Database.new("/tmp/syn_finder.sql")
    unless db_exist
      create_tables(db)
    end
  end
  db
end
stem_epithet(epithet) click to toggle source
# File lib/synonym-finder.rb, line 133
def stem_epithet(epithet)
  epithet.split(" ").map { |e| @stemmer.stem(e) }.join(" ")
end
tmp_populate() click to toggle source
# File lib/synonym-finder.rb, line 137
def tmp_populate
  f = open("/tmp/dump.sql")
  f.each_with_index do |line, i|
    i += 1
    puts "loading from dump line %s" % i if i % 10000 == 0
    if line.match /INSERT/
      @db.execute(line.strip)
    end
  end
end