class SynonymFinder::DuplicateFinder
Public Class Methods
new(synonym_finder)
click to toggle source
# File lib/synonym-finder/duplicate_finder.rb, line 4 def initialize(synonym_finder) @synonym_finder = synonym_finder @db = @synonym_finder.db @matches = {} end
Public Instance Methods
canonical_duplicates()
click to toggle source
# File lib/synonym-finder/duplicate_finder.rb, line 10 def canonical_duplicates SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical forms") @db.execute("select canonical from name_parts group by canonical having count(*) > 1").each_with_index do |canonical, i| i = i + 1 SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical form candidate %s" % i) if i % 100 == 0 names = @db.execute("select name_id, path from name_parts where canonical = ?", canonical) find_pairs(names) end @matches.each do |key, value| if value[:total_distance] == 0 value[:type] = :chresonym else value[:type] = :alt_placement end end @matches end
find_pairs(names, threshold = 0)
click to toggle source
# File lib/synonym-finder/duplicate_finder.rb, line 28 def find_pairs(names, threshold = 0) pairs = get_pairs(names) pairs.each do |pair| key = [pair[0][0], pair[1][0]] total_distance = get_total_distance(pair[0][1], pair[1][1]) value = {:total_distance => total_distance} @matches[key] = value if !@matches.has_key?(key) && (threshold == 0 || total_distance <= threshold) end end
get_pairs(names)
click to toggle source
# File lib/synonym-finder/duplicate_finder.rb, line 48 def get_pairs(names) names = names.map { |n| [n[0], n[1].to_s.split("|")] } pairs = [] until names.empty? name = names.pop names.each {|n| pairs << [name, n].sort} end pairs end
get_total_distance(path1, path2)
click to toggle source
# File lib/synonym-finder/duplicate_finder.rb, line 38 def get_total_distance(path1, path2) total_distance = path1.size + path2.size count = 0 path1.zip(path2).each do |pair| break if pair[0] != pair[1] count += 1 end total_distance - count * 2 end
species_epithet_duplicates(threshold_distance)
click to toggle source
# File lib/synonym-finder/duplicate_finder.rb, line 58 def species_epithet_duplicates(threshold_distance) SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithets") @db.execute("select epithet_stem from name_parts group by epithet_stem having count(*) > 1").each_with_index do |stem, i| i = i + 1 SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithet candidate %s" % i) if i % 100 == 0 names = @db.execute("select name_id, path from name_parts where epithet_stem = ?", stem) find_pairs(names, threshold_distance) end count = 0 SynonymFinder.logger_write(@synonym_finder.object_id, "Assigning type to found matches") @matches.each do |key, value| next if value.has_key?(:type) count += 1 SynonymFinder.logger_write(@synonym_finder.object_id, "Processing match %s" % count) if count % 10000 == 0 if value[:total_distance] == 0 epithets = @db.execute("select distinct epithet from name_parts where name_id in (#{key.join(",")})") if epithets.size == 1 value[:type] = :misplaced_synonym else genera = @db.execute("select canonical from name_parts where name_id in (#{key.join(",")})").map { |c| c[0].split(" ")[0] }.uniq value[:type] = genera.size == 1 ? :lexical_variant : :misplaced_synonym end else value[:type] = :homotypic end end @matches end