class FamilyReunion::TaxamatchPreprocessor
Public Class Methods
new(cache)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 4 def initialize(cache) @cache = cache end
Public Instance Methods
get_letters(word)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 93 def get_letters(word) letters = @cache.word_letters[word] if letters == nil letters = word.split('').uniq @cache.word_letters[word] = letters end letters end
get_match_candidates(list1, list2)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 8 def get_match_candidates(list1, list2) match_candidates = {:uninomials => {}, :binomials => {}, :trinomials => {}} partitioned_names1 = partition_canonicals(list1) partitioned_names2 = partition_canonicals(list2) [:uninomials, :binomials, :trinomials].each do |bucket| candidates = self.send("process_#{bucket}", partitioned_names1[bucket], partitioned_names2[bucket]) match_candidates[bucket].merge!(candidates) end match_candidates end
partition_canonicals(canonicals)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 19 def partition_canonicals(canonicals) partitions = { :uninomials => [], :binomials => [], :trinomials => [], :multinomials => [] } canonicals.each do |name| words = name.split(' ') key = case words.size when 1 :uninomials when 2 :binomials when 3 :trinomials else :multinomials end partitions[key] << [name, words] end partitions end
process_binomials(names1, names2)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 49 def process_binomials(names1, names2) names1.inject({}) do |res, n1| names2.each do |n2| if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] } end end res end end
process_trinomials(names1, names2)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 60 def process_trinomials(names1, names2) names1.inject({}) do |res, n1| names2.each do |n2| if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) && similar_words?(n1[1][2], n2[1][2]) res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] } end end res end end
process_uninomials(names1, names2)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 38 def process_uninomials(names1, names2) names1.inject({}) do |res, n1| names2.each do |n2| if similar_words?(n1[1][0], n2[1][0]) res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] } end end res end end
similar_words?(word1, word2)
click to toggle source
# File lib/family-reunion/taxamatch_preprocessor.rb, line 71 def similar_words?(word1, word2) raise RuntimeError unless (word1.is_a?(String) && word2.is_a?(String)) key = [word1, word2].sort.join(':') cached = @cache.similar_words[key] return cached if cached != nil are_similar = false if word1 == word2 are_similar = true else letters1 = get_letters(word1) letters2 = get_letters(word2) symmertric_difference = (letters1 - letters2) + (letters2 - letters1) similar_letters = symmertric_difference.size.to_f/(letters1.size + letters2.size) <= 0.3 similar_length = (word1.size - word2.size).abs.to_f/word1.size <= 0.2 are_similar = similar_letters && similar_length end @cache.similar_words[key] = are_similar are_similar end