class Text::WhiteSimilarity

Ruby implementation of the string similarity described by Simon White at: www.catalysoft.com/articles/StrikeAMatch.html

                     2 * |pairs(s1) INTERSECT pairs(s2)|
similarity(s1, s2) = -----------------------------------
                         |pairs(s1)| + |pairs(s2)|

e.g.

                                          2 * |{FR, NC}|
similarity(FRANCE, FRENCH) = ---------------------------------------
                             |{FR,RA,AN,NC,CE}| + |{FR,RE,EN,NC,CH}|

                           = (2 * 2) / (5 + 5)

                           = 0.4

WhiteSimilarity.new.similarity("FRANCE", "FRENCH")

Public Class Methods

new() click to toggle source
# File lib/text/white_similarity.rb, line 32
def initialize
  @word_letter_pairs = {}
end
similarity(str1, str2) click to toggle source
# File lib/text/white_similarity.rb, line 28
def self.similarity(str1, str2)
  new.similarity(str1, str2)
end

Public Instance Methods

similarity(str1, str2) click to toggle source
# File lib/text/white_similarity.rb, line 36
def similarity(str1, str2)
  pairs1 = word_letter_pairs(str1)
  pairs2 = word_letter_pairs(str2).dup

  union = pairs1.length + pairs2.length

  intersection = 0
  pairs1.each do |pair1|
    if index = pairs2.index(pair1)
      intersection += 1
      pairs2.delete_at(index)
    end
  end

  (2.0 * intersection) / union
end

Private Instance Methods

word_letter_pairs(str) click to toggle source
# File lib/text/white_similarity.rb, line 54
def word_letter_pairs(str)
  @word_letter_pairs[str] ||=
    str.upcase.split(/\s+/).map{ |word|
      (0 ... (word.length - 1)).map { |i| word[i, 2] }
    }.flatten.freeze
end