class StringSimilarity
Constants
- MIN_NGRAM_LENGTH
- VERSION
Public Class Methods
bigram_score(string_1, string_2)
click to toggle source
# File lib/string_similarity.rb, line 13 def self.bigram_score(string_1, string_2) score(string_1, string_2, MIN_NGRAM_LENGTH) end
ngram_score(string_1, string_2)
click to toggle source
# File lib/string_similarity.rb, line 7 def self.ngram_score(string_1, string_2) ngram_length = (string_1.length.to_f * 0.1).round ngram_length = [MIN_NGRAM_LENGTH, ngram_length].max score(string_1, string_2, ngram_length) end
Private Class Methods
capitalization_differences(original_phrase, found_phrase)
click to toggle source
# File lib/string_similarity.rb, line 46 def self.capitalization_differences(original_phrase, found_phrase) (original_phrase.downcase == found_phrase || found_phrase.downcase == original_phrase || original_phrase.upcase == found_phrase || found_phrase.upcase == original_phrase) && found_phrase != original_phrase end
normalize_score(score, original_phrase, found_phrase)
click to toggle source
# File lib/string_similarity.rb, line 52 def self.normalize_score(score, original_phrase, found_phrase) capitalization_differences(original_phrase, found_phrase) ? score - 1 : score end
remove_special_characters(phrase)
click to toggle source
# File lib/string_similarity.rb, line 39 def self.remove_special_characters(phrase) phrase.gsub!(/[[:punct:]<>]+/, "") # remove all punctuation phrase.strip! # remove trailing and leading white space phrase.gsub!(/\s+/, " ") # replace extended white space with one space phrase.downcase end
score(string_1, string_2, ngram_length)
click to toggle source
# File lib/string_similarity.rb, line 19 def self.score(string_1, string_2, ngram_length) string_1 = String.new(string_1) string_2 = String.new(string_2) cleaned_string_1 = remove_special_characters(string_1) cleaned_string_2 = remove_special_characters(string_2) string_1_ngrams = cleaned_string_1.each_char.each_cons(ngram_length).to_set string_2_ngrams = cleaned_string_2.each_char.each_cons(ngram_length).to_set overlap = (string_1_ngrams & string_2_ngrams).size total = string_1_ngrams.size + string_2_ngrams.size return 0 unless overlap > 0 && total > 0 sorensen_dice = overlap * 2.0 / total score = (sorensen_dice * 100).round normalize_score(score, string_1, string_2) end