module Eco::Data::FuzzyMatch::NGramsScore

Public Instance Methods

ngrams_score(str1, str2, range: 3..5, normalized: false) click to toggle source

A score is kept of matching ngram combinations of `str2`. @note This algorithm is best suited for matching sentences, or 'firstname lastname' compared with 'lastname firstname' combinations. @param range [Integer, Range] determine the lenght of the generated values. @normalized [Boolean] to avoid double ups in normalizing. @return [Score] the score object with the result.

# File lib/eco/data/fuzzy_match/ngrams_score.rb, line 42
def ngrams_score(str1, str2, range: 3..5, normalized: false)
  str1, str2 = normalize_string([str1, str2]) unless normalized
  len1 = str1 && str1.length; len2 = str2 && str2.length

  Score.new(0, len1 || 0).tap do |score|
    next if !str2 || !str1
    next if str2.empty? || str1.empty?
    score.total = len1
    next score.increase(score.total) if str1 == str2
    next if str1.length < 2 || str2.length < 2

    grams     = word_ngrams(str2, range, normalized: true)
    grams_count = grams.length
    next unless grams_count > 0

    if range.is_a?(Integer)
      item_weight = score.total.to_f / grams_count
      matches     = grams.select {|res| str1.include?(gram)}.length
      score.increase(matches * item_weight)
    else
      groups       = grams.group_by {|gram| gram.length}
      sorted_lens  = groups.keys.sort.reverse
      lens         = sorted_lens.length
      group_weight = (1.0 / lens).round(3)

      groups.each do |len, grams|
        len_max_score  = score.total * group_weight
        item_weight    = len_max_score / grams_count
        matches        = grams.select {|gram| str1.include?(gram)}.length
        #pp "(#{len}) match: #{matches} (of #{grams.length} of total #{grams_count}) || max_score: #{len_max_score} (over #{score.total})"
        score.increase(matches * item_weight)
      end
    end

  end
end
words_ngrams_score(str1, str2, range: 3..5, normalized: false) click to toggle source

It does the following:

1. It splits both strings into words
2. Pairs all words by best `ngrams_score` match
3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice)
4. Merges the `ngrams_score` of all the paired words of `str2` against their `str1` word pair

@param range [Integer, Range] determine the lenght of the generated values for each `word`. @normalized [Boolean] to avoid double ups in normalizing. @return [Score] the score object with the result.

# File lib/eco/data/fuzzy_match/ngrams_score.rb, line 13
def words_ngrams_score(str1, str2, range: 3..5, normalized: false)
  str1, str2 = normalize_string([str1, str2]) unless normalized
  len1 = str1 && str1.length; len2 = str2 && str2.length

  Score.new(0, 0).tap do |score|
    next if !str2 || !str1
    next score.increase_total(len1) if str2.empty? || str1.empty?
    if str1 == str2
      score.total = len1
      score.increase(score.total)
    end
    if str1.length < 2 || str1.length < 2
      score.increase_total(len1)
    end

    pairs = paired_words(str1, str2, normalized: true) do |needle, item|
      ngrams_score(needle, item, range: range, normalized: true)
    end.each do |sub_str1, data|
      item, iscore = data
      score.merge!(iscore)
    end
  end
end