module Eco::Data::FuzzyMatch::Pairing

Public Instance Methods

paired_words(str1, str2, normalized: false) { |needle, item| ... } click to toggle source

Pair words using some algorithm. It does the following:

1. It splits both strings into words.
2. Pairs all words by using `block` to score the best match.
3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice).
4. Merges the `Score` of all the paired words of `str2` against their `str1` word pair.

@yield [needle, item] offers a comparison algorithm between two strings. @yieldparam needle [String] the string of reference. @yieldparam item [String] one of the haystack items. @yieldreturn [Eco::Data::FuzzyMatch::Score] the `Score` object with the results of comparing `str1` and `str2` @param str1 [String] the string of reference. @param str2 [String] one of the haystack items. @normalized [Boolean] to avoid double ups in normalizing. @return [Hash] where `keys` are the words of `str1` and their `values` a pair array of `pair` and `Score`

# File lib/eco/data/fuzzy_match/pairing.rb, line 20
def paired_words(str1, str2, normalized: false)
  str1, str2 = normalize_string([str1, str2]) unless normalized
  return {nil => [nil, Score.new(0, 0)]} if !str2 || !str1
  return {str1 => [nil, Score.new(0, 0)]} if str1.length < 2 || str1.length < 2

  needles    = get_words(str1, normalized: true)
  haystack   = get_words(str2, normalized: true)

  ranking    = {}
  faceted    = needles.each_with_object({}) do |needle, faceted|
    faceted[needle] = haystack.map do |item|
      {
        pair:  item,
        score: yield(needle, item)
      }.tap do |result|
        ranking[item] ||= []
        if result[:score].ratio > 0.05
          ranking[item] << ({needle: needle, score: result[:score]})
        end
      end
    end.sort_by do |result|
      result[:score].ratio
    end.reverse
  end

  paired = {}
  #scores = {}
  ranking.each do |item, results|
    sorted = results.reject do |result|
      paired.key?(result[:needle])
    end.sort_by do |result|
      result[:score].ratio
    end.reverse
    if result = sorted.shift
      unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
        raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{result[:needle]}' and item #{item}"
      end
      paired[result[:needle]] = {
        pair:  item,
        score: result[:score]
      }
    end
  end

  pending_items   = haystack - paired.values
  faceted.reject do |needle, results|
    paired.key?(needle)
  end.each do |needle, results|
    results.select! do |result|
      pending_items.include?(result[:pair]) && result[:score].ratio > 0.05
    end
    if result = results.shift
      unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
        raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{needle}' and item #{result[:pair]}"
      end
      paired[needle] = result
      pending_items.delete(result[:pair])
    end
  end

  pending_needles = needles - paired.keys
  pending_needles.each do |needle|
    paired[needle] = {
      pair:  nil,
      score: Score.new(0, needle.length)
    }
  end
  paired.each_with_object({}) do |(needle, data), out|
    out[needle] = data.values_at(:pair, :score)
  end
end