module Eco::Data::FuzzyMatch::InstanceMethods

Constants

FUZZY_MATCH_OPTIONS
JARO_OPTIONS
NGRAMS_OPTIONS
POSITION_OPTIONS
RESULTS_OPTIONS

Attributes

fuzzy_options[RW]

Public Instance Methods

find_all_with_score(needle, needle_str: nil, haystack: nil, **options) click to toggle source

TODO: integration for options => to ensure repeated words do not bring down the score are cut by threshold @note

- When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results

@param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key. @param needle_str [String, nil] the actual value of needle_str to be used. @param haystack [Enumerable] the items to find `needle` among. @return [Eco::Data::FuzzyMatch::Results]

# File lib/eco/data/fuzzy_match.rb, line 78
def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
  base_match    = fuzzy_match(haystack, **options)
  match_results = base_match.find_all_with_score(needle_str || needle)
  needle_str  ||= item_string(needle)
  results       = match_results.each_with_object([]) do |fuzzy_results, results|
    item, dice, lev = fuzzy_results
    unless item == needle
      item_str     = item_string(item)

      if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
        dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
      end

      jaro_res     ||= jaro(needle_str, item_str)
      ngram_res    ||= ngram(needle_str, item_str)
      wngram_res   ||= words_ngram(needle_str, item_str)
      pos_res      ||= position(needle_str, item_str)

      results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
    end
  end
  Results.new(needle, needle_str, results).tap do |res|
    res.order     = fuzzy_options[:order]     if fuzzy_options[:order]
    res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
  end.relevant_results
end
fuzzy_match(haystack_data = nil, **options) click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 61
def fuzzy_match(haystack_data = nil, **options)
  if instance_variable_defined?(:@fuzzy_match) && !haystack_data
    return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
  end
  @fuzzy_options = options
  # make it run with a native C extension (for better performance: ~130 % increase of performance)
  ::FuzzyMatch.engine = :amatch
  @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
end
recalculate_results(results, needle_str: nil, **options) { |needle_str || value, value, needle, match| ... } click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 105
def recalculate_results(results, needle_str: nil, **options)
  raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
  new_results = results.each_with_object([]) do |result, new_results|
    nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)

    if istr.to_s.strip.empty?
      dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 1
    elsif nstr.to_s.strip.empty?
      unless istr = needle_str
        dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
      end
    end

    res          = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
    dice       ||= res&.dices_coefficient_similar || 0
    lev        ||= res&.levenshtein_similar       || 0
    jaro_res   ||= jaro(nstr, istr)
    ngram_res  ||= ngram(nstr, istr)
    wngram_res ||= words_ngram(nstr, istr)
    pos_res    ||= position(nstr, istr)

    new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
  end
  Results.new(results.needle, results.value, new_results).tap do |res|
    res.order     = options[:order]     if options[:order]
    res.threshold = options[:threshold] if options[:threshold]
  end.relevant_results
end

Private Instance Methods

fuzzy_match_options(options = nil) click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 178
def fuzzy_match_options(options = nil)
  options = fuzzy_options unless options
  options.slice(*FUZZY_MATCH_OPTIONS).merge({
    stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
  })
end
fuzzy_read_method() click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 185
def fuzzy_read_method
  fuzzy_match_options[:read]
end
haystack(data = nil) click to toggle source

@note

- When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`

@param data [Enumerable, nil] @return [Array<Object>] the non-repeated values of `data`

# File lib/eco/data/fuzzy_match.rb, line 160
def haystack(data = nil)
  data = self if self.is_a?(Enumerable) && !data
  raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
  data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
  data.uniq.compact.tap do |items|
    if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
      raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
    end
  end
end
item_string(item, attr = fuzzy_read_method) click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 171
def item_string(item, attr = fuzzy_read_method)
  return item if !item || item.is_a?(String) || !attr
  return attr.call(item) if attr.is_a?(Proc)
  attr = attr.to_sym
  return item.send(attr) if item.respond_to?(attr)
end
jaro(str1, str2) click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 136
def jaro(str1, str2)
  options = fuzzy_options.slice(*JARO_OPTIONS)
  self.class.jaro_winkler(str1, str2, **options)
end
ngram(str1, str2) click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 141
def ngram(str1, str2)
  options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
  self.class.ngrams_score(str1, str2, **options).ratio
end
position(str1, str2) click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 151
def position(str1, str2)
  options = fuzzy_options.slice(*POSITION_OPTIONS)
  self.class.chars_position_score(str1, str2, **options).ratio
end
words_ngram(str1, str2) click to toggle source
# File lib/eco/data/fuzzy_match.rb, line 146
def words_ngram(str1, str2)
  options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
  self.class.words_ngrams_score(str1, str2, **options).ratio
end