class Matching::Matcher

Attributes

compare_pairs[R]
custom_functions[R]
filter_functions[R]
join_pairs[R]
left_matches[R]
left_store[R]
min_score[RW]
right_index[R]
right_matches[R]
right_store[R]

Public Class Methods

define(opts=nil, &block) click to toggle source
# File lib/matching/matcher.rb, line 10
def self.define(opts=nil, &block)
  m = new(opts)
  m.define(block)
  m
end
new(opts={}) click to toggle source
# File lib/matching/matcher.rb, line 16
def initialize(opts={})
  @left_store = opts[:left_store]
  @right_store = opts[:right_store]
  @min_score = opts[:min_score] || 1.0

  @join_pairs = []
  @compare_pairs = []
  @custom_functions = []
  @filter_functions = []
  @right_matches = {} #hash keyed on right_class records, used during main rec loop
  @left_matches = {} #hash keyed on left_class records, created after main rec loop from reverse of @right_matches
  @left_losers = [] #array of left objects that were matched to right records then unmatched, requiring re-match attempt

  # Create @right_index using either a hash or Redis as the backing store
  if opts[:redis_db] && opts[:redis_db].to_i >= 1
    @right_index = RedisIndex.new(opts[:redis_db])
  else
    @right_index = HashIndex.new
  end
end

Public Instance Methods

compare(left_attr, right_attr, weight, is_fuzzy = false) click to toggle source

For records matched via join attributes, comparisons may be applied to adjust the score.

# File lib/matching/matcher.rb, line 64
def compare(left_attr, right_attr, weight, is_fuzzy = false)
  @compare_pairs << AttributePair.new(left_attr, right_attr, weight, is_fuzzy)
end
compare_values(left,right,opts={}) click to toggle source

Compare left and right arguments and return similarity as a floating point value where 0.0 represents no similarity and 1.0 represents equality.

# File lib/matching/matcher.rb, line 39
def compare_values(left,right,opts={})
  return 0.0 unless left && right

  raise ArgumentError, "Cannot compare values of dissimilar type - left = #{left}, right = #{right}" unless left.class == right.class

  if opts[:fuzzy]
    raise ArgumentError, "Cannot calculate fuzzy comparison for type #{left.class}" unless left.respond_to?(:similarity_to)
    left.similarity_to(right,opts)
  else
    (left == right ? 1.0 : 0.0)
  end
end
custom(lmbda) click to toggle source

Custom functions may adjust the score beyond the simple comparisons performed via @compare_pairs.

# File lib/matching/matcher.rb, line 70
def custom(lmbda)
  @custom_functions << lmbda
end
define(&block) click to toggle source
# File lib/matching/matcher.rb, line 52
def define(&block)
  instance_eval(&block)
end
evaluate_left_losers() click to toggle source

Attempt to find matches while any left losers remain

# File lib/matching/matcher.rb, line 212
def evaluate_left_losers
  return unless @left_losers.any?

  # Use a copy of the array because it may be filled again as
  # find_matches is called
  working_losers = @left_losers
  @left_losers = []
  working_losers.each do |left_obj| 
    ranked_matches = find_matches(left_obj)
    pair_matches(left_obj, ranked_matches)
  end

  # To understand recursion you first must understand recursion
  evaluate_left_losers
end
exceptions(side) click to toggle source
# File lib/matching/matcher.rb, line 242
def exceptions(side)
  if side == :left 
    store, matches = @left_store, @left_matches
  else 
    store, matches = @right_store, @right_matches
  end

  arr = []
  if arr.class == ArrayStore
    arr = store.arr - matches
  else
    store.each do |obj|
      arr << obj unless matches[obj]
    end
  end
  arr
end
filter(lmbda) click to toggle source

Filter lambdas must return a boolean. Returning true will prevent a match.

# File lib/matching/matcher.rb, line 75
def filter(lmbda)
  @filter_functions << lmbda
end
find_matches(left_obj) click to toggle source

Return of scored matches for the left_object argument. Results are in an ordered array of form [[right_obj_a, score_a], [right_obj_b, score_b], …]

# File lib/matching/matcher.rb, line 151
def find_matches(left_obj)
  potential_matches = find_potential_matches(left_obj)
  ranked_pairs = []

  potential_matches.each do |right_obj|
    score = score_pair(left_obj, right_obj)
    ranked_pairs << [right_obj, score] if score >= @min_score
  end

  ranked_pairs.sort! { |a,b| a[1] <=> b[1] }
  ranked_pairs.reverse
end
find_potential_matches(left_obj) click to toggle source

Return an array of right_objects that match the left_object by join criteria. This is equivalent to an index lookup. No scoring is done by this method.

# File lib/matching/matcher.rb, line 167
def find_potential_matches(left_obj)
  right_objects = []

  @join_pairs.each do |jp|
    left_val = left_obj.send(jp.left_attr)
    next if left_val.nil? || left_val == ''

    matches = @right_index.get(jp.right_attr, left_val)
    right_objects = right_objects | matches if matches
  end

  # At this point right_objects contains an array of right object ID's.
  # Retrieve the matching objects now.
  right_objects.map! { |r_id| @right_store.find(r_id) }
end
index_right_objects() click to toggle source

Indexes attribues from right object in @right_index (either hash or Redis, see initialize). For each join_pair, store the attribute’s values in the form:

attr:val -> [array_of_ids]
# File lib/matching/matcher.rb, line 139
def index_right_objects

  # Require at least one exact_pair else would execute in quadratic time
  raise 'Matcher requires at least one join pair to be defined' unless @join_pairs.any?

  @right_store.each do |right_obj, id|
    @join_pairs.each { |jp| @right_index.put(jp.right_attr, right_obj.send(jp.right_attr), id) }
  end
end
join(left_attr, right_attr, weight) click to toggle source

One or more join attributes are required for a match between two records to occur. Attributes must be equal.

# File lib/matching/matcher.rb, line 58
def join(left_attr, right_attr, weight)
  @join_pairs << AttributePair.new(left_attr, right_attr, weight)
end
left_exceptions() click to toggle source

Returns array of non-matched left objects

# File lib/matching/matcher.rb, line 229
def left_exceptions
  return @left_exceptions if @left_exceptions
  @left_exceptions = exceptions(:left)
  @left_exceptions
end
match() { |left_obj| ... } click to toggle source

Perform matching

# File lib/matching/matcher.rb, line 104
def match
  unless @left_store && @right_store
    raise ArgumentError, "Matcher requires left_store and right_store attributes"
  end

  # Index right objects to speed search
  index_right_objects

  # Evaluate each left record for matches.
  # If more than one match is found, the best-possible match
  # will be awarded the match unless another object is already
  # matched to it. Conflicts are resolved in a separate method.
  @left_store.each do |left_obj|

    yield left_obj if block_given?

    # Results are pre-sorted with the best matches first
    ranked_matches = find_matches(left_obj)

    # Attempt to pair the left_object with one of the
    # ranked right matches
    pair_matches(left_obj, ranked_matches)
  end #each left_obj

  # Call the recursive method evaluate_left_losers which will attempt to
  # find new matches
  evaluate_left_losers

  # Populate left_matches as the mirror of right_matches
  @right_matches.each { |right_obj, match| @left_matches[match.left_obj] = match }
end
matches() click to toggle source
# File lib/matching/matcher.rb, line 260
def matches
  @left_matches.map do |left_obj, match|
    match
  end 
end
pair_matches(left_obj, ranked_matches) click to toggle source

Evaluate and possibly create Match objects to join the left_object to one of the right_objects from the ranked_matches array

# File lib/matching/matcher.rb, line 186
def pair_matches(left_obj, ranked_matches)

  ranked_matches.each do |pair|
    (right_obj, score) = pair

    if @right_matches[right_obj]
      # A match already exists. Determine which left_obj is the best fit.
      if score > @right_matches[right_obj].score
        # The current left_obj is a better fit.
        # Record the other left_obj as a loser then switch
        # the match for the right_obj.
        @left_losers << @right_matches[right_obj].left_obj
        @right_matches[right_obj] = Match.new(left_obj, right_obj, score)
        break
      else
        # Continue looping to try to find a better match
      end
    else
      # Assign first match for this right_obj
      @right_matches[right_obj] = Match.new(left_obj, right_obj, score)
      break
    end
  end
end
right_exceptions() click to toggle source

Returns array of non-matched right objects

# File lib/matching/matcher.rb, line 236
def right_exceptions
  return @right_exceptions if @right_exceptions
  @right_exceptions = exceptions(:right)
  @right_exceptions
end
score_pair(left_obj, right_obj) click to toggle source

Given join, compare, and custom rules, return the floating point matching score of two objects.

# File lib/matching/matcher.rb, line 81
def score_pair(left_obj, right_obj)
  score = 0

  @join_pairs.each do |pair|
    score += pair.weight * compare_values(left_obj.send(pair.left_attr), right_obj.send(pair.right_attr))
  end

  @compare_pairs.each do |pair|
    score += pair.weight * compare_values(left_obj.send(pair.left_attr), right_obj.send(pair.right_attr), pair.is_fuzzy)
  end

  @custom_functions.each do |lmbda|
    score += lmbda.call(left_obj, right_obj)
  end

  @filter_functions.each do |lmbda|
    score = 0 unless lmbda.call(left_obj, right_obj)
  end

  score
end