class Matching::Deduplicator

Attributes

criteria[RW]
grouped[RW]
groups[RW]
index[RW]
store[RW]

Public Class Methods

new(store,opts={}) click to toggle source

Stored in form { id => index_of_groups_object }

# File lib/matching/deduplicator.rb, line 9
def initialize(store,opts={})
  raise 'Store parameter required' unless store
  @store = store

  @criteria = []
 
  # Create an index using either a hash or Redis as the backing store
  if opts[:redis_db] && opts[:redis_db].to_i >= 1
    @index = RedisIndex.new(opts[:redis_db])
  else
    @index = HashIndex.new
  end
end

Public Instance Methods

create_index() click to toggle source
# File lib/matching/deduplicator.rb, line 106
def create_index
  raise 'Deduplicator requires at least one match attribute be defined' unless @criteria.any?

  @store.each do |obj, id|
    unique_attrs.each do |ma|
      @index.put(ma, obj.send(ma), id)
    end
  end
end
deduplicate() click to toggle source
# File lib/matching/deduplicator.rb, line 35
def deduplicate
  @groups = []      # Array of arrays containing ids of grouped objects
  @nil_group = []   # Special array of objects whose indexed values are all nil (because index isn't tracking them)
  @grouped = {}     # Hash of each object's id to the index of @groups in which its found
  
  # Index all records in the store to speed search
  create_index

  # Place each object into an array in @groups that contain all
  # records that match the defined matching logic.
  @store.each do |obj,store_idx|

    puts "On #{store_idx}" if store_idx % 100 == 0 && store_idx > 0

    # Shortcut the process if there is only one array in criteria
    # and this object is already present (because it can't possibly match
    # a second time)
    next if @criteria.size == 1 && @grouped[obj.id]

    @criteria.each do |arr|

      # Find matching objects
      all_matches = nil
      arr.each do |match_attr|
        val = obj.send(match_attr)

        if val != nil
          matches = @index.get(match_attr, val)
          all_matches = (all_matches ? all_matches & matches : matches)
        end
      end

      if all_matches.nil?
        @nil_group << obj.id
        next
      end

      # Assign matched objects to a group.
      # Groups may be merged in this process.
      current_group_indexes = all_matches.inject([]) do |arr,id| 
        arr << @grouped[id] if @grouped[id] 
        arr
      end.uniq.compact

      next if current_group_indexes.size == 1 # can only be [obj_id]

      if current_group_indexes.size > 1
        # Merge related groups into mega_group based on first group
        mega_group = @groups[current_group_indexes[0]] 
        current_group_indexes[1..-1].each do |idx| 
          @groups[idx].each { |id| mega_group << id } 
          @groups.delete_at(idx)
        end
      
        # Re-assign @grouped for all objects to new mega-group
        mega_group.each { |obj_id| @grouped[obj_id] = current_group_indexes[0] }
      else
        # Create new group
        @groups << all_matches
        group_idx = @groups.size - 1
        all_matches.each { |obj_id| @grouped[obj_id] = group_idx }
      end
    end   
  end

  # Add the contents of nil group as a single group
  @groups << @nil_group if @nil_group.any?

  #puts "Results: #{@groups.inspect}"
end
define(&block) click to toggle source
# File lib/matching/deduplicator.rb, line 31
def define(&block)
  instance_eval(&block)
end
each_with_groups() { |find, grp_idx, obj_idx| ... } click to toggle source

Returns each object in store along with its group’s index and index within the group. For example… group_idx | idx | name

0 |   0 | Fred Smith
0 |   1 | Fred Smith
1 |   0 | Jane Green
2 |   0 | Linda Smythe
2 |   1 | Linda Smythe
# File lib/matching/deduplicator.rb, line 124
def each_with_groups
  @groups.each_with_index do |arr,grp_idx|
    arr.each_with_index do |obj_id,obj_idx|
      yield(@store.find(obj_id), grp_idx, obj_idx) 
    end
  end
end
match_attrs(attrs) click to toggle source
# File lib/matching/deduplicator.rb, line 23
def match_attrs(attrs)
  @criteria << [*attrs] #converts to array if not already, doesn't affect arrays
end
unique_attrs() click to toggle source
# File lib/matching/deduplicator.rb, line 27
def unique_attrs
  @criteria.flatten.uniq
end