class SimString::StringMatcher

Public Class Methods

new(simstring_db, measure) click to toggle source
# File lib/simstring_pure.rb, line 232
def initialize(simstring_db, measure)
  @db = simstring_db
  @measure = measure
  @feature_extractor = @db.feature_extractor
end

Public Instance Methods

Private Instance Methods

get(db, y_size, feature) click to toggle source

Returns a Set of strings that each meet the following 2 criteria:

  1. the string has a feature set size equal to <y_size>

  2. the string's feature set contains the feature <feature>

# File lib/simstring_pure.rb, line 310
def get(db, y_size, feature)
  db.lookup_strings_by_feature_set_size_and_feature(y_size, feature)
end
min_overlap(measure, query_size, y_size, alpha) click to toggle source
# File lib/simstring_pure.rb, line 272
def min_overlap(measure, query_size, y_size, alpha)
  measure.minimum_common_feature_count(query_size, y_size, alpha)
end
overlap_join(query_feature_set, tau, db, y_size) click to toggle source

implements “Algorithm 3: CPMerge algorithm” described in “Simple and Efficient Algorithm for Approximate Dictionary Matching” (see www.aclweb.org/anthology/C10-1096)

# File lib/simstring_pure.rb, line 277
def overlap_join(query_feature_set, tau, db, y_size)
  memoized_get_fn_results = query_feature_set.reduce({}) {|memo, feature| memo[feature] = get(db, y_size, feature); memo }
  query_feature_set_size = query_feature_set.size
  sorted_features = query_feature_set.sort_by {|feature| memoized_get_fn_results[feature].size }
  m = {}
  (0..(query_feature_set_size - tau)).each do |k|
    memoized_get_fn_results[sorted_features[k]].each do |s|
      m[s] ||= 0
      m[s] += 1
    end
  end
  r = []
  ((query_feature_set_size - tau + 1)..(query_feature_set_size - 1)).each do |k|
    candidate_matching_strings = m.keys
    candidate_matching_strings.each do |s|
      m[s] ||= 0
      if memoized_get_fn_results[sorted_features[k]].include?(s)
        m[s] += 1
      end
      if tau <= m[s]
        r << s
        m.delete(s)
      elsif m[s] + (query_feature_set_size - k - 1) < tau
        m.delete(s)
      end
    end
  end
  r
end