class SimString::StringMatcher
Public Class Methods
new(simstring_db, measure)
click to toggle source
# File lib/simstring_pure.rb, line 232 def initialize(simstring_db, measure) @db = simstring_db @measure = measure @feature_extractor = @db.feature_extractor end
Public Instance Methods
ranked_search(query_string, alpha, measure = @measure)
click to toggle source
Same as search
, except returns an array of Match objects indicating both the matched string(s) and their corresponding similarity scores. Example:
matcher.ranked_search("Fooo", 0.5) => [#<struct Match value="Foo", score=0.9128709291752769>, <struct Match value="Food", score=0.5>, <struct Match value="Foot", score=0.5>]
# File lib/simstring_pure.rb, line 263 def ranked_search(query_string, alpha, measure = @measure) feature_set = @feature_extractor.features(query_string) search(query_string, alpha, measure).map do |matching_string| Match.new(matching_string, measure.similarity(feature_set, @feature_extractor.features(matching_string))) end.sort_by {|match| -match.score } end
search(query_string, alpha, measure = @measure)
click to toggle source
Implements “Algorithm 1: Approximate dictionary matching” described in “Simple and Efficient Algorithm for Approximate Dictionary Matching” (see www.aclweb.org/anthology/C10-1096) Returns an array of matching strings. Example:
matcher.search("Fooo", 0.5) => ["Foo", "Food", "Foot"]
# File lib/simstring_pure.rb, line 243 def search(query_string, alpha, measure = @measure) feature_set = @feature_extractor.features(query_string) feature_set_size = feature_set.size matches = [] min_feature_size_of_matching_string = measure.min_feature_size(@db, feature_set_size, alpha) max_feature_size_of_matching_string = measure.max_feature_size(@db, feature_set_size, alpha) (min_feature_size_of_matching_string..max_feature_size_of_matching_string).each do |candidate_match_feature_size| tau = min_overlap(measure, feature_set_size, candidate_match_feature_size, alpha) additional_matches = overlap_join(feature_set, tau, @db, candidate_match_feature_size) matches.concat(additional_matches) end matches end
Private Instance Methods
get(db, y_size, feature)
click to toggle source
Returns a Set of strings that each meet the following 2 criteria:
-
the string has a feature set size equal to <y_size>
-
the string's feature set contains the feature <feature>
# File lib/simstring_pure.rb, line 310 def get(db, y_size, feature) db.lookup_strings_by_feature_set_size_and_feature(y_size, feature) end
min_overlap(measure, query_size, y_size, alpha)
click to toggle source
# File lib/simstring_pure.rb, line 272 def min_overlap(measure, query_size, y_size, alpha) measure.minimum_common_feature_count(query_size, y_size, alpha) end
overlap_join(query_feature_set, tau, db, y_size)
click to toggle source
implements “Algorithm 3: CPMerge algorithm” described in “Simple and Efficient Algorithm for Approximate Dictionary Matching” (see www.aclweb.org/anthology/C10-1096)
# File lib/simstring_pure.rb, line 277 def overlap_join(query_feature_set, tau, db, y_size) memoized_get_fn_results = query_feature_set.reduce({}) {|memo, feature| memo[feature] = get(db, y_size, feature); memo } query_feature_set_size = query_feature_set.size sorted_features = query_feature_set.sort_by {|feature| memoized_get_fn_results[feature].size } m = {} (0..(query_feature_set_size - tau)).each do |k| memoized_get_fn_results[sorted_features[k]].each do |s| m[s] ||= 0 m[s] += 1 end end r = [] ((query_feature_set_size - tau + 1)..(query_feature_set_size - 1)).each do |k| candidate_matching_strings = m.keys candidate_matching_strings.each do |s| m[s] ||= 0 if memoized_get_fn_results[sorted_features[k]].include?(s) m[s] += 1 end if tau <= m[s] r << s m.delete(s) elsif m[s] + (query_feature_set_size - k - 1) < tau m.delete(s) end end end r end