class Eco::API::Organization::PeopleSimilarity
Class to find out duplicates in the People
Manager
@attr_writer attribute [String, Proc, nil] the target attribute to be read.
Attributes
Public Instance Methods
Analyses People
bases on `options` @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`).
This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read).
@param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold) @return [Hash] where the keys are the people `id`s and the values the `Eco::Data::FuzzyMatch::Results`
# File lib/eco/api/organization/people_similarity.rb, line 124 def analyse(needle_read: nil, keep_empty: false, **options) options = { read: self.attribute }.merge(options) total = count; i = 1 each_with_object({}) do |person, results| needle_str = needle_read ? item_string(person, needle_read) : nil results[person.id] = find_all_with_score(person, needle_str: needle_str, **options) print_progress("Analysed", total, i) i += 1 end.yield_self do |analysed| analysed = clean_empty(analysed) unless keep_empty #puts "... #{analysed.count} results after cleaning empty" analysed end end
@!group Config @return [String, Proc, nil] the target attribute to be read.
# File lib/eco/api/organization/people_similarity.rb, line 15 def attribute=(attr) @attribute = attr end
It returns all the entries with `attribute` n0t empty @return [Eco::API::Organization::PeopleSimilarity]
# File lib/eco/api/organization/people_similarity.rb, line 107 def attribute_present reject do |person| item_value(person).to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end
It returns all the entries with `attribute` empty @return [Eco::API::Organization::PeopleSimilarity]
# File lib/eco/api/organization/people_similarity.rb, line 97 def blank_attribute select do |person| item_value(person).to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end
Removes from results those that do not have similar entries
# File lib/eco/api/organization/people_similarity.rb, line 160 def clean_empty(analysed) analysed.select do |id, results| !results.empty? end end
Renalyses by ignoring matching words between the `needle` and those found in `results`
# File lib/eco/api/organization/people_similarity.rb, line 198 def ignore_matching_words(analysed, **options) prompt = "Reanalysing by ignoring matching words" reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item| self.class.remove_matching_words(needle_str, item_str) end end
Renalyses by ignoring matching words between the `needle` and those found in `results`
# File lib/eco/api/organization/people_similarity.rb, line 206 def ignore_matching_words_old(analysed, **options) options = { read: self.attribute }.merge(options) total = analysed.count; i = 1 with_analysed(analysed) do |person, results| print_progress("Reanalysing by ignoring matching words", total, i) i += 1 ignore_same_words_score(results, **options) end end
Returns the target value to analyse @param person [Ecoportal::API::V1::Person]
# File lib/eco/api/organization/people_similarity.rb, line 25 def item_value(person) return attr.call(item) if attribute.is_a?(Proc) attr = attribute.to_sym return item.send(attr) if item.respond_to?(attr) end
It returns all people with no name @return [Eco::API::Organization::PeopleSimilarity]
# File lib/eco/api/organization/people_similarity.rb, line 87 def named reject do |person| person.name.to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end
Generates a new object with same config but different base `data`. @return [Eco::API::Organization::PeopleSimilarity]
Eco::API::Organization::People#newFrom
# File lib/eco/api/organization/people_similarity.rb, line 54 def newFrom(data) super(data).tap do |simil| simil.threshold = threshold simil.order = order simil.attribute = attribute end end
Gets a new instance object of this class, with only people in results @param analysed [Hash] where the keys are the people `id`s and values the `Eco::Data::FuzzyMatch::Results` @return [Eco::API::Organization::PeopleSimilarity]
# File lib/eco/api/organization/people_similarity.rb, line 146 def newSimilarity(analysed) newFrom(people_in_results(analysed)) end
# File lib/eco/api/organization/people_similarity.rb, line 38 def order @order ||= [:words_ngrams, :dice] end
Define the order or relevant of per user matches @param
# File lib/eco/api/organization/people_similarity.rb, line 34
def order=(values)
@order = values
end
# File lib/eco/api/organization/people_similarity.rb, line 150 def people_in_results(analysed) analysed.each_with_object([]) do |(id, results), people| related = results.each_with_object([self[id]]) do |result, related| related << result.match end related.each {|person| people << person unless people.include?(person)} end end
@note
1. Unless `:analysed` is provided, it launches an analysis cutting with Jaro Winker min 0.5 2. It then re-sorts and cuts based on `options`
@return [Hash] where the keys are the people `id`s and the values the `Eco::Data::FuzzyMatch::Results`
# File lib/eco/api/organization/people_similarity.rb, line 235 def print_analysis(**options) analysed = options[:analysed] || results_with_false_positives.analyse(**options) analysed.each_with_object({}) do |(id, results), out| puts report(analysed) end end
Reanalyses by using a block to treat the needle and item values
# File lib/eco/api/organization/people_similarity.rb, line 187 def reanalyse(analysed, msg: "Reanalysing", **options, &block) options = { read: self.attribute }.merge(options) total = analysed.count; i = 1 with_analysed(analysed) do |person, results| print_progress(msg, total, i) i += 1 recalculate_results(results, &block) end end
Launches a reanalyis on `analysed` based on `options` @param analysed [Hash] where the keys are the people `id`s and the values the `Eco::Data::FuzzyMatch::Results`
# File lib/eco/api/organization/people_similarity.rb, line 180 def rearrange(analysed, **options) with_analysed(analysed) do |person, results| results.relevant_results(**options) end end
It gathers those that have the same `email` @return [Hash] where `keys` are `email`s and `values` an `Array<Person>`
# File lib/eco/api/organization/people_similarity.rb, line 68 def repeated_emails init_caches @by_email.select do |email, people| people.count > 1 end end
@return [String] well structured text
# File lib/eco/api/organization/people_similarity.rb, line 221 def report(analysed, format: :txt) case when format == :txt analysed.each_with_object("") do |(id, results), out| msg = results.results.map {|r| r.print}.join("\n ") out << "#{self[id].identify}:\n " + msg + "\n" end end end
# File lib/eco/api/organization/people_similarity.rb, line 48 def threshold @threshold ||= 0.15 end
Define the order or relevant of per user matches @param value [Float] the threshold that all of the algorithms should comply with
# File lib/eco/api/organization/people_similarity.rb, line 44 def threshold=(value) @threshold = value end
It returns all people with no name @return [Eco::API::Organization::PeopleSimilarity]
# File lib/eco/api/organization/people_similarity.rb, line 77 def unnamed select do |person| person.name.to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end
Helper to do some treatment fo the results @param analysed [Hash] where the keys are the people `id`s and values the `Eco::Data::FuzzyMatch::Results` @return [Hash] where the keys are the people `id`s and values the `Eco::Data::FuzzyMatch::Results`
# File lib/eco/api/organization/people_similarity.rb, line 169 def with_analysed(analysed, keep_empty: false) analysed.each_with_object({}) do |(id, results), reanalysed| reanalysed[id] = yield(self[id], results) end.yield_self do |reanalysed| reanalysed = clean_empty(reanalysed) unless keep_empty reanalysed end.tap {|out| "with_analysed... returns #{out.count} records"} end
Protected Instance Methods
@!endgroup
Eco::API::Organization::People#on_change
# File lib/eco/api/organization/people_similarity.rb, line 245 def on_change remove_instance_variable(@fuzzy_match) super end
Private Instance Methods
# File lib/eco/api/organization/people_similarity.rb, line 252 def print_progress(msg, total, num) return unless total > 10 puts "" unless num > 1 @print_msg_len ||= 0 percent = (100 * num.to_f / total).round(1) msg = " #{msg}: #{percent}% (#{num} of #{total})\r" @print_msg_len = msg.length unless @print_msg_len > msg.length print msg $stdout.flush if percent > 99.9 sleep(0.2) print "#{" " * @print_msg_len}\r" $stdout.flush end end