class Fuzzy::Scorer

Constants

TermSet
Token

Public Class Methods

new(corpus) click to toggle source
# File lib/fuzzy.rb, line 23
def initialize corpus
  corpus = (corpus || []).reject{|c| c[:weight].blank? or c[:terms].blank?}
  @corpus = corpus.map{|c| TermSet.new(c[:weight], c[:terms].reject{|t| t.blank?})}
  @total_weight = @corpus.sum {|c| c.weight}
  @weighted_tokens = @corpus.flat_map{ |c| c.tokens @total_weight }
end

Public Instance Methods

normalized_tokens() click to toggle source
# File lib/fuzzy.rb, line 50
def normalized_tokens
  basic_tokens = tokens
  max = basic_tokens.max_by(&:weight).weight
  min = basic_tokens.min_by(&:weight).weight
  # Calculate m and c values for the linear transform y=mx+c
  # m = (y' - y)/(x' - x)
  m = (1 - 0).fdiv(max - min)
  # Substituting the max values in, we get 1 = m(max) + c
  c = 1 - (m * max)
  basic_tokens.map{|t| Token.new(t.token, (t.weight*m + c))}
end
rank(query) click to toggle source
# File lib/fuzzy.rb, line 30
def rank query
  scores = @weighted_tokens.map do |wt|
    length_score = wt.token.starts_with?(query) ? query.length.fdiv(wt.token.length) : 0
    length_score * wt.weight
  end
  score_count = scores.count{|s| s > 0}
  return 0 unless score_count > 0
  scores.sum / score_count
end
tokenize() click to toggle source
# File lib/fuzzy.rb, line 40
def tokenize
  @corpus.flat_map{|c| c.cleaned_terms.to_a}.flat_map do |str|
    (1..str.length).map { |len| str.slice(0, len) }
  end.to_set
end
tokens() click to toggle source
# File lib/fuzzy.rb, line 46
def tokens
  tokenize.map{|t| Token.new(t, rank(t))}
end