class TextProfileSignature

Constants

VERSION

Public Class Methods

new(options={}) click to toggle source
# File lib/text_profile_signature.rb, line 25
def initialize(options={})
  options[:min_token_length] ||= 2
  options[:quant_rate] ||= 0.01
  
  @options = options
end

Public Instance Methods

generate_sign(text) click to toggle source
# File lib/text_profile_signature.rb, line 32
def generate_sign(text)
  # remove all characters except letters and digits,
  # and bring all characters to lower case
  # split the text into tokens (all consecutive non-whitespace characters)
  # discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters)
  current_token = String.new
  max_freq = 0
  tokens = {}
  text.each_char do |character|
    if character =~ /[[:alnum:]]/
      current_token << Unicode::downcase(character)
    else
      if current_token.length > 0
        if current_token.length > @options[:min_token_length]
          # Add it
          tok = tokens[current_token]
          unless tok
            tok = {count: 0, term: current_token}
            tokens[current_token] = tok
          end
          tok[:count] += 1
          max_freq = tok[:count] if tok[:count] > max_freq
        end
        current_token = String.new
      end
    end
  end
  
  # Check the last token
  if current_token.length > @options[:min_token_length]
    # Add it
    tok = tokens[current_token]
    unless tok
      tok = {count: 0, term: current_token}
      tokens[current_token] = tok
    end
    tok[:count] += 1
    max_freq = tok[:count] if tok[:count] > max_freq
  end
  
  # calculate the QUANT value
  quant = (max_freq * @options[:quant_rate]).round
  
  if quant < 2
    if max_freq > 1
      quant = 2
    else
      quant = 1
    end
  end
  
  # round down the counts of tokens to the nearest multiple of QUANT
  # tokens, which frequency after quantization falls below QUANT, are discarded
  quantized_tokens = tokens.values.inject([]) do |memo, item|
    # round down to the nearest QUANT
    item[:count] = (item[:count] / quant) * quant
    
    # discard the frequencies below the QUANT
    memo.push(item) if item[:count] >= quant
    
    memo
  end
  
  # sort the list of tokens by decreasing frequency
  profile = quantized_tokens.sort {|x, y| [y[:count], x[:term]] <=> [x[:count], y[:term]]}

  # create a list of tokens and their quantized frequency,
  # separated by spaces, in the order of decreasing frequency
  quantized_frequency_str = profile.map do |a|
    "#{a[:term]} #{a[:count]}"
  end.join("\n")
  
  Digest::MD5.hexdigest(quantized_frequency_str)
end