class Statlysis::Similar

Attributes

corpus[RW]
id_to_similar_ids[RW]
id_to_text_hash_proc[RW]
matrix[RW]

Public Class Methods

new(model_name, id_to_text_hash_proc) click to toggle source
# File lib/statlysis/similar.rb, line 9
def initialize model_name, id_to_text_hash_proc
  # 初始化数据
  cron.id_to_text_hash_proc = id_to_text_hash_proc

  # 初始化表和模型
  cron.stat_table_name = [Statlysis.tablename_default_pre, "similar", model_name].compact.join("_")
  Utils.setup_pattern_table_and_model cron.stat_table_name

  cron.id_to_similar_ids = {}
  cron
end

Public Instance Methods

process() click to toggle source
# File lib/statlysis/similar.rb, line 21
    def process
      logger.info "SimilarProcess #{cron.stat_model} at #{DateTime.now}"
      require 'gsl'
      require 'tf-idf-similarity'

      # 初始化文档
      logger.info "开始取出 cron.id_to_text_hash_proc"
      @id_to_text_hash = cron.id_to_text_hash_proc.call

      logger.info "开始把@id_to_text_hash转化为数组"
      as = @id_to_text_hash.to_a

      logger.info "开始把as slice为1200每次"
      as.each_slice(1200) do |a|
      logger.info "开始跑 #{a.size}  个条目的相似性"
      cron.corpus = TfIdfSimilarity::Collection.new
      a.each do |id, text|
        cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
      end
=begin
      @id_to_text_hash.each do |id, text|
        cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
      end
=end

      cron.matrix = cron.corpus.similarity_matrix
      matrix_array = cron.matrix.to_a

      # matrix的数组下标对应到真实的item_id
      matrix_idx_to_item_id_hash = {}
      cron.corpus.documents.each_with_index do |document, idx1|
        matrix_idx_to_item_id_hash[idx1] = document.id
      end

      # 取出matrix里各item的按相关度倒序的item_ids,并保存
      cron.corpus.documents.each_with_index do |document, idx1|
        _item_id_to_score = Hash.new 0
        matrix_array[idx1].each_with_index do |num, idx2|
          _item_id_to_score[matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num)
        end
        _item_id_to_score.delete document.id
        logger.info "对比文档:"
        logger.info "#{document.id} # #{summary(document.id)}"
        logger.info "相关文档:"
        _item_ids = _item_id_to_score.sort {|a1, b1| b1[1] <=> a1[1] }
        _item_ids[0..9].each do |item_id, score|
          logger.info "#{score} #  #{summary(item_id)}"
        end
        cron.id_to_similar_ids[document.id] = _item_ids[0..99].map(&:first)
        logger.info
      end

      # save results to database
      cron.id_to_similar_ids.each do |id, similar_ids|
        s = cron.stat_model.find_or_create(:pattern => id)
        s.update :result => similar_ids.to_json
      end
      end # @id_to_text_hash.to_a.each_slice(1000) do |a|

      return true
    end
Also aliased as: run
run()
Alias for: process
summary(doc_id) click to toggle source
# File lib/statlysis/similar.rb, line 84
def summary doc_id
  @id_to_text_hash[doc_id].mb_chars[0..41].split("\n").join
end