class SimilarityTree::SimilarityMatrix

Table of the diff/similarity scores between different text documents

Public Class Methods

new(sources, options = {}) click to toggle source

Initialize a matrix for a set of documents

# File lib/similarity_tree/similarity_matrix.rb, line 11
def initialize(sources, options = {})
  @sources = sources
  @config = default_options.merge(options)

  @id = -1
  @source_index = Hash.new
  @matrix = nil
end

Public Instance Methods

build_tree(root_id, score_threshold = 0) click to toggle source
# File lib/similarity_tree/similarity_matrix.rb, line 32
def build_tree(root_id, score_threshold = 0)
  # build the similarity tree
  @matrix = self.calculate if @matrix.nil?
  tree = SimilarityTree.new(root_id, @matrix, score_threshold).build

  # populate the nodes with the sources for the compatibility matrix
  tree.each_node {|n| n.content = @source_index[n.id] }
  tree
end
calculate() click to toggle source

calculate and output results as an array of arrays; optional block is run each comparison to help with any progress bars

# File lib/similarity_tree/similarity_matrix.rb, line 22
def calculate
  if @config[:calculation_method] == :tf_idf
    @matrix = calculate_with_tf_idf
  elsif @config[:calculation_method] == :diff
    @matrix = calculate_with_diff
  else
    raise "Unknown calculation type"
  end
end

Private Instance Methods

calculate_with_diff() click to toggle source

Create a similarity matrix, using diff as the similarity measure, based on the difference of WORDS (not characters) (only counts insertions and deletions, not substitution and transposition).

# File lib/similarity_tree/similarity_matrix.rb, line 82
def calculate_with_diff
  progress_bar = nil
  if @config[:show_progress]
    progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
                                     total: @sources.length*(@sources.length-1)/2
  end

  matrix = {}
  @sources.each_with_index do |a,i|
    a_id = id_of(a)
    a_text = text_of(a)

    @sources[i + 1..-1].each do |b|
      b_id   = id_of(b)
      b_text = text_of(b)

      stats = FastHtmlDiff::DiffBuilder.new(a_text, b_text).statistics

      # http://en.wikipedia.org/wiki/Dice%27s_coefficient
      total_count = 2 * stats[:matches][:words] + stats[:insertions][:words] + stats[:deletions][:words]
      similarity = 2 * stats[:matches][:words] / total_count.to_f

      # Build the similarity matrix,
      matrix[a_id] ||= {a_id => 1}
      matrix[a_id][b_id] = similarity
      matrix[b_id] ||= {b_id => 1}
      matrix[b_id][a_id] = similarity

      progress_bar.increment unless progress_bar.nil?
    end
  end
  matrix
end
calculate_with_tf_idf() click to toggle source
# File lib/similarity_tree/similarity_matrix.rb, line 52
def calculate_with_tf_idf
  progress_bar = nil
  if @config[:show_progress]
    progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
                                     total: @sources.length
  end

  # iterate through the input texts and build the tf_idf corpus
  corpus = []
  ids = @sources.map do |source|
    corpus << TfIdfSimilarity::Document.new(text_of(source))
    progress_bar.increment unless progress_bar.nil?
    id_of(source)
  end
  model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf, library: :gsl)
  similarity_matrix = model.similarity_matrix

  # compile the results into an ordinary m*n array
  matrix = {}
  ids.each_with_index do |a,i|
    matrix[a] = {}
    ids.each_with_index do |b,j|
      matrix[a][b] = similarity_matrix[i, j].round(6)
    end
  end
  matrix
end
default_options() click to toggle source
# File lib/similarity_tree/similarity_matrix.rb, line 43
def default_options
  {
      id_func: nil,
      content_func: nil,
      calculation_method: :tf_idf,
      show_progress: false
  }
end
id_of(source) click to toggle source
# File lib/similarity_tree/similarity_matrix.rb, line 116
def id_of(source)
  id = nil
  if !@config[:id_func].nil?
    id = source.send @config[:id_func].to_s
  else
    if is_a_filename? source
      id = File.basename(source)
    else
      id = @sources.find_index(source)
    end
  end

  # maintain an index of id => source
  @source_index[id] = source if @source_index[id].nil?
  id
end
is_a_filename?(filename) click to toggle source

quick and dirty check on whether a string is a filename based on the string length and whether the file exists

# File lib/similarity_tree/similarity_matrix.rb, line 144
def is_a_filename?(filename)
  (filename.length < 512) && File.exists?(filename)
end
text_of(source) click to toggle source
# File lib/similarity_tree/similarity_matrix.rb, line 133
def text_of(source)
  if !@config[:content_func].nil?
    txt = source.send @config[:content_func].to_s
  else
    txt = source
  end
  txt = File.read(txt) if is_a_filename?(txt)
  txt
end