class HTMLDOMDiff::Differ

Public Instance Methods

diff(ldoc, rdoc) click to toggle source
# File lib/html-dom-diff/differ.rb, line 15
def diff(ldoc, rdoc)
  reset ldoc, rdoc

  match_by_ids ldoc, rdoc
  prep_with @lsignatures, ldoc
  prep_with @rsignatures, rdoc

  perform_initial_top_down_matching [ldoc], [rdoc]

  @matchqueue.push(rdoc)
  perform_initial_matching

  match_bottom_up ldoc
  match_top_down  ldoc

  @builder
end
diff_fragments(left, right) click to toggle source
# File lib/html-dom-diff/differ.rb, line 11
def diff_fragments(left, right)
  diff parse_fragments(left).child, parse_fragments(right).child
end
diff_strings(left, right) click to toggle source
# File lib/html-dom-diff/differ.rb, line 7
def diff_strings(left, right)
  diff parse(left).root, parse(right).root
end

Private Instance Methods

find_best_match(element) click to toggle source
# File lib/html-dom-diff/differ.rb, line 134
def find_best_match(element)
  candidates = []
  @lsignatures.each do |left, sig|
    if !left_matched?(left) && sig == @rsignatures[element]
      candidates << left
    end
  end

  if candidates.size == 0
    return
  elsif candidates.size == 1
    return candidates.first
  else
    matching_parents = candidates.select do |left|
      left_matches?(left.parent, element.parent)
    end

    if matching_parents.size == 1
      return matching_parents.first
    else
      return
    end
  end
end
hash_for(array) click to toggle source
# File lib/html-dom-diff/differ.rb, line 100
def hash_for(array)
  Digest::SHA256.digest array.join(";")
end
match_all_children(left, right) click to toggle source
# File lib/html-dom-diff/differ.rb, line 159
def match_all_children(left, right)
  record_matching left, right
  left.children.zip(right.children).each do |a, b|
    match_all_children a, b
  end
end
match_bottom_up(element) click to toggle source
# File lib/html-dom-diff/differ.rb, line 174
def match_bottom_up(element)
  element.children.each do |child|
    match_bottom_up child
  end

  if !left_matched?(element) && element.respond_to?(:parent) && left_matched?(element.parent)
    children = left_match(element.parent).children.reject { |c| right_matched?(c) }
    match    = children.find { |c| c.name == element.name }
    record_matching(element, match) if match
  end
end
match_by_ids(ldoc, rdoc) click to toggle source
# File lib/html-dom-diff/differ.rb, line 57
def match_by_ids(ldoc, rdoc)
  rightside = rdoc.css("[id]").to_a
  ldoc.css("[id]").each do |element|
    rindex = rightside.find_index { |e| e[:id] == element[:id] }
    if rindex
      record_matching element, rightside[rindex]
      rightside.delete_at(rindex)
    end
  end
end
match_parents(left, right) click to toggle source
# File lib/html-dom-diff/differ.rb, line 166
def match_parents(left, right)
  # TODO implement multi-ancestor matching
  return if left_matched?(left.parent) || right_matched?(right.parent)
  if left.parent.name == right.parent.name
    record_matching left.parent, right.parent
  end
end
match_top_down(element) click to toggle source
# File lib/html-dom-diff/differ.rb, line 186
def match_top_down(element)
  unless left_matched?(element)
    childmatches = element.children.select { |c| left_matched?(c) }.map { |c| left_match(c).parent }.uniq
    childmatches.reject! { |e| right_matched?(e) }
    if childmatches.size == 1 && childmatches.first.name == element.name
      record_matching(element, childmatches.first)
    end
  end

  element.children.each do |child|
    match_top_down child
  end
end
parse(string) click to toggle source
# File lib/html-dom-diff/differ.rb, line 41
def parse(string)
  Nokogiri::HTML(string, nil, nil, (Nokogiri::XML::ParseOptions::DEFAULT_HTML & Nokogiri::XML::ParseOptions::NOBLANKS))
end
parse_fragments(string) click to toggle source
# File lib/html-dom-diff/differ.rb, line 45
def parse_fragments(string)
  Nokogiri::HTML::DocumentFragment.parse(string)
end
perform_initial_matching() click to toggle source
# File lib/html-dom-diff/differ.rb, line 122
def perform_initial_matching
  while @matchqueue.size > 0
    element = @matchqueue.pop
    if !right_matched?(element) && (match = find_best_match(element))
      match_all_children match, element
      match_parents match, element
    else
      element.children.each { |c| @matchqueue.push c }
    end
  end
end
perform_initial_top_down_matching(lnodes, rnodes) click to toggle source
# File lib/html-dom-diff/differ.rb, line 108
def perform_initial_top_down_matching(lnodes, rnodes)
  _lnodes = lnodes.reject(&:text?)
  _rnodes = rnodes.reject(&:text?)

  _lnodes.each do |lnode|
    lcounts    = _lnodes.count  { |c| c.name == lnode.name }
    candidates = _rnodes.select { |c| c.name == lnode.name }
    if lcounts == 1 && candidates.size == 1
      record_matching lnode, candidates.first
      perform_initial_top_down_matching lnode.children, candidates.first.children
    end
  end
end
prep_with(sig_hash, element, level=0) click to toggle source
# File lib/html-dom-diff/differ.rb, line 68
def prep_with(sig_hash, element, level=0)
  weights    = weight_for(element)
  signatures = [signature_part_for(element)]
  element.children.each do |child|
    weight, signature = prep_with(sig_hash, child, level+1)
    weights += weight
    signatures << signature
  end

  @builder.add_weight(element, weights)
  sig_hash[element] = hash_for(signatures)
  @depths[element]  = level

  [ weights, sig_hash[element] ]
end
record_matching(left, right) click to toggle source
# File lib/html-dom-diff/differ.rb, line 104
def record_matching(left, right)
  @builder.match(left, right)
end
reset(ldoc, rdoc) click to toggle source
# File lib/html-dom-diff/differ.rb, line 49
def reset(ldoc, rdoc)
  @builder     = DeltaTreeBuilder.new(ldoc, rdoc)
  @depths      = {}
  @lsignatures = {}
  @rsignatures = {}
  @matchqueue  = PQueue.new() { |a, b| @builder.weight(a) > @builder.weight(b) }
end
signature_part_for(element) click to toggle source
# File lib/html-dom-diff/differ.rb, line 92
def signature_part_for(element)
  if element.text? or element.cdata?
    element.text
  else
    element.name
  end
end
weight_for(element) click to toggle source
# File lib/html-dom-diff/differ.rb, line 84
def weight_for(element)
  if element.text? or element.cdata?
    1 + Math.log(element.text.size)
  else
    1
  end
end