class FastHtmlDiff::DiffBuilder
Public Class Methods
new(html_str_a,html_str_b,config={})
click to toggle source
# File lib/fast_html_diff.rb, line 6 def initialize(html_str_a,html_str_b,config={}) # merge specified config with defaults @config = default_config.merge(config) if config[:tokenizer_regexp].nil? if @config[:ignore_punctuation] @config[:tokenizer_regexp] = %r{([^A-Za-z0-9]+)} else @config[:tokenizer_regexp] = %r{(\s+)} end end @word_list = {} @insertions = [] @deletions = [] @matches = [] @split_nodes = Hash.new @insertion_nodes = Hash.new # parse, tokenize and index the input documents @a = Nokogiri::HTML(html_str_a) @b = Nokogiri::HTML(html_str_b) if @config[:simplify_html] simplify_html(@a) simplify_html(@b) end index_document(@a, :a) index_document(@b, :b) # find the insertions and deletions diff_words end
Public Instance Methods
build()
click to toggle source
build output HTML
# File lib/fast_html_diff.rb, line 39 def build # update doc_a with tags for the insertions and deletions update_dom @a.to_html end
statistics()
click to toggle source
output statistics on insertions and deletions
# File lib/fast_html_diff.rb, line 46 def statistics result = { insertions: { segments: 0, words: 0, chars: 0 }, deletions: { segments: 0, words: 0, chars: 0 }, matches: { segments: 0, words: 0, chars: 0} } @insertions.each do |i| result[:insertions][:segments] += 1 result[:insertions][:words] += i[:b_end] - i[:b_start] + 1 result[:insertions][:chars] += @word_list[:b][i[:b_end]][:end_pos] - @word_list[:b][i[:b_start]][:start_pos] end @deletions.each do |i| result[:deletions][:segments] += 1 result[:deletions][:words] += i[:a_end] - i[:a_start] + 1 result[:deletions][:chars] += @word_list[:a][i[:a_end]][:end_pos] - @word_list[:a][i[:a_start]][:start_pos] end @matches.each do |i| result[:matches][:segments] += 1 result[:matches][:words] += i[:a_end] - i[:a_start] + 1 result[:matches][:chars] += @word_list[:a][i[:a_end]][:end_pos] - @word_list[:a][i[:a_start]][:start_pos] end result end
Private Instance Methods
default_config()
click to toggle source
# File lib/fast_html_diff.rb, line 465 def default_config { ignore_punctuation: true, case_insensitive: true, tokenizer_regexp: %r{([^A-Za-z0-9]+)}, # overrides any ignore_punctuation setting diff_cmd: 'diff', try_hard: false, simplify_html: false, simplified_html_tags: ['html','body','p','strong','em','ul','ol','li'] } end
diff_words()
click to toggle source
# File lib/fast_html_diff.rb, line 106 def diff_words # run diff on the word lists, using it as a quick, natively-run lcs algorithm diff_result = nil begin file_a = Tempfile.new('fast_html_diff_a') file_a.write @word_list[:a].map{|w| w[:index_word]}.join("\n") + "\n" file_a.close file_b = Tempfile.new('fast_html_diff_b') file_b.write @word_list[:b].map{|w| w[:index_word]}.join("\n") + "\n" file_b.close diff_args = "-U 100000" + (@config[:try_hard] ? ' -d' : '') diff_result = `#{@config[:diff_cmd]} #{diff_args} #{file_a.path} #{file_b.path}` ensure file_a.close file_a.unlink file_b.close file_b.unlink end # remap output back to the indexed word list doca_i = 0 docb_i = 0 prev_operation = :none diff_result.each_line do |word| next if word.match /^(---|\+\+\+|@@|\\\\)/ # skip info lines case word[0] when '+' if prev_operation == :insertion @insertions.last[:b_end] = docb_i else if prev_operation == :deletion @deletions.last[:next_operation] = :insertion end @insertions << { a_position: doca_i-1, #insert before the current word b_start: docb_i, b_end: docb_i, prev_operation: prev_operation } prev_operation = :insertion end docb_i += 1 when '-' if prev_operation == :deletion @deletions.last[:a_end] = doca_i else if prev_operation == :insertion @insertions.last[:next_operation] = :insertion end @deletions << { a_start: doca_i, a_end: doca_i, prev_operation: prev_operation } prev_operation = :deletion end doca_i += 1 else if prev_operation == :match @matches.last[:a_end] = doca_i else if prev_operation == :insertion @insertions.last[:next_operation] = :match elsif prev_operation == :deletion @deletions.last[:next_operation] = :match end @matches << { a_start: doca_i, a_end: doca_i } end prev_operation = :match doca_i += 1 docb_i += 1 end # if an additon is one past the end, keep the marker at the end doca_i = (@word_list[:a].length-1) if doca_i >= @word_list[:a].length docb_i = (@word_list[:b].length-1) if docb_i >= @word_list[:b].length end end
index_document(doc, doc_name)
click to toggle source
index the words in the document
# File lib/fast_html_diff.rb, line 73 def index_document(doc, doc_name) @word_list[doc_name] = Array.new # index each word of each text node preceding_chars = "" doc.xpath('//text()').each do |text_node| position = 0 is_a_word = true text_node.content.split(@config[:tokenizer_regexp]).each_with_index do |word,i| # check whether we're starting with a word or a split itself if (i == 0) || (i == 1) is_a_word = !word.empty? && !word.match(@config[:tokenizer_regexp]) else is_a_word = !is_a_word end if !is_a_word preceding_chars = word unless word.empty? else @word_list[doc_name] << { node: text_node, index_word: (@config[:case_insensitive] ? word.downcase : word), start_pos: position, end_pos: position + word.length, preceding_chars: preceding_chars } preceding_chars = "" end position += word.length end end end
modify_each_node_between(node, start_char, end_char) { |node_in_set| ... }
click to toggle source
splits nodes (if necessary) between the specified character positions and runs the block for each node between the start and end
# File lib/fast_html_diff.rb, line 347 def modify_each_node_between(node, start_char, end_char) prev_node_set = nil if @split_nodes[node].nil? prev_node_set = [node] else prev_node_set = @split_nodes[node] end # skip over inserted nodes, as they're not included in the character # counts (and there's no further operations on them) prev_node_set.delete_if {|n| @insertion_nodes[n] } new_node_set = [] inside_nodes = [] insertion_queue = Hash.new cur_char = 0 start_trimmed = false end_trimmed = false prev_node_set.each do |n| cur_node = n new_node_set << cur_node node_end_char = cur_char + cur_node.content.length # split node at the start_char unless start_trimmed if start_char > node_end_char cur_char = node_end_char next else if start_char == cur_char start_trimmed = true else # start_char beteen cur_char and node_end_char after_node = cur_node.dup cur_node.content = after_node.content[0..(start_char-cur_char-1)] after_node.content = after_node.content[(start_char-cur_char)..-1] insertion_queue[after_node] = cur_node # don't actually add_next_sibling yet, as Nokogiri will merge them start_trimmed = true cur_char += cur_node.content.length cur_node = after_node new_node_set << cur_node end end end # split node at the end_char unless end_trimmed || !start_trimmed inside_nodes << cur_node if end_char > node_end_char cur_char = node_end_char next elsif end_char == node_end_char end_trimmed = true cur_char = node_end_char next else # end_char < node_end_char after_node = cur_node.dup if (end_char-cur_char) > 0 cur_node.content = after_node.content[0..(end_char-cur_char-1)] after_node.content = after_node.content[(end_char-cur_char)..-1] else cur_node.content = "" end insertion_queue[after_node] = cur_node end_trimmed = true new_node_set << after_node end end cur_char = node_end_char end new_node_set.map! do |node_in_set| insert_after = insertion_queue[node_in_set] if inside_nodes.include?(node_in_set) && block_given? modified_node = nil if !insert_after.nil? modified_node = yield node_in_set insert_after.add_next_sibling(modified_node) else node_parent = node_in_set.parent node_position = node_parent.children.index(node_in_set) modified_node = yield node_in_set # if the actual node has changed, need to rehook to parent (assume the origial has been removed) if modified_node != node_in_set if node_parent.children.length > node_position node_parent.children[node_position].add_previous_sibling(modified_node) else node_parent.add_child(modified_node) end end end # also need to update the insertion queue if a node referenced by # another has changed if modified_node != node_in_set insertion_queue.each do |floating_node, target_node| if target_node == node_in_set insertion_queue[floating_node] = modified_node end end end modified_node else insert_after.add_next_sibling(node_in_set) unless insert_after.nil? node_in_set end end @split_nodes[node] = new_node_set end
prepare_insertion(insertion)
click to toggle source
build the exact DOM tree for an insertion
# File lib/fast_html_diff.rb, line 272 def prepare_insertion(insertion) start_node = @word_list[:b][insertion[:b_start]][:node] start_char = @word_list[:b][insertion[:b_start]][:start_pos] end_node = @word_list[:b][insertion[:b_end]][:node] end_char = @word_list[:b][insertion[:b_end]][:end_pos] # find the closest common ancestor of the start and end, and clone this portion cca = (start_node.ancestors & end_node.ancestors).first cca_clone = cca.dup # find the start node in the clone by retracing the path path_to_cca = [] target_node = start_node until target_node == cca path_to_cca.unshift target_node.parent.children.index(target_node) target_node = target_node.parent end start_node = cca_clone path_to_cca.each {|i| start_node = start_node.children[i]} # find the end node in the clone by retracing the path path_to_cca = [] target_node = end_node until target_node == cca path_to_cca.unshift target_node.parent.children.index(target_node) target_node = target_node.parent end end_node = cca_clone path_to_cca.each {|i| end_node = end_node.children[i]} # trim away NODES up the tree that fall to the left of the start # or to the right of the end left_node = start_node while left_node != cca_clone siblings = left_node.parent.children self_index = siblings.index(left_node) unless self_index == 0 left_of_self = siblings.slice(0..(self_index-1)) left_of_self.each {|n| n.remove} unless left_of_self.nil? end left_node = left_node.parent end right_node = end_node while right_node != cca_clone siblings = right_node.parent.children self_index = siblings.index(right_node) right_of_self = siblings.slice((self_index+1)..-1) right_of_self.each {|n| n.remove} unless right_of_self.nil? right_node = right_node.parent end # trim away the TEXT that falls to the left of the start or to the right of # the end; also include the preceding characters to the insertion end_node.content = end_node.content[0..(end_char-1)] start_node.content = start_node.content[start_char..-1] # unless there's a deletion immediately before, include the preceding chars in the insertion unless (insertion[:prev_operation] == :deletion) || (insertion[:b_start] <= 0) start_node.content = @word_list[:b][insertion[:b_start]][:preceding_chars] + start_node.content end #unless (insertion[:next_operation] == :deletion) || (insertion[:b_end] >= @word_list[:b].length-1) # end_node.content += @word_list[:b][insertion[:b_end]+1][:preceding_chars] #end insertion_data = { new_nodes: cca_clone, insertion_point_node: @word_list[:a][insertion[:a_position]][:node], insertion_char_index: @word_list[:a][insertion[:a_position]][:end_pos] } insertion.merge insertion_data end
simplify_html(html)
click to toggle source
# File lib/fast_html_diff.rb, line 459 def simplify_html(html) (html.css('*') - html.css(@config[:simplified_html_tags].join(','))).each do |node| node.replace(node.children) end end
update_dom()
click to toggle source
mark insertions and deletions in doc a
# File lib/fast_html_diff.rb, line 195 def update_dom # prepare the nodes to insert before making any modifications @insertions.map! do |insertion| prepare_insertion(insertion) end # perform the insertions @insertions.each do |insertion| # if the insertion point's parent is the same type as the cca, merge the children # together; otherwise, insert the cca wholesale # TODO: handle case where a_position is -1 (insertion before start of document) # add whole nodes as-is and wrap partial nodes in a span additional_node = nil touches_node_start = @word_list[:b][insertion[:b_start]-1].nil? || (@word_list[:b][insertion[:b_start]-1][:node] != @word_list[:b][insertion[:b_start]][:node]) touches_node_end = @word_list[:b][insertion[:b_end]+1].nil? || (@word_list[:b][insertion[:b_end]+1][:node] != @word_list[:b][insertion[:b_end]][:node]) if touches_node_start && touches_node_end additional_node = insertion[:new_nodes] # bump the end char past whitespace/punctuation unless @word_list[:b][insertion[:b_end]+1].nil? insertion[:insertion_char_index] += @word_list[:b][insertion[:b_end]+1][:preceding_chars].length end else additional_node = Nokogiri::XML::Node.new('span', @a) if insertion[:new_nodes].children.length > 0 insertion[:new_nodes].children.each {|c| additional_node.add_child(c) } else additional_node.add_child(insertion[:new_nodes]) end end @insertion_nodes[additional_node] = true # insertions need to wrap around the text nodes additional_node.search('text()').each do |text_node| parent = text_node.parent wrapper = Nokogiri::XML::Node.new('ins', @a) wrapper.add_child(text_node) parent.add_child(wrapper) end # split the insertion point node (if necessary) and insert the new nodes modify_each_node_between(insertion[:insertion_point_node], insertion[:insertion_char_index], insertion[:insertion_char_index]) do |n| additional_node end end @deletions.each do |deletion| start_node = @word_list[:a][deletion[:a_start]][:node] start_char = @word_list[:a][deletion[:a_start]][:start_pos] end_node = @word_list[:a][deletion[:a_end]][:node] end_char = @word_list[:a][deletion[:a_end]][:end_pos] # wrap deletions in del tags just above each text node (so as to preserve # the original formatting) prev_node = cur_node = nil for word_i in deletion[:a_start]..deletion[:a_end] cur_node = @word_list[:a][word_i][:node] if cur_node != prev_node first = (cur_node == start_node) ? start_char : 0 last = (cur_node == end_node) ? end_char : cur_node.content.length modify_each_node_between(cur_node, first, last) do |n| wrapper = Nokogiri::XML::Node.new('del', @a) wrapper.add_child(n) wrapper end end prev_node = cur_node end end end