class PatienceDiff::SequenceMatcher
Matches indexed data (generally text) using the Patience diff algorithm.
Constants
- Card
Attributes
context[RW]
Public Class Methods
new(opts = {})
click to toggle source
Options:
* :context: number of lines of context to use when grouping
# File lib/patience_diff/sequence_matcher.rb, line 10 def initialize(opts = {}) @context = opts[:context] || 3 end
Public Instance Methods
diff_opcodes(a, b)
click to toggle source
Generate a diff of a and b, and return an array of opcodes describing that diff. Each opcode represents a range in a and b that is either equal, only in a, or only in b. Opcodes are 5-tuples, in the format:
0: code A symbol indicating the diff operation. Can be :equal, :delete, or :insert. 1: a_start Index in a where the range begins 2: a_end Index in a where the range ends. 3: b_start Index in b where the range begins 4: b_end Index in b where the range ends.
For :equal, (a_end - a_start) == (b_end - b_start). For :delete, a_start == a_end. For :insert, b_start == b_end.
# File lib/patience_diff/sequence_matcher.rb, line 80 def diff_opcodes(a, b) sequences = collapse_matches(match(a, b)) sequences << [a.length, b.length, 0] a_pos = b_pos = 0 opcodes = [] sequences.each do |(i, j, len)| if a_pos < i opcodes << [:delete, a_pos, i-1, b_pos, b_pos] end if b_pos < j opcodes << [:insert, a_pos, a_pos, b_pos, j-1] end if len > 0 opcodes << [:equal, i, i+len-1, j, j+len-1] end a_pos = i+len b_pos = j+len end opcodes end
grouped_opcodes(a, b)
click to toggle source
Generate a diff of a and b using diff_opcodes
, and split the opcode into groups whenever an :equal range is encountered that is longer than @context * 2. Returns an array of arrays of 5-tuples as described for diff_opcodes
.
# File lib/patience_diff/sequence_matcher.rb, line 17 def grouped_opcodes(a, b) groups = [] last_group = [] diff_opcodes(a, b).each do |opcode| if opcode[0] == :equal if @context.zero? groups << last_group last_group = [] next end code, a_start, a_end, b_start, b_end = *opcode if (a_start.zero? and b_start.zero?) or (a_end == a.length-1 and b_end == b.length-1) threshold = @context else threshold = @context * 2 end if (b_end - b_start + 1) > threshold unless last_group.empty? last_group << [ code, a_start, a_start + @context - 1, b_start, b_start + @context - 1 ] groups << last_group last_group = [] end opcode = [ code, a_end - @context + 1, a_end, b_end - @context + 1, b_end ] end end last_group << opcode end groups << last_group unless last_group.one? and last_group.first[0] == :equal groups end
Private Instance Methods
bisect(piles, target)
click to toggle source
# File lib/patience_diff/sequence_matcher.rb, line 242 def bisect(piles, target) low = 0 high = piles.size - 1 while (low <= high) mid = (low + high)/2 if piles[mid].value < target low = mid + 1 else high = mid - 1 end end low end
collapse_matches(matches)
click to toggle source
# File lib/patience_diff/sequence_matcher.rb, line 158 def collapse_matches(matches) return [] if matches.empty? sequences = [] start_a, start_b = *(matches.first) len = 1 matches[1..-1].each do |(i_a, i_b)| if i_a == start_a + len and i_b == start_b + len len += 1 else sequences << [start_a, start_b, len] start_a = i_a start_b = i_b len = 1 end end sequences << [start_a, start_b, len] sequences end
longest_unique_subsequence(a, b)
click to toggle source
# File lib/patience_diff/sequence_matcher.rb, line 177 def longest_unique_subsequence(a, b) deck = Array.new(b.length) unique_a = {} unique_b = {} a.each_with_index do |val, index| if unique_a.has_key? val unique_a[val] = nil else unique_a[val] = index end end b.each_with_index do |val, index| a_index = unique_a[val] next unless a_index dupe_index = unique_b[val] if dupe_index deck[dupe_index] = nil unique_a.delete(val) else unique_b[val] = index deck[index] = a_index end end card = patience_sort(deck).last result = [] while card result.unshift [card.value, card.index] card = card.previous end result end
match(a, b)
click to toggle source
# File lib/patience_diff/sequence_matcher.rb, line 103 def match(a, b) matches = [] recursively_match(a, b, 0, 0, a.length, b.length) do |match| matches << match end matches end
patience_sort(deck)
click to toggle source
# File lib/patience_diff/sequence_matcher.rb, line 212 def patience_sort(deck) piles = [] pile = 0 deck.each_with_index do |card_value, index| next if card_value.nil? card = Card.new(index, card_value) if piles.any? and piles.last.value < card_value pile = piles.size elsif piles.any? and piles[pile].value < card_value and (pile == piles.size-1 or piles[pile+1].value > card_value) pile += 1 else pile = bisect(piles, card_value) end card.previous = piles[pile-1] if pile > 0 if pile < piles.size #puts "putting card #{card.value} on pile #{pile}" piles[pile] = card else #puts "putting card #{card.value} on new pile" piles << card end end piles end
recursively_match(a, b, a_lo, b_lo, a_hi, b_hi) { |match| ... }
click to toggle source
# File lib/patience_diff/sequence_matcher.rb, line 111 def recursively_match(a, b, a_lo, b_lo, a_hi, b_hi) return if a_lo == a_hi or b_lo == b_hi last_a_pos = a_lo - 1 last_b_pos = b_lo - 1 longest_unique_subsequence(a[a_lo...a_hi], b[b_lo...b_hi]).each do |(a_pos, b_pos)| # recurse betwen unique lines a_pos += a_lo b_pos += b_lo if (last_a_pos+1 != a_pos) or (last_b_pos+1 != b_pos) recursively_match(a, b, last_a_pos+1, last_b_pos+1, a_pos, b_pos) { |match| yield match } end last_a_pos = a_pos last_b_pos = b_pos yield [a_pos, b_pos] end if last_a_pos >= a_lo or last_b_pos >= b_lo # there was at least one match # recurse between last match and end recursively_match(a, b, last_a_pos+1, last_b_pos+1, a_hi, b_hi) { |match| yield match } elsif a[a_lo] == b[b_lo] # no unique lines # diff forward from beginning while a_lo < a_hi and b_lo < b_hi and a[a_lo] == b[b_lo] yield [a_lo, b_lo] a_lo += 1 b_lo += 1 end recursively_match(a, b, a_lo, b_lo, a_hi, b_hi) { |match| yield match } elsif a[a_hi-1] == b[b_hi-1] # no unique lines # diff back from end a_mid = a_hi - 1 b_mid = b_hi - 1 while a_mid > a_lo and b_mid > b_lo and a[a_mid-1] == b[b_mid-1] a_mid -= 1 b_mid -= 1 end recursively_match(a, b, a_lo, b_lo, a_mid, b_mid) { |match| yield match } (0...(a_hi-a_mid)).each do |i| yield [a_mid+i, b_mid+i] end end end