class TextAlignment::LCSMin
It finds minimal lcs and sdiff of the given strings, str1 and str2. It relies on the diff-lcs gem for the computation of lcs table.
Constants
- PLACEHOLDER_CHAR
Attributes
lcs[R]
m1_final[R]
m1_initial[R]
m2_final[R]
m2_initial[R]
sdiff[R]
Public Class Methods
new(str1, str2)
click to toggle source
# File lib/text_alignment/lcs_min.rb, line 18 def initialize (str1, str2) raise ArgumentError, "nil string" if str1.nil? || str2.nil? raise ArgumentError, "empty string" if str1.empty? || str2.empty? # str1 is copied as it is. # str2 is copied with w/s characters replaced with the placeholder characters, # to avoid overfitting to w/s characters during LCS computation. @str1 = str1 @str2 = str2.gsub(/\s/, PLACEHOLDER_CHAR) # find the corresponding minimal range of the two strings r = _find_min_range(0, @str1.length - 1, 0, @str2.length - 1) if r.nil? @sdiff = nil @lcs = 0 return end @m1_initial, @m1_final, @m2_initial, @m2_final = r[:m1_initial], r[:m1_final], r[:m2_initial], r[:m2_final] if @m1_initial.nil? @sdiff = nil @lcs = 0 else # compute sdiff and lcs # here the original str2 is used with all the w/s characters preserved. @sdiff = Diff::LCS.sdiff(@str1[@m1_initial..@m1_final], str2[@m2_initial..@m2_final]) @lcs = @sdiff.count{|d| d.action == '='} # adjust the position values of sdiff @sdiff.each do |h| h.old_position += @m1_initial unless h.old_position.nil? h.new_position += @m2_initial unless h.new_position.nil? end (0 ... @m2_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))} (0 ... @m1_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))} (@m1_final + 1 ... @str1.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))} (@m2_final + 1 ... @str2.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))} end end
Public Instance Methods
_find_min_range(m1_initial, m1_final, m2_initial, m2_final, clcs = 0)
click to toggle source
# File lib/text_alignment/lcs_min.rb, line 60 def _find_min_range (m1_initial, m1_final, m2_initial, m2_final, clcs = 0) return nil if (m1_final - m1_initial < 0) || (m2_final - m2_initial < 0) sdiff = Diff::LCS.sdiff(@str1[m1_initial..m1_final], @str2[m2_initial..m2_final]) lcs = sdiff.count{|d| d.action == '='} return nil if lcs == 0 return nil if lcs < clcs match_last = sdiff.rindex{|d| d.action == '='} m1_final = sdiff[match_last].old_position + m1_initial m2_final = sdiff[match_last].new_position + m2_initial match_first = sdiff.index{|d| d.action == '='} m1_initial = sdiff[match_first].old_position + m1_initial m2_initial = sdiff[match_first].new_position + m2_initial # attempt for shorter match if ((m1_final - m1_initial) > (m2_final - m2_initial)) r = _find_min_range(m1_initial + 1, m1_final, m2_initial, m2_final, lcs) return r unless r.nil? r = _find_min_range(m1_initial, m1_final - 1, m2_initial, m2_final, lcs) return r unless r.nil? else r = _find_min_range(m1_initial, m1_final, m2_initial + 1, m2_final, lcs) return r unless r.nil? r = _find_min_range(m1_initial, m1_final, m2_initial, m2_final - 1, lcs) return r unless r.nil? end return { m1_initial: m1_initial, m1_final: m1_final, m2_initial: m2_initial, m2_final: m2_final } end
num_big_gaps(sdiff, initial, last)
click to toggle source
# File lib/text_alignment/lcs_min.rb, line 97 def num_big_gaps (sdiff, initial, last) raise ArgumentError, "nil sdiff" if sdiff.nil? raise ArgumentError, "invalid indice: #{initial}, #{last}" unless last >= initial state1 = :initial state2 = :initial gaps1 = [] gaps2 = [] (initial .. last).each do |i| case sdiff[i].action when '=' state1 = :continue state2 = :continue when '!' gaps1 << 1 state1 = :break if state2 == :break gaps2[-1] += 1 else gaps2 << 1 end state2 = :continue when '+' if state1 == :break gaps1[-1] += 1 else gaps1 << 1 end state1 = :break when '-' if state2 == :break gaps2[-1] += 1 else gaps2 << 1 end state2 = :break end end num_big_gaps1 = gaps1.select{|g| g > MAX_LEN_BIG_GAP}.length num_big_gaps2 = gaps2.select{|g| g > MAX_LEN_BIG_GAP}.length num_big_gaps1 + num_big_gaps2 end