class TextAlignment::LCSMin

It finds minimal lcs and sdiff of the given strings, str1 and str2. It relies on the diff-lcs gem for the computation of lcs table.

Constants

PLACEHOLDER_CHAR

Attributes

lcs[R]
m1_final[R]
m1_initial[R]
m2_final[R]
m2_initial[R]
sdiff[R]

Public Class Methods

new(str1, str2) click to toggle source
# File lib/text_alignment/lcs_min.rb, line 18
def initialize (str1, str2)
        raise ArgumentError, "nil string" if str1.nil? || str2.nil?
        raise ArgumentError, "empty string" if str1.empty? || str2.empty?

        # str1 is copied as it is.
        # str2 is copied with w/s characters replaced with the placeholder characters,
        # to avoid overfitting to w/s characters during LCS computation.
        @str1 = str1
        @str2 = str2.gsub(/\s/, PLACEHOLDER_CHAR)

        # find the corresponding minimal range of the two strings
        r = _find_min_range(0, @str1.length - 1, 0, @str2.length - 1)
        if r.nil?
                @sdiff = nil
                @lcs = 0
                return
        end

        @m1_initial, @m1_final, @m2_initial, @m2_final = r[:m1_initial], r[:m1_final], r[:m2_initial], r[:m2_final]

        if @m1_initial.nil?
                @sdiff = nil
                @lcs = 0
        else
                # compute sdiff and lcs
                # here the original str2 is used with all the w/s characters preserved.
                @sdiff = Diff::LCS.sdiff(@str1[@m1_initial..@m1_final], str2[@m2_initial..@m2_final])
                @lcs = @sdiff.count{|d| d.action == '='}

                # adjust the position values of sdiff
                @sdiff.each do |h|
                        h.old_position += @m1_initial unless h.old_position.nil?
                        h.new_position += @m2_initial unless h.new_position.nil?
                end

                (0 ... @m2_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
                (0 ... @m1_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
                (@m1_final + 1 ... @str1.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
                (@m2_final + 1 ... @str2.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
        end
end

Public Instance Methods

_find_min_range(m1_initial, m1_final, m2_initial, m2_final, clcs = 0) click to toggle source
# File lib/text_alignment/lcs_min.rb, line 60
def _find_min_range (m1_initial, m1_final, m2_initial, m2_final, clcs = 0)
        return nil if (m1_final - m1_initial < 0) || (m2_final - m2_initial < 0)
        sdiff = Diff::LCS.sdiff(@str1[m1_initial..m1_final], @str2[m2_initial..m2_final])
        lcs = sdiff.count{|d| d.action == '='}

        return nil if lcs == 0
        return nil if lcs < clcs

        match_last  = sdiff.rindex{|d| d.action == '='}
        m1_final    = sdiff[match_last].old_position + m1_initial
        m2_final    = sdiff[match_last].new_position + m2_initial

        match_first = sdiff.index{|d| d.action == '='}
        m1_initial  = sdiff[match_first].old_position + m1_initial
        m2_initial  = sdiff[match_first].new_position + m2_initial

        # attempt for shorter match
        if ((m1_final - m1_initial) > (m2_final - m2_initial))
                r = _find_min_range(m1_initial + 1, m1_final, m2_initial, m2_final, lcs)
                return r unless r.nil?
                r = _find_min_range(m1_initial, m1_final - 1, m2_initial, m2_final, lcs)
                return r unless r.nil?
        else
                r = _find_min_range(m1_initial, m1_final, m2_initial + 1, m2_final, lcs)
                return r unless r.nil?
                r = _find_min_range(m1_initial, m1_final, m2_initial, m2_final - 1, lcs)
                return r unless r.nil?
        end

        return {
                m1_initial: m1_initial,
                m1_final: m1_final,
                m2_initial: m2_initial,
                m2_final: m2_final
        }
end
num_big_gaps(sdiff, initial, last) click to toggle source
# File lib/text_alignment/lcs_min.rb, line 97
def num_big_gaps (sdiff, initial, last)
        raise ArgumentError, "nil sdiff" if sdiff.nil?
        raise ArgumentError, "invalid indice: #{initial}, #{last}" unless last >= initial

        state1 = :initial
        state2 = :initial
        gaps1 = []
        gaps2 = []

        (initial .. last).each do |i|
                case sdiff[i].action
                when '='
                        state1 = :continue
                        state2 = :continue
                when '!'
                        gaps1 << 1
                        state1 = :break

                        if state2 == :break
                                gaps2[-1] += 1
                        else
                                gaps2 << 1
                        end
                        state2 = :continue
                when '+'
                        if state1 == :break
                                gaps1[-1] += 1
                        else
                                gaps1 << 1
                        end
                        state1 = :break
                when '-'
                        if state2 == :break
                                gaps2[-1] += 1
                        else
                                gaps2 << 1
                        end
                        state2 = :break
                end
        end

        num_big_gaps1 = gaps1.select{|g| g > MAX_LEN_BIG_GAP}.length
        num_big_gaps2 = gaps2.select{|g| g > MAX_LEN_BIG_GAP}.length
        num_big_gaps1 + num_big_gaps2
end