class TextAlignment

Constants

BUFFER_MIN
BUFFER_RATE
CHAR_MAPPING
MIN_LENGTH_FOR_APPROXIMATION

approximate the location of str1 in str2

NIL_CHARACTER
SIMILARITY_THRESHOLD

to work on the hash representation of denotations to assume that there is no bag representation to this method

SIZE_NGRAM
SIZE_WINDOW
TEXT_SIMILARITY_THRESHOLD
VERSION

Public Class Methods

_find_divisions(_source, _targets) click to toggle source
# File lib/text_alignment/find_divisions.rb, line 35
def _find_divisions(_source, _targets)
        indice = []
        history = []
        cache = {}
        source = _source.dup
        targets = _targets.dup
        until source.strip.empty? || targets.empty?
                mode, cmp = nil, nil
                candidates = []
                targets.each_with_index do |target, i|
                        if source.size < target[:text].size
                                mode = :t_in_s
                                str1 = source
                                str2 = target[:text]
                        else
                                mode = :s_in_t
                                str1 = target[:text]
                                str2 = source
                        end

                        len1 = str1.length
                        len2 = str2.length

                        offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
                                approximate_fit(str1, str2)
                        else
                                # the whole source
                                [0, -1]
                        end

                        unless offset_begin.nil?
                                key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
                                cmp = if cache.has_key? key
                                        cache[key]
                                else
                                        cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
                                end
                                cache[key] = cmp

                                if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
                                        candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
                                end
                        end
                end

                # return remaining source and targets if m.nil?
                break if candidates.empty?

                choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
                m = choice[:idx]
                mode = choice[:mode]

                index = if mode == :t_in_s
                        {divid:targets[m][:divid], region:[0, source.size]}
                else # :s_in_t
                        cmp = choice[:cmp]
                        offset = choice[:offset]
                        {divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
                end

                source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
                history << index[:region].dup

                before_begin = index[:region][0]
                before_end = index[:region][1]

                rhistory = history.reverse
                rhistory.shift
                rhistory.each do |h|
                        gap = h[1] - h[0]
                        index[:region][0] += gap if index[:region][0] >= h[0]
                        index[:region][1] += gap if index[:region][1] >  h[0]
                end

                indice << index

                targets.delete_at(m)
        end

        unless source.strip.empty? && targets.empty?
                index = {divid:nil}
                index[:remaining_source] = source unless source.strip.empty?
                index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
                indice << index
        end

        indice
end
_find_divisions_old(source, targets) click to toggle source
# File lib/text_alignment/find_divisions.rb, line 124
def _find_divisions_old(source, targets)
        mode, m, c, offset_begin = nil, nil, nil, nil

        targets.each_with_index do |target, i|
                if source.size < target[:text].size
                        mode = :t_in_s
                        str1 = source
                        str2 = target[:text]
                else
                        mode = :s_in_t
                        str1 = target[:text]
                        str2 = source
                end

                len1 = str1.length
                len2 = str2.length

                offset_begin, offset_end = 0, -1
                offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)

                unless offset_begin.nil?
                        c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
                        if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
                                m = i
                                break
                        end
                end
        end

        # return remaining source and targets if m.nil?
        return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?

        index = if mode == :t_in_s
                [targets[m][:divid], [0, source.size]]
        else # :s_in_t
                [targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
        end

        next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
        targets.delete_at(m)

        if next_source.strip.empty? || targets.empty?
                return [index]
        else
                more_index = _find_divisions(next_source, targets)
                gap = index[1][1] - index[1][0]
                more_index.each do |i|
                        if (i[0] > -1)
                                i[1][0] += gap if i[1][0] >= index[1][0]
                                i[1][1] += gap if i[1][1] >  index[1][0]
                        end
                end
                return [index] + more_index
        end
end
approximate_fit(str1, str2) click to toggle source

If finds an approximate region of str2 that contains str1

# File lib/text_alignment/approximate_fit.rb, line 13
def approximate_fit(str1, str2)
        raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
        return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION

        ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
        ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
        ngram_shared = ngram1 & ngram2

        # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
        return nil, nil if ngram_shared.empty?

        signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
        return nil, nil if signature_ngrams.empty? #raise "no signature ngram"

        cache = {}
        fit_begin, fit_end = nil, nil
        signature_ngrams.each do |signature_ngram|
                loc_signature_ngram_in_str1 = str1.index(signature_ngram)
                loc_signature_ngram_in_str2 = str2.index(signature_ngram)

                # approximate the beginning of the fit
                fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
                fit_begin = 0 if fit_begin < 0

                # approximate the end of the fit
                offset_end = str1.length - loc_signature_ngram_in_str1
                fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
                fit_end = str2.length if fit_end > str2.length

                next if cache.has_key?("#{fit_begin}-#{fit_end}")
                text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
                cache["#{fit_begin}-#{fit_end}"] = text_similarity

                break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
                fit_begin, fit_end = nil, nil
        end
        return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
        return nil, nil
end
cdiff(str1, str2) click to toggle source
# File lib/text_alignment/lcs_cdiff.rb, line 10
def cdiff(str1, str2)
        raise ArgumentError, "nil string" if str1.nil? || str2.nil?
        raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
        sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
end
find_divisions(source, targets, mappings = []) click to toggle source

It finds, among the targets, the right divisions for the taraget text to fit in.

# File lib/text_alignment/find_divisions.rb, line 15
def find_divisions(source, targets, mappings = [])
        raise ArgumentError, "nil source"           if source == nil
        raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
        raise ArgumentError, "nil mappings"         if mappings == nil

        character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
        mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
        characters_from = character_mappings.collect{|m| m[0]}.join
        characters_to   = character_mappings.collect{|m| m[1]}.join
        characters_to.gsub!(/-/, '\-')

        source.tr!(characters_from, characters_to)
        targets.each{|target| target[:text].tr!(characters_from, characters_to)}

        # to process smaller ones first
        targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}

        TextAlignment._find_divisions(source, targets)
end
glcs_required?(str1, mappings = []) click to toggle source
# File lib/text_alignment/glcs_required.rb, line 5
def glcs_required?(str1, mappings = [])
        raise ArgumentError, "nil string" if str1.nil?
        raise ArgumentError, "nil mappings" if mappings.nil?

        # character mappings can be safely applied to the strings withoug changing the position of other characters
        character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
        characters_from = character_mappings.collect{|m| m[0]}.join
        characters_to   = character_mappings.collect{|m| m[1]}.join
        characters_to.gsub!(/-/, '\-')

        str1.tr!(characters_from, characters_to)

        str1 =~/([^\p{ASCII}][^\p{ASCII}])/
        $1
end
sdiff2cdiff(sdiff) click to toggle source
# File lib/text_alignment/lcs_cdiff.rb, line 16
def sdiff2cdiff (sdiff)
        raise ArgumentError, "nil sdiff" if sdiff.nil?

        cdiff_str1, cdiff_str2 = '', ''

        sdiff.each do |h|
                case h.action
                when '='
                        cdiff_str1 += h.old_element
                        cdiff_str2 += h.new_element
                when '!'
                        cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
                        cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
                when '-'
                        cdiff_str1 += h.old_element
                        cdiff_str2 += TextAlignment::NIL_CHARACTER
                when '+'
                        cdiff_str1 += TextAlignment::NIL_CHARACTER
                        cdiff_str2 += h.new_element
                end
        end

        cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
end

Private Class Methods

text_similarity(str1, str2, ngram_order = 3) click to toggle source
# File lib/text_alignment/approximate_fit.rb, line 55
def text_similarity(str1, str2, ngram_order = 3)
        _str1 = str1.delete(" \t\r\n")
        _str2 = str2.delete(" \t\r\n")
        String::Similarity.cosine(_str1, _str2, ngram:2)
end