class TextAlignment::TextAlignment

Attributes

block_alignment[R]
lost_annotations[R]
similarity[R]

Public Class Methods

new(reference_text, options = {}) click to toggle source

Initialize with a reference text, against which texts will be aligned

# File lib/text_alignment/text_alignment.rb, line 15
def initialize(reference_text, options = {})
        raise ArgumentError, "nil text" if reference_text.nil?

        options ||= {}
        @duplicate_texts = options[:duplicate_texts] || false
        @to_ignore_whitespaces = options[:to_ignore_whitespaces] || false
        @to_ignore_text_order = options[:to_ignore_text_order] || false

        @original_reference_text = reference_text
        @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @to_ignore_whitespaces)
        @mapped_reference_text = @rtext_mapping.mapped_text

        @original_text = nil
        @blocks = nil
        @cultivation_map = TextAlignment::CultivationMap.new
end

Public Instance Methods

align(text, denotations = nil) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 32
def align(text, denotations = nil)
        # To maintain the cultivation map
        update_cultivation_map unless @duplicate_texts

        # In case the input text is the same as the previous one, reuse the previous text mapping
        unless @original_text && @original_text == text
                @original_text = text
                @text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
        end

        @mapped_text = @text_mapping.mapped_text
        denotations_mapped = @text_mapping.enmap_denotations(denotations)

        ## To generate the block_alignment of the input text against the reference text
        @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
                r
        else
                find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
        end

        @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
end
alignment_show() click to toggle source
# File lib/text_alignment/text_alignment.rb, line 141
def alignment_show
        stext = @block_alignment[:text]
        ttext = @block_alignment[:reference_text]

        show = ''
        @block_alignment[:blocks].each do |a|
                show += case a[:alignment]
                when :block
                        "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
                        stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
                when :term
                        "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
                        stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
                when :empty
                        "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
                        "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
                        stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
                        ">>>>> string 2 " +
                        if a[:target]
                                "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
                                ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
                        else
                                "[-]\n\n"
                        end
                else
                        astr1 = ''
                        astr2 = ''

                        base = a[:source][:begin]
                        astr1 = a[:alignment].sdiff.map do |c|
                                case c.action
                                when '='
                                        stext[c.old_position + base]
                                when '+'
                                        '_'
                                when '-'
                                        stext[c.old_position + base]
                                when '!'
                                        stext[c.old_position + base] + '_'
                                end
                        end.join('')

                        base = a[:target][:begin]
                        astr2 = a[:alignment].sdiff.map do |c|
                                case c.action
                                when '='
                                        ttext[c.new_position + base]
                                when '+'
                                        ttext[c.new_position + base]
                                when '-'
                                        '_'
                                when '!'
                                        '_' + ttext[c.new_position + base]
                                end
                        end.join('')

                        "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
                        "[#{astr1}]\n" +
                        "[#{astr2.gsub("\n", " ")}]\n\n"
                end
        end
        show
end
transform_a_span(span) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 99
def transform_a_span(span)
        {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
end
transform_begin_position(_begin_position) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 55
def transform_begin_position(_begin_position)
        begin_position = @text_mapping.enmap_position(_begin_position)

        i = @blocks.index{|b| b[:source][:end] > begin_position}
        block = @blocks[i]

        b = if block[:alignment] == :block || block[:alignment] == :term
                begin_position + block[:delta]
        elsif block[:alignment] == :empty
                if begin_position == block[:source][:begin]
                        block[:target][:begin]
                else
                        nil
                end
        else
                r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
                r.nil? ? nil : r + block[:target][:begin]
        end

        @rtext_mapping.demap_position(b)
end
transform_denotations!(denotations) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 107
def transform_denotations!(denotations)
        return nil if denotations.nil?
        @lost_annotations = []

        denotations.each do |d|
                source = {begin:d.begin, end:d.end}
                d.begin = transform_begin_position(d.begin);
                d.end = transform_end_position(d.end);
                raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
        rescue
                @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
                d.begin = nil
                d.end = nil
        end

        @lost_annotations
end
transform_end_position(_end_position) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 77
def transform_end_position(_end_position)
        end_position = @text_mapping.enmap_position(_end_position)

        i = @blocks.index{|b| b[:source][:end] >= end_position}
        block = @blocks[i]

        e = if block[:alignment] == :block || block[:alignment] == :term
                end_position + block[:delta]
        elsif block[:alignment] == :empty
                if end_position == block[:source][:end]
                        block[:target][:end]
                else
                        nil
                end
        else
                r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
                r.nil? ? nil : r + block[:target][:begin]
        end

        @rtext_mapping.demap_position(e)
end
transform_hdenotations(hdenotations) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 125
def transform_hdenotations(hdenotations)
        return nil if hdenotations.nil?
        @lost_annotations = []

        r = hdenotations.collect do |d|
                t = transform_a_span(d[:span])
                raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
                new_d = d.dup.merge({span:t})
        rescue
                @lost_annotations << {source: d[:span], target:t}
                nil
        end.compact

        r
end
transform_spans(spans) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 103
def transform_spans(spans)
        spans.map{|span| transform_a_span(span)}
end

Private Instance Methods

demap_blocks(_blocks) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 432
def demap_blocks(_blocks)
        return nil if _blocks.nil?

        blocks = _blocks.map{|b| b.dup}
        blocks.each do |b|
                b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
                b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
        end

        blocks
end
find_block_alignment(str1, str2, denotations, cultivation_map) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 207
def find_block_alignment(str1, str2, denotations, cultivation_map)
        ## to find block alignments
        anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @to_ignore_whitespaces, @to_ignore_text_order)

        blocks = []
        while block = anchor_finder.get_next_anchor
                last = blocks.last
                if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
                        last[:source][:end] = block[:source][:end]
                        last[:target][:end] = block[:target][:end]
                else
                        blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
                end
        end

        # pp blocks
        # puts "-----"
        # puts
        # exit
        # blocks.each do |b|
        #     p [b[:source], b[:target]]
        #     puts "---"
        #     puts str1[b[:source][:begin] ... b[:source][:end]]
        #     puts "---"
        #     puts str2[b[:target][:begin] ... b[:target][:end]]
        #     puts "====="
        #     puts
        # end
        # puts "-=-=-=-=-"
        # puts

        ## To fill the gaps
        ## lblock: last block, cblock: current block
        lblock = nil
        blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
                b1 = lblock.nil? ? 0 : lblock[:source][:end]
                e1 = cblock.nil? ? str1.length : cblock[:source][:begin]

                if b1 <= e1
                        _str1 = str1[b1 ... e1]

                        b2 = lblock.nil? ? 0 : lblock[:target][:end]
                        e2 = cblock.nil? ? str2.length : cblock[:target][:begin]

                        if b2 < e2
                                _str2 = str2[b2 ... e2]

                                sum += if _str1.strip.empty? || _str2.strip.empty?
                                        [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
                                else
                                        len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
                                        region_state, state_region = cultivation_map.region_state([b2, e2])
                                        case region_state
                                        when :closed
                                                [{source:{begin:b1, end:e1}, alignment: :empty}]
                                        when :front_open
                                                if sum.empty? # when there is no preceding matched block
                                                        [{source:{begin:b1, end:e1}, alignment: :empty}]
                                                else
                                                        oe2 = state_region[1]
                                                        me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
                                                        local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
                                                end
                                        when :rear_open
                                                if cblock.nil? # when there is no following matched block
                                                        [{source:{begin:b1, end:e1}, alignment: :empty}]
                                                else
                                                        ob2 = state_region[0]
                                                        mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
                                                        local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
                                                end
                                        when :middle_closed
                                                attempt1 = if sum.empty?
                                                        [{source:{begin:b1, end:e1}, alignment: :empty}]
                                                else
                                                        oe2 = state_region[0]
                                                        me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
                                                        local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
                                                end
                                                if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
                                                        ob2 = state_region[1]
                                                        mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
                                                        local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
                                                else
                                                        attempt1
                                                end
                                        else # :open
                                                if (e2 - b2) > len_buffer
                                                        attempt1 = if sum.empty?
                                                                [{source:{begin:b1, end:e1}, alignment: :empty}]
                                                        else
                                                                local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
                                                        end
                                                        if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
                                                                local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
                                                        else
                                                                attempt1
                                                        end
                                                else
                                                        local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
                                                end
                                        end
                                end
                        elsif b2 > e2 # when out of order
                                # ToDo
                        end

                end

                lblock = cblock
                cblock.nil? ? sum : sum << cblock
        end

end
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 394
def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
        source = {begin:b1, end:e1}
        target = {begin:b2, end:e2}

        if (e1 - b1) > 2000
                [{source:source, target:target, alignment: :empty}]
        else
                alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
                if alignment.similarity < 0.5
                        [{source:source, target:target, alignment: :empty}]
                else
                        [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
                end
        end
end
local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 332
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
        tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
        if tblocks.empty? || tblocks.first[:alignment] == :empty
                lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
        else
                tblocks
        end
end
term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 341
def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
        str2_block = str2[0 ... e2]

        ## term-based alignment
        tblocks = if denotations
                denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
                                                sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
                                                map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}

                search_position = b2
                _tblocks = denotations_in_scope.map do |denotation|
                        lex = denotation[:lex]
                        term_begin = cultivation_map.index(lex, str2_block, search_position)
                        break [] if term_begin.nil? # break the loop if a missing term is found
                        search_position = term_begin + lex.length
                        {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
                end

                # redundant matching found
                unless _tblocks.empty?
                        search_position = _tblocks.last[:target][:end]
                        denotations_in_scope.each do |term|
                                look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
                                unless look_forward.nil?
                                        _tblocks = []
                                        break
                                end
                        end
                end

                _tblocks
        else
                []
        end

        ltblock = nil
        tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
                tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
                te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]

                if te1 > tb1
                        tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
                        te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
                        sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
                end

                ltblock = ctblock
                ctblock.nil? ? sum : sum << ctblock
        end

        tblocks2
end
update_cultivation_map() click to toggle source
# File lib/text_alignment/text_alignment.rb, line 410
def update_cultivation_map
        return if @blocks.nil?

        ## To update the cultivation map
        newly_cultivated_regions = @blocks.collect do |b|
                if b[:alignment] == :block || b[:alignment] == :term
                        [b[:target][:begin], b[:target][:end]]
                else
                        nil
                end
        end.compact.inject([]) do |condensed, region|
                if condensed.empty? || (condensed.last.last + 1 < region.first)
                        condensed.push region
                else
                        condensed.last[1] = region.last
                end
                condensed
        end

        @cultivation_map.cultivate(newly_cultivated_regions)
end
whole_block_alignment(str1, str2, cultivation_map) click to toggle source
# File lib/text_alignment/text_alignment.rb, line 322
def whole_block_alignment(str1, str2, cultivation_map)
        block_begin = cultivation_map.index(str1, str2)
        return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?

        block_begin = cultivation_map.index(str1.downcase, str2.downcase)
        return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?

        nil
end