class TextAlignment::AnchorFinder
Public Class Methods
new(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 9 def initialize(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false) @method_get_left_windows, @method_get_right_windows = if to_ignore_whitespaces [method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)] else [method(:get_left_windows), method(:get_right_windows)] end @s1 = source_str.downcase @s2 = target_str.downcase @cultivation_map = cultivation_map @to_ignore_text_order = to_ignore_text_order @size_ngram = TextAlignment::SIZE_NGRAM @size_window = TextAlignment::SIZE_WINDOW @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1 @pos_s2_final_possible_end = @s2.length # positions of last match @pos_s1_last_match = 0 @pos_s2_last_match = 0 end
Public Instance Methods
get_next_anchor()
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 33 def get_next_anchor # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin # To skip whitespace letters next if [' ', "\n", "\t"].include? @s1[beg_s1] _beg_s2 = get_beg_s2(beg_s1) break _beg_s2 unless _beg_s2.nil? end # To return nil when it fails to find an anchor return nil if beg_s2.class == Range # To extend the block to the left b1 = beg_s1 b2 = beg_s2 left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1] b1 -= 1; b2 -= 1 end # To extend the block to the right e1 = beg_s1 + @size_ngram e2 = beg_s2 + @size_ngram right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2] e1 += 1; e2 += 1 end @pos_s1_last_match = e1 @pos_s2_last_match = e2 {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}} end
Private Instance Methods
find_beg_s2_candidates(anchor, search_position)
click to toggle source
To find beg_s2 which match to the anchor return nil if the anchor is too much frequent
# File lib/text_alignment/anchor_finder.rb, line 84 def find_beg_s2_candidates(anchor, search_position) candidates = [] while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position) candidates << _beg_s2 # for speed, skip anchor of high frequency if candidates.length > 5 candidates.clear break end search_position = _beg_s2 + 1 end candidates end
find_valid_beg_s2(beg_s1, beg_s2_candidates)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 100 def find_valid_beg_s2(beg_s1, beg_s2_candidates) valid_beg_s2 = nil (10 .. 30).step(10).each do |size_window| valid_beg_s2 = nil r = beg_s2_candidates.each do |beg_s2| # if both the begining points are sufficiantly close to the end points of the last match # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5) if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5) break unless valid_beg_s2.nil? valid_beg_s2 = beg_s2 next end left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window) if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold) break unless valid_beg_s2.nil? valid_beg_s2 = beg_s2 next end right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window) if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold) break unless valid_beg_s2.nil? valid_beg_s2 = beg_s2 next end end # r == nil means that the inner loop was broken (multiple candidates had passed the tests) # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found) if r.nil? valid_beg_s2 = nil else break end end valid_beg_s2 end
get_beg_s2(beg_s1)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 71 def get_beg_s2(beg_s1) # to get the anchor to search for in s2 anchor = @s1[beg_s1, @size_ngram] search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match beg_s2_candidates = find_beg_s2_candidates(anchor, search_position) return nil if beg_s2_candidates.empty? find_valid_beg_s2(beg_s1, beg_s2_candidates) end
get_left_windows(beg_s1, beg_s2, size_window = nil)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 142 def get_left_windows(beg_s1, beg_s2, size_window = nil) size_window ||= @size_window # comment out below with the assumption that the beginning of a document gives a significant locational information # return if beg_s1 < size_window || beg_s2 < size_window window_s1 = '' loc = beg_s1 - 1 count = 0 while count < size_window && loc >= 0 if @s1[loc] =~ /[0-9a-zA-Z]/ window_s1 += @s1[loc] count += 1 end loc -= 1 end window_s2 = '' loc = beg_s2 - 1 count = 0 while count < size_window && loc >= 0 if @s2[loc] =~ /[0-9a-zA-Z]/ window_s2 += @s2[loc] count += 1 end loc -= 1 end [window_s1, window_s2] end
get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 206 def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil) size_window ||= @size_window # comment out below with the assumption that the beginning of a document gives a significant locational information # return if beg_s1 < size_window || beg_s2 < size_window wbeg = beg_s1 - size_window wbeg = 0 if wbeg < 0 window_s1 = @s1[wbeg ... beg_s1] wbeg = beg_s2 - size_window wbeg = 0 if wbeg < 0 window_s2 = @s2[wbeg ... beg_s2] [window_s1, window_s2] end
get_right_windows(beg_s1, beg_s2, size_window = nil)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 173 def get_right_windows(beg_s1, beg_s2, size_window = nil) size_window ||= @size_window # commend below with the assumption that the end of a document gives a significant locational # return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window)) window_s1 = '' loc = beg_s1 + @size_ngram len_s1 = @s1.length count = 0 while count < size_window && loc < len_s1 if @s1[loc] =~ /[0-9a-zA-Z]/ window_s1 += @s1[loc] count += 1 end loc += 1 end window_s2 = '' loc = beg_s2 + @size_ngram len_s2 = @s2.length count = 0 while count < size_window && loc < len_s2 if @s2[loc] =~ /[0-9a-zA-Z]/ window_s2 += @s2[loc] count += 1 end loc += 1 end [window_s1, window_s2] end
get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 223 def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil) size_window ||= @size_window # commend below with the assumption that the end of a document gives a significant locational # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window)) slen = @s1.length wbeg = beg_s1 + @size_ngram wend = wbeg + size_window wend = slen if wend > slen window_s1 = @s1[wbeg ... wend] slen = @s2.length wbeg = beg_s2 + @size_ngram wend = wbeg + size_window wend = slen if wend > slen window_s2 = @s2[wbeg ... wend] [window_s1, window_s2] end
text_similarity(str1, str2, ngram_order = 2)
click to toggle source
# File lib/text_alignment/anchor_finder.rb, line 244 def text_similarity(str1, str2, ngram_order = 2) return 0 if str1.nil? || str2.nil? String::Similarity.cosine(str1, str2, ngram:ngram_order) end