class TextAlignment
Constants
- BUFFER_MIN
- BUFFER_RATE
- CHAR_MAPPING
- MIN_LENGTH_FOR_APPROXIMATION
approximate the location of str1 in str2
- NIL_CHARACTER
- SIMILARITY_THRESHOLD
to work on the hash representation of denotations to assume that there is no bag representation to this method
- SIZE_NGRAM
- SIZE_WINDOW
- TEXT_SIMILARITY_THRESHOLD
- VERSION
Public Class Methods
_find_divisions(_source, _targets)
click to toggle source
# File lib/text_alignment/find_divisions.rb, line 35 def _find_divisions(_source, _targets) indice = [] history = [] cache = {} source = _source.dup targets = _targets.dup until source.strip.empty? || targets.empty? mode, cmp = nil, nil candidates = [] targets.each_with_index do |target, i| if source.size < target[:text].size mode = :t_in_s str1 = source str2 = target[:text] else mode = :s_in_t str1 = target[:text] str2 = source end len1 = str1.length len2 = str2.length offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD) approximate_fit(str1, str2) else # the whole source [0, -1] end unless offset_begin.nil? key = str1 + ' _:_ ' + str2[offset_begin .. offset_end] cmp = if cache.has_key? key cache[key] else cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end]) end cache[key] = cmp if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)) candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp} end end end # return remaining source and targets if m.nil? break if candidates.empty? choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity} m = choice[:idx] mode = choice[:mode] index = if mode == :t_in_s {divid:targets[m][:divid], region:[0, source.size]} else # :s_in_t cmp = choice[:cmp] offset = choice[:offset] {divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]} end source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1] history << index[:region].dup before_begin = index[:region][0] before_end = index[:region][1] rhistory = history.reverse rhistory.shift rhistory.each do |h| gap = h[1] - h[0] index[:region][0] += gap if index[:region][0] >= h[0] index[:region][1] += gap if index[:region][1] > h[0] end indice << index targets.delete_at(m) end unless source.strip.empty? && targets.empty? index = {divid:nil} index[:remaining_source] = source unless source.strip.empty? index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty? indice << index end indice end
_find_divisions_old(source, targets)
click to toggle source
# File lib/text_alignment/find_divisions.rb, line 124 def _find_divisions_old(source, targets) mode, m, c, offset_begin = nil, nil, nil, nil targets.each_with_index do |target, i| if source.size < target[:text].size mode = :t_in_s str1 = source str2 = target[:text] else mode = :s_in_t str1 = target[:text] str2 = source end len1 = str1.length len2 = str2.length offset_begin, offset_end = 0, -1 offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD) unless offset_begin.nil? c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end]) if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)) m = i break end end end # return remaining source and targets if m.nil? return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil? index = if mode == :t_in_s [targets[m][:divid], [0, source.size]] else # :s_in_t [targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]] end next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1] targets.delete_at(m) if next_source.strip.empty? || targets.empty? return [index] else more_index = _find_divisions(next_source, targets) gap = index[1][1] - index[1][0] more_index.each do |i| if (i[0] > -1) i[1][0] += gap if i[1][0] >= index[1][0] i[1][1] += gap if i[1][1] > index[1][0] end end return [index] + more_index end end
approximate_fit(str1, str2)
click to toggle source
If finds an approximate region of str2 that contains str1
# File lib/text_alignment/approximate_fit.rb, line 13 def approximate_fit(str1, str2) raise ArgumentError, 'nil string' if str1.nil? || str2.nil? return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]} ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]} ngram_shared = ngram1 & ngram2 # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings return nil, nil if ngram_shared.empty? signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1} return nil, nil if signature_ngrams.empty? #raise "no signature ngram" cache = {} fit_begin, fit_end = nil, nil signature_ngrams.each do |signature_ngram| loc_signature_ngram_in_str1 = str1.index(signature_ngram) loc_signature_ngram_in_str2 = str2.index(signature_ngram) # approximate the beginning of the fit fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i fit_begin = 0 if fit_begin < 0 # approximate the end of the fit offset_end = str1.length - loc_signature_ngram_in_str1 fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i fit_end = str2.length if fit_end > str2.length next if cache.has_key?("#{fit_begin}-#{fit_end}") text_similarity = text_similarity(str1, str2[fit_begin ... fit_end]) cache["#{fit_begin}-#{fit_end}"] = text_similarity break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD fit_begin, fit_end = nil, nil end return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end return nil, nil end
cdiff(str1, str2)
click to toggle source
# File lib/text_alignment/lcs_cdiff.rb, line 10 def cdiff(str1, str2) raise ArgumentError, "nil string" if str1.nil? || str2.nil? raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER) sdiff2cdiff(Diff::LCS.sdiff(str1, str2)) end
find_divisions(source, targets, mappings = [])
click to toggle source
It finds, among the targets, the right divisions for the taraget text to fit in.
# File lib/text_alignment/find_divisions.rb, line 15 def find_divisions(source, targets, mappings = []) raise ArgumentError, "nil source" if source == nil raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty? raise ArgumentError, "nil mappings" if mappings == nil character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1} mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1} characters_from = character_mappings.collect{|m| m[0]}.join characters_to = character_mappings.collect{|m| m[1]}.join characters_to.gsub!(/-/, '\-') source.tr!(characters_from, characters_to) targets.each{|target| target[:text].tr!(characters_from, characters_to)} # to process smaller ones first targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size} TextAlignment._find_divisions(source, targets) end
glcs_required?(str1, mappings = [])
click to toggle source
# File lib/text_alignment/glcs_required.rb, line 5 def glcs_required?(str1, mappings = []) raise ArgumentError, "nil string" if str1.nil? raise ArgumentError, "nil mappings" if mappings.nil? # character mappings can be safely applied to the strings withoug changing the position of other characters character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1} characters_from = character_mappings.collect{|m| m[0]}.join characters_to = character_mappings.collect{|m| m[1]}.join characters_to.gsub!(/-/, '\-') str1.tr!(characters_from, characters_to) str1 =~/([^\p{ASCII}][^\p{ASCII}])/ $1 end
sdiff2cdiff(sdiff)
click to toggle source
# File lib/text_alignment/lcs_cdiff.rb, line 16 def sdiff2cdiff (sdiff) raise ArgumentError, "nil sdiff" if sdiff.nil? cdiff_str1, cdiff_str2 = '', '' sdiff.each do |h| case h.action when '=' cdiff_str1 += h.old_element cdiff_str2 += h.new_element when '!' cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element when '-' cdiff_str1 += h.old_element cdiff_str2 += TextAlignment::NIL_CHARACTER when '+' cdiff_str1 += TextAlignment::NIL_CHARACTER cdiff_str2 += h.new_element end end cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ') end
Private Class Methods
text_similarity(str1, str2, ngram_order = 3)
click to toggle source
# File lib/text_alignment/approximate_fit.rb, line 55 def text_similarity(str1, str2, ngram_order = 3) _str1 = str1.delete(" \t\r\n") _str2 = str2.delete(" \t\r\n") String::Similarity.cosine(_str1, _str2, ngram:2) end