class Mizuho::JaroWinklerPure

Constants

THRESHOLD

Public Instance Methods

getDistance( s1, s2 ) click to toggle source
# File lib/mizuho/fuzzystringmatch.rb, line 26
def getDistance( s1, s2 )
  a1 = s1.split( // )
  a2 = s2.split( // )
  
  if s1.size > s2.size
    (max,min) = a1,a2
  else
    (max,min) = a2,a1
  end

  range = [ (max.size / 2 - 1), 0 ].max
  indexes = Array.new( min.size, -1 )
  flags   = Array.new( max.size, false )

  matches = 0;
  (0 ... min.size).each { |mi|
    c1 = min[mi]
    xi = [mi - range, 0].max
    xn = [mi + range + 1, max.size].min
    
    (xi ... xn).each { |i|
      if (not flags[i]) && ( c1 == max[i] )
        indexes[mi] = i
        flags[i] = true
        matches += 1
        break
      end
    }
  }

  ms1 = Array.new( matches, nil )
  ms2 = Array.new( matches, nil )

  si = 0
  (0 ... min.size).each { |i|
    if (indexes[i] != -1)
      ms1[si] = min[i]
      si += 1
    end
  }

  si = 0
  (0 ... max.size).each { |i|
    if flags[i]
      ms2[si] = max[i]
      si += 1
    end
  }
  
  transpositions = 0
  (0 ... ms1.size).each { |mi|
    if ms1[mi] != ms2[mi]
      transpositions += 1
    end
  }

  prefix = 0
  (0 ... min.size).each { |mi|
    if s1[mi] == s2[mi]
      prefix += 1
    else
      break
    end
  }

  if 0 == matches
    0.0
  else
    m = matches.to_f
    t = (transpositions/ 2)
    j = ((m / s1.size) + (m / s2.size) + ((m - t) / m)) / 3.0;
    return j < THRESHOLD ? j : j + [0.1, 1.0 / max.size].min * prefix * (1 - j)
  end
end