/usr/lib/ruby/vendor_ruby/text/levenshtein.rb is in ruby-text 1.3.0-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | #
# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support.
#
# The Levenshtein distance is a measure of how similar two strings s and t are,
# calculated as the number of deletions/insertions/substitutions needed to
# transform s into t. The greater the distance, the more the strings differ.
#
# The Levenshtein distance is also sometimes referred to as the
# easier-to-pronounce-and-spell 'edit distance'.
#
# Author: Paul Battley (pbattley@gmail.com)
#
module Text # :nodoc:
module Levenshtein
# Calculate the Levenshtein distance between two strings +str1+ and +str2+.
#
# The optional argument max_distance can reduce the number of iterations by
# stopping if the Levenshtein distance exceeds this value. This increases
# performance where it is only necessary to compare the distance with a
# reference value instead of calculating the exact distance.
#
# The distance is calculated in terms of Unicode codepoints. Be aware that
# this algorithm does not perform normalisation: if there is a possibility
# of different normalised forms being used, normalisation should be performed
# beforehand.
#
def distance(str1, str2, max_distance = nil)
if max_distance
distance_with_maximum(str1, str2, max_distance)
else
distance_without_maximum(str1, str2)
end
end
private
def distance_with_maximum(str1, str2, max_distance) # :nodoc:
s, t = [str1, str2].sort_by(&:length).
map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
n = s.length
m = t.length
big_int = n * m
return m if n.zero?
return n if m.zero?
return 0 if s == t
# If the length difference is already greater than the max_distance, then
# there is nothing else to check
if (n - m).abs >= max_distance
return max_distance
end
# The values necessary for our threshold are written; the ones after must
# be filled with large integers since the tailing member of the threshold
# window in the bottom array will run min across them
d = (m + 1).times.map { |i|
if i < m || i < max_distance + 1
i
else
big_int
end
}
x = nil
e = nil
n.times do |i|
# Since we're reusing arrays, we need to be sure to wipe the value left
# of the starting index; we don't have to worry about the value above the
# ending index as the arrays were initially filled with large integers
# and we progress to the right
if e.nil?
e = i + 1
else
e = big_int
end
diag_index = t.length - s.length + i
# If max_distance was specified, we can reduce second loop. So we set
# up our threshold window.
# See:
# Gusfield, Dan (1997). Algorithms on strings, trees, and sequences:
# computer science and computational biology.
# Cambridge, UK: Cambridge University Press. ISBN 0-521-58519-8.
# pp. 263–264.
min = [0, i - max_distance - 1].max
max = [m - 1, i + max_distance].min
(min .. max).each do |j|
# If the diagonal value is already greater than the max_distance
# then we can safety return: the diagonal will never go lower again.
# See: http://www.levenshtein.net/
if j == diag_index && d[j] >= max_distance
return max_distance
end
cost = s[i] == t[j] ? 0 : 1
x = [
d[j+1] + 1, # insertion
e + 1, # deletion
d[j] + cost # substitution
].min
d[j] = e
e = x
end
d[m] = x
end
if x > max_distance
return max_distance
else
return x
end
end
def distance_without_maximum(str1, str2) # :nodoc:
s, t = [str1, str2].map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
n = s.length
m = t.length
return m if n.zero?
return n if m.zero?
d = (0..m).to_a
x = nil
n.times do |i|
e = i + 1
m.times do |j|
cost = (s[i] == t[j]) ? 0 : 1
x = [
d[j+1] + 1, # insertion
e + 1, # deletion
d[j] + cost # substitution
].min
d[j] = e
e = x
end
d[m] = x
end
return x
end
extend self
end
end
|