1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """A class to calculate a similarity based on the Levenshtein
22 distance. See http://en.wikipedia.org/wiki/Levenshtein_distance.
23
24 If available, the python-Levenshtein package will be used which will provide
25 better performance as it is implemented natively. See
26 http://trific.ath.cx/python/levenshtein/
27 """
28
29 import math
30 import sys
31
33 """Calculates the distance for use in similarity calculation. Python
34 version."""
35 l1 = len(a)
36 l2 = len(b)
37 if stopvalue == -1:
38 stopvalue = l2
39 current = range(l1+1)
40 for i in range(1, l2+1):
41 previous, current = current, [i]+[0]*l1
42 least = l2
43 for j in range(1, l1 + 1):
44 change = previous[j-1]
45 if a[j-1] != b[i-1]:
46 change = change + 1
47 insert = previous[j] + 1
48 delete = current[j-1] + 1
49 current[j] = min(insert, delete, change)
50 if least > current[j]:
51 least = current[j]
52
53
54 if least > stopvalue:
55 return least
56
57 return current[l1]
58
60 """Same as python_distance in functionality. This uses the fast C
61 version if we detected it earlier.
62
63 Note that this does not support arbitrary sequence types, but only
64 string types."""
65 return Levenshtein.distance(a, b)
66
67 try:
68 import Levenshtein as Levenshtein
69 distance = native_distance
70 except Exception:
71 print >> sys.stderr, "Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching."
72 distance = python_distance
73
76 self.MAX_LEN = max_len
77
97
99 """Returns the similarity between a and b based on Levenshtein distance. It
100 can stop prematurely as soon as it sees that a and b will be no simmilar than
101 the percentage specified in stoppercentage.
102
103 The Levenshtein distance is calculated, but the following should be noted:
104 * Only the first MAX_LEN characters are considered. Long strings differing
105 at the end will therefore seem to match better than they should. See the
106 use of the variable penalty to lessen the effect of this.
107 * Strings with widely different lengths give the opportunity for shortcut.
108 This is by definition of the Levenshtein distance: the distance will be
109 at least as much as the difference in string length.
110 * Calculation is stopped as soon as a similarity of stoppercentage becomes
111 unattainable. See the use of the variable stopvalue.
112 * Implementation uses memory O(min(len(a), len(b))
113 * Excecution time is O(len(a)*len(b))
114 """
115 l1, l2 = len(a), len(b)
116 if l1 == 0 or l2 == 0:
117 return 0
118
119 if l1 > l2:
120 l1, l2 = l2, l1
121 a, b = b, a
122
123
124
125 maxsimilarity = 100 - 100.0*abs(l1 - l2)/l2
126 if maxsimilarity < stoppercentage:
127 return maxsimilarity * 1.0
128
129
130 penalty = 0
131 if l2 > self.MAX_LEN:
132 b = b[:self.MAX_LEN]
133 l2 = self.MAX_LEN
134 penalty += 7
135 if l1 > self.MAX_LEN:
136 a = a[:self.MAX_LEN]
137 l1 = self.MAX_LEN
138 penalty += 7
139
140
141 stopvalue = math.ceil((100.0 - stoppercentage)/100 * l2)
142 dist = distance(a, b, stopvalue)
143 if dist > stopvalue:
144 return stoppercentage - 1.0
145
146
147
148 if dist != 0:
149 penalty = 0
150 return 100 - (dist*1.0/l2)*100 - penalty
151
152
153 if __name__ == "__main__":
154 from sys import argv
155 comparer = LevenshteinComparer()
156 print "Similarity:\n%s" % comparer.similarity(argv[1], argv[2], 50)
157