1
2 """Levenshtein (edit) distance between two strings of symbols.
3 """
4
5 import numpy
6
7 -def dist(s, t, csub, cinsert, cdel):
8 """Cost for converting string s into string t. This function
9 takes three dictionaries that contain the cost of insertion, deletion,
10 and substitutions.
11
12 @param s: starting string
13 @param t: ending string
14 @type s: string or array
15 @type t: string or array
16 @param csub: cost of converting a to b
17 @type csub: dict((a,b) : float)
18 @param cinsert: cost of insertion
19 @type cinsert: dict(a: float)
20 @param cdel: cost of deletion
21 @type cdel: dict(a: float)
22 """
23 def f(a, b):
24 if a is None:
25 return cinsert[b]
26 elif b is None:
27 return cdel[a]
28 return csub[(a, b)]
29 return distf(s, t, f)
30
31
33 """Cost for converting string C{s} into string C{t}. This function takes
34 a function that computes the cost of insertions, deletions, or substitutions.
35 (This is an alternative implementation for C{distf2()}.)
36
37 @param s: starting string
38 @param t: ending string
39 @type s: string or array
40 @type t: string or array
41 @param csub: cost of converting a to b
42 @type csub: function(a,b) : float
43 """
44 m = len(s)
45 n = len(t)
46 d = numpy.zeros((m+1, n+1), numpy.float)
47 for i in range(1, m+1):
48 d[i,0] = d[i-1,0] + csub(s[i-1], None)
49 for j in range(1, n+1):
50 d[0,j] = d[0,j-1] + csub(None, t[j-1])
51
52 for j in range(1, n+1):
53 for i in range(1, m+1):
54 d[i,j] = min(d[i-1,j] + csub(s[i-1], None),
55 d[i,j-1] + csub(None, t[j-1]),
56 d[i-1, j-1] + csub(s[i-1], t[j-1])
57 )
58 return d[m, n]
59
60
62 """Cost for converting string s into string t. This function takes
63 a function that computes the cost of insertions, deletions, or substitutions.
64
65 @param s: starting string
66 @param t: ending string
67 @type s: string or array
68 @type t: string or array
69 @param csub: cost of converting a to b
70 @type csub: function(a,b) : float
71 """
72 m = len(s)
73 n = len(t)
74 dj = numpy.zeros((m+1,))
75 djm1 = numpy.zeros((m+1,))
76 for i in range(1, m+1):
77 dj[i] = dj[i-1] + csub(s[i-1], None)
78
79 for j in range(1, n+1):
80 tmp = djm1
81 djm1 = dj
82 dj = tmp
83 dj[:] = 0.0
84 dj[0] = djm1[0] + csub(None, t[j-1])
85 for i in range(1, m+1):
86 dj[i] = min(dj[i-1] + csub(s[i-1], None),
87 djm1[i] + csub(None, t[j-1]),
88 djm1[i-1] + csub(s[i-1], t[j-1])
89 )
90 return dj[m]
91
92
93 distf = distf2
94
95
97 """This is the default cost function for converting C{a} into C{b}.
98 Insertions, deletions, and substitutions all have a unit cost. Normally,
99 this is passed as an arg to L{distf}.
100 @param a: a symbol (or None to indicate an insertion).
101 @param b: a symbol (or None to indicate an deletion).
102 @return: the cost of changing C{a} into C{b}
103 @rtype: int
104 """
105 if a is None or b is None:
106 return 1
107 if a==b:
108 return 0
109 return 1
110
111
113 """This is a sample cost function for converting C{a} into C{b}.
114 Insertions and deletions have a unit cost; substitutions are free. Normally,
115 this is passed as an arg to L{distf}.
116 @param a: a symbol (or None to indicate an insertion).
117 @param b: a symbol (or None to indicate an deletion).
118 @return: the cost of changing C{a} into C{b}
119 @rtype: int
120 """
121 if a is None or b is None:
122 return 1
123 return 0
124
125
127 """This is a sample cost function for converting C{a} into C{b}.
128 Insertions and substitutions have a unit cost; deletion are free. Normally,
129 this is passed as an arg to L{distf}.
130 @param a: a symbol (or None to indicate an insertion).
131 @param b: a symbol (or None to indicate an deletion).
132 @return: the cost of changing C{a} into C{b}
133 @rtype: int
134 """
135 if b is None:
136 return 0
137 if a is None:
138 return 1
139 if a==b:
140 return 0
141 return 1
142
143
144 -class text_cost(object):
145 """This is useful for differencing documents that have been parsed
146 into lists of words. It computes the cost of a change of words
147 by an edit distance computation on the characters within the words.
148 (It caches the costs of commonly encountered words for speed.)
149 Instances of this class can be passed as a cost function to
150 L{distf}.
151 """
152
153 - def __init__(self, costfac=None, cachesize=None, word_cost=None, frac_exp=0.0):
154 """Create an instance.
155 @param costfac: None, or an arbitrary function that multiplies the scaled
156 edit distance between the two words. Called as C{self.costfac(a,b)}
157 where either C{a} or C{b} can be None for insertions or deletions.
158 @type costfac: C{function(str,str): float} or C{None}
159 @param cachesize: how large a cache of word-to-word distances should be kept?
160 @type cachesize: None (meaning unlimited) or int
161 @param word_cost: cost function to use inside words: this is a cost for insertion,
162 deletion, or substitution of letters.
163 @type word_cost: C{function(str,str): float} or None (meaning use L{def_cost}).
164 @param frac_exp: An exponent for scaling the edit distance by the length of words.
165 @type frac_exp: float
166 """
167 self.costfac = costfac
168 self.cache = {}
169 assert cachesize is None or cachesize >= 0
170 self.cachesize = cachesize
171
172 if word_cost is None:
173 self.word_cost = def_cost
174 else:
175 self.word_cost = word_cost
176 self.frac_exp = float(frac_exp)
177
178
179
180 - def __call__(self, a, b):
181 """To be used by L{distf}.
182 @type a: str or None
183 @type b: str or None
184 @return: a cost
185 @rtype: presumably a float
186 """
187 if a is None:
188 a = ''
189 else:
190 if a == b:
191 return 0.0
192 if b is None:
193 b = ''
194
195 try:
196 return self.cache[(a,b)]
197 except KeyError:
198 pass
199
200 cost = distf(a, b, self.word_cost)
201 cost /= float(max(len(a), len(b)))**self.frac_exp
202 if self.cachesize is not None and len(self.cache) > self.cachesize:
203 self.cache.pop()
204 if self.costfac is not None:
205 cost *= self.costfac(a, b)
206 self.cache[(a,b)] = cost
207 return cost
208
209
210
212 assert distf("hello", "helpo", def_cost) == 1
213 assert distf("hello", "helo", def_cost) == 1
214 assert distf("hello", "hellox", def_cost) == 1
215 assert distf("hello", "helloxx", def_cost) == 2
216 assert distf("hello", "elo", def_cost) == 2
217 assert distf("hello", "eelpo", def_cost) == 2
218 assert distf("kitten", "sitting", def_cost) == 3
219 assert distf("saturday", "sunday", def_cost) == 3
220 assert distf("hello", "xxxxo", free_sub) == 0
221 assert distf("hello", "xxxo", free_sub) == 1
222 assert distf("hello", "xxo", free_sub) == 2
223 assert distf("hello", "yxxxxo", free_sub) == 1
224 assert distf("hello", "yxxxxoy", free_sub) == 2
225 assert distf("hello", "hello", free_del) == 0
226 assert distf("hello", "helo", free_del) == 0
227 assert distf("hello", "hlo", free_del) == 0
228 assert distf("helo", "hello", free_del) == 1
229 assert distf("hlo", "hello", free_del) == 2
230 assert distf("hello", "hallo", free_del) == 1
231 assert distf("hlo", "hellp", free_del) == 3
232
233
235 a = ['Now', ' ', 'is', ' ', 'the', ' ', 'time', ' ', 'for', '\n', 'testing', '.']
236 b = list(a)
237 b[-1] = '!'
238 assert distf(a, b, text_cost()) == 1
239 b[-3] = ' '
240 assert distf(a, b, text_cost()) == 3
241 b = list(a)
242 b[-4] = 'from'
243 assert distf(a, b, text_cost()) == 2
244 b[-4:-3] = []
245 assert distf(a, b, text_cost()) == 3
246 b = list(a)
247 b[-4] = 'fir'
248 assert abs( distf(a, b, text_cost(frac_exp=1.0)) - 1./3.) < 0.001
249
250
251 if __name__ == '__main__':
252 test()
253 test2()
254