Package gmisclib :: Module edit_distance
[frames] | no frames]

Source Code for Module gmisclib.edit_distance

  1   
  2  """Levenshtein (edit) distance between two strings of symbols. 
  3  """ 
  4   
  5  import numpy 
  6   
7 -def dist(s, t, csub, cinsert, cdel):
8 """Cost for converting string s into string t. This function 9 takes three dictionaries that contain the cost of insertion, deletion, 10 and substitutions. 11 12 @param s: starting string 13 @param t: ending string 14 @type s: string or array 15 @type t: string or array 16 @param csub: cost of converting a to b 17 @type csub: dict((a,b) : float) 18 @param cinsert: cost of insertion 19 @type cinsert: dict(a: float) 20 @param cdel: cost of deletion 21 @type cdel: dict(a: float) 22 """ 23 def f(a, b): 24 if a is None: 25 return cinsert[b] 26 elif b is None: 27 return cdel[a] 28 return csub[(a, b)]
29 return distf(s, t, f) 30 31
32 -def distf1(s, t, csub):
33 """Cost for converting string C{s} into string C{t}. This function takes 34 a function that computes the cost of insertions, deletions, or substitutions. 35 (This is an alternative implementation for C{distf2()}.) 36 37 @param s: starting string 38 @param t: ending string 39 @type s: string or array 40 @type t: string or array 41 @param csub: cost of converting a to b 42 @type csub: function(a,b) : float 43 """ 44 m = len(s) 45 n = len(t) 46 d = numpy.zeros((m+1, n+1), numpy.float) 47 for i in range(1, m+1): 48 d[i,0] = d[i-1,0] + csub(s[i-1], None) 49 for j in range(1, n+1): 50 d[0,j] = d[0,j-1] + csub(None, t[j-1]) 51 52 for j in range(1, n+1): 53 for i in range(1, m+1): 54 d[i,j] = min(d[i-1,j] + csub(s[i-1], None), # insertion 55 d[i,j-1] + csub(None, t[j-1]), # deletion 56 d[i-1, j-1] + csub(s[i-1], t[j-1]) # sub 57 ) 58 return d[m, n]
59 60
61 -def distf2(s, t, csub):
62 """Cost for converting string s into string t. This function takes 63 a function that computes the cost of insertions, deletions, or substitutions. 64 65 @param s: starting string 66 @param t: ending string 67 @type s: string or array 68 @type t: string or array 69 @param csub: cost of converting a to b 70 @type csub: function(a,b) : float 71 """ 72 m = len(s) 73 n = len(t) 74 dj = numpy.zeros((m+1,)) 75 djm1 = numpy.zeros((m+1,)) 76 for i in range(1, m+1): 77 dj[i] = dj[i-1] + csub(s[i-1], None) 78 79 for j in range(1, n+1): 80 tmp = djm1 81 djm1 = dj 82 dj = tmp 83 dj[:] = 0.0 84 dj[0] = djm1[0] + csub(None, t[j-1]) 85 for i in range(1, m+1): 86 dj[i] = min(dj[i-1] + csub(s[i-1], None), # insertion 87 djm1[i] + csub(None, t[j-1]), # deletion 88 djm1[i-1] + csub(s[i-1], t[j-1]) # sub 89 ) 90 return dj[m]
91 92 93 distf = distf2 94 95
96 -def def_cost(a, b):
97 """This is the default cost function for converting C{a} into C{b}. 98 Insertions, deletions, and substitutions all have a unit cost. Normally, 99 this is passed as an arg to L{distf}. 100 @param a: a symbol (or None to indicate an insertion). 101 @param b: a symbol (or None to indicate an deletion). 102 @return: the cost of changing C{a} into C{b} 103 @rtype: int 104 """ 105 if a is None or b is None: 106 return 1 107 if a==b: 108 return 0 109 return 1
110 111
112 -def free_sub(a, b):
113 """This is a sample cost function for converting C{a} into C{b}. 114 Insertions and deletions have a unit cost; substitutions are free. Normally, 115 this is passed as an arg to L{distf}. 116 @param a: a symbol (or None to indicate an insertion). 117 @param b: a symbol (or None to indicate an deletion). 118 @return: the cost of changing C{a} into C{b} 119 @rtype: int 120 """ 121 if a is None or b is None: 122 return 1 123 return 0
124 125
126 -def free_del(a, b):
127 """This is a sample cost function for converting C{a} into C{b}. 128 Insertions and substitutions have a unit cost; deletion are free. Normally, 129 this is passed as an arg to L{distf}. 130 @param a: a symbol (or None to indicate an insertion). 131 @param b: a symbol (or None to indicate an deletion). 132 @return: the cost of changing C{a} into C{b} 133 @rtype: int 134 """ 135 if b is None: 136 return 0 137 if a is None: 138 return 1 139 if a==b: 140 return 0 141 return 1
142 143
144 -class text_cost(object):
145 """This is useful for differencing documents that have been parsed 146 into lists of words. It computes the cost of a change of words 147 by an edit distance computation on the characters within the words. 148 (It caches the costs of commonly encountered words for speed.) 149 Instances of this class can be passed as a cost function to 150 L{distf}. 151 """ 152
153 - def __init__(self, costfac=None, cachesize=None, word_cost=None, frac_exp=0.0):
154 """Create an instance. 155 @param costfac: None, or an arbitrary function that multiplies the scaled 156 edit distance between the two words. Called as C{self.costfac(a,b)} 157 where either C{a} or C{b} can be None for insertions or deletions. 158 @type costfac: C{function(str,str): float} or C{None} 159 @param cachesize: how large a cache of word-to-word distances should be kept? 160 @type cachesize: None (meaning unlimited) or int 161 @param word_cost: cost function to use inside words: this is a cost for insertion, 162 deletion, or substitution of letters. 163 @type word_cost: C{function(str,str): float} or None (meaning use L{def_cost}). 164 @param frac_exp: An exponent for scaling the edit distance by the length of words. 165 @type frac_exp: float 166 """ 167 self.costfac = costfac 168 self.cache = {} 169 assert cachesize is None or cachesize >= 0 170 self.cachesize = cachesize 171 172 if word_cost is None: 173 self.word_cost = def_cost 174 else: 175 self.word_cost = word_cost 176 self.frac_exp = float(frac_exp)
177 178 179
180 - def __call__(self, a, b):
181 """To be used by L{distf}. 182 @type a: str or None 183 @type b: str or None 184 @return: a cost 185 @rtype: presumably a float 186 """ 187 if a is None: 188 a = '' 189 else: 190 if a == b: 191 return 0.0 192 if b is None: 193 b = '' 194 195 try: 196 return self.cache[(a,b)] 197 except KeyError: 198 pass 199 200 cost = distf(a, b, self.word_cost) 201 cost /= float(max(len(a), len(b)))**self.frac_exp 202 if self.cachesize is not None and len(self.cache) > self.cachesize: 203 self.cache.pop() 204 if self.costfac is not None: 205 cost *= self.costfac(a, b) 206 self.cache[(a,b)] = cost 207 return cost
208 209 210
211 -def test():
212 assert distf("hello", "helpo", def_cost) == 1 213 assert distf("hello", "helo", def_cost) == 1 214 assert distf("hello", "hellox", def_cost) == 1 215 assert distf("hello", "helloxx", def_cost) == 2 216 assert distf("hello", "elo", def_cost) == 2 217 assert distf("hello", "eelpo", def_cost) == 2 218 assert distf("kitten", "sitting", def_cost) == 3 219 assert distf("saturday", "sunday", def_cost) == 3 220 assert distf("hello", "xxxxo", free_sub) == 0 221 assert distf("hello", "xxxo", free_sub) == 1 222 assert distf("hello", "xxo", free_sub) == 2 223 assert distf("hello", "yxxxxo", free_sub) == 1 224 assert distf("hello", "yxxxxoy", free_sub) == 2 225 assert distf("hello", "hello", free_del) == 0 226 assert distf("hello", "helo", free_del) == 0 227 assert distf("hello", "hlo", free_del) == 0 228 assert distf("helo", "hello", free_del) == 1 229 assert distf("hlo", "hello", free_del) == 2 230 assert distf("hello", "hallo", free_del) == 1 231 assert distf("hlo", "hellp", free_del) == 3
232 233
234 -def test2():
235 a = ['Now', ' ', 'is', ' ', 'the', ' ', 'time', ' ', 'for', '\n', 'testing', '.'] 236 b = list(a) 237 b[-1] = '!' 238 assert distf(a, b, text_cost()) == 1 239 b[-3] = ' ' 240 assert distf(a, b, text_cost()) == 3 241 b = list(a) 242 b[-4] = 'from' 243 assert distf(a, b, text_cost()) == 2 244 b[-4:-3] = [] 245 assert distf(a, b, text_cost()) == 3 246 b = list(a) 247 b[-4] = 'fir' 248 assert abs( distf(a, b, text_cost(frac_exp=1.0)) - 1./3.) < 0.001
249 250 251 if __name__ == '__main__': 252 test() 253 test2() 254