1""" 2Contains functions implementing edit distance algorithms. 3""" 4 5from whoosh.compat import xrange 6 7 8def levenshtein(seq1, seq2, limit=None): 9 """Returns the Levenshtein edit distance between two strings. 10 """ 11 12 oneago = None 13 thisrow = list(range(1, len(seq2) + 1)) + [0] 14 for x in xrange(len(seq1)): 15 # Python lists wrap around for negative indices, so put the 16 # leftmost column at the *end* of the list. This matches with 17 # the zero-indexed strings and saves extra calculation. 18 oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1] 19 for y in xrange(len(seq2)): 20 delcost = oneago[y] + 1 21 addcost = thisrow[y - 1] + 1 22 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) 23 thisrow[y] = min(delcost, addcost, subcost) 24 25 if limit and x > limit and min(thisrow) > limit: 26 return limit + 1 27 28 return thisrow[len(seq2) - 1] 29 30 31def damerau_levenshtein(seq1, seq2, limit=None): 32 """Returns the Damerau-Levenshtein edit distance between two strings. 33 """ 34 35 oneago = None 36 thisrow = list(range(1, len(seq2) + 1)) + [0] 37 for x in xrange(len(seq1)): 38 # Python lists wrap around for negative indices, so put the 39 # leftmost column at the *end* of the list. This matches with 40 # the zero-indexed strings and saves extra calculation. 41 twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] 42 for y in xrange(len(seq2)): 43 delcost = oneago[y] + 1 44 addcost = thisrow[y - 1] + 1 45 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) 46 thisrow[y] = min(delcost, addcost, subcost) 47 # This block deals with transpositions 48 if (x > 0 and y > 0 and seq1[x] == seq2[y - 1] 49 and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]): 50 thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) 51 52 if limit and x > limit and min(thisrow) > limit: 53 return limit + 1 54 55 return thisrow[len(seq2) - 1] 56 57 58def relative(a, b): 59 """Returns the relative distance between two strings, in the range 60 [0-1] where 1 means total equality. 61 """ 62 63 d = distance(a, b) 64 longer = float(max((len(a), len(b)))) 65 shorter = float(min((len(a), len(b)))) 66 r = ((longer - d) / longer) * (shorter / longer) 67 return r 68 69 70distance = damerau_levenshtein 71