1## Copyright (C) 2006 Muthiah Annamalai <muthiah.annamalai@uta.edu> 2## Copyright (C) 2014 Max Görner <max.goerner@tu-dresden.de> 3## 4## This program is free software; you can redistribute it and/or modify it under 5## the terms of the GNU General Public License as published by the Free Software 6## Foundation; either version 3 of the License, or (at your option) any later 7## version. 8## 9## This program is distributed in the hope that it will be useful, but WITHOUT 10## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 12## details. 13## 14## You should have received a copy of the GNU General Public License along with 15## this program; if not, see <http://www.gnu.org/licenses/>. 16 17## -*- texinfo -*- 18## @deftypefn {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2}) 19## @deftypefnx {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2}, @var{weights}) 20## @deftypefnx {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2}, @var{weights}, @var{modus}) 21## Compute the Levenshtein edit distance between the two strings. 22## 23## The optional argument @var{weights} specifies weights for the deletion, 24## matched, and insertion operations; by default it is set to +1, 0, +1 25## respectively, so that a least editdistance means a closer match between the 26## two strings. This function implements the Levenshtein edit distance as 27## presented in Wikipedia article, accessed Nov 2006. Also the levenshtein edit 28## distance of a string with the empty string is defined to be its length. 29## 30## For the special case that there are no weights given and the array L is not 31## requested, an algorithm of Berghel and Roach, which improves an algorithm 32## introduced by Ukkonen in 1985, will be applied. This algorithm is 33## significantly faster most of the times. Its main strength lies in cases with 34## small edit distances, where huge speedups and memory savings are suspectible. 35## The time (and space) complexity is O(((dist^2 - (n - m)^2)/2) + dist), where 36## n and m denote the length of both strings. 37## 38## The optional argument @var{modus} specifies the algorithm to be used. For 39## @var{modus} = 0, Berghel and Roach's algorithm will be used whenever 40## possible. For @var{modus} = 1, the classic algorithm by Fisher and Wagner 41## will be used. If @var{L} is omitted, and @var{modus} = 1, a variant of Fisher 42## and Wagner's algorithm using only a linear amount of memory with respect to 43## the input length, but O(m*n) runtime, will be used. Again, n and m denote the 44## length of both strings. 45## 46## The default return value @var{dist} is the edit distance, and 47## the other return value @var{L} is the distance matrix. 48## 49## @example 50## @group 51## editdistance ('marry', 'marie') 52## @result{} 2 53## @end group 54## @end example 55## 56## @end deftypefn 57 58function [dist, L] = editdistance (str1, str2, weights = [1 0 1], modus = 0) 59 #Checking correct call of the function. 60 if (nargin < 2 || nargin > 4) 61 print_usage (); 62 endif 63 if (nargin >= 3 && numel (weights) != 3) 64 error ("editdistance: WEIGTHS must be a 3 element vector.") 65 endif 66 if (!isvector(str1) || !isvector(str2)) 67 error ("editdistance: Both strings must be a vector.") 68 endif 69 70 #Using the approach of Berghel and Roach, if possible. 71 if (modus == 0 && 72 nargout < 2 && 73 (weights(1) == weights(3) && weights(2) == 0) 74 ) 75 dist = weights(1)*editdistance_berghel_roach (str1,str2); 76 return; 77 endif 78 79 saveMemory = nargout < 2; 80 81 L1 = numel (str1) + 1; 82 L2 = numel (str2) + 1; 83 84 #Checking whether str1 or str2 are empty strings. If so, the determination of the minimal edit distance is trivial. 85 if (L1 == 1) 86 dist = L2-1; 87 return; 88 elseif (L2 == 1) 89 dist = L1-1; 90 return; 91 endif 92 93 if (saveMemory) 94 L = zeros (2,L2); 95 else 96 L = zeros (L1,L2); 97 endif 98 99 %Setting weights 100 g = weights(1); %insertion 101 m = weights(2); %match 102 d = weights(3); %deletion 103 104 if (not (saveMemory)) 105 L(:,1) = [0:L1-1]'*g; 106 endif 107 L(1,:) = [0:L2-1]*g; 108 109 for idx = 2:L1; 110 if (saveMemory) 111 L(2, 1) = idx-1; 112 endif 113 for idy = 2:L2 114 if (str1(idx-1) == str2(idy-1)) 115 score = m; 116 else 117 score = d; 118 endif 119 if (saveMemory) 120 x = 2; 121 else 122 x = idx; 123 endif 124 m1 = L(x-1,idy-1) + score; 125 m2 = L(x-1,idy) + g; 126 m3 = L(x,idy-1) + g; 127 L(x,idy) = min ([m1, m2, m3]); 128 endfor 129 if (saveMemory) 130 L(1, :) = L(2, :); 131 endif 132 endfor 133 134 if (saveMemory) 135 x = 2; 136 else 137 x = L1; 138 endif 139 140 dist = L(x,L2); 141endfunction 142 143function dist = editdistance_berghel_roach (a,b) 144 #Variables named according to Berghel and Roach 1996 (except s, which is called dist here) 145 146 if (!isvector(a) || !isvector(b)) 147 print_usage(); 148 endif 149 150 if (numel (a) > numel (b)) 151 ans = a; 152 a = b; 153 b = ans; 154 endif 155 m = numel (a); n = numel (b); 156 if (m == 0) 157 dist = n; 158 return; 159 endif 160 161 MIN_K = -m-1; #The diagonal with the lowest number is -m 162 MIN_P = -1; #minimum p that has to be cached is 0 163 FKP = sparse (m+n+2,n+1); 164 FKP_cached = sparse (m+n+2,n+1); 165 166 p = n-m; 167 do 168 inc = p; 169 for temp_p = 0:p-1 170 if (abs (n-m - inc) <= temp_p) 171 get_f ((n-m) - inc, temp_p); 172 endif 173 if (abs(n-m + inc) <= temp_p) 174 get_f(n-m+inc,temp_p); 175 endif 176 inc--; 177 endfor 178 get_f (n-m,p); 179 p++; 180 until (get_f (n-m,p-1) == m); 181 182 dist = p-1; 183 184 function f = get_f (k,p) 185 if (p == abs (k)-1) 186 if (k < 0) 187 f = abs (k) - 1; 188 else 189 f = -1; 190 endif 191 return; 192 endif 193 if (p < abs (k)-1) 194 f = -inf; 195 return; 196 endif 197 198 if (FKP_cached(k-MIN_K,p-MIN_P) == 1) 199 f = FKP(k-MIN_K,p-MIN_P); 200 return; 201 endif 202 203 c1 = get_f (k,p-1) + 1; 204 c2 = get_f (k-1,p-1); 205 c3 = get_f (k+1,p-1) + 1; 206 t = max ([c1 c2 c3]); 207 #if (a([t, t+1]) == b([k+t+1, k+t]) t2 = t+1; endif #taking adjacent transpositions into account 208 while (t < m && t+k < n && a(t+1) == b(t+1+k)) 209 t += 1; 210 endwhile 211 if (t > m || t+k > n) 212 f = FKP(k-MIN_K,p-MIN_P) = NaN; 213 else 214 f = FKP(k-MIN_K,p-MIN_P) = t; 215 endif 216 FKP_cached(k-MIN_K,p-MIN_P) = 1; 217 endfunction 218endfunction 219 220%!test 221%! l = 50; 222%! n = 20; 223%! rand('state',31513); 224%! abc = 'A':'Z'; 225%! 226%! for it = 1:n 227%! #Generate two Strings 228%! #This kind of generation produces worst case examples for the new algorithm, 229%! #so runtime comparisons won't be informative. 230%! str1 = str2 = abc(randi([1 length(abc)],1,l)); 231%! m = randi(l/2); 232%! str1(randi([1 l],1,m)) = abc(randi([1 length(abc)],1,m)); 233%! m = randi(l/2); 234%! str2(randi([1 l],1,m)) = abc(randi([1 length(abc)],1,m)); 235%! 236%! d1 = editdistance(str1,str2); #Berghel and Roach 237%! tempDist = editdistance(str2,str1); 238%! assert(d1 == tempDist, "editdistance: Berghel and Roach algorithm is not symmetric."); 239%! [d2 ~] = editdistance(str1,str2); #Fisher and Wagner 240%! [tempDist ~] = editdistance(str2,str1); 241%! assert(d2 == tempDist, "editdistance: Fisher and Wagner algorithm is not symmetric."); 242%! d3 = editdistance(str1,str2,[1 0 1],1); #Fisher and Wagner with linear memory usage 243%! tempDist = editdistance(str2,str1,[1 0 1],1); 244%! assert(d3 == tempDist, "editdistance: Fisher and Wagner algorithm with linear memory usage is not symmetric."); 245%! assert(d1 == d2, "editdistance: The result of the algorithm by Berghel and Roach and that of the algorithm by Fisher and Wagners differ."); 246%! assert(d2 == d3, "editdistance: The results of the algorithm by Fisher and Wagner with and without linear memory usage differ."); 247%! endfor 248