1## Copyright (C) 2006 Muthiah Annamalai <muthiah.annamalai@uta.edu>
2## Copyright (C) 2014 Max Görner <max.goerner@tu-dresden.de>
3##
4## This program is free software; you can redistribute it and/or modify it under
5## the terms of the GNU General Public License as published by the Free Software
6## Foundation; either version 3 of the License, or (at your option) any later
7## version.
8##
9## This program is distributed in the hope that it will be useful, but WITHOUT
10## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12## details.
13##
14## You should have received a copy of the GNU General Public License along with
15## this program; if not, see <http://www.gnu.org/licenses/>.
16
17## -*- texinfo -*-
18## @deftypefn  {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2})
19## @deftypefnx {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2}, @var{weights})
20## @deftypefnx {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2}, @var{weights}, @var{modus})
21## Compute the Levenshtein edit distance between the two strings.
22##
23## The optional argument @var{weights} specifies weights for the deletion,
24## matched, and insertion operations; by default it is set to +1, 0, +1
25## respectively, so that a least editdistance means a closer match between the
26## two strings. This function implements the Levenshtein edit distance as
27## presented in Wikipedia article, accessed Nov 2006. Also the levenshtein edit
28## distance of a string with the empty string is defined to be its length.
29##
30## For the special case that there are no weights given and the array L is not
31## requested, an algorithm of Berghel and Roach, which improves an algorithm
32## introduced by Ukkonen in 1985, will be applied. This algorithm is
33## significantly faster most of the times. Its main strength lies in cases with
34## small edit distances, where huge speedups and memory savings are suspectible.
35## The time (and space) complexity is O(((dist^2 - (n - m)^2)/2) + dist), where
36## n and m denote the length of both strings.
37##
38## The optional argument @var{modus} specifies the algorithm to be used. For
39## @var{modus} = 0, Berghel and Roach's algorithm will be used whenever
40## possible. For @var{modus} = 1, the classic algorithm by Fisher and Wagner
41## will be used. If @var{L} is omitted, and @var{modus} = 1, a variant of Fisher
42## and Wagner's algorithm using only a linear amount of memory with respect to
43## the input length, but O(m*n) runtime, will be used. Again, n and m denote the
44## length of both strings.
45##
46## The default return value @var{dist} is the edit distance, and
47## the other return value @var{L} is the distance matrix.
48##
49## @example
50## @group
51## editdistance ('marry', 'marie')
52##   @result{}  2
53## @end group
54## @end example
55##
56## @end deftypefn
57
58function [dist, L] = editdistance (str1, str2, weights = [1 0 1], modus = 0)
59  #Checking correct call of the function.
60  if (nargin < 2 || nargin > 4)
61    print_usage ();
62  endif
63  if (nargin >= 3 && numel (weights) != 3)
64    error ("editdistance: WEIGTHS must be a 3 element vector.")
65  endif
66  if (!isvector(str1) || !isvector(str2))
67    error ("editdistance: Both strings must be a vector.")
68  endif
69
70  #Using the approach of Berghel and Roach, if possible.
71  if (modus == 0 &&
72      nargout < 2 &&
73      (weights(1) == weights(3) && weights(2) == 0)
74     )
75    dist = weights(1)*editdistance_berghel_roach (str1,str2);
76    return;
77  endif
78
79  saveMemory = nargout < 2;
80
81  L1 = numel (str1) + 1;
82  L2 = numel (str2) + 1;
83
84  #Checking whether str1 or str2 are empty strings. If so, the determination of the minimal edit distance is trivial.
85  if (L1 == 1)
86    dist = L2-1;
87    return;
88  elseif (L2 == 1)
89    dist = L1-1;
90    return;
91  endif
92
93  if (saveMemory)
94    L = zeros (2,L2);
95  else
96    L  = zeros (L1,L2);
97  endif
98
99  %Setting weights
100  g = weights(1); %insertion
101  m = weights(2); %match
102  d = weights(3); %deletion
103
104  if (not (saveMemory))
105    L(:,1) = [0:L1-1]'*g;
106  endif
107  L(1,:) = [0:L2-1]*g;
108
109  for idx = 2:L1;
110    if (saveMemory)
111      L(2, 1) = idx-1;
112    endif
113    for idy = 2:L2
114      if (str1(idx-1) == str2(idy-1))
115        score = m;
116      else
117        score = d;
118      endif
119      if (saveMemory)
120        x = 2;
121      else
122        x = idx;
123      endif
124      m1 = L(x-1,idy-1) + score;
125      m2 = L(x-1,idy) + g;
126      m3 = L(x,idy-1) + g;
127      L(x,idy) = min ([m1, m2, m3]);
128    endfor
129    if (saveMemory)
130      L(1, :) = L(2, :);
131    endif
132  endfor
133
134  if (saveMemory)
135    x = 2;
136  else
137    x = L1;
138  endif
139
140  dist = L(x,L2);
141endfunction
142
143function dist = editdistance_berghel_roach (a,b)
144  #Variables named according to Berghel and Roach 1996 (except s, which is called dist here)
145
146  if (!isvector(a) || !isvector(b))
147    print_usage();
148  endif
149
150  if (numel (a) > numel (b))
151    ans = a;
152    a = b;
153    b = ans;
154  endif
155  m = numel (a); n = numel (b);
156  if (m == 0)
157    dist = n;
158    return;
159  endif
160
161  MIN_K = -m-1; #The diagonal with the lowest number is -m
162  MIN_P = -1; #minimum p that has to be cached is 0
163  FKP = sparse (m+n+2,n+1);
164  FKP_cached = sparse (m+n+2,n+1);
165
166  p = n-m;
167  do
168    inc = p;
169    for temp_p = 0:p-1
170      if (abs (n-m - inc) <= temp_p)
171        get_f ((n-m) - inc, temp_p);
172      endif
173      if (abs(n-m + inc) <= temp_p)
174        get_f(n-m+inc,temp_p);
175      endif
176      inc--;
177    endfor
178    get_f (n-m,p);
179    p++;
180  until (get_f (n-m,p-1) == m);
181
182  dist = p-1;
183
184  function f = get_f (k,p)
185    if (p == abs (k)-1)
186      if (k < 0)
187        f = abs (k) - 1;
188      else
189        f = -1;
190      endif
191      return;
192    endif
193    if (p < abs (k)-1)
194      f = -inf;
195      return;
196    endif
197
198    if (FKP_cached(k-MIN_K,p-MIN_P) == 1)
199      f = FKP(k-MIN_K,p-MIN_P);
200      return;
201    endif
202
203    c1 = get_f (k,p-1) + 1;
204    c2 = get_f (k-1,p-1);
205    c3 = get_f (k+1,p-1) + 1;
206    t = max ([c1 c2 c3]);
207    #if (a([t, t+1]) == b([k+t+1, k+t]) t2 = t+1; endif #taking adjacent transpositions into account
208    while (t < m && t+k < n && a(t+1) == b(t+1+k))
209      t += 1;
210    endwhile
211    if (t > m || t+k > n)
212      f = FKP(k-MIN_K,p-MIN_P) = NaN;
213    else
214      f = FKP(k-MIN_K,p-MIN_P) = t;
215    endif
216    FKP_cached(k-MIN_K,p-MIN_P) = 1;
217  endfunction
218endfunction
219
220%!test
221%! l = 50;
222%! n = 20;
223%! rand('state',31513);
224%! abc = 'A':'Z';
225%!
226%! for it = 1:n
227%!   #Generate two Strings
228%!   #This kind of generation produces worst case examples for the new algorithm,
229%!   #so runtime comparisons won't be informative.
230%!   str1 = str2 = abc(randi([1 length(abc)],1,l));
231%!   m = randi(l/2);
232%!   str1(randi([1 l],1,m)) = abc(randi([1 length(abc)],1,m));
233%!   m = randi(l/2);
234%!   str2(randi([1 l],1,m)) = abc(randi([1 length(abc)],1,m));
235%!
236%!   d1 = editdistance(str1,str2); #Berghel and Roach
237%!   tempDist = editdistance(str2,str1);
238%!   assert(d1 == tempDist, "editdistance: Berghel and Roach algorithm is not symmetric.");
239%!   [d2 ~] = editdistance(str1,str2); #Fisher and Wagner
240%!   [tempDist ~] = editdistance(str2,str1);
241%!   assert(d2 == tempDist, "editdistance: Fisher and Wagner algorithm is not symmetric.");
242%!   d3 = editdistance(str1,str2,[1 0 1],1); #Fisher and Wagner with linear memory usage
243%!   tempDist = editdistance(str2,str1,[1 0 1],1);
244%!   assert(d3 == tempDist, "editdistance: Fisher and Wagner algorithm with linear memory usage is not symmetric.");
245%!   assert(d1 == d2, "editdistance: The result of the algorithm by Berghel and Roach and that of the algorithm by Fisher and Wagners differ.");
246%!   assert(d2 == d3, "editdistance: The results of the algorithm by Fisher and Wagner with and without linear memory usage differ.");
247%! endfor
248