1 /* The authors make no claims as to the fitness or correctness of this software
2  * for any use whatsoever, and it is provided as is. Any use of this software
3  * is at the user's own risk.
4  */
5 
6 #include "EXTERN.h"
7 #include "common.h"		/* Declare MEM_SIZE */
8 #include "util.h"		/* Declare safemalloc() */
9 
10 #ifdef EDIT_DISTANCE
11 
12 /* edit_dist -- returns the minimum edit distance between two strings
13 
14 	Program by:  Mark Maimone   CMU Computer Science   13 Nov 89
15 	Last Modified:  28 Jan 90
16 
17    If the input strings have length n and m, the algorithm runs in time
18    O(nm) and space O(min(m,n)).
19 
20 HISTORY
21    13 Nov 89 (mwm) Created edit_dist() and set_costs().
22 
23    28 Jan 90 (mwm) Added view_costs().  Should verify that THRESHOLD
24    computations will work even when THRESHOLD is not a multiple of
25    sizeof(int).
26 
27    17 May 93 (mwm) Improved performance when used with trn's newsgroup
28    processing; assume all costs are 1, and you can terminate when a
29    threshold is exceeded.
30 */
31 
32 
33 #define	TRN_SPEEDUP		/* Use a less-general version of the
34 				   routine, one that's better for trn.
35 				   All change costs are 1, and it's okay
36 				   to terminate if the edit distance is
37 				   known to exceed MIN_DIST */
38 
39 #define THRESHOLD 4000		/* worry about allocating more memory only
40 				   when this # of bytes is exceeded */
41 #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
42 
43 #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
44 
45 #define swap_int(x,y)  (_iswap = (x), (x) = (y), (y) = _iswap)
46 #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
47 #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
48 #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
49 
50 
51 static int insert_cost = 1;
52 static int delete_cost = 1;
53 static int change_cost = 1;
54 static int swap_cost   = 1;
55 
56 static int _iswap;			/* swap_int temp variable */
57 static char *_cswap;			/* swap_char temp variable */
58 static int _mx, _my, _mz;		/* min2, min3 temp variables */
59 
60 
61 
62 void
view_costs(ins,del,ch,swap)63 view_costs(ins, del, ch, swap)
64 int *ins, *del, *ch, *swap;
65 {
66     SAFE_ASSIGN(ins, insert_cost);
67     SAFE_ASSIGN(del, delete_cost);
68     SAFE_ASSIGN(ch, change_cost);
69     SAFE_ASSIGN(swap, swap_cost);
70 } /* view_costs */
71 
72 void
set_costs(ins,del,ch,swap)73 set_costs(ins, del, ch, swap)
74 int ins, del, ch, swap;
75 {
76     insert_cost = ins;
77     delete_cost = del;
78     change_cost = ch;
79     swap_cost   = swap;
80 } /* set_costs */
81 
82 
83 /* edit_distn -- returns the edit distance between two strings, or -1 on
84    failure */
85 
86 int
edit_distn(from,from_len,to,to_len)87 edit_distn(from, from_len, to, to_len)
88 char *from, *to;
89 register int from_len, to_len;
90 {
91 #ifndef TRN_SPEEDUP
92     register int ins, del, ch;	  	/* local copies of edit costs */
93 #endif
94     register int row, col, index;	/* dynamic programming counters */
95     register int radix;			/* radix for modular indexing */
96 #ifdef TRN_SPEEDUP
97     register int low;
98 #endif
99     int *buffer;			/* pointer to storage for one row
100 					   of the d.p. array */
101     static int store[THRESHOLD / sizeof (int)];
102 					/* a small amount of static
103 					   storage, to be used when the
104 					   input strings are small enough */
105 
106 /* Handle trivial cases when one string is empty */
107 
108     if (from == NULL || !from_len)
109 	if (to == NULL || !to_len)
110 	    return 0;
111 	else
112 	    return to_len * insert_cost;
113     else if (to == NULL || !to_len)
114 	return from_len * delete_cost;
115 
116 /* Initialize registers */
117 
118     radix = 2 * from_len + 3;
119 #ifdef TRN_SPEEDUP
120 #define ins 1
121 #define del 1
122 #define ch 1
123 #define swap_cost 1
124 #else
125     ins  = insert_cost;
126     del  = delete_cost;
127     ch   = change_cost;
128 #endif
129 
130 /* Make   from   short enough to fit in the static storage, if it's at all
131    possible */
132 
133     if (from_len > to_len && from_len > STRLENTHRESHOLD) {
134 	swap_int(from_len, to_len);
135 	swap_char(from, to);
136 #ifndef TRN_SPEEDUP
137 	swap_int(ins, del);
138 #endif
139     } /* if from_len > to_len */
140 
141 /* Allocate the array storage (from the heap if necessary) */
142 
143     if (from_len <= STRLENTHRESHOLD)
144 	buffer = store;
145     else
146 	buffer = (int *) safemalloc((MEM_SIZE) radix * sizeof (int));
147 
148 /* Here's where the fun begins.  We will find the minimum edit distance
149    using dynamic programming.  We only need to store two rows of the matrix
150    at a time, since we always progress down the matrix.  For example,
151    given the strings "one" and "two", and insert, delete and change costs
152    equal to 1:
153 
154 	   _  o  n  e
155 	_  0  1  2  3
156 	t  1  1  2  3
157 	w  2  2  2  3
158 	o  3  2  3  3
159 
160    The dynamic programming recursion is defined as follows:
161 
162 	ar(x,0) := x * insert_cost
163 	ar(0,y) := y * delete_cost
164 	ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
165 		       a(x - 1, y) + insert_cost,
166 		       a(x, y - 1) + delete_cost,
167 		       a(x - 2, y - 2) + (from[x] == to[y-1] &&
168 					  from[x-1] == to[y] ? swap_cost :
169 					  infinity))
170 
171    Since this only looks at most two rows and three columns back, we need
172    only store the values for the two preceeding rows.  In this
173    implementation, we do not explicitly store the zero column, so only 2 *
174    from_len + 2   words are needed.  However, in the implementation of the
175    swap_cost   check, the current matrix value is used as a buffer; we
176    can't overwrite the earlier value until the   swap_cost   check has
177    been performed.  So we use   2 * from_len + 3   elements in the buffer.
178 */
179 
180 #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
181 	buffer[mod(index)]))
182 #define NW(x,y)	  ar(x, y, index + from_len + 2)
183 #define N(x,y)	  ar(x, y, index + from_len + 3)
184 #define W(x,y)	  ar(x, y, index + radix - 1)
185 #define NNWW(x,y) ar(x, y, index + 1)
186 #define mod(x) ((x) % radix)
187 
188     index = 0;
189 
190 #ifdef DEBUG_EDITDIST
191     printf("      ");
192     for (col = 0; col < from_len; col++)
193 	printf(" %c ", from[col]);
194     printf("\n   ");
195 
196     for (col = 0; col <= from_len; col++)
197 	printf("%2d ", col * del);
198 #endif
199 
200 /* Row 0 is handled implicitly; its value at a given column is   col*del.
201    The loop below computes the values for Row 1.  At this point we know the
202    strings are nonempty.  We also don't need to consider swap costs in row
203    1.
204 
205    COMMENT:  the indicies   row and col   below point into the STRING, so
206    the corresponding MATRIX indicies are   row+1 and col+1.
207 */
208 
209     buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
210 #ifdef TRN_SPEEDUP
211     low = buffer[mod(index + radix - 1)];
212 #endif
213 
214 #ifdef DEBUG_EDITDIST
215     printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
216 #endif
217 
218     for (col = 1; col < from_len; col++) {
219 	buffer[index] = min3(
220 		col * del + ((from[col] == to[0]) ? 0 : ch),
221 		(col + 1) * del + ins,
222 		buffer[index - 1] + del);
223 #ifdef TRN_SPEEDUP
224 	if (buffer[index] < low)
225 	    low = buffer[index];
226 #endif
227 	index++;
228 
229 #ifdef DEBUG_EDITDIST
230 	printf("%2d ", buffer[index - 1]);
231 #endif
232 
233     } /* for col = 1 */
234 
235 #ifdef DEBUG_EDITDIST
236     printf("\n %c %2d ", to[1], 2 * ins);
237 #endif
238 
239 /* Now handle the rest of the matrix */
240 
241     for (row = 1; row < to_len; row++) {
242 	for (col = 0; col < from_len; col++) {
243 	    buffer[index] = min3(
244 		    NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
245 		    N(row, col + 1) + ins,
246 		    W(row + 1, col) + del);
247 	    if (from[col] == to[row - 1] && col > 0 &&
248 		    from[col - 1] == to[row])
249 		buffer[index] = min2(buffer[index],
250 			NNWW(row - 1, col - 1) + swap_cost);
251 
252 #ifdef DEBUG_EDITDIST
253 	    printf("%2d ", buffer[index]);
254 #endif
255 #ifdef TRN_SPEEDUP
256 	    if (buffer[index] < low || col == 0)
257 		low = buffer[index];
258 #endif
259 
260 	    index = mod(index + 1);
261 	} /* for col = 1 */
262 #ifdef DEBUG_EDITDIST
263 	if (row < to_len - 1)
264 	    printf("\n %c %2d ", to[row+1], (row + 2) * ins);
265 	else
266 	    printf("\n");
267 #endif
268 #ifdef TRN_SPEEDUP
269 	if (low > MIN_DIST)
270 	    break;
271 #endif
272     } /* for row = 1 */
273 
274     row = buffer[mod(index + radix - 1)];
275     if (buffer != store)
276 	free((char *) buffer);
277     return row;
278 } /* edit_distn */
279 
280 #endif /* EDIT_DISTANCE */
281