1 /* The authors make no claims as to the fitness or correctness of this software
2 * for any use whatsoever, and it is provided as is. Any use of this software
3 * is at the user's own risk.
4 */
5
6 #include "EXTERN.h"
7 #include "common.h" /* Declare MEM_SIZE */
8 #include "util.h" /* Declare safemalloc() */
9
10 #ifdef EDIT_DISTANCE
11
12 /* edit_dist -- returns the minimum edit distance between two strings
13
14 Program by: Mark Maimone CMU Computer Science 13 Nov 89
15 Last Modified: 28 Jan 90
16
17 If the input strings have length n and m, the algorithm runs in time
18 O(nm) and space O(min(m,n)).
19
20 HISTORY
21 13 Nov 89 (mwm) Created edit_dist() and set_costs().
22
23 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
24 computations will work even when THRESHOLD is not a multiple of
25 sizeof(int).
26
27 17 May 93 (mwm) Improved performance when used with trn's newsgroup
28 processing; assume all costs are 1, and you can terminate when a
29 threshold is exceeded.
30 */
31
32
33 #define TRN_SPEEDUP /* Use a less-general version of the
34 routine, one that's better for trn.
35 All change costs are 1, and it's okay
36 to terminate if the edit distance is
37 known to exceed MIN_DIST */
38
39 #define THRESHOLD 4000 /* worry about allocating more memory only
40 when this # of bytes is exceeded */
41 #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
42
43 #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
44
45 #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
46 #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
47 #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
48 #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
49
50
51 static int insert_cost = 1;
52 static int delete_cost = 1;
53 static int change_cost = 1;
54 static int swap_cost = 1;
55
56 static int _iswap; /* swap_int temp variable */
57 static char *_cswap; /* swap_char temp variable */
58 static int _mx, _my, _mz; /* min2, min3 temp variables */
59
60
61
62 void
view_costs(ins,del,ch,swap)63 view_costs(ins, del, ch, swap)
64 int *ins, *del, *ch, *swap;
65 {
66 SAFE_ASSIGN(ins, insert_cost);
67 SAFE_ASSIGN(del, delete_cost);
68 SAFE_ASSIGN(ch, change_cost);
69 SAFE_ASSIGN(swap, swap_cost);
70 } /* view_costs */
71
72 void
set_costs(ins,del,ch,swap)73 set_costs(ins, del, ch, swap)
74 int ins, del, ch, swap;
75 {
76 insert_cost = ins;
77 delete_cost = del;
78 change_cost = ch;
79 swap_cost = swap;
80 } /* set_costs */
81
82
83 /* edit_distn -- returns the edit distance between two strings, or -1 on
84 failure */
85
86 int
edit_distn(from,from_len,to,to_len)87 edit_distn(from, from_len, to, to_len)
88 char *from, *to;
89 register int from_len, to_len;
90 {
91 #ifndef TRN_SPEEDUP
92 register int ins, del, ch; /* local copies of edit costs */
93 #endif
94 register int row, col, index; /* dynamic programming counters */
95 register int radix; /* radix for modular indexing */
96 #ifdef TRN_SPEEDUP
97 register int low;
98 #endif
99 int *buffer; /* pointer to storage for one row
100 of the d.p. array */
101 static int store[THRESHOLD / sizeof (int)];
102 /* a small amount of static
103 storage, to be used when the
104 input strings are small enough */
105
106 /* Handle trivial cases when one string is empty */
107
108 if (from == NULL || !from_len)
109 if (to == NULL || !to_len)
110 return 0;
111 else
112 return to_len * insert_cost;
113 else if (to == NULL || !to_len)
114 return from_len * delete_cost;
115
116 /* Initialize registers */
117
118 radix = 2 * from_len + 3;
119 #ifdef TRN_SPEEDUP
120 #define ins 1
121 #define del 1
122 #define ch 1
123 #define swap_cost 1
124 #else
125 ins = insert_cost;
126 del = delete_cost;
127 ch = change_cost;
128 #endif
129
130 /* Make from short enough to fit in the static storage, if it's at all
131 possible */
132
133 if (from_len > to_len && from_len > STRLENTHRESHOLD) {
134 swap_int(from_len, to_len);
135 swap_char(from, to);
136 #ifndef TRN_SPEEDUP
137 swap_int(ins, del);
138 #endif
139 } /* if from_len > to_len */
140
141 /* Allocate the array storage (from the heap if necessary) */
142
143 if (from_len <= STRLENTHRESHOLD)
144 buffer = store;
145 else
146 buffer = (int *) safemalloc((MEM_SIZE) radix * sizeof (int));
147
148 /* Here's where the fun begins. We will find the minimum edit distance
149 using dynamic programming. We only need to store two rows of the matrix
150 at a time, since we always progress down the matrix. For example,
151 given the strings "one" and "two", and insert, delete and change costs
152 equal to 1:
153
154 _ o n e
155 _ 0 1 2 3
156 t 1 1 2 3
157 w 2 2 2 3
158 o 3 2 3 3
159
160 The dynamic programming recursion is defined as follows:
161
162 ar(x,0) := x * insert_cost
163 ar(0,y) := y * delete_cost
164 ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
165 a(x - 1, y) + insert_cost,
166 a(x, y - 1) + delete_cost,
167 a(x - 2, y - 2) + (from[x] == to[y-1] &&
168 from[x-1] == to[y] ? swap_cost :
169 infinity))
170
171 Since this only looks at most two rows and three columns back, we need
172 only store the values for the two preceeding rows. In this
173 implementation, we do not explicitly store the zero column, so only 2 *
174 from_len + 2 words are needed. However, in the implementation of the
175 swap_cost check, the current matrix value is used as a buffer; we
176 can't overwrite the earlier value until the swap_cost check has
177 been performed. So we use 2 * from_len + 3 elements in the buffer.
178 */
179
180 #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
181 buffer[mod(index)]))
182 #define NW(x,y) ar(x, y, index + from_len + 2)
183 #define N(x,y) ar(x, y, index + from_len + 3)
184 #define W(x,y) ar(x, y, index + radix - 1)
185 #define NNWW(x,y) ar(x, y, index + 1)
186 #define mod(x) ((x) % radix)
187
188 index = 0;
189
190 #ifdef DEBUG_EDITDIST
191 printf(" ");
192 for (col = 0; col < from_len; col++)
193 printf(" %c ", from[col]);
194 printf("\n ");
195
196 for (col = 0; col <= from_len; col++)
197 printf("%2d ", col * del);
198 #endif
199
200 /* Row 0 is handled implicitly; its value at a given column is col*del.
201 The loop below computes the values for Row 1. At this point we know the
202 strings are nonempty. We also don't need to consider swap costs in row
203 1.
204
205 COMMENT: the indicies row and col below point into the STRING, so
206 the corresponding MATRIX indicies are row+1 and col+1.
207 */
208
209 buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
210 #ifdef TRN_SPEEDUP
211 low = buffer[mod(index + radix - 1)];
212 #endif
213
214 #ifdef DEBUG_EDITDIST
215 printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
216 #endif
217
218 for (col = 1; col < from_len; col++) {
219 buffer[index] = min3(
220 col * del + ((from[col] == to[0]) ? 0 : ch),
221 (col + 1) * del + ins,
222 buffer[index - 1] + del);
223 #ifdef TRN_SPEEDUP
224 if (buffer[index] < low)
225 low = buffer[index];
226 #endif
227 index++;
228
229 #ifdef DEBUG_EDITDIST
230 printf("%2d ", buffer[index - 1]);
231 #endif
232
233 } /* for col = 1 */
234
235 #ifdef DEBUG_EDITDIST
236 printf("\n %c %2d ", to[1], 2 * ins);
237 #endif
238
239 /* Now handle the rest of the matrix */
240
241 for (row = 1; row < to_len; row++) {
242 for (col = 0; col < from_len; col++) {
243 buffer[index] = min3(
244 NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
245 N(row, col + 1) + ins,
246 W(row + 1, col) + del);
247 if (from[col] == to[row - 1] && col > 0 &&
248 from[col - 1] == to[row])
249 buffer[index] = min2(buffer[index],
250 NNWW(row - 1, col - 1) + swap_cost);
251
252 #ifdef DEBUG_EDITDIST
253 printf("%2d ", buffer[index]);
254 #endif
255 #ifdef TRN_SPEEDUP
256 if (buffer[index] < low || col == 0)
257 low = buffer[index];
258 #endif
259
260 index = mod(index + 1);
261 } /* for col = 1 */
262 #ifdef DEBUG_EDITDIST
263 if (row < to_len - 1)
264 printf("\n %c %2d ", to[row+1], (row + 2) * ins);
265 else
266 printf("\n");
267 #endif
268 #ifdef TRN_SPEEDUP
269 if (low > MIN_DIST)
270 break;
271 #endif
272 } /* for row = 1 */
273
274 row = buffer[mod(index + radix - 1)];
275 if (buffer != store)
276 free((char *) buffer);
277 return row;
278 } /* edit_distn */
279
280 #endif /* EDIT_DISTANCE */
281