1 /* diff - compute a shortest edit script (SES) given two sequences
2 * Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>
3 *
4 * The MIT License
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /* This algorithm is basically Myers' solution to SES/LCS with
26 * the Hirschberg linear space refinement as described in the
27 * following publication:
28 *
29 * E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
30 * Algorithmica 1, 2 (1986), 251-266.
31 * http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps
32 *
33 * This is the same algorithm used by GNU diff(1).
34 */
35
36 #include <stdlib.h>
37 #include <limits.h>
38 #include <errno.h>
39
40 #include "mba/msgno.h"
41 #include "mba/diff.h"
42
43 #define FV(k) _v(ctx, (k), 0)
44 #define RV(k) _v(ctx, (k), 1)
45
46 struct _ctx {
47 idx_fn idx;
48 cmp_fn cmp;
49 void *context;
50 struct varray *buf;
51 struct varray *ses;
52 int si;
53 int dmax;
54 };
55
56 struct middle_snake {
57 int x, y, u, v;
58 };
59
60 static void
_setv(struct _ctx * ctx,int k,int r,int val)61 _setv(struct _ctx *ctx, int k, int r, int val)
62 {
63 int j;
64 int *i;
65 /* Pack -N to N into 0 to N * 2
66 */
67 j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
68
69 i = (int *)varray_get(ctx->buf, j);
70 *i = val;
71 }
72 static int
_v(struct _ctx * ctx,int k,int r)73 _v(struct _ctx *ctx, int k, int r)
74 {
75 int j;
76
77 j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
78
79 return *((int *)varray_get(ctx->buf, j));
80 }
81
82 static int
_find_middle_snake(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx,struct middle_snake * ms)83 _find_middle_snake(const void *a, int aoff, int n,
84 const void *b, int boff, int m,
85 struct _ctx *ctx,
86 struct middle_snake *ms)
87 {
88 int delta, odd, mid, d;
89
90 delta = n - m;
91 odd = delta & 1;
92 mid = (n + m) / 2;
93 mid += odd;
94
95 _setv(ctx, 1, 0, 0);
96 _setv(ctx, delta - 1, 1, n);
97
98 for (d = 0; d <= mid; d++) {
99 int k, x, y;
100
101 if ((2 * d - 1) >= ctx->dmax) {
102 return ctx->dmax;
103 }
104
105 for (k = d; k >= -d; k -= 2) {
106 if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
107 x = FV(k + 1);
108 } else {
109 x = FV(k - 1) + 1;
110 }
111 y = x - k;
112
113 ms->x = x;
114 ms->y = y;
115 if (ctx->cmp) {
116 while (x < n && y < m && ctx->cmp(ctx->idx(a, aoff + x, ctx->context),
117 ctx->idx(b, boff + y, ctx->context), ctx->context) == 0) {
118 x++; y++;
119 }
120 } else {
121 const unsigned char *a0 = (const unsigned char *)a + aoff;
122 const unsigned char *b0 = (const unsigned char *)b + boff;
123 while (x < n && y < m && a0[x] == b0[y]) {
124 x++; y++;
125 }
126 }
127 _setv(ctx, k, 0, x);
128
129 if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
130 if (x >= RV(k)) {
131 ms->u = x;
132 ms->v = y;
133 return 2 * d - 1;
134 }
135 }
136 }
137 for (k = d; k >= -d; k -= 2) {
138 int kr = (n - m) + k;
139
140 if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) {
141 x = RV(kr - 1);
142 } else {
143 x = RV(kr + 1) - 1;
144 }
145 y = x - kr;
146
147 ms->u = x;
148 ms->v = y;
149 if (ctx->cmp) {
150 while (x > 0 && y > 0 && ctx->cmp(ctx->idx(a, aoff + (x - 1), ctx->context),
151 ctx->idx(b, boff + (y - 1), ctx->context), ctx->context) == 0) {
152 x--; y--;
153 }
154 } else {
155 const unsigned char *a0 = (const unsigned char *)a + aoff;
156 const unsigned char *b0 = (const unsigned char *)b + boff;
157 while (x > 0 && y > 0 && a0[x - 1] == b0[y - 1]) {
158 x--; y--;
159 }
160 }
161 _setv(ctx, kr, 1, x);
162
163 if (!odd && kr >= -d && kr <= d) {
164 if (x <= FV(kr)) {
165 ms->x = x;
166 ms->y = y;
167 return 2 * d;
168 }
169 }
170 }
171 }
172
173 errno = EFAULT;
174
175 return -1;
176 }
177
178 static void
_edit(struct _ctx * ctx,int op,int off,int len)179 _edit(struct _ctx *ctx, int op, int off, int len)
180 {
181 struct diff_edit *e;
182
183 if (len == 0 || ctx->ses == NULL) {
184 return;
185 } /* Add an edit to the SES (or
186 * coalesce if the op is the same)
187 */
188 e = varray_get(ctx->ses, ctx->si);
189 if (e->op != op) {
190 if (e->op) {
191 ctx->si++;
192 e = varray_get(ctx->ses, ctx->si);
193 }
194 e->op = op;
195 e->off = off;
196 e->len = len;
197 } else {
198 e->len += len;
199 }
200 }
201
202 static int
_ses(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx)203 _ses(const void *a, int aoff, int n,
204 const void *b, int boff, int m,
205 struct _ctx *ctx)
206 {
207 struct middle_snake ms;
208 int d;
209
210 if (n == 0) {
211 _edit(ctx, DIFF_INSERT, boff, m);
212 d = m;
213 } else if (m == 0) {
214 _edit(ctx, DIFF_DELETE, aoff, n);
215 d = n;
216 } else {
217 /* Find the middle "snake" around which we
218 * recursively solve the sub-problems.
219 */
220 d = _find_middle_snake(a, aoff, n, b, boff, m, ctx, &ms);
221 if (d == -1) {
222 return -1;
223 } else if (d >= ctx->dmax) {
224 return ctx->dmax;
225 } else if (ctx->ses == NULL) {
226 return d;
227 } else if (d > 1) {
228 if (_ses(a, aoff, ms.x, b, boff, ms.y, ctx) == -1) {
229 return -1;
230 }
231
232 _edit(ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x);
233
234 aoff += ms.u;
235 boff += ms.v;
236 n -= ms.u;
237 m -= ms.v;
238 if (_ses(a, aoff, n, b, boff, m, ctx) == -1) {
239 return -1;
240 }
241 } else {
242 int x = ms.x;
243 int u = ms.u;
244
245 /* There are only 4 base cases when the
246 * edit distance is 1.
247 *
248 * n > m m > n
249 *
250 * - |
251 * \ \ x != u
252 * \ \
253 *
254 * \ \
255 * \ \ x == u
256 * - |
257 */
258
259 if (m > n) {
260 if (x == u) {
261 _edit(ctx, DIFF_MATCH, aoff, n);
262 _edit(ctx, DIFF_INSERT, boff + (m - 1), 1);
263 } else {
264 _edit(ctx, DIFF_INSERT, boff, 1);
265 _edit(ctx, DIFF_MATCH, aoff, n);
266 }
267 } else {
268 if (x == u) {
269 _edit(ctx, DIFF_MATCH, aoff, m);
270 _edit(ctx, DIFF_DELETE, aoff + (n - 1), 1);
271 } else {
272 _edit(ctx, DIFF_DELETE, aoff, 1);
273 _edit(ctx, DIFF_MATCH, aoff + 1, m);
274 }
275 }
276 }
277 }
278
279 return d;
280 }
281 int
diff(const void * a,int aoff,int n,const void * b,int boff,int m,idx_fn idx,cmp_fn cmp,void * context,int dmax,struct varray * ses,int * sn,struct varray * buf)282 diff(const void *a, int aoff, int n,
283 const void *b, int boff, int m,
284 idx_fn idx, cmp_fn cmp, void *context, int dmax,
285 struct varray *ses, int *sn,
286 struct varray *buf)
287 {
288 struct _ctx ctx;
289 int d, x, y;
290 struct diff_edit *e = NULL;
291 struct varray tmp;
292
293 if (!idx != !cmp) { /* ensure both NULL or both non-NULL */
294 errno = EINVAL;
295 return -1;
296 }
297
298 ctx.idx = idx;
299 ctx.cmp = cmp;
300 ctx.context = context;
301 if (buf) {
302 ctx.buf = buf;
303 } else {
304 varray_init(&tmp, sizeof(int), NULL);
305 ctx.buf = &tmp;
306 }
307 ctx.ses = ses;
308 ctx.si = 0;
309 ctx.dmax = dmax ? dmax : INT_MAX;
310
311 if (ses && sn) {
312 if ((e = varray_get(ses, 0)) == NULL) {
313 AMSG("");
314 if (buf == NULL) {
315 varray_deinit(&tmp);
316 }
317 return -1;
318 }
319 e->op = 0;
320 }
321
322 /* The _ses function assumes the SES will begin or end with a delete
323 * or insert. The following will insure this is true by eating any
324 * beginning matches. This is also a quick to process sequences
325 * that match entirely.
326 */
327 x = y = 0;
328 if (cmp) {
329 while (x < n && y < m && cmp(idx(a, aoff + x, context),
330 idx(b, boff + y, context), context) == 0) {
331 x++; y++;
332 }
333 } else {
334 const unsigned char *a0 = (const unsigned char *)a + aoff;
335 const unsigned char *b0 = (const unsigned char *)b + boff;
336 while (x < n && y < m && a0[x] == b0[y]) {
337 x++; y++;
338 }
339 }
340 _edit(&ctx, DIFF_MATCH, aoff, x);
341
342 if ((d = _ses(a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) {
343 AMSG("");
344 if (buf == NULL) {
345 varray_deinit(&tmp);
346 }
347 return -1;
348 }
349 if (ses && sn) {
350 *sn = e->op ? ctx.si + 1 : 0;
351 }
352
353 if (buf == NULL) {
354 varray_deinit(&tmp);
355 }
356
357 return d;
358 }
359
360