1 /* diff - compute a shortest edit script (SES) given two sequences
2  * Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>
3  *
4  * The MIT License
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 /* This algorithm is basically Myers' solution to SES/LCS with
26  * the Hirschberg linear space refinement as described in the
27  * following publication:
28  *
29  *   E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
30  *   Algorithmica 1, 2 (1986), 251-266.
31  *   http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps
32  *
33  * This is the same algorithm used by GNU diff(1).
34  */
35 
36 #include <stdlib.h>
37 #include <limits.h>
38 #include <errno.h>
39 
40 #include "mba/msgno.h"
41 #include "mba/diff.h"
42 
43 #define FV(k) _v(ctx, (k), 0)
44 #define RV(k) _v(ctx, (k), 1)
45 
46 struct _ctx {
47 	idx_fn idx;
48 	cmp_fn cmp;
49 	void *context;
50 	struct varray *buf;
51 	struct varray *ses;
52 	int si;
53 	int dmax;
54 };
55 
56 struct middle_snake {
57 	int x, y, u, v;
58 };
59 
60 static void
_setv(struct _ctx * ctx,int k,int r,int val)61 _setv(struct _ctx *ctx, int k, int r, int val)
62 {
63 	int j;
64 	int *i;
65                 /* Pack -N to N into 0 to N * 2
66                  */
67 	j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
68 
69 	i = (int *)varray_get(ctx->buf, j);
70 	*i = val;
71 }
72 static int
_v(struct _ctx * ctx,int k,int r)73 _v(struct _ctx *ctx, int k, int r)
74 {
75 	int j;
76 
77 	j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
78 
79 	return *((int *)varray_get(ctx->buf, j));
80 }
81 
82 static int
_find_middle_snake(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx,struct middle_snake * ms)83 _find_middle_snake(const void *a, int aoff, int n,
84 		const void *b, int boff, int m,
85 		struct _ctx *ctx,
86 		struct middle_snake *ms)
87 {
88 	int delta, odd, mid, d;
89 
90 	delta = n - m;
91 	odd = delta & 1;
92 	mid = (n + m) / 2;
93 	mid += odd;
94 
95 	_setv(ctx, 1, 0, 0);
96 	_setv(ctx, delta - 1, 1, n);
97 
98 	for (d = 0; d <= mid; d++) {
99 		int k, x, y;
100 
101 		if ((2 * d - 1) >= ctx->dmax) {
102 			return ctx->dmax;
103 		}
104 
105 		for (k = d; k >= -d; k -= 2) {
106 			if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
107 				x = FV(k + 1);
108 			} else {
109 				x = FV(k - 1) + 1;
110 			}
111 			y = x - k;
112 
113 			ms->x = x;
114 			ms->y = y;
115 			if (ctx->cmp) {
116 				while (x < n && y < m && ctx->cmp(ctx->idx(a, aoff + x, ctx->context),
117 							ctx->idx(b, boff + y, ctx->context), ctx->context) == 0) {
118 					x++; y++;
119 				}
120 			} else {
121 				const unsigned char *a0 = (const unsigned char *)a + aoff;
122 				const unsigned char *b0 = (const unsigned char *)b + boff;
123 				while (x < n && y < m && a0[x] == b0[y]) {
124 					x++; y++;
125 				}
126 			}
127 			_setv(ctx, k, 0, x);
128 
129 			if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
130 				if (x >= RV(k)) {
131 					ms->u = x;
132 					ms->v = y;
133 					return 2 * d - 1;
134 				}
135 			}
136 		}
137 		for (k = d; k >= -d; k -= 2) {
138 			int kr = (n - m) + k;
139 
140 			if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) {
141 				x = RV(kr - 1);
142 			} else {
143 				x = RV(kr + 1) - 1;
144 			}
145 			y = x - kr;
146 
147 			ms->u = x;
148 			ms->v = y;
149 			if (ctx->cmp) {
150 				while (x > 0 && y > 0 && ctx->cmp(ctx->idx(a, aoff + (x - 1), ctx->context),
151 							ctx->idx(b, boff + (y - 1), ctx->context), ctx->context) == 0) {
152 					x--; y--;
153 				}
154 			} else {
155 				const unsigned char *a0 = (const unsigned char *)a + aoff;
156 				const unsigned char *b0 = (const unsigned char *)b + boff;
157 				while (x > 0 && y > 0 && a0[x - 1] == b0[y - 1]) {
158 					x--; y--;
159 				}
160 			}
161 			_setv(ctx, kr, 1, x);
162 
163 			if (!odd && kr >= -d && kr <= d) {
164 				if (x <= FV(kr)) {
165 					ms->x = x;
166 					ms->y = y;
167 					return 2 * d;
168 				}
169 			}
170 		}
171 	}
172 
173 	errno = EFAULT;
174 
175 	return -1;
176 }
177 
178 static void
_edit(struct _ctx * ctx,int op,int off,int len)179 _edit(struct _ctx *ctx, int op, int off, int len)
180 {
181 	struct diff_edit *e;
182 
183 	if (len == 0 || ctx->ses == NULL) {
184 		return;
185 	}               /* Add an edit to the SES (or
186                      * coalesce if the op is the same)
187                      */
188 	e = varray_get(ctx->ses, ctx->si);
189 	if (e->op != op) {
190 		if (e->op) {
191 			ctx->si++;
192 			e = varray_get(ctx->ses, ctx->si);
193 		}
194 		e->op = op;
195 		e->off = off;
196 		e->len = len;
197 	} else {
198 		e->len += len;
199 	}
200 }
201 
202 static int
_ses(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx)203 _ses(const void *a, int aoff, int n,
204 		const void *b, int boff, int m,
205 		struct _ctx *ctx)
206 {
207 	struct middle_snake ms;
208 	int d;
209 
210 	if (n == 0) {
211 		_edit(ctx, DIFF_INSERT, boff, m);
212 		d = m;
213 	} else if (m == 0) {
214 		_edit(ctx, DIFF_DELETE, aoff, n);
215 		d = n;
216 	} else {
217                     /* Find the middle "snake" around which we
218                      * recursively solve the sub-problems.
219                      */
220 		d = _find_middle_snake(a, aoff, n, b, boff, m, ctx, &ms);
221 		if (d == -1) {
222 			return -1;
223 		} else if (d >= ctx->dmax) {
224 			return ctx->dmax;
225 		} else if (ctx->ses == NULL) {
226 			return d;
227 		} else if (d > 1) {
228 			if (_ses(a, aoff, ms.x, b, boff, ms.y, ctx) == -1) {
229 				return -1;
230 			}
231 
232 			_edit(ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x);
233 
234 			aoff += ms.u;
235 			boff += ms.v;
236 			n -= ms.u;
237 			m -= ms.v;
238 			if (_ses(a, aoff, n, b, boff, m, ctx) == -1) {
239 				return -1;
240 			}
241 		} else {
242 			int x = ms.x;
243 			int u = ms.u;
244 
245                  /* There are only 4 base cases when the
246                   * edit distance is 1.
247                   *
248                   * n > m   m > n
249                   *
250                   *   -       |
251                   *    \       \    x != u
252                   *     \       \
253                   *
254                   *   \       \
255                   *    \       \    x == u
256                   *     -       |
257                   */
258 
259 			if (m > n) {
260 				if (x == u) {
261 					_edit(ctx, DIFF_MATCH, aoff, n);
262 					_edit(ctx, DIFF_INSERT, boff + (m - 1), 1);
263 				} else {
264 					_edit(ctx, DIFF_INSERT, boff, 1);
265 					_edit(ctx, DIFF_MATCH, aoff, n);
266 				}
267 			} else {
268 				if (x == u) {
269 					_edit(ctx, DIFF_MATCH, aoff, m);
270 					_edit(ctx, DIFF_DELETE, aoff + (n - 1), 1);
271 				} else {
272 					_edit(ctx, DIFF_DELETE, aoff, 1);
273 					_edit(ctx, DIFF_MATCH, aoff + 1, m);
274 				}
275 			}
276 		}
277 	}
278 
279 	return d;
280 }
281 int
diff(const void * a,int aoff,int n,const void * b,int boff,int m,idx_fn idx,cmp_fn cmp,void * context,int dmax,struct varray * ses,int * sn,struct varray * buf)282 diff(const void *a, int aoff, int n,
283 		const void *b, int boff, int m,
284 		idx_fn idx, cmp_fn cmp, void *context, int dmax,
285 		struct varray *ses, int *sn,
286 		struct varray *buf)
287 {
288 	struct _ctx ctx;
289 	int d, x, y;
290 	struct diff_edit *e = NULL;
291 	struct varray tmp;
292 
293 	if (!idx != !cmp) { /* ensure both NULL or both non-NULL */
294 		errno = EINVAL;
295 		return -1;
296 	}
297 
298 	ctx.idx = idx;
299 	ctx.cmp = cmp;
300 	ctx.context = context;
301 	if (buf) {
302 		ctx.buf = buf;
303 	} else {
304 		varray_init(&tmp, sizeof(int), NULL);
305 		ctx.buf = &tmp;
306 	}
307 	ctx.ses = ses;
308 	ctx.si = 0;
309 	ctx.dmax = dmax ? dmax : INT_MAX;
310 
311 	if (ses && sn) {
312 		if ((e = varray_get(ses, 0)) == NULL) {
313 			AMSG("");
314 			if (buf == NULL) {
315 				varray_deinit(&tmp);
316 			}
317 			return -1;
318 		}
319 		e->op = 0;
320 	}
321 
322          /* The _ses function assumes the SES will begin or end with a delete
323           * or insert. The following will insure this is true by eating any
324           * beginning matches. This is also a quick to process sequences
325           * that match entirely.
326           */
327 	x = y = 0;
328 	if (cmp) {
329 		while (x < n && y < m && cmp(idx(a, aoff + x, context),
330 					idx(b, boff + y, context), context) == 0) {
331 			x++; y++;
332 		}
333 	} else {
334 		const unsigned char *a0 = (const unsigned char *)a + aoff;
335 		const unsigned char *b0 = (const unsigned char *)b + boff;
336 		while (x < n && y < m && a0[x] == b0[y]) {
337 			x++; y++;
338 		}
339 	}
340 	_edit(&ctx, DIFF_MATCH, aoff, x);
341 
342 	if ((d = _ses(a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) {
343 		AMSG("");
344 		if (buf == NULL) {
345 			varray_deinit(&tmp);
346 		}
347 		return -1;
348 	}
349 	if (ses && sn) {
350 		*sn = e->op ? ctx.si + 1 : 0;
351 	}
352 
353 	if (buf == NULL) {
354 		varray_deinit(&tmp);
355 	}
356 
357 	return d;
358 }
359 
360