1 /* diff - compute a shortest edit script (SES) given two sequences
2 * Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>
3 *
4 * The MIT License
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /* This algorithm is basically Myers' solution to SES/LCS with
26 * the Hirschberg linear space refinement as described in the
27 * following publication:
28 *
29 * E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
30 * Algorithmica 1, 2 (1986), 251-266.
31 * http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps
32 *
33 * This is the same algorithm used by GNU diff(1).
34 */
35
36 #include <stdlib.h>
37 #include <limits.h>
38 #include <errno.h>
39 #include <stddef.h>
40 #include <R_ext/Utils.h>
41
42 #include "cli.h"
43 #include "errors.h"
44
45 typedef enum {
46 DIFF_MATCH = 1,
47 DIFF_DELETE,
48 DIFF_INSERT
49 } diff_op;
50
51 struct diff_edit {
52 short op;
53 int off; /* off into s1 if MATCH or DELETE but s2 if INSERT */
54 int len;
55 };
56
57 typedef int (*cmp_fn)(int a, int b, void *context);
58
59 static int _diff(const void *a, int aoff, int n,
60 const void *b, int boff, int m,
61 cmp_fn cmp, void *context, int dmax,
62 struct diff_edit *ses, int *sn,
63 int *buf, int bufsize);
64
65 struct _chr_data {
66 SEXP* aptr;
67 SEXP* bptr;
68 };
69
_cmp_chr(int a,int b,void * context)70 static int _cmp_chr(int a, int b, void *context) {
71 struct _chr_data *data = (struct _chr_data*) context;
72 return data->aptr[a] != data->bptr[b];
73 }
74
clic_diff_chr(SEXP old,SEXP new,SEXP max)75 SEXP clic_diff_chr(SEXP old, SEXP new, SEXP max) {
76 int l_old = Rf_length(old);
77 int l_new = Rf_length(new);
78 int dmax = INTEGER(max)[0];
79 int snmax = l_old + l_new + 1; // upper bound is sum of lengths
80 int bufsize = snmax;
81
82 struct diff_edit *ses =
83 (struct diff_edit*) S_alloc(snmax, sizeof(struct diff_edit));
84 int *buf = (int*) S_alloc(bufsize, sizeof(int));
85 int sn;
86
87 struct _chr_data data;
88 data.aptr = STRING_PTR(old);
89 data.bptr = STRING_PTR(new);
90
91 int out = _diff(old, 0, l_old, new, 0, l_new, _cmp_chr, &data,
92 dmax, ses, &sn, buf, bufsize);
93
94 /* AFAICT we'll never error like this, but it does not hurt... */
95 if (out < 0) { // __NO_COVERAGE__
96 R_THROW_ERROR( // __NO_COVERAGE__
97 "Could not calculate diff, internal error: %d, %d", // __NO_COVERAGE__
98 out, // __NO_COVERAGE__
99 errno // __NO_COVERAGE__
100 ); // __NO_COVERAGE__
101 } // __NO_COVERAGE__
102
103 SEXP result = PROTECT(allocVector(VECSXP, 4));
104 SET_VECTOR_ELT(result, 0, allocVector(INTSXP, sn));
105 SET_VECTOR_ELT(result, 1, allocVector(INTSXP, sn));
106 SET_VECTOR_ELT(result, 2, allocVector(INTSXP, sn));
107 SET_VECTOR_ELT(result, 3, ScalarInteger(out));
108
109 int i;
110 int *op_ptr = INTEGER(VECTOR_ELT(result, 0));
111 int *off_ptr = INTEGER(VECTOR_ELT(result, 1));
112 int *len_ptr = INTEGER(VECTOR_ELT(result, 2));
113
114 for (i = 0; i < sn; i++, op_ptr++, off_ptr++, len_ptr++) {
115 struct diff_edit *e = ses + i;
116 *op_ptr = e->op;
117 *off_ptr = e->off;
118 *len_ptr = e->len;
119 }
120
121 UNPROTECT(1);
122 return result;
123 }
124
125 #define FV(k) _v(ctx, (k), 0)
126 #define RV(k) _v(ctx, (k), 1)
127
128 struct _ctx {
129 cmp_fn cmp;
130 void *context;
131 int *buf;
132 int bufsize;
133 struct diff_edit *ses;
134 int si;
135 int dmax;
136 };
137
138 struct middle_snake {
139 int x, y, u, v;
140 };
141
_setv(struct _ctx * ctx,int k,int r,int val)142 static void _setv(struct _ctx *ctx, int k, int r, int val) {
143 /* Pack -N to N into 0 to N * 2 */
144 int j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
145
146 if (j >= ctx->bufsize) {
147 int newsize = j + 1;
148 ctx->buf = (int*) S_realloc((char*) ctx->buf, newsize, ctx->bufsize,
149 sizeof(int));
150 ctx->bufsize = newsize;
151 }
152
153 ctx->buf[j] = val;
154 }
155
_v(struct _ctx * ctx,int k,int r)156 static int _v(struct _ctx *ctx, int k, int r) {
157 int j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
158
159 if (j > ctx->bufsize) {
160 int newsize = j + 1;
161 ctx->buf = (int*) S_realloc((char*) ctx->buf, newsize, ctx->bufsize,
162 sizeof(int));
163 ctx->bufsize = newsize;
164 }
165
166 return ctx->buf[j];
167 }
168
_find_middle_snake(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx,struct middle_snake * ms)169 static int _find_middle_snake(const void *a, int aoff, int n,
170 const void *b, int boff, int m,
171 struct _ctx *ctx,
172 struct middle_snake *ms) {
173 int delta, odd, mid, d;
174
175 delta = n - m;
176 odd = delta & 1;
177 mid = (n + m) / 2;
178 mid += odd;
179
180 _setv(ctx, 1, 0, 0);
181 _setv(ctx, delta - 1, 1, n);
182
183 for (d = 0; d <= mid; d++) {
184 int k, x, y;
185
186 R_CheckUserInterrupt();
187
188 if ((2 * d - 1) >= ctx->dmax) {
189 return ctx->dmax;
190 }
191
192 for (k = d; k >= -d; k -= 2) {
193 if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
194 x = FV(k + 1);
195 } else {
196 x = FV(k - 1) + 1;
197 }
198 y = x - k;
199
200 ms->x = x;
201 ms->y = y;
202 while (x < n && y < m && ctx->cmp(aoff + x, boff + y, ctx->context) == 0) {
203 x++; y++;
204 }
205 _setv(ctx, k, 0, x);
206
207 if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
208 if (x >= RV(k)) {
209 ms->u = x;
210 ms->v = y;
211 return 2 * d - 1;
212 }
213 }
214 }
215 for (k = d; k >= -d; k -= 2) {
216 int kr = (n - m) + k;
217
218 if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) {
219 x = RV(kr - 1);
220 } else {
221 x = RV(kr + 1) - 1;
222 }
223 y = x - kr;
224
225 ms->u = x;
226 ms->v = y;
227 while (x > 0 && y > 0 && ctx->cmp(aoff + (x - 1), boff + (y - 1), ctx->context) == 0) {
228 x--; y--;
229 }
230 _setv(ctx, kr, 1, x);
231
232 if (!odd && kr >= -d && kr <= d) {
233 if (x <= FV(kr)) {
234 ms->x = x;
235 ms->y = y;
236 return 2 * d;
237 }
238 }
239 }
240 }
241
242 errno = EFAULT; // __NO_COVERAGE__
243
244 return -1; // __NO_COVERAGE__
245 }
246
_edit(struct _ctx * ctx,diff_op op,int off,int len)247 static void _edit(struct _ctx *ctx, diff_op op, int off, int len) {
248 struct diff_edit *e;
249
250 if (len == 0 || ctx->ses == NULL) {
251 return;
252 } /* Add an edit to the SES (or
253 * coalesce if the op is the same)
254 */
255 e = &ctx->ses[ctx->si];
256 if (e->op != op) {
257 if (e->op) {
258 ctx->si++;
259 e = &ctx->ses[ctx->si];
260 }
261 e->op = op;
262 e->off = off;
263 e->len = len;
264 } else {
265 e->len += len;
266 }
267 }
268
_ses(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx)269 static int _ses(const void *a, int aoff, int n,
270 const void *b, int boff, int m,
271 struct _ctx *ctx) {
272 struct middle_snake ms;
273 int d;
274
275 if (n == 0) {
276 _edit(ctx, DIFF_INSERT, boff, m);
277 d = m;
278 } else if (m == 0) {
279 _edit(ctx, DIFF_DELETE, aoff, n);
280 d = n;
281 } else {
282 /* Find the middle "snake" around which we
283 * recursively solve the sub-problems.
284 */
285 d = _find_middle_snake(a, aoff, n, b, boff, m, ctx, &ms);
286 if (d == -1) {
287 return -1;
288 } else if (d >= ctx->dmax) {
289 return ctx->dmax;
290 } else if (ctx->ses == NULL) {
291 return d;
292 } else if (d > 1) {
293 if (_ses(a, aoff, ms.x, b, boff, ms.y, ctx) == -1) {
294 return -1;
295 }
296
297 _edit(ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x);
298
299 aoff += ms.u;
300 boff += ms.v;
301 n -= ms.u;
302 m -= ms.v;
303 if (_ses(a, aoff, n, b, boff, m, ctx) == -1) {
304 return -1;
305 }
306 } else {
307 int x = ms.x;
308 int u = ms.u;
309
310 /* There are only 4 base cases when the
311 * edit distance is 1.
312 *
313 * n > m m > n
314 *
315 * - |
316 * \ \ x != u
317 * \ \
318 *
319 * \ \
320 * \ \ x == u
321 * - |
322 */
323
324 if (m > n) {
325 if (x == u) {
326 _edit(ctx, DIFF_MATCH, aoff, n);
327 _edit(ctx, DIFF_INSERT, boff + (m - 1), 1);
328 } else {
329 _edit(ctx, DIFF_INSERT, boff, 1);
330 _edit(ctx, DIFF_MATCH, aoff, n);
331 }
332 } else {
333 if (x == u) {
334 _edit(ctx, DIFF_MATCH, aoff, m);
335 _edit(ctx, DIFF_DELETE, aoff + (n - 1), 1);
336 } else {
337 _edit(ctx, DIFF_DELETE, aoff, 1);
338 _edit(ctx, DIFF_MATCH, aoff + 1, m);
339 }
340 }
341 }
342 }
343
344 return d;
345 }
346
_diff(const void * a,int aoff,int n,const void * b,int boff,int m,cmp_fn cmp,void * context,int dmax,struct diff_edit * ses,int * sn,int * buf,int bufsize)347 static int _diff(const void *a, int aoff, int n,
348 const void *b, int boff, int m,
349 cmp_fn cmp, void *context, int dmax,
350 struct diff_edit *ses, int *sn,
351 int *buf, int bufsize) {
352 struct _ctx ctx;
353 int d, x, y;
354 struct diff_edit *e = NULL;
355
356 ctx.cmp = cmp;
357 ctx.context = context;
358 ctx.buf = buf;
359 ctx.bufsize = bufsize;
360 ctx.ses = ses;
361 ctx.si = 0;
362 ctx.dmax = dmax ? dmax : INT_MAX;
363
364 if (ses && sn) {
365 if ((e = ses) == NULL) {
366 return -1; // __NO_COVERAGE__
367 }
368 e->op = 0;
369 }
370
371 /* The _ses function assumes the SES will begin or end with a delete
372 * or insert. The following will ensure this is true by eating any
373 * beginning matches. This is also a quick way to process sequences
374 * that match entirely.
375 */
376 x = y = 0;
377 while (x < n && y < m && cmp(aoff + x, boff + y, context) == 0) {
378 x++; y++;
379 }
380 _edit(&ctx, DIFF_MATCH, aoff, x);
381
382 if ((d = _ses(a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) {
383 return -1;
384 }
385 if (ses && sn) {
386 *sn = e->op ? ctx.si + 1 : 0;
387 }
388
389
390 return d;
391 }
392