1 /* diff - compute a shortest edit script (SES) given two sequences
2  * Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>
3  *
4  * The MIT License
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 /* This algorithm is basically Myers' solution to SES/LCS with
26  * the Hirschberg linear space refinement as described in the
27  * following publication:
28  *
29  *   E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
30  *   Algorithmica 1, 2 (1986), 251-266.
31  *   http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps
32  *
33  * This is the same algorithm used by GNU diff(1).
34  */
35 
36 #include <stdlib.h>
37 #include <limits.h>
38 #include <errno.h>
39 #include <stddef.h>
40 #include <R_ext/Utils.h>
41 
42 #include "cli.h"
43 #include "errors.h"
44 
45 typedef enum {
46   DIFF_MATCH = 1,
47   DIFF_DELETE,
48   DIFF_INSERT
49 } diff_op;
50 
51 struct diff_edit {
52   short op;
53   int off; /* off into s1 if MATCH or DELETE but s2 if INSERT */
54   int len;
55 };
56 
57 typedef int (*cmp_fn)(int a, int b, void *context);
58 
59 static int _diff(const void *a, int aoff, int n,
60                  const void *b, int boff, int m,
61                  cmp_fn cmp, void *context, int dmax,
62                  struct diff_edit *ses, int *sn,
63                  int *buf, int bufsize);
64 
65 struct _chr_data {
66   SEXP* aptr;
67   SEXP* bptr;
68 };
69 
_cmp_chr(int a,int b,void * context)70 static int _cmp_chr(int a, int b, void *context) {
71   struct _chr_data *data = (struct _chr_data*) context;
72   return data->aptr[a] != data->bptr[b];
73 }
74 
clic_diff_chr(SEXP old,SEXP new,SEXP max)75 SEXP clic_diff_chr(SEXP old, SEXP new, SEXP max) {
76   int l_old = Rf_length(old);
77   int l_new = Rf_length(new);
78   int dmax = INTEGER(max)[0];
79   int snmax = l_old + l_new + 1; // upper bound is sum of lengths
80   int bufsize = snmax;
81 
82   struct diff_edit *ses =
83     (struct diff_edit*) S_alloc(snmax, sizeof(struct diff_edit));
84   int *buf = (int*) S_alloc(bufsize, sizeof(int));
85   int sn;
86 
87   struct _chr_data data;
88   data.aptr = STRING_PTR(old);
89   data.bptr = STRING_PTR(new);
90 
91   int out = _diff(old, 0, l_old, new, 0, l_new, _cmp_chr, &data,
92                   dmax, ses, &sn, buf, bufsize);
93 
94   /* AFAICT we'll never error like this, but it does not hurt... */
95   if (out < 0) {                // __NO_COVERAGE__
96     R_THROW_ERROR(                                        // __NO_COVERAGE__
97       "Could not calculate diff, internal error: %d, %d", // __NO_COVERAGE__
98       out,                                                // __NO_COVERAGE__
99       errno                                               // __NO_COVERAGE__
100     );                                                    // __NO_COVERAGE__
101   }                                                       // __NO_COVERAGE__
102 
103   SEXP result = PROTECT(allocVector(VECSXP, 4));
104   SET_VECTOR_ELT(result, 0, allocVector(INTSXP, sn));
105   SET_VECTOR_ELT(result, 1, allocVector(INTSXP, sn));
106   SET_VECTOR_ELT(result, 2, allocVector(INTSXP, sn));
107   SET_VECTOR_ELT(result, 3, ScalarInteger(out));
108 
109   int i;
110   int *op_ptr  = INTEGER(VECTOR_ELT(result, 0));
111   int *off_ptr = INTEGER(VECTOR_ELT(result, 1));
112   int *len_ptr = INTEGER(VECTOR_ELT(result, 2));
113 
114   for (i = 0; i < sn; i++, op_ptr++, off_ptr++, len_ptr++) {
115     struct diff_edit *e = ses + i;
116     *op_ptr = e->op;
117     *off_ptr = e->off;
118     *len_ptr = e->len;
119   }
120 
121   UNPROTECT(1);
122   return result;
123 }
124 
125 #define FV(k) _v(ctx, (k), 0)
126 #define RV(k) _v(ctx, (k), 1)
127 
128 struct _ctx {
129   cmp_fn cmp;
130   void *context;
131   int *buf;
132   int bufsize;
133   struct diff_edit *ses;
134   int si;
135   int dmax;
136 };
137 
138 struct middle_snake {
139   int x, y, u, v;
140 };
141 
_setv(struct _ctx * ctx,int k,int r,int val)142 static void _setv(struct _ctx *ctx, int k, int r, int val) {
143   /* Pack -N to N into 0 to N * 2 */
144   int j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
145 
146   if (j >= ctx->bufsize) {
147     int newsize = j + 1;
148     ctx->buf = (int*) S_realloc((char*) ctx->buf, newsize, ctx->bufsize,
149                                 sizeof(int));
150     ctx->bufsize = newsize;
151   }
152 
153   ctx->buf[j] = val;
154 }
155 
_v(struct _ctx * ctx,int k,int r)156 static int _v(struct _ctx *ctx, int k, int r) {
157   int j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
158 
159   if (j > ctx->bufsize) {
160     int newsize = j + 1;
161     ctx->buf = (int*) S_realloc((char*) ctx->buf, newsize, ctx->bufsize,
162                                 sizeof(int));
163     ctx->bufsize = newsize;
164   }
165 
166   return ctx->buf[j];
167 }
168 
_find_middle_snake(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx,struct middle_snake * ms)169 static int _find_middle_snake(const void *a, int aoff, int n,
170                               const void *b, int boff, int m,
171                               struct _ctx *ctx,
172                               struct middle_snake *ms) {
173   int delta, odd, mid, d;
174 
175   delta = n - m;
176   odd = delta & 1;
177   mid = (n + m) / 2;
178   mid += odd;
179 
180   _setv(ctx, 1, 0, 0);
181   _setv(ctx, delta - 1, 1, n);
182 
183   for (d = 0; d <= mid; d++) {
184     int k, x, y;
185 
186     R_CheckUserInterrupt();
187 
188     if ((2 * d - 1) >= ctx->dmax) {
189       return ctx->dmax;
190     }
191 
192     for (k = d; k >= -d; k -= 2) {
193       if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
194         x = FV(k + 1);
195       } else {
196         x = FV(k - 1) + 1;
197       }
198       y = x - k;
199 
200       ms->x = x;
201       ms->y = y;
202       while (x < n && y < m && ctx->cmp(aoff + x, boff + y, ctx->context) == 0) {
203         x++; y++;
204       }
205       _setv(ctx, k, 0, x);
206 
207       if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
208         if (x >= RV(k)) {
209           ms->u = x;
210           ms->v = y;
211           return 2 * d - 1;
212         }
213       }
214     }
215     for (k = d; k >= -d; k -= 2) {
216       int kr = (n - m) + k;
217 
218       if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) {
219         x = RV(kr - 1);
220       } else {
221         x = RV(kr + 1) - 1;
222       }
223       y = x - kr;
224 
225       ms->u = x;
226       ms->v = y;
227       while (x > 0 && y > 0 && ctx->cmp(aoff + (x - 1), boff + (y - 1), ctx->context) == 0) {
228         x--; y--;
229       }
230       _setv(ctx, kr, 1, x);
231 
232       if (!odd && kr >= -d && kr <= d) {
233         if (x <= FV(kr)) {
234           ms->x = x;
235           ms->y = y;
236           return 2 * d;
237         }
238       }
239     }
240   }
241 
242   errno = EFAULT; // __NO_COVERAGE__
243 
244   return -1;      // __NO_COVERAGE__
245 }
246 
_edit(struct _ctx * ctx,diff_op op,int off,int len)247 static void _edit(struct _ctx *ctx, diff_op op, int off, int len) {
248   struct diff_edit *e;
249 
250   if (len == 0 || ctx->ses == NULL) {
251     return;
252   }               /* Add an edit to the SES (or
253                    * coalesce if the op is the same)
254                    */
255   e = &ctx->ses[ctx->si];
256   if (e->op != op) {
257     if (e->op) {
258       ctx->si++;
259       e = &ctx->ses[ctx->si];
260     }
261     e->op = op;
262     e->off = off;
263     e->len = len;
264   } else {
265     e->len += len;
266   }
267 }
268 
_ses(const void * a,int aoff,int n,const void * b,int boff,int m,struct _ctx * ctx)269 static int _ses(const void *a, int aoff, int n,
270                 const void *b, int boff, int m,
271                 struct _ctx *ctx) {
272   struct middle_snake ms;
273   int d;
274 
275   if (n == 0) {
276     _edit(ctx, DIFF_INSERT, boff, m);
277     d = m;
278   } else if (m == 0) {
279     _edit(ctx, DIFF_DELETE, aoff, n);
280     d = n;
281   } else {
282     /* Find the middle "snake" around which we
283      * recursively solve the sub-problems.
284      */
285     d = _find_middle_snake(a, aoff, n, b, boff, m, ctx, &ms);
286     if (d == -1) {
287       return -1;
288     } else if (d >= ctx->dmax) {
289       return ctx->dmax;
290     } else if (ctx->ses == NULL) {
291       return d;
292     } else if (d > 1) {
293       if (_ses(a, aoff, ms.x, b, boff, ms.y, ctx) == -1) {
294         return -1;
295       }
296 
297       _edit(ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x);
298 
299       aoff += ms.u;
300       boff += ms.v;
301       n -= ms.u;
302       m -= ms.v;
303       if (_ses(a, aoff, n, b, boff, m, ctx) == -1) {
304         return -1;
305       }
306     } else {
307       int x = ms.x;
308       int u = ms.u;
309 
310       /* There are only 4 base cases when the
311        * edit distance is 1.
312        *
313        * n > m   m > n
314        *
315        *   -       |
316        *    \       \    x != u
317        *     \       \
318        *
319        *   \       \
320        *    \       \    x == u
321        *     -       |
322        */
323 
324       if (m > n) {
325         if (x == u) {
326           _edit(ctx, DIFF_MATCH, aoff, n);
327           _edit(ctx, DIFF_INSERT, boff + (m - 1), 1);
328         } else {
329           _edit(ctx, DIFF_INSERT, boff, 1);
330           _edit(ctx, DIFF_MATCH, aoff, n);
331         }
332       } else {
333         if (x == u) {
334           _edit(ctx, DIFF_MATCH, aoff, m);
335           _edit(ctx, DIFF_DELETE, aoff + (n - 1), 1);
336         } else {
337           _edit(ctx, DIFF_DELETE, aoff, 1);
338           _edit(ctx, DIFF_MATCH, aoff + 1, m);
339         }
340       }
341     }
342   }
343 
344   return d;
345 }
346 
_diff(const void * a,int aoff,int n,const void * b,int boff,int m,cmp_fn cmp,void * context,int dmax,struct diff_edit * ses,int * sn,int * buf,int bufsize)347 static int _diff(const void *a, int aoff, int n,
348                  const void *b, int boff, int m,
349                  cmp_fn cmp, void *context, int dmax,
350                  struct diff_edit *ses, int *sn,
351                  int *buf, int bufsize) {
352   struct _ctx ctx;
353   int d, x, y;
354   struct diff_edit *e = NULL;
355 
356   ctx.cmp = cmp;
357   ctx.context = context;
358   ctx.buf = buf;
359   ctx.bufsize = bufsize;
360   ctx.ses = ses;
361   ctx.si = 0;
362   ctx.dmax = dmax ? dmax : INT_MAX;
363 
364   if (ses && sn) {
365     if ((e = ses) == NULL) {
366       return -1;  // __NO_COVERAGE__
367     }
368     e->op = 0;
369   }
370 
371   /* The _ses function assumes the SES will begin or end with a delete
372    * or insert. The following will ensure this is true by eating any
373    * beginning matches. This is also a quick way to process sequences
374    * that match entirely.
375    */
376   x = y = 0;
377   while (x < n && y < m && cmp(aoff + x, boff + y, context) == 0) {
378     x++; y++;
379   }
380   _edit(&ctx, DIFF_MATCH, aoff, x);
381 
382   if ((d = _ses(a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) {
383     return -1;
384   }
385   if (ses && sn) {
386     *sn = e->op ? ctx.si + 1 : 0;
387   }
388 
389 
390   return d;
391 }
392