1 // Copyright (C) 2003 Graydon Hoare <graydon@pobox.com>
2 //
3 // This program is made available under the GNU GPL version 2.0 or
4 // greater. See the accompanying file COPYING for details.
5 //
6 // This program is distributed WITHOUT ANY WARRANTY; without even the
7 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 // PURPOSE.
9
10 /* this is a pretty direct translation (with only vague understanding,
11 unfortunately) of aubrey jaffer's most recent O(NP) edit-script
12 calculation algorithm, which performs quite a bit better than myers,
13 manber and miller's O(NP) simple edit *distance* algorithm. this one
14 builds the entire *script* that fast.
15
16 the following is jaffer's copyright and license statement. it probably
17 still has some legal relevance here, as this is a highly derivative
18 work. if you want to see more of the original file jaffer's work came
19 from, see the SLIB repository on savannah.nongnu.org, his website at
20 http://www.swiss.ai.mit.edu/~jaffer/, or look in the journal of
21 computational biology. apparantly it's submitted for publication there
22 too.
23
24 ---
25
26 "differ.scm" O(NP) Sequence Comparison Algorithm.
27 Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
28
29 Permission to copy this software, to modify it, to redistribute it, to
30 distribute modified versions, and to use it for any purpose is granted,
31 subject to the following restrictions and understandings.
32
33 1. Any copy made of this software must include this copyright notice
34 in full.
35
36 2. I have made no warrantee or representation that the operation of
37 this software will be error-free, and I am under no obligation to
38 provide any services, by way of maintenance, update, or otherwise.
39
40 3. In conjunction with products arising from the use of this material,
41 there shall be no use of my name in any advertising, promotional, or
42 sales literature without prior written consent in each case.
43
44 */
45
46 /*
47 This is now understood better. The comments below are all new,
48 most of the variable names that aren't one letter and don't
49 look like x86 registers are new, and there are some complexity
50 fixes so the recursing doesn't make it accidentally O(n^2) in
51 the input length.
52 */
53
54 #include "base.hh"
55 #include <algorithm>
56 #include "vector.hh"
57
58 #include "lcs.hh"
59 #include "sanity.hh"
60
61 using std::back_insert_iterator;
62 using std::back_inserter;
63 using std::copy;
64 using std::iterator_traits;
65 using std::max;
66 using std::min;
67 using std::sort;
68 using std::vector;
69
70 /*
71 http://read.pudn.com/downloads131/sourcecode/delphi_control/558602/O(NP).pdf
72 An O(NP) Sequence Comparison Algorithm
73 Sun Wu, Udi Manber, Gene Myers; University of Arizona
74 Webb Miller; Pennsylvania State University
75 August 1989
76
77 The above paper shows how to find the edit distance between strings in time
78 at worst O(num-deletions * longer-length), and on average
79 O(longer-length + (edit-distance * num-deletions)).
80
81
82 Name the two input strings "a" and "b", "a" being the shorter one. Consider
83 and edit graph with a going down (x coordinate) and b going across (y coord).
84
85 stringBislonger
86 s\ \.
87 t \ .
88 r \ .
89 i \ \ .
90 n \ . \ .
91 g X .
92 A .
93
94 You start in the top left corner, and want to end up in the lower right
95 corner. There are 3 ways you can move: follow a diagonal for zero cost,
96 or move directly down or directly right for a cost of one. The total cost
97 of the cheapest path is the edit distance. A movement directly down
98 corresponds to a deletion, and a movement directly right corresponds to
99 an insertion.
100
101 If you had a diagonal from the top all the way to the bottom, the cost
102 would be the difference in the lengths of the input strings ("delta").
103 For every movement directly down you need to add exactly one movement
104 directly right, so the total cost D = delta + (2 * num-deletions).
105
106 Give each diagonal in the edit graph a number. The diagonal through the
107 origin is 0; diagonals above / right of it are numbered 1, 2, ...; diagonals
108 below / left of it are numbered -1, -2, ... . The diagonal through the lower
109 right corner will be number delta (difference of input lengths).
110
111 An edit path with a particular number of deletions cannot go below
112 diagonal -(num-deletions) or above diagonal delta + (num-deletions).
113 So we have bounding diagonals for any edit path up to a given number of
114 deletions and therefore up to a given length.
115
116
117 compare() with a large p_lim and full_scan = false implements this algorithm.
118
119 compare() with a given p_lim (maximum number of deletions) calculates
120 the lowest cost of a path through each relevant point along the bottom of
121 the edit graph.
122 */
123
124
125
126 struct work_vec
127 {
128 long lo;
129 long hi;
130 static vector<long, QA(long)> vec;
work_vecwork_vec131 work_vec(long lo, long hi) :
132 lo(lo), hi(hi)
133 {
134 // I(hi >= lo);
135 size_t len = (hi - lo) + 1;
136 vec.resize(len);
137 vec.assign(len, -1);
138 }
139
operator []work_vec140 inline long & operator[](long t)
141 {
142 // I(t >= lo && t <= hi);
143 return vec[t-lo];
144 }
145 };
146
147 vector<long, QA(long)> work_vec::vec;
148
149 template <typename A,
150 typename B,
151 typename LCS>
152 struct jaffer_edit_calculator
153 {
154
155 typedef vector<long, QA(long)> cost_vec;
156 typedef vector<long, QA(long)> edit_vec;
157
158 template <typename T>
159 struct subarray
160 {
161 typedef typename iterator_traits<T>::value_type vt;
162
163 T base; // underlying representation
164 long start; // current extent
165 long end; // current extent
166
subarrayjaffer_edit_calculator::subarray167 subarray(T b, long s, long e) :
168 base(b), start(s), end(e) {}
169
sizejaffer_edit_calculator::subarray170 inline long size() const
171 {
172 if (end < start)
173 return start - end;
174 else
175 return end - start;
176 }
177
subsetjaffer_edit_calculator::subarray178 inline subarray subset(long s, long e) const
179 {
180 return subarray(base + min(start, end), s, e);
181 }
182
operator []jaffer_edit_calculator::subarray183 inline vt const & operator[](size_t idx) const
184 {
185 if (end < start)
186 return *(base + (start - (idx + 1)));
187 else
188 return *(base + (start + idx));
189 }
190 };
191
runjaffer_edit_calculator192 static long run(work_vec & fp, long k,
193 subarray<A> const & a, long a_len,
194 subarray<B> const & b, long b_len,
195 cost_vec & CC, long p)
196 {
197 long cost = k + 2*p;
198
199 // do the run
200 long y = max(fp[k-1]+1, fp[k+1]);
201 long x = y - k;
202
203 I(y >= 0);
204 I(x >= 0);
205
206 while (true)
207 {
208 // record costs along the way
209 long xcst = a_len - x;
210 if (y < static_cast<long>(CC.size()) && xcst >= 0)
211 {
212 CC[y] = min(xcst + cost, CC[y]);
213 }
214 if (x < a_len && y < b_len && a[x] == b[y])
215 {
216 ++x; ++y;
217 }
218 else
219 break;
220 }
221
222 fp[k] = y;
223 return y;
224 }
225
226 // 'compare' here is the core myers, manber and miller algorithm.
comparejaffer_edit_calculator227 static long compare(cost_vec & costs,
228 subarray<A> const & a, long len_a,
229 subarray<B> const & b, long len_b,
230 long p_lim,
231 bool full_scan = true)
232 {
233 long const delta = len_b - len_a;
234 long lo = -(len_a+1), hi = (1+len_b);
235 if (full_scan)
236 {
237 lo = -(p_lim + 1);
238 hi = p_lim + 1 + delta;
239 }
240 work_vec fp(lo, hi);
241
242 long p = 0;
243
244 for (; p <= p_lim; ++p)
245 {
246
247 // lower sweep
248 for (long k = -p; k < delta; ++k)
249 run(fp, k, a, len_a, b, len_b, costs, p);
250
251 // upper sweep
252 for (long k = delta + p; k > delta; --k)
253 run(fp, k, a, len_a, b, len_b, costs, p);
254
255 // middle
256 long fpval = run(fp, delta, a, len_a, b, len_b, costs, p);
257
258 // we can bail early if not doing a full scan
259 if (!full_scan && len_b <= fpval)
260 break;
261 }
262
263 return delta + 2*p;
264 }
265
266 // This splits the edit graph into a top half and a bottom half, calculates
267 // the (cost of the) cheapest possible path through each point along the
268 // middle, and then splits the graph into left/right portions based on that
269 // point. It then recurses on the top left and bottom right quadrants (the
270 // shortest edit path cannot possibly go through the other two quadrants).
271 //
272 // When getting costs through the top and bottom halves, it can discard the
273 // rightmost part of the top and the leftmost part of the bottom, beyond where
274 // the edit band (diagonls -(num-deletes) and delta + num-deletes) crosses
275 // the split. Even with doing this the edit band is overstated in the
276 // calls to compare(), because while max-possible-deletes (p_lim) is correct
277 // the delta value is still larger by max-possible-deletes.
divide_and_conquerjaffer_edit_calculator278 static long divide_and_conquer(subarray<A> const & a, long start_a, long end_a,
279 subarray<B> const & b, long start_b, long end_b,
280 edit_vec & edits,
281 unsigned long edx,
282 long polarity,
283 long p_lim)
284 {
285 long len_b = end_b - start_b;
286 long len_a = end_a - start_a;
287 long const delta = len_b - len_a;
288 // total edit distance
289 long tcst = (2 * p_lim) + (len_b - len_a);
290 // top/bottom split point
291 long mid_a = (start_a + end_a) / 2;
292
293 I(start_a >= 0);
294 I(start_a <= a.size());
295 I(start_b >= 0);
296 I(start_b <= b.size());
297 I(end_a >= 0);
298 I(end_a <= a.size());
299 I(end_b >= 0);
300 I(end_b <= b.size());
301
302 cost_vec cc(len_b + 1, len_a + len_b);
303 cost_vec rr(len_b + 1, len_a + len_b);
304
305 // get costs from the top left through each point on the top/bottom split
306 long const top_len_a = mid_a - start_a;
307 // trim off the rightmost part of b, past where the edit band crosses the split
308 long const top_end_b = min(end_b, start_b + (top_len_a + delta + p_lim + 1));
309 compare (cc,
310 a.subset(start_a, mid_a), top_len_a,
311 b.subset(start_b, top_end_b), top_end_b - start_b,
312 min(p_lim, len_a));
313
314 // get costs from the lower right through each point on the top/bottom split
315 long const bottom_len_a = end_a - mid_a;
316 // here we trim the leftmost part of b (before reversing it)
317 long const bottom_start_b = max(start_b, end_b - (bottom_len_a + delta + p_lim + 1));
318 compare (rr,
319 a.subset(end_a, mid_a), bottom_len_a,
320 b.subset(end_b, bottom_start_b), end_b - bottom_start_b,
321 min(p_lim, len_a));
322
323 // find the first (closest-to-center) point on the split line, which
324 // has the correct total (top + bottom) cost and is therefore on the
325 // shortest edit path
326 long b_split = mid_split(len_a, len_b, rr, cc, tcst);
327
328 // known costs of each half of the path
329 long est_c = cc[b_split];
330 long est_r = rr[len_b - b_split];
331
332 // recurse on the two halves
333
334 long cost_c = diff_to_et (a, start_a, mid_a,
335 b, start_b, start_b + b_split,
336 edits, edx, polarity,
337 (est_c - (b_split - (mid_a - start_a))) / 2);
338
339 I(cost_c == est_c);
340
341 long cost_r = diff_to_et (a, mid_a, end_a,
342 b, start_b + b_split, end_b,
343 edits, est_c + edx, polarity,
344 (est_r - ((len_b - b_split) - (end_a - mid_a))) / 2);
345
346 I(cost_r == est_r);
347
348 return est_r + est_c;
349 }
350
mid_splitjaffer_edit_calculator351 static long mid_split(long m, long n,
352 cost_vec const & rr,
353 cost_vec const & cc,
354 long cost)
355 {
356 long cdx = 1 + n/2;
357 long rdx = n/2;
358 while (true)
359 {
360 I (rdx >= 0);
361
362 if (cost == (cc[rdx] + rr[n-rdx]))
363 return rdx;
364 if (cost == (cc[cdx] + rr[n-cdx]))
365 return cdx;
366 --rdx;
367 ++cdx;
368 }
369 }
370
371
order_editsjaffer_edit_calculator372 static void order_edits(edit_vec const & edits,
373 long sign,
374 edit_vec & nedits)
375 {
376 nedits.clear();
377 nedits.resize(edits.size());
378 long cost = edits.size();
379
380 if (cost == 0)
381 {
382 nedits = edits;
383 return;
384 }
385
386 edit_vec sedits = edits;
387 sort(sedits.begin(), sedits.end());
388
389 long idx0;
390 for (idx0 = 0; idx0 < cost && sedits[idx0] < 0; ++idx0) ;
391 long len_a = max(0L, -sedits[0]);
392 long len_b = sedits[cost-1];
393
394 long ddx = idx0 - 1;
395 long idx = idx0;
396 long ndx = 0;
397 long adx = 0;
398 long bdx = 0;
399
400 while (bdx < len_b || adx < len_a)
401 {
402
403 long del = (ddx < 0) ? 0 : sedits[ddx];
404 long ins = (idx >= cost) ? 0 : sedits[idx];
405
406 if (del < 0 && adx >= (-1 - del) &&
407 ins > 0 && bdx >= (-1 + ins))
408 {
409 nedits[ndx] = del;
410 nedits[ndx+1] = ins;
411 --ddx; ++idx; ndx += 2; ++adx; ++bdx;
412 }
413 else if (del < 0 && adx >= (-1 - del))
414 {
415 nedits[ndx] = del;
416 --ddx; ++ndx; ++adx;
417 }
418 else if (ins > 0 && bdx >= (-1 + ins))
419 {
420 nedits[ndx] = ins;
421 ++idx; ++ndx; ++bdx;
422 }
423 else
424 {
425 ++adx; ++bdx;
426 }
427 }
428 }
429
430 // trims and calls diff_to_ez
diff_to_etjaffer_edit_calculator431 static long diff_to_et(subarray<A> const & a, long start_a, long end_a,
432 subarray<B> const & b, long start_b, long end_b,
433 vector<long, QA(long)> & edits,
434 long edx,
435 long polarity,
436 long p_lim)
437 {
438
439 I(start_a >= 0);
440 I(start_a <= a.size());
441 I(start_b >= 0);
442 I(start_b <= b.size());
443 I(end_a >= 0);
444 I(end_a <= a.size());
445 I(end_b >= 0);
446 I(end_b <= b.size());
447
448 I(end_a - start_a >= p_lim);
449
450 // last, not end
451 long new_last_a = end_a - 1;
452 long new_last_b = end_b - 1;
453 while ((start_b <= new_last_b) &&
454 (start_a <= new_last_a) &&
455 (a[new_last_a] == b[new_last_b]))
456 {
457 --new_last_a;
458 --new_last_b;
459 }
460
461 long new_start_a = start_a;
462 long new_start_b = start_b;
463 while((new_start_b < new_last_b) &&
464 (new_start_a < new_last_a) &&
465 (a[new_start_a] == b[new_start_b]))
466 {
467 ++new_start_a;
468 ++new_start_b;
469 }
470
471 // we've trimmed; now call diff_to_ez.
472
473 // difference between length of (new) a and length of (new) b
474 long delta = (new_last_b - new_start_b) - (new_last_a - new_start_a);
475
476 if (delta < 0)
477 return diff_to_ez (b, new_start_b, new_last_b+1,
478 a, new_start_a, new_last_a+1,
479 edits, edx, -polarity, delta + p_lim);
480 else
481 return diff_to_ez (a, new_start_a, new_last_a+1,
482 b, new_start_b, new_last_b+1,
483 edits, edx, polarity, p_lim);
484 }
485
diff_to_ezjaffer_edit_calculator486 static long diff_to_ez(subarray<A> const & a, long start_a, long end_a,
487 subarray<B> const & b, long start_b, long end_b,
488 vector<long, QA(long)> & edits,
489 long edx1,
490 long polarity,
491 long p_lim)
492 {
493
494 I(start_a >= 0);
495 I(start_a <= a.size());
496 I(start_b >= 0);
497 I(start_b <= b.size());
498 I(end_a >= 0);
499 I(end_a <= a.size());
500 I(end_b >= 0);
501 I(end_b <= b.size());
502
503 long len_a = end_a - start_a;
504 long len_b = end_b - start_b;
505
506 I(len_a <= len_b);
507
508 // easy case #1: B inserts only
509 if (p_lim == 0)
510 {
511 // A == B, no edits
512 if (len_a == len_b)
513 return 0;
514
515 long adx = start_a;
516 long bdx = start_b;
517 long edx0 = edx1;
518
519 while (true)
520 {
521 if (bdx >= end_b)
522 return len_b - len_a;
523
524 if (adx >= end_a)
525 {
526 for (long idx = bdx, edx = edx0;
527 idx < end_b;
528 ++idx, ++edx)
529 edits[edx] = polarity * (idx+1);
530
531 return len_b - len_a;
532 }
533
534 if (a[adx] == b[bdx])
535 {
536 ++adx; ++bdx;
537 }
538 else
539 {
540 edits[edx0] = polarity * (bdx+1);
541 ++bdx; ++edx0;
542 }
543 }
544 }
545
546 // easy case #2: delete all A, insert all B
547 else if (len_a <= p_lim)
548 {
549 I(len_a == p_lim);
550
551 long edx0 = edx1;
552 for (long idx = start_a; idx < end_a; ++idx, ++edx0)
553 edits[edx0] = polarity * (-1 - idx);
554
555 for (long jdx = start_b; jdx < end_b; ++jdx, ++edx0)
556 edits[edx0] = polarity * (jdx + 1);
557
558 return len_a + len_b;
559 }
560
561 // hard case: recurse on subproblems
562 else
563 {
564 return divide_and_conquer (a, start_a, end_a,
565 b, start_b, end_b,
566 edits, edx1, polarity, p_lim);
567 }
568 }
569
diff_to_editsjaffer_edit_calculator570 static void diff_to_edits(subarray<A> const & a, long len_a,
571 subarray<B> const & b, long len_b,
572 vector<long, QA(long)> & edits)
573 {
574 I(len_a <= len_b);
575 cost_vec costs(len_a + len_b); // scratch array, ignored
576 long edit_distance = compare(costs, a, len_a, b, len_b, min(len_a, len_b), false);
577
578 edits.clear();
579 edits.resize(edit_distance, 0);
580 long cost = diff_to_et(a, 0, len_a,
581 b, 0, len_b,
582 edits, 0, 1, (edit_distance - (len_b - len_a)) / 2);
583 I(cost == edit_distance);
584 }
585
edits_to_lcsjaffer_edit_calculator586 static void edits_to_lcs (vector<long, QA(long)> const & edits,
587 subarray<A> const a, long len_a, long len_b,
588 LCS output)
589 {
590 long edx = 0, sdx = 0, adx = 0;
591 typedef typename iterator_traits<A>::value_type vt;
592 vector<vt, QA(vt)> lcs(((len_a + len_b) - edits.size()) / 2);
593 while (adx < len_a)
594 {
595 long edit = (edx < static_cast<long>(edits.size())) ? edits[edx] : 0;
596
597 if (edit > 0)
598 { ++edx; }
599 else if (edit == 0)
600 { lcs[sdx++] = a[adx++]; }
601 else if (adx >= (-1 - edit))
602 { ++edx; ++adx; }
603 else
604 { lcs[sdx++] = a[adx++]; }
605 }
606
607 copy(lcs.begin(), lcs.end(), output);
608 }
609 };
610
611
612 template <typename A,
613 typename B,
614 typename LCS>
_edit_script(A begin_a,A end_a,B begin_b,B end_b,vector<long,QA (long)> & edits_out,LCS ignored_out)615 void _edit_script(A begin_a, A end_a,
616 B begin_b, B end_b,
617 vector<long, QA(long)> & edits_out,
618 LCS ignored_out)
619 {
620 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
621 long len_a = end_a - begin_a;
622 long len_b = end_b - begin_b;
623 typename calc_t::edit_vec edits, ordered;
624
625 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
626 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
627
628 if (len_b < len_a)
629 {
630 calc_t::diff_to_edits (b, len_b, a, len_a, edits);
631 calc_t::order_edits (edits, -1, ordered);
632 for (size_t i = 0; i < ordered.size(); ++i)
633 ordered[i] *= -1;
634 }
635 else
636 {
637 calc_t::diff_to_edits (a, len_a, b, len_b, edits);
638 calc_t::order_edits (edits, 1, ordered);
639 }
640
641 edits_out.clear();
642 edits_out.reserve(ordered.size());
643 copy(ordered.begin(), ordered.end(), back_inserter(edits_out));
644 }
645
646
647 template <typename A,
648 typename B,
649 typename LCS>
_longest_common_subsequence(A begin_a,A end_a,B begin_b,B end_b,LCS out)650 void _longest_common_subsequence(A begin_a, A end_a,
651 B begin_b, B end_b,
652 LCS out)
653 {
654 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
655 long len_a = end_a - begin_a;
656 long len_b = end_b - begin_b;
657 typename calc_t::edit_vec edits, ordered;
658
659 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
660 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
661
662 if (len_b < len_a)
663 {
664 calc_t::diff_to_edits(b, len_b, a, len_a, edits);
665 calc_t::order_edits(edits, -1, ordered);
666 calc_t::edits_to_lcs(ordered, b, len_b, len_a, out);
667 }
668 else
669 {
670 calc_t::diff_to_edits(a, len_a, b, len_b, edits);
671 calc_t::order_edits(edits, 1, ordered);
672 calc_t::edits_to_lcs(ordered, a, len_a, len_b, out);
673 }
674 }
675
676
677 void
longest_common_subsequence(vector<long,QA (long)>::const_iterator begin_a,vector<long,QA (long)>::const_iterator end_a,vector<long,QA (long)>::const_iterator begin_b,vector<long,QA (long)>::const_iterator end_b,back_insert_iterator<vector<long,QA (long)>> lcs)678 longest_common_subsequence(vector<long, QA(long)>::const_iterator begin_a,
679 vector<long, QA(long)>::const_iterator end_a,
680 vector<long, QA(long)>::const_iterator begin_b,
681 vector<long, QA(long)>::const_iterator end_b,
682 back_insert_iterator< vector<long, QA(long)> > lcs)
683 {
684 _longest_common_subsequence(begin_a, end_a,
685 begin_b, end_b,
686 lcs);
687 }
688
689 void
edit_script(vector<long,QA (long)>::const_iterator begin_a,vector<long,QA (long)>::const_iterator end_a,vector<long,QA (long)>::const_iterator begin_b,vector<long,QA (long)>::const_iterator end_b,vector<long,QA (long)> & edits_out)690 edit_script(vector<long, QA(long)>::const_iterator begin_a,
691 vector<long, QA(long)>::const_iterator end_a,
692 vector<long, QA(long)>::const_iterator begin_b,
693 vector<long, QA(long)>::const_iterator end_b,
694 vector<long, QA(long)> & edits_out)
695 {
696 vector<long, QA(long)> lcs;
697 _edit_script(begin_a, end_a,
698 begin_b, end_b,
699 edits_out,
700 back_inserter(lcs));
701 }
702
703 // Local Variables:
704 // mode: C++
705 // fill-column: 76
706 // c-file-style: "gnu"
707 // indent-tabs-mode: nil
708 // End:
709 // vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:
710