1 // Copyright (C) 2003 Graydon Hoare <graydon@pobox.com>
2 //
3 // This program is made available under the GNU GPL version 2.0 or
4 // greater. See the accompanying file COPYING for details.
5 //
6 // This program is distributed WITHOUT ANY WARRANTY; without even the
7 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 // PURPOSE.
9 
10 /* this is a pretty direct translation (with only vague understanding,
11    unfortunately) of aubrey jaffer's most recent O(NP) edit-script
12    calculation algorithm, which performs quite a bit better than myers,
13    manber and miller's O(NP) simple edit *distance* algorithm. this one
14    builds the entire *script* that fast.
15 
16    the following is jaffer's copyright and license statement. it probably
17    still has some legal relevance here, as this is a highly derivative
18    work. if you want to see more of the original file jaffer's work came
19    from, see the SLIB repository on savannah.nongnu.org, his website at
20    http://www.swiss.ai.mit.edu/~jaffer/, or look in the journal of
21    computational biology. apparantly it's submitted for publication there
22    too.
23 
24    ---
25 
26    "differ.scm" O(NP) Sequence Comparison Algorithm.
27    Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
28 
29    Permission to copy this software, to modify it, to redistribute it, to
30    distribute modified versions, and to use it for any purpose is granted,
31    subject to the following restrictions and understandings.
32 
33    1.  Any copy made of this software must include this copyright notice
34    in full.
35 
36    2.  I have made no warrantee or representation that the operation of
37    this software will be error-free, and I am under no obligation to
38    provide any services, by way of maintenance, update, or otherwise.
39 
40    3.  In conjunction with products arising from the use of this material,
41    there shall be no use of my name in any advertising, promotional, or
42    sales literature without prior written consent in each case.
43 
44 */
45 
46 /*
47   This is now understood better. The comments below are all new,
48   most of the variable names that aren't one letter and don't
49   look like x86 registers are new, and there are some complexity
50   fixes so the recursing doesn't make it accidentally O(n^2) in
51   the input length.
52  */
53 
54 #include "base.hh"
55 #include <algorithm>
56 #include "vector.hh"
57 
58 #include "lcs.hh"
59 #include "sanity.hh"
60 
61 using std::back_insert_iterator;
62 using std::back_inserter;
63 using std::copy;
64 using std::iterator_traits;
65 using std::max;
66 using std::min;
67 using std::sort;
68 using std::vector;
69 
70 /*
71   http://read.pudn.com/downloads131/sourcecode/delphi_control/558602/O(NP).pdf
72   An O(NP) Sequence Comparison Algorithm
73   Sun Wu, Udi Manber, Gene Myers; University of Arizona
74   Webb Miller; Pennsylvania State University
75   August 1989
76 
77   The above paper shows how to find the edit distance between strings in time
78   at worst O(num-deletions * longer-length), and on average
79   O(longer-length + (edit-distance * num-deletions)).
80 
81 
82   Name the two input strings "a" and "b", "a" being the shorter one. Consider
83   and edit graph with a going down (x coordinate) and b going across (y coord).
84 
85    stringBislonger
86   s\       \.
87   t \       .
88   r  \      .
89   i   \   \ .
90   n    \    . \    .
91   g     X   .
92   A      .
93 
94   You start in the top left corner, and want to end up in the lower right
95   corner. There are 3 ways you can move: follow a diagonal for zero cost,
96   or move directly down or directly right for a cost of one. The total cost
97   of the cheapest path is the edit distance. A movement directly down
98   corresponds to a deletion, and a movement directly right corresponds to
99   an insertion.
100 
101   If you had a diagonal from the top all the way to the bottom, the cost
102   would be the difference in the lengths of the input strings ("delta").
103   For every movement directly down you need to add exactly one movement
104   directly right, so the total cost D = delta + (2 * num-deletions).
105 
106   Give each diagonal in the edit graph a number. The diagonal through the
107   origin is 0; diagonals above / right of it are numbered 1, 2, ...; diagonals
108   below / left of it are numbered -1, -2, ... . The diagonal through the lower
109   right corner will be number delta (difference of input lengths).
110 
111   An edit path with a particular number of deletions cannot go below
112   diagonal -(num-deletions) or above diagonal delta + (num-deletions).
113   So we have bounding diagonals for any edit path up to a given number of
114   deletions and therefore up to a given length.
115 
116 
117   compare() with a large p_lim and full_scan = false implements this algorithm.
118 
119   compare() with a given p_lim (maximum number of deletions) calculates
120   the lowest cost of a path through each relevant point along the bottom of
121   the edit graph.
122  */
123 
124 
125 
126 struct work_vec
127 {
128   long lo;
129   long hi;
130   static vector<long, QA(long)> vec;
work_vecwork_vec131   work_vec(long lo, long hi) :
132     lo(lo), hi(hi)
133   {
134     // I(hi >= lo);
135     size_t len = (hi - lo) + 1;
136     vec.resize(len);
137     vec.assign(len, -1);
138   }
139 
operator []work_vec140   inline long & operator[](long t)
141   {
142     // I(t >= lo && t <= hi);
143     return vec[t-lo];
144   }
145 };
146 
147 vector<long, QA(long)> work_vec::vec;
148 
149 template <typename A,
150           typename B,
151           typename LCS>
152 struct jaffer_edit_calculator
153 {
154 
155   typedef vector<long, QA(long)> cost_vec;
156   typedef vector<long, QA(long)> edit_vec;
157 
158   template <typename T>
159   struct subarray
160   {
161     typedef typename iterator_traits<T>::value_type vt;
162 
163     T base;     // underlying representation
164     long start; // current extent
165     long end;   // current extent
166 
subarrayjaffer_edit_calculator::subarray167     subarray(T b, long s, long e) :
168       base(b), start(s), end(e) {}
169 
sizejaffer_edit_calculator::subarray170     inline long size() const
171     {
172       if (end < start)
173         return start - end;
174       else
175         return end - start;
176     }
177 
subsetjaffer_edit_calculator::subarray178     inline subarray subset(long s, long e) const
179     {
180       return subarray(base + min(start, end), s, e);
181     }
182 
operator []jaffer_edit_calculator::subarray183     inline vt const & operator[](size_t idx) const
184     {
185       if (end < start)
186           return *(base + (start - (idx + 1)));
187       else
188           return *(base + (start + idx));
189     }
190   };
191 
runjaffer_edit_calculator192   static long run(work_vec & fp, long k,
193                   subarray<A> const & a, long a_len,
194                   subarray<B> const & b, long b_len,
195                   cost_vec & CC, long p)
196   {
197     long cost = k + 2*p;
198 
199     // do the run
200     long y = max(fp[k-1]+1, fp[k+1]);
201     long x = y - k;
202 
203     I(y >= 0);
204     I(x >= 0);
205 
206     while (true)
207       {
208         // record costs along the way
209         long xcst = a_len - x;
210         if (y < static_cast<long>(CC.size()) && xcst >= 0)
211           {
212             CC[y] = min(xcst + cost, CC[y]);
213           }
214         if (x < a_len && y < b_len && a[x] == b[y])
215           {
216             ++x; ++y;
217           }
218         else
219           break;
220       }
221 
222     fp[k] = y;
223     return y;
224   }
225 
226   // 'compare' here is the core myers, manber and miller algorithm.
comparejaffer_edit_calculator227   static long compare(cost_vec & costs,
228                       subarray<A> const & a, long len_a,
229                       subarray<B> const & b, long len_b,
230                       long p_lim,
231                       bool full_scan = true)
232   {
233     long const delta = len_b - len_a;
234     long lo = -(len_a+1), hi = (1+len_b);
235     if (full_scan)
236       {
237         lo = -(p_lim + 1);
238         hi = p_lim + 1 + delta;
239       }
240     work_vec fp(lo, hi);
241 
242     long p = 0;
243 
244     for (; p <= p_lim; ++p)
245       {
246 
247         // lower sweep
248         for (long k = -p; k < delta; ++k)
249           run(fp, k, a, len_a, b, len_b, costs, p);
250 
251         // upper sweep
252         for (long k = delta + p; k > delta; --k)
253           run(fp, k, a, len_a, b, len_b, costs, p);
254 
255         // middle
256         long fpval = run(fp, delta, a, len_a, b, len_b, costs, p);
257 
258         // we can bail early if not doing a full scan
259         if (!full_scan && len_b <= fpval)
260           break;
261       }
262 
263     return delta + 2*p;
264   }
265 
266   // This splits the edit graph into a top half and a bottom half, calculates
267   // the (cost of the) cheapest possible path through each point along the
268   // middle, and then splits the graph into left/right portions based on that
269   // point. It then recurses on the top left and bottom right quadrants (the
270   // shortest edit path cannot possibly go through the other two quadrants).
271   //
272   // When getting costs through the top and bottom halves, it can discard the
273   // rightmost part of the top and the leftmost part of the bottom, beyond where
274   // the edit band (diagonls -(num-deletes) and delta + num-deletes) crosses
275   // the split. Even with doing this the edit band is overstated in the
276   // calls to compare(), because while max-possible-deletes (p_lim) is correct
277   // the delta value is still larger by max-possible-deletes.
divide_and_conquerjaffer_edit_calculator278   static long divide_and_conquer(subarray<A> const & a, long start_a, long end_a,
279                                  subarray<B> const & b, long start_b, long end_b,
280                                  edit_vec & edits,
281                                  unsigned long edx,
282                                  long polarity,
283                                  long p_lim)
284   {
285     long len_b = end_b - start_b;
286     long len_a = end_a - start_a;
287     long const delta = len_b - len_a;
288     // total edit distance
289     long tcst = (2 * p_lim) + (len_b - len_a);
290     // top/bottom split point
291     long mid_a = (start_a + end_a) / 2;
292 
293     I(start_a >= 0);
294     I(start_a <= a.size());
295     I(start_b >= 0);
296     I(start_b <= b.size());
297     I(end_a >= 0);
298     I(end_a <= a.size());
299     I(end_b >= 0);
300     I(end_b <= b.size());
301 
302     cost_vec cc(len_b + 1, len_a + len_b);
303     cost_vec rr(len_b + 1, len_a + len_b);
304 
305     // get costs from the top left through each point on the top/bottom split
306     long const top_len_a = mid_a - start_a;
307     // trim off the rightmost part of b, past where the edit band crosses the split
308     long const top_end_b = min(end_b, start_b + (top_len_a + delta + p_lim + 1));
309     compare (cc,
310              a.subset(start_a, mid_a), top_len_a,
311              b.subset(start_b, top_end_b), top_end_b - start_b,
312              min(p_lim, len_a));
313 
314     // get costs from the lower right through each point on the top/bottom split
315     long const bottom_len_a = end_a - mid_a;
316     // here we trim the leftmost part of b (before reversing it)
317     long const bottom_start_b = max(start_b, end_b - (bottom_len_a + delta + p_lim + 1));
318     compare (rr,
319              a.subset(end_a, mid_a), bottom_len_a,
320              b.subset(end_b, bottom_start_b), end_b - bottom_start_b,
321              min(p_lim, len_a));
322 
323     // find the first (closest-to-center) point on the split line, which
324     // has the correct total (top + bottom) cost and is therefore on the
325     // shortest edit path
326     long b_split = mid_split(len_a, len_b, rr, cc, tcst);
327 
328     // known costs of each half of the path
329     long est_c = cc[b_split];
330     long est_r = rr[len_b - b_split];
331 
332     // recurse on the two halves
333 
334     long cost_c = diff_to_et (a, start_a, mid_a,
335                               b, start_b, start_b + b_split,
336                               edits, edx, polarity,
337                               (est_c - (b_split - (mid_a - start_a))) / 2);
338 
339     I(cost_c == est_c);
340 
341     long cost_r = diff_to_et (a, mid_a, end_a,
342                               b, start_b + b_split, end_b,
343                               edits, est_c + edx, polarity,
344                               (est_r - ((len_b - b_split) - (end_a - mid_a))) / 2);
345 
346     I(cost_r == est_r);
347 
348     return est_r + est_c;
349   }
350 
mid_splitjaffer_edit_calculator351   static long mid_split(long m, long n,
352                         cost_vec const & rr,
353                         cost_vec const & cc,
354                         long cost)
355   {
356     long cdx = 1 + n/2;
357     long rdx = n/2;
358     while (true)
359       {
360         I (rdx >= 0);
361 
362         if (cost == (cc[rdx] + rr[n-rdx]))
363             return rdx;
364         if (cost == (cc[cdx] + rr[n-cdx]))
365             return cdx;
366         --rdx;
367         ++cdx;
368       }
369   }
370 
371 
order_editsjaffer_edit_calculator372   static void order_edits(edit_vec const & edits,
373                           long sign,
374                           edit_vec & nedits)
375   {
376     nedits.clear();
377     nedits.resize(edits.size());
378     long cost = edits.size();
379 
380     if (cost == 0)
381       {
382         nedits = edits;
383         return;
384       }
385 
386     edit_vec sedits = edits;
387     sort(sedits.begin(), sedits.end());
388 
389     long idx0;
390     for (idx0 = 0; idx0 < cost && sedits[idx0] < 0; ++idx0) ;
391     long len_a = max(0L, -sedits[0]);
392     long len_b = sedits[cost-1];
393 
394     long ddx = idx0 - 1;
395     long idx = idx0;
396     long ndx = 0;
397     long adx = 0;
398     long bdx = 0;
399 
400     while (bdx < len_b || adx < len_a)
401       {
402 
403         long del = (ddx < 0) ? 0 : sedits[ddx];
404         long ins = (idx >= cost) ? 0 : sedits[idx];
405 
406         if (del < 0 && adx >= (-1 - del) &&
407             ins > 0 && bdx >= (-1 + ins))
408           {
409             nedits[ndx] = del;
410             nedits[ndx+1] = ins;
411             --ddx; ++idx; ndx += 2; ++adx; ++bdx;
412           }
413         else if (del < 0 && adx >= (-1 - del))
414           {
415             nedits[ndx] = del;
416             --ddx; ++ndx; ++adx;
417           }
418         else if (ins > 0 && bdx >= (-1 + ins))
419           {
420             nedits[ndx] = ins;
421             ++idx; ++ndx; ++bdx;
422           }
423         else
424           {
425             ++adx; ++bdx;
426           }
427       }
428   }
429 
430   // trims and calls diff_to_ez
diff_to_etjaffer_edit_calculator431   static long diff_to_et(subarray<A> const & a, long start_a, long end_a,
432                          subarray<B> const & b, long start_b, long end_b,
433                          vector<long, QA(long)> & edits,
434                          long edx,
435                          long polarity,
436                          long p_lim)
437   {
438 
439     I(start_a >= 0);
440     I(start_a <= a.size());
441     I(start_b >= 0);
442     I(start_b <= b.size());
443     I(end_a >= 0);
444     I(end_a <= a.size());
445     I(end_b >= 0);
446     I(end_b <= b.size());
447 
448     I(end_a - start_a >= p_lim);
449 
450     // last, not end
451     long new_last_a = end_a - 1;
452     long new_last_b = end_b - 1;
453     while ((start_b <= new_last_b) &&
454            (start_a <= new_last_a) &&
455            (a[new_last_a] == b[new_last_b]))
456       {
457         --new_last_a;
458         --new_last_b;
459       }
460 
461     long new_start_a = start_a;
462     long new_start_b = start_b;
463     while((new_start_b < new_last_b) &&
464           (new_start_a < new_last_a) &&
465           (a[new_start_a] == b[new_start_b]))
466       {
467         ++new_start_a;
468         ++new_start_b;
469       }
470 
471     // we've trimmed; now call diff_to_ez.
472 
473     // difference between length of (new) a and length of (new) b
474     long delta = (new_last_b - new_start_b) - (new_last_a - new_start_a);
475 
476     if (delta < 0)
477       return diff_to_ez (b, new_start_b, new_last_b+1,
478                          a, new_start_a, new_last_a+1,
479                          edits, edx, -polarity, delta + p_lim);
480     else
481       return diff_to_ez (a, new_start_a, new_last_a+1,
482                          b, new_start_b, new_last_b+1,
483                          edits, edx, polarity, p_lim);
484   }
485 
diff_to_ezjaffer_edit_calculator486   static long diff_to_ez(subarray<A> const & a, long start_a, long end_a,
487                          subarray<B> const & b, long start_b, long end_b,
488                          vector<long, QA(long)> & edits,
489                          long edx1,
490                          long polarity,
491                          long p_lim)
492   {
493 
494     I(start_a >= 0);
495     I(start_a <= a.size());
496     I(start_b >= 0);
497     I(start_b <= b.size());
498     I(end_a >= 0);
499     I(end_a <= a.size());
500     I(end_b >= 0);
501     I(end_b <= b.size());
502 
503     long len_a = end_a - start_a;
504     long len_b = end_b - start_b;
505 
506     I(len_a <= len_b);
507 
508     // easy case #1: B inserts only
509     if (p_lim == 0)
510       {
511         // A == B, no edits
512         if (len_a == len_b)
513           return 0;
514 
515         long adx = start_a;
516         long bdx = start_b;
517         long edx0 = edx1;
518 
519         while (true)
520           {
521             if (bdx >= end_b)
522               return len_b - len_a;
523 
524             if (adx >= end_a)
525               {
526                 for (long idx = bdx, edx = edx0;
527                      idx < end_b;
528                      ++idx, ++edx)
529                   edits[edx] = polarity * (idx+1);
530 
531                 return len_b - len_a;
532               }
533 
534             if (a[adx] == b[bdx])
535               {
536                 ++adx; ++bdx;
537               }
538             else
539               {
540                 edits[edx0] = polarity * (bdx+1);
541                 ++bdx; ++edx0;
542               }
543           }
544       }
545 
546     // easy case #2: delete all A, insert all B
547     else if (len_a <= p_lim)
548       {
549         I(len_a == p_lim);
550 
551         long edx0 = edx1;
552         for (long idx = start_a; idx < end_a; ++idx, ++edx0)
553           edits[edx0] = polarity * (-1 - idx);
554 
555         for (long jdx = start_b; jdx < end_b; ++jdx, ++edx0)
556           edits[edx0] = polarity * (jdx + 1);
557 
558         return len_a + len_b;
559       }
560 
561     // hard case: recurse on subproblems
562     else
563       {
564         return divide_and_conquer (a, start_a, end_a,
565                                    b, start_b, end_b,
566                                    edits, edx1, polarity, p_lim);
567       }
568   }
569 
diff_to_editsjaffer_edit_calculator570   static void diff_to_edits(subarray<A> const & a, long len_a,
571                             subarray<B> const & b, long len_b,
572                             vector<long, QA(long)> & edits)
573   {
574     I(len_a <= len_b);
575     cost_vec costs(len_a + len_b); // scratch array, ignored
576     long edit_distance = compare(costs, a, len_a, b, len_b, min(len_a, len_b), false);
577 
578     edits.clear();
579     edits.resize(edit_distance, 0);
580     long cost = diff_to_et(a, 0, len_a,
581                            b, 0, len_b,
582                            edits, 0, 1, (edit_distance - (len_b - len_a)) / 2);
583     I(cost == edit_distance);
584   }
585 
edits_to_lcsjaffer_edit_calculator586   static void edits_to_lcs (vector<long, QA(long)> const & edits,
587                             subarray<A> const a, long len_a, long len_b,
588                             LCS output)
589   {
590     long edx = 0, sdx = 0, adx = 0;
591     typedef typename iterator_traits<A>::value_type vt;
592     vector<vt, QA(vt)> lcs(((len_a + len_b) - edits.size()) / 2);
593     while (adx < len_a)
594       {
595         long edit = (edx < static_cast<long>(edits.size())) ? edits[edx] : 0;
596 
597         if (edit > 0)
598           { ++edx; }
599         else if (edit == 0)
600           { lcs[sdx++] = a[adx++]; }
601         else if (adx >= (-1 - edit))
602           { ++edx; ++adx; }
603         else
604           { lcs[sdx++] = a[adx++]; }
605       }
606 
607     copy(lcs.begin(), lcs.end(), output);
608   }
609 };
610 
611 
612 template <typename A,
613           typename B,
614           typename LCS>
_edit_script(A begin_a,A end_a,B begin_b,B end_b,vector<long,QA (long)> & edits_out,LCS ignored_out)615 void _edit_script(A begin_a, A end_a,
616                   B begin_b, B end_b,
617                   vector<long, QA(long)> & edits_out,
618                   LCS ignored_out)
619 {
620   typedef jaffer_edit_calculator<A,B,LCS> calc_t;
621   long len_a = end_a - begin_a;
622   long len_b = end_b - begin_b;
623   typename calc_t::edit_vec edits, ordered;
624 
625   typename calc_t::template subarray<A> a(begin_a, 0, len_a);
626   typename calc_t::template subarray<B> b(begin_b, 0, len_b);
627 
628   if (len_b < len_a)
629     {
630       calc_t::diff_to_edits (b, len_b, a, len_a, edits);
631       calc_t::order_edits (edits, -1, ordered);
632       for (size_t i = 0; i < ordered.size(); ++i)
633         ordered[i] *= -1;
634     }
635   else
636     {
637       calc_t::diff_to_edits (a, len_a, b, len_b, edits);
638       calc_t::order_edits (edits, 1, ordered);
639     }
640 
641   edits_out.clear();
642   edits_out.reserve(ordered.size());
643   copy(ordered.begin(), ordered.end(), back_inserter(edits_out));
644 }
645 
646 
647 template <typename A,
648           typename B,
649           typename LCS>
_longest_common_subsequence(A begin_a,A end_a,B begin_b,B end_b,LCS out)650 void _longest_common_subsequence(A begin_a, A end_a,
651                                  B begin_b, B end_b,
652                                  LCS out)
653 {
654   typedef jaffer_edit_calculator<A,B,LCS> calc_t;
655   long len_a = end_a - begin_a;
656   long len_b = end_b - begin_b;
657   typename calc_t::edit_vec edits, ordered;
658 
659   typename calc_t::template subarray<A> a(begin_a, 0, len_a);
660   typename calc_t::template subarray<B> b(begin_b, 0, len_b);
661 
662   if (len_b < len_a)
663     {
664       calc_t::diff_to_edits(b, len_b, a, len_a, edits);
665       calc_t::order_edits(edits, -1, ordered);
666       calc_t::edits_to_lcs(ordered, b, len_b, len_a, out);
667     }
668   else
669     {
670       calc_t::diff_to_edits(a, len_a, b, len_b, edits);
671       calc_t::order_edits(edits, 1, ordered);
672       calc_t::edits_to_lcs(ordered, a, len_a, len_b, out);
673     }
674 }
675 
676 
677 void
longest_common_subsequence(vector<long,QA (long)>::const_iterator begin_a,vector<long,QA (long)>::const_iterator end_a,vector<long,QA (long)>::const_iterator begin_b,vector<long,QA (long)>::const_iterator end_b,back_insert_iterator<vector<long,QA (long)>> lcs)678 longest_common_subsequence(vector<long, QA(long)>::const_iterator begin_a,
679                            vector<long, QA(long)>::const_iterator end_a,
680                            vector<long, QA(long)>::const_iterator begin_b,
681                            vector<long, QA(long)>::const_iterator end_b,
682                            back_insert_iterator< vector<long, QA(long)> > lcs)
683 {
684   _longest_common_subsequence(begin_a, end_a,
685                               begin_b, end_b,
686                               lcs);
687 }
688 
689 void
edit_script(vector<long,QA (long)>::const_iterator begin_a,vector<long,QA (long)>::const_iterator end_a,vector<long,QA (long)>::const_iterator begin_b,vector<long,QA (long)>::const_iterator end_b,vector<long,QA (long)> & edits_out)690 edit_script(vector<long, QA(long)>::const_iterator begin_a,
691             vector<long, QA(long)>::const_iterator end_a,
692             vector<long, QA(long)>::const_iterator begin_b,
693             vector<long, QA(long)>::const_iterator end_b,
694             vector<long, QA(long)> & edits_out)
695 {
696   vector<long, QA(long)> lcs;
697   _edit_script(begin_a, end_a,
698                begin_b, end_b,
699                edits_out,
700                back_inserter(lcs));
701 }
702 
703 // Local Variables:
704 // mode: C++
705 // fill-column: 76
706 // c-file-style: "gnu"
707 // indent-tabs-mode: nil
708 // End:
709 // vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:
710