1 #include <math.h>
2 #include <limits>
3 #include <mummer/sw_align.hh>
4 
5 namespace mummer {
6 namespace sw_align {
7 
8 //-- Characters used in creating the alignment edit matrix, DO NOT ALTER!
9 static const int DELETE = 0;
10 static const int INSERT = 1;
11 static const int MATCH  = 2;
12 static const int START  = 3;
13 static const int NONE   = 4;
14 
15 //----------------------------------------- Private Function Declarations ----//
16 static void generateDelta
17      (const DiagonalMatrix& Diag, long int FinishCt, long int FinishCDi,
18       long int N, std::vector<long int> & Delta);
19 
20 
21 static inline int maxScore(Score S[3]);
22 
23 static inline void scoreEdit
24      (Score & curr, const long int del, const long int ins, const long int mat);
25 
26 
27 //------------------------------------------ Private Function Definitions ----//
_alignEngine(const char * A0,long int Astart,long int & Aend,const char * B0,long int Bstart,long int & Bend,std::vector<long int> & Delta,unsigned int m_o,DiagonalMatrix & Diag) const28 bool aligner::_alignEngine
29      (const char* A0, long int Astart, long int & Aend,
30       const char* B0, long int Bstart, long int & Bend,
31       std::vector<long int>& Delta, unsigned int m_o,
32       DiagonalMatrix& Diag) const
33 
34      //  A0 is a sequence such that A [1...\0]
35      //  B0 is a sequence such that B [1...\0]
36      //  The alignment should use bases A [Astart...Aend] (inclusive)
37      //  The alignment should use bases B [Bstart...Bend] (inclusive)
38      //       or [Aend...Astart] etc. if BACKWARD_SEARCH
39      //       Aend must never equal Astart, same goes for Bend and Bstart
40      //  Delta is an integer vector, not necessarily empty
41      //  m_o is the modus operandi of the function:
42      //      FORWARD_ALIGN, FORWARD_SEARCH, BACKWARD_SEARCH
43      //  Returns true on success (Aend & Bend reached) or false on failure
44 
45 {
46   bool        TargetReached;    // the target was reached
47   const char *A, *B;            // the sequence pointers to be used by this func
48 
49   static const long int min_score   = std::numeric_limits<long>::min(); // minimum possible score
50   long int              high_score  = min_score; // global maximum score
51   long int              xhigh_score = min_score; // non-optimal high score
52   const long int        max_diff    = good_score() * _break_len; // max score difference
53 
54   long int CDi;                 // conceptual diagonal index (not relating to mem)
55   long int Dct, Di;             // diagonal counter, actual diagonal index
56   long int PDi, PPDi;           // previous diagonal index and prev prev diag index
57   long int Ds, PDs, PPDs;       // diagonal size, prev, prev prev diagonal size where 'size' = rbound - lbound + 1
58 
59   long int Dl         = 2;      // current conceptual diagonal length
60   long int lbound     = 0;      // current diagonal left(lower) node bound index
61   long int rbound     = 0;      // current diagonal right(upper) node bound index
62   long int FinishCt   = 0;      // diagonal containing the high_score
63   long int FinishCDi  = 0;      // conceptual index of the high_score on FinishCt
64   long int xFinishCt  = 0;      // non-optimal ...
65   long int xFinishCDi = 0;      // non-optimal ...
66   long int N, M, L;             // maximum matrix dimensions... N rows, M columns
67 
68   long int tlb, trb;
69   double   Dmid  = .5;          // diag midpoint
70   double   Dband = _banding/2.0; // diag banding
71 
72   int Iadj, Dadj, Madj;         // insert, delete and match adjust values
73 
74 #ifdef _DEBUG_VERBOSE
75   long int MaxL = 0;             // biggest diagonal seen
76   long int TrimCt = 0;           // counter of nodes trimmed
77   long int CalcCt = 0;           // counter of nodes calculated
78 #endif
79 
80   //-- Set up character pointers for the appropriate m_o
81   if(m_o & DIRECTION_BIT) {
82     A = A0 + ( Astart - 1 );
83     B = B0 + ( Bstart - 1 );
84     N = Aend - Astart + 1;
85     M = Bend - Bstart + 1;
86   } else {
87     A = A0 + ( Astart + 1 );
88     B = B0 + ( Bstart + 1 );
89     N = Astart - Aend + 1;
90     M = Bstart - Bend + 1;
91   }
92 
93   //-- Initialize position 0,0 in the matrices
94   Diag.clear();   // clear the list of diagonals to make up edit matrix
95   { auto& D0 = Diag[0];
96     D0.lbound = lbound;
97     D0.rbound = rbound ++;
98     auto& I0 = D0.I[0];
99     I0.S[DELETE].value = min_score;
100     I0.S[INSERT].value = min_score;
101     I0.S[MATCH].value  = 0;
102     I0.max(MATCH);
103 
104     I0.S[DELETE].used = NONE;
105     I0.S[INSERT].used = NONE;
106     I0.S[MATCH].used  = START;
107   }
108 
109   L = N < M ? N : M;
110 
111   //-- **START** of diagonal processing loop
112   //-- Calculate the rest of the diagonals until goal reached or score worsens
113   for(Dct = 1; Dct <= N + M  && (Dct - FinishCt) <= _break_len  && lbound <= rbound; Dct++) {
114     auto& CurD = Diag[Dct];
115     CurD.lbound = lbound;
116     CurD.rbound = rbound;
117 
118     //-- malloc space for the edit char and score nodes
119     Ds = rbound - lbound + 1;
120 
121 #ifdef _DEBUG_VERBOSE
122     //-- Keep count of trimmed and calculated nodes
123     CalcCt += Ds;
124     TrimCt += Dl - Ds;
125     if(Ds > MaxL )
126       MaxL = Ds;
127 #endif
128 
129     //-- Set diagonal index adjustment values
130     if(Dct <= N) {
131       Iadj = 0;
132       Madj = -1;
133     } else {
134       Iadj = 1;
135       Madj = Dct == N + 1 ? 0 : 1;
136     }
137     Dadj = Iadj - 1;
138 
139     //-- Set parent diagonal values
140     const auto& PrevD = Diag[Dct - 1] ; // previous diagonal
141     PDs               = PrevD.rbound - PrevD.lbound + 1;
142     PDi               = lbound + Dadj;
143     PDi               = PDi - PrevD.lbound;
144 
145     //-- Set grandparent diagonal values
146     const long PPDct = Dct - 2; //  prev prev diagonal
147     const Diagonal& PPrevD = Diag[std::max((long)0, PPDct)]; // if PPDct < 0, not PPrevD is not used
148     if(PPDct >= 0) {
149       PPDs   = PPrevD.rbound - PPrevD.lbound + 1;
150       PPDi   = lbound + Madj;
151       PPDi   = PPDi - PPrevD.lbound;
152     } else
153       PPDi = PPDs = 0;
154 
155     //-- If forced alignment, don't keep track of global max
156     if(m_o & FORCED_BIT )
157       high_score = min_score;
158 
159     //-- **START** of internal node scoring loop
160     //-- Calculate scores for every node (within bounds) for diagonal Dct
161     for(CDi = lbound; CDi <= rbound; CDi ++) {
162       //-- Set the index (in memory) of current node and clear score
163       Di = CDi - CurD.lbound;
164       auto& CurDi = CurD.I[Di];
165 
166       //-- Calculate DELETE score
167       if(PDi >= 0  &&  PDi < PDs ) {
168         const auto& PrevDi = PrevD.I[PDi];
169         scoreEdit(CurDi.S[DELETE],
170                   PrevDi.S[DELETE].value + (PrevDi.S[DELETE].used == NONE ? 0 : CONT_GAP_SCORE [_matrix_type]),
171                   PrevDi.S[INSERT].value + (PrevDi.S[INSERT].used == NONE ? 0 : OPEN_GAP_SCORE [_matrix_type]),
172                   PrevDi.S[MATCH].value  + (PrevDi.S[MATCH].used == NONE  ? 0 : OPEN_GAP_SCORE [_matrix_type]));
173       } else {
174         CurDi.S[DELETE].value = min_score;
175         CurDi.S[DELETE].used  = NONE;
176       }
177 
178       PDi ++;
179 
180       //-- Calculate INSERT score
181       if(PDi >= 0  &&  PDi < PDs ) {
182         const auto& PrevDi = PrevD.I[PDi];
183         scoreEdit(CurDi.S[INSERT],
184                   PrevDi.S[DELETE].value + (PrevDi.S[DELETE].used == NONE ? 0 : OPEN_GAP_SCORE [_matrix_type]),
185                   PrevDi.S[INSERT].value + (PrevDi.S[INSERT].used == NONE ? 0 : CONT_GAP_SCORE [_matrix_type]),
186                   PrevDi.S[MATCH].value  + (PrevDi.S[MATCH].used == NONE  ? 0 : OPEN_GAP_SCORE [_matrix_type]));
187       } else {
188         CurDi.S[INSERT].value = min_score;
189         CurDi.S[INSERT].used  = NONE;
190       }
191 
192       //-- Calculate MATCH/MIS-MATCH score
193       if(PPDi >= 0  &&  PPDi < PPDs) {
194         const auto& PPrevDi = PPrevD.I[PPDi];
195         scoreEdit(CurDi.S[MATCH],
196                   PPrevDi.S[DELETE].value,
197                   PPrevDi.S[INSERT].value,
198                   PPrevDi.S[MATCH].value);
199         CurDi.S[MATCH].value += scoreMatch(CurD, Dct, CDi, A, B, N, m_o);
200       } else {
201         CurDi.S[MATCH].value = min_score;
202         CurDi.S[MATCH].used  = NONE;
203       }
204 
205       PPDi ++;
206 
207       CurDi.max(maxScore (CurDi.S));
208 
209       //-- Reset high_score if new global max was found
210       if(CurDi.max().value >= high_score) {
211         high_score = CurDi.max().value;
212         FinishCt   = Dct;
213         FinishCDi  = CDi;
214       }
215     }
216     //-- **END** of internal node scoring loop
217 
218 
219     //-- Calculate max non-optimal score
220     if(m_o & SEQEND_BIT  &&  Dct >= L) {
221       if(L == N) {
222         if(lbound == 0) {
223           if(CurD.I[0].max().value >= xhigh_score) {
224             xhigh_score = CurD.I[0].max().value;
225             xFinishCt   = Dct;
226             xFinishCDi  = 0;
227           }
228         }
229       } else  { // L == M
230         if(rbound == M) {
231           if(CurD.I[M-CurD.lbound].max().value >= xhigh_score) {
232             xhigh_score = CurD.I[M-CurD.lbound].max().value;
233             xFinishCt   = Dct;
234             xFinishCDi  = M;
235           }
236         }
237       }
238     }
239 
240 
241     //-- If in extender modus operandi, free soon to be greatgrandparent diag
242     // if(m_o & SEARCH_BIT  &&  Dct > 1 )
243     //   free ( PPrevD->I );
244 
245 
246     //-- Trim hopeless diagonal nodes
247     const auto CurDi_start = CurD.I.cbegin();
248     const auto CurDi_end   = CurDi_start + Ds;
249     for(auto it = CurDi_start ; it < CurDi_end; ++it) {
250       if(high_score - it->max().value > max_diff )
251         lbound ++;
252       else
253         break;
254     }
255     for(auto it = CurDi_end - 1; it >= CurDi_start; --it) {
256       if(high_score - it->max().value > max_diff )
257         rbound --;
258       else
259         break;
260     }
261 
262     //-- Grow new diagonal and reset boundaries
263     if(Dct < N && Dct < M) {
264       Dl ++; rbound ++; Dmid = (Dct+1)/2.0;
265     } else if(Dct >= N && Dct >= M) {
266       Dl --; lbound --; Dmid = N - (Dct+1)/2.0;
267     } else if(Dct >= N) {
268       lbound --; Dmid = N - (Dct+1)/2.0;
269     } else {
270       rbound ++; Dmid = (Dct+1)/2.0;
271     }
272 
273     //-- Trim at hard band
274     if(Dband > 0) {
275       tlb = (long int)ceil(Dmid - Dband);
276       if(lbound < tlb )
277         lbound = tlb;
278       trb = (long int)floor(Dmid + Dband);
279       if(rbound > trb )
280         rbound = trb;
281     }
282 
283     if(lbound < 0 )
284       lbound = 0;
285     if(rbound >= Dl )
286       rbound = Dl - 1;
287   }
288   //-- **END** of diagonal processing loop
289   Dct --;
290 
291   //-- Check if the target was reached
292   //   If OPTIMAL, backtrack to last high_score to maximize alignment score
293   TargetReached = false;
294   if(Dct == N + M) {
295     if(~m_o & OPTIMAL_BIT || m_o & SEQEND_BIT) {
296       TargetReached = true;
297       FinishCt      = N + M;
298       FinishCDi     = 0;
299     } else if(FinishCt == Dct )
300       TargetReached = true;
301   } else if(m_o & SEQEND_BIT  &&  xFinishCt != 0) {
302     //-- non-optimal, extend alignment to end of shortest seq if possible
303     FinishCt  = xFinishCt;
304     FinishCDi = xFinishCDi;
305   }
306 
307   //-- Set A/Bend to finish positions
308   long int Aadj = FinishCt <= N ? FinishCt - FinishCDi - 1 : N - FinishCDi - 1;
309   long int Badj = FinishCt <= N ? FinishCDi - 1 : FinishCt - N + FinishCDi - 1;
310   if(~m_o & DIRECTION_BIT) {
311     Aadj *= -1;
312     Badj *= -1;
313   }
314   Aend = Astart + Aadj;
315   Bend = Bstart + Badj;
316 
317 #ifdef _DEBUG_VERBOSE
318   assert (FinishCt > 1);
319 
320   //-- Ouput calculation statistics
321   if(TargetReached )
322     fprintf(stderr,"Finish score = %ld : %ld,%ld\n",
323 	    Diag[FinishCt].I[0].max().value, N, M);
324   else
325     fprintf(stderr,"High score = %ld : %ld,%ld\n", high_score,
326 	    labs(Aadj) + 1, labs(Badj) + 1);
327   fprintf(stderr, "%ld nodes calculated, %ld nodes trimmed\n", CalcCt, TrimCt);
328   if(m_o & DIRECTION_BIT )
329     fprintf(stderr, "%ld bytes used\n",
330 	    (long int)sizeof(Diagonal) * Dct + (long int)sizeof(Node) * CalcCt);
331   else
332     fprintf(stderr, "%ld bytes used\n",
333 	    ((long int)sizeof(Diagonal) + (long int)sizeof(Node) * MaxL) * 2);
334 #endif
335 
336   //-- If in forward alignment m_o, create the Delta information
337   if(~m_o & SEARCH_BIT )
338     generateDelta(Diag, FinishCt, FinishCDi, N, Delta);
339 
340   return TargetReached;
341 }
342 
scoreMatch(const Diagonal & Diag,long int Dct,long int CDi,const char * A,const char * B,long int N,unsigned int m_o) const343 long int aligner::scoreMatch
344      (const Diagonal& Diag, long int Dct, long int CDi,
345       const char * A, const char * B, long int N, unsigned int m_o) const
346 
347      //  Diag is the single diagonal that contains the node to be scored
348      //  Dct is Diag's diagonal index in the edit matrix
349      //  CDi is the conceptual node to be scored in Diag
350      //  A and B are the alignment sequences
351      //  N is the alignment target index in A
352      //  m_o is the modus operandi of the alignment:
353      //      FORWARD_ALIGN, FORWARD_SEARCH, BACKWARD_SEARCH
354 
355 {
356   int Dir;
357   char Ac, Bc;
358 
359   //-- 1 for forward, -1 for reverse
360   Dir = m_o & DIRECTION_BIT ? 1 : -1;
361 
362   //-- Locate the characters that need to be compared
363   if ( Dct <= N )
364     {
365       Ac = *( A + ( (Dct - CDi) * Dir ) );
366       Bc = *( B + ( (CDi) * Dir ) );
367     }
368   else
369     {
370       Ac = *( A + ( (N - CDi) * Dir ) );
371       Bc = *( B + ( (Dct - N + CDi) * Dir ) );
372     }
373 
374   if ( ! isalpha(Ac) )
375     Ac = STOP_CHAR;
376   if ( ! isalpha(Bc) )
377     Bc = STOP_CHAR;
378 
379   return MATCH_SCORE [_matrix_type] [toupper(Ac) - 'A'] [toupper(Bc) - 'A'];
380 }
381 
382 
383 
generateDelta(const DiagonalMatrix & Diag,long int FinishCt,long int FinishCDi,long int N,std::vector<long int> & Delta)384 static void generateDelta
385      (const DiagonalMatrix& Diag, long int FinishCt, long int FinishCDi,
386       long int N, std::vector<long int> & Delta)
387 
388      //  Diag is the list of diagonals that compose the edit matrix
389      //  FinishCt is the diagonal that contains the finishing node
390      //  FinishCDi is the conceptual finishing node, in FinishCt, for the align
391      //  N & M are the target positions for the alignment
392      //  Delta is the vector in which to store the alignment data, new data
393      //      will be appended onto any existing data.
394      //  NOTE: there will be no zero at the end of the data, end of data
395      //        is signaled by the end of the vector
396      //  Return is void
397 
398 {
399   //-- Function pre-conditions
400 #ifdef _DEBUG_ASSERT
401   assert ( Diag != NULL );
402   assert ( FinishCt > 1 );
403 #endif
404 
405   long int Count;                // delta counter
406   long int Dct = FinishCt;  // diagonal index
407   long int CDi = FinishCDi; // conceptual node index
408   long int Di = 0;          // actual node index
409   long int Pi = 0;          // path index
410   long int PSize = 100;     // capacity of the path space
411   char * Reverse_Path;       // path space
412 
413   Score curr_score;
414   int edit;
415 
416   //-- malloc space for the edit path
417   Reverse_Path = (char *) Safe_malloc ( PSize * sizeof(char) );
418 
419   //-- Which Score index is the maximum value in? Store in edit
420   Di = CDi - Diag[Dct] . lbound;
421   edit = Diag[Dct].I[Di].edit();
422 
423 
424   //-- Walk the path backwards through the edit space
425   while ( Dct >= 0 ) {
426     //-- remalloc path space if neccessary
427     if ( Pi >= PSize ) {
428       PSize *= 2;
429       Reverse_Path = (char *) Safe_realloc( Reverse_Path, sizeof(char) * PSize );
430     }
431 
432     Di = CDi - Diag[Dct].lbound;
433     curr_score = Diag[Dct].I[Di].S[edit];
434 
435     Reverse_Path[Pi ++] = edit;
436     switch ( edit ) {
437     case DELETE :
438       CDi = Dct -- <= N ? CDi - 1 : CDi;
439       break;
440     case INSERT :
441       CDi = Dct -- <= N ? CDi : CDi + 1;
442       break;
443     case MATCH :
444       CDi = Dct <= N ? CDi - 1 : ( Dct == N + 1 ? CDi : CDi + 1 );
445       Dct -= 2;
446       break;
447     case START :
448       Dct = -1;
449       break;
450     default :
451       fprintf(stderr,"\nERROR: Invalid edit matrix entry,\n"
452               "       please file a bug report\n");
453       exit ( EXIT_FAILURE );
454     }
455 
456     edit = curr_score.used;
457   }
458 
459   //-- Generate the delta information
460   Count = 1;
461   for (Pi -= 2; Pi >= 0; Pi --) {
462     switch ( Reverse_Path[Pi] ) {
463     case DELETE :
464       Delta.push_back(-Count);
465       Count = 1;
466       break;
467     case INSERT :
468       Delta.push_back(Count);
469       Count = 1;
470       break;
471     case MATCH :
472       Count ++;
473       break;
474     case START :
475       break;
476     default :
477       fprintf(stderr,"\nERROR: Invalid path matrix entry,\n"
478               "       please file a bug report\n");
479       exit ( EXIT_FAILURE );
480     }
481   }
482 
483   free (Reverse_Path);
484 
485   return;
486 }
487 
488 
489 
490 
maxScore(Score S[3])491 static inline int maxScore(Score S[3])
492 
493      //  Return a pointer to the maximum score in the score array
494 
495 {
496   if(S[DELETE].value > S[INSERT].value)
497     return S[DELETE].value > S[MATCH].value ? DELETE : MATCH;
498   else
499     return S[INSERT].value > S[MATCH].value ? INSERT : MATCH;
500 }
501 
502 
503 
504 
scoreEdit(Score & curr,const long int del,const long int ins,const long int mat)505 static inline void scoreEdit
506      (Score & curr, const long int del, const long int ins, const long int mat)
507 
508      //  Assign current edit a maximal score using either del, ins or mat
509 
510 {
511   if ( del > ins ) {
512     if ( del > mat ) {
513       curr.value = del;
514       curr.used = DELETE;
515     } else {
516       curr.value = mat;
517       curr.used = MATCH;
518     }
519   } else if ( ins > mat ) {
520     curr.value = ins;
521     curr.used = INSERT;
522   }  else {
523     curr.value = mat;
524     curr.used = MATCH;
525   }
526 }
527 
528 } // namespace sw_align
529 } // namespace mummer
530