1 #include <math.h>
2 #include <limits>
3 #include <mummer/sw_align.hh>
4
5 namespace mummer {
6 namespace sw_align {
7
8 //-- Characters used in creating the alignment edit matrix, DO NOT ALTER!
9 static const int DELETE = 0;
10 static const int INSERT = 1;
11 static const int MATCH = 2;
12 static const int START = 3;
13 static const int NONE = 4;
14
15 //----------------------------------------- Private Function Declarations ----//
16 static void generateDelta
17 (const DiagonalMatrix& Diag, long int FinishCt, long int FinishCDi,
18 long int N, std::vector<long int> & Delta);
19
20
21 static inline int maxScore(Score S[3]);
22
23 static inline void scoreEdit
24 (Score & curr, const long int del, const long int ins, const long int mat);
25
26
27 //------------------------------------------ Private Function Definitions ----//
_alignEngine(const char * A0,long int Astart,long int & Aend,const char * B0,long int Bstart,long int & Bend,std::vector<long int> & Delta,unsigned int m_o,DiagonalMatrix & Diag) const28 bool aligner::_alignEngine
29 (const char* A0, long int Astart, long int & Aend,
30 const char* B0, long int Bstart, long int & Bend,
31 std::vector<long int>& Delta, unsigned int m_o,
32 DiagonalMatrix& Diag) const
33
34 // A0 is a sequence such that A [1...\0]
35 // B0 is a sequence such that B [1...\0]
36 // The alignment should use bases A [Astart...Aend] (inclusive)
37 // The alignment should use bases B [Bstart...Bend] (inclusive)
38 // or [Aend...Astart] etc. if BACKWARD_SEARCH
39 // Aend must never equal Astart, same goes for Bend and Bstart
40 // Delta is an integer vector, not necessarily empty
41 // m_o is the modus operandi of the function:
42 // FORWARD_ALIGN, FORWARD_SEARCH, BACKWARD_SEARCH
43 // Returns true on success (Aend & Bend reached) or false on failure
44
45 {
46 bool TargetReached; // the target was reached
47 const char *A, *B; // the sequence pointers to be used by this func
48
49 static const long int min_score = std::numeric_limits<long>::min(); // minimum possible score
50 long int high_score = min_score; // global maximum score
51 long int xhigh_score = min_score; // non-optimal high score
52 const long int max_diff = good_score() * _break_len; // max score difference
53
54 long int CDi; // conceptual diagonal index (not relating to mem)
55 long int Dct, Di; // diagonal counter, actual diagonal index
56 long int PDi, PPDi; // previous diagonal index and prev prev diag index
57 long int Ds, PDs, PPDs; // diagonal size, prev, prev prev diagonal size where 'size' = rbound - lbound + 1
58
59 long int Dl = 2; // current conceptual diagonal length
60 long int lbound = 0; // current diagonal left(lower) node bound index
61 long int rbound = 0; // current diagonal right(upper) node bound index
62 long int FinishCt = 0; // diagonal containing the high_score
63 long int FinishCDi = 0; // conceptual index of the high_score on FinishCt
64 long int xFinishCt = 0; // non-optimal ...
65 long int xFinishCDi = 0; // non-optimal ...
66 long int N, M, L; // maximum matrix dimensions... N rows, M columns
67
68 long int tlb, trb;
69 double Dmid = .5; // diag midpoint
70 double Dband = _banding/2.0; // diag banding
71
72 int Iadj, Dadj, Madj; // insert, delete and match adjust values
73
74 #ifdef _DEBUG_VERBOSE
75 long int MaxL = 0; // biggest diagonal seen
76 long int TrimCt = 0; // counter of nodes trimmed
77 long int CalcCt = 0; // counter of nodes calculated
78 #endif
79
80 //-- Set up character pointers for the appropriate m_o
81 if(m_o & DIRECTION_BIT) {
82 A = A0 + ( Astart - 1 );
83 B = B0 + ( Bstart - 1 );
84 N = Aend - Astart + 1;
85 M = Bend - Bstart + 1;
86 } else {
87 A = A0 + ( Astart + 1 );
88 B = B0 + ( Bstart + 1 );
89 N = Astart - Aend + 1;
90 M = Bstart - Bend + 1;
91 }
92
93 //-- Initialize position 0,0 in the matrices
94 Diag.clear(); // clear the list of diagonals to make up edit matrix
95 { auto& D0 = Diag[0];
96 D0.lbound = lbound;
97 D0.rbound = rbound ++;
98 auto& I0 = D0.I[0];
99 I0.S[DELETE].value = min_score;
100 I0.S[INSERT].value = min_score;
101 I0.S[MATCH].value = 0;
102 I0.max(MATCH);
103
104 I0.S[DELETE].used = NONE;
105 I0.S[INSERT].used = NONE;
106 I0.S[MATCH].used = START;
107 }
108
109 L = N < M ? N : M;
110
111 //-- **START** of diagonal processing loop
112 //-- Calculate the rest of the diagonals until goal reached or score worsens
113 for(Dct = 1; Dct <= N + M && (Dct - FinishCt) <= _break_len && lbound <= rbound; Dct++) {
114 auto& CurD = Diag[Dct];
115 CurD.lbound = lbound;
116 CurD.rbound = rbound;
117
118 //-- malloc space for the edit char and score nodes
119 Ds = rbound - lbound + 1;
120
121 #ifdef _DEBUG_VERBOSE
122 //-- Keep count of trimmed and calculated nodes
123 CalcCt += Ds;
124 TrimCt += Dl - Ds;
125 if(Ds > MaxL )
126 MaxL = Ds;
127 #endif
128
129 //-- Set diagonal index adjustment values
130 if(Dct <= N) {
131 Iadj = 0;
132 Madj = -1;
133 } else {
134 Iadj = 1;
135 Madj = Dct == N + 1 ? 0 : 1;
136 }
137 Dadj = Iadj - 1;
138
139 //-- Set parent diagonal values
140 const auto& PrevD = Diag[Dct - 1] ; // previous diagonal
141 PDs = PrevD.rbound - PrevD.lbound + 1;
142 PDi = lbound + Dadj;
143 PDi = PDi - PrevD.lbound;
144
145 //-- Set grandparent diagonal values
146 const long PPDct = Dct - 2; // prev prev diagonal
147 const Diagonal& PPrevD = Diag[std::max((long)0, PPDct)]; // if PPDct < 0, not PPrevD is not used
148 if(PPDct >= 0) {
149 PPDs = PPrevD.rbound - PPrevD.lbound + 1;
150 PPDi = lbound + Madj;
151 PPDi = PPDi - PPrevD.lbound;
152 } else
153 PPDi = PPDs = 0;
154
155 //-- If forced alignment, don't keep track of global max
156 if(m_o & FORCED_BIT )
157 high_score = min_score;
158
159 //-- **START** of internal node scoring loop
160 //-- Calculate scores for every node (within bounds) for diagonal Dct
161 for(CDi = lbound; CDi <= rbound; CDi ++) {
162 //-- Set the index (in memory) of current node and clear score
163 Di = CDi - CurD.lbound;
164 auto& CurDi = CurD.I[Di];
165
166 //-- Calculate DELETE score
167 if(PDi >= 0 && PDi < PDs ) {
168 const auto& PrevDi = PrevD.I[PDi];
169 scoreEdit(CurDi.S[DELETE],
170 PrevDi.S[DELETE].value + (PrevDi.S[DELETE].used == NONE ? 0 : CONT_GAP_SCORE [_matrix_type]),
171 PrevDi.S[INSERT].value + (PrevDi.S[INSERT].used == NONE ? 0 : OPEN_GAP_SCORE [_matrix_type]),
172 PrevDi.S[MATCH].value + (PrevDi.S[MATCH].used == NONE ? 0 : OPEN_GAP_SCORE [_matrix_type]));
173 } else {
174 CurDi.S[DELETE].value = min_score;
175 CurDi.S[DELETE].used = NONE;
176 }
177
178 PDi ++;
179
180 //-- Calculate INSERT score
181 if(PDi >= 0 && PDi < PDs ) {
182 const auto& PrevDi = PrevD.I[PDi];
183 scoreEdit(CurDi.S[INSERT],
184 PrevDi.S[DELETE].value + (PrevDi.S[DELETE].used == NONE ? 0 : OPEN_GAP_SCORE [_matrix_type]),
185 PrevDi.S[INSERT].value + (PrevDi.S[INSERT].used == NONE ? 0 : CONT_GAP_SCORE [_matrix_type]),
186 PrevDi.S[MATCH].value + (PrevDi.S[MATCH].used == NONE ? 0 : OPEN_GAP_SCORE [_matrix_type]));
187 } else {
188 CurDi.S[INSERT].value = min_score;
189 CurDi.S[INSERT].used = NONE;
190 }
191
192 //-- Calculate MATCH/MIS-MATCH score
193 if(PPDi >= 0 && PPDi < PPDs) {
194 const auto& PPrevDi = PPrevD.I[PPDi];
195 scoreEdit(CurDi.S[MATCH],
196 PPrevDi.S[DELETE].value,
197 PPrevDi.S[INSERT].value,
198 PPrevDi.S[MATCH].value);
199 CurDi.S[MATCH].value += scoreMatch(CurD, Dct, CDi, A, B, N, m_o);
200 } else {
201 CurDi.S[MATCH].value = min_score;
202 CurDi.S[MATCH].used = NONE;
203 }
204
205 PPDi ++;
206
207 CurDi.max(maxScore (CurDi.S));
208
209 //-- Reset high_score if new global max was found
210 if(CurDi.max().value >= high_score) {
211 high_score = CurDi.max().value;
212 FinishCt = Dct;
213 FinishCDi = CDi;
214 }
215 }
216 //-- **END** of internal node scoring loop
217
218
219 //-- Calculate max non-optimal score
220 if(m_o & SEQEND_BIT && Dct >= L) {
221 if(L == N) {
222 if(lbound == 0) {
223 if(CurD.I[0].max().value >= xhigh_score) {
224 xhigh_score = CurD.I[0].max().value;
225 xFinishCt = Dct;
226 xFinishCDi = 0;
227 }
228 }
229 } else { // L == M
230 if(rbound == M) {
231 if(CurD.I[M-CurD.lbound].max().value >= xhigh_score) {
232 xhigh_score = CurD.I[M-CurD.lbound].max().value;
233 xFinishCt = Dct;
234 xFinishCDi = M;
235 }
236 }
237 }
238 }
239
240
241 //-- If in extender modus operandi, free soon to be greatgrandparent diag
242 // if(m_o & SEARCH_BIT && Dct > 1 )
243 // free ( PPrevD->I );
244
245
246 //-- Trim hopeless diagonal nodes
247 const auto CurDi_start = CurD.I.cbegin();
248 const auto CurDi_end = CurDi_start + Ds;
249 for(auto it = CurDi_start ; it < CurDi_end; ++it) {
250 if(high_score - it->max().value > max_diff )
251 lbound ++;
252 else
253 break;
254 }
255 for(auto it = CurDi_end - 1; it >= CurDi_start; --it) {
256 if(high_score - it->max().value > max_diff )
257 rbound --;
258 else
259 break;
260 }
261
262 //-- Grow new diagonal and reset boundaries
263 if(Dct < N && Dct < M) {
264 Dl ++; rbound ++; Dmid = (Dct+1)/2.0;
265 } else if(Dct >= N && Dct >= M) {
266 Dl --; lbound --; Dmid = N - (Dct+1)/2.0;
267 } else if(Dct >= N) {
268 lbound --; Dmid = N - (Dct+1)/2.0;
269 } else {
270 rbound ++; Dmid = (Dct+1)/2.0;
271 }
272
273 //-- Trim at hard band
274 if(Dband > 0) {
275 tlb = (long int)ceil(Dmid - Dband);
276 if(lbound < tlb )
277 lbound = tlb;
278 trb = (long int)floor(Dmid + Dband);
279 if(rbound > trb )
280 rbound = trb;
281 }
282
283 if(lbound < 0 )
284 lbound = 0;
285 if(rbound >= Dl )
286 rbound = Dl - 1;
287 }
288 //-- **END** of diagonal processing loop
289 Dct --;
290
291 //-- Check if the target was reached
292 // If OPTIMAL, backtrack to last high_score to maximize alignment score
293 TargetReached = false;
294 if(Dct == N + M) {
295 if(~m_o & OPTIMAL_BIT || m_o & SEQEND_BIT) {
296 TargetReached = true;
297 FinishCt = N + M;
298 FinishCDi = 0;
299 } else if(FinishCt == Dct )
300 TargetReached = true;
301 } else if(m_o & SEQEND_BIT && xFinishCt != 0) {
302 //-- non-optimal, extend alignment to end of shortest seq if possible
303 FinishCt = xFinishCt;
304 FinishCDi = xFinishCDi;
305 }
306
307 //-- Set A/Bend to finish positions
308 long int Aadj = FinishCt <= N ? FinishCt - FinishCDi - 1 : N - FinishCDi - 1;
309 long int Badj = FinishCt <= N ? FinishCDi - 1 : FinishCt - N + FinishCDi - 1;
310 if(~m_o & DIRECTION_BIT) {
311 Aadj *= -1;
312 Badj *= -1;
313 }
314 Aend = Astart + Aadj;
315 Bend = Bstart + Badj;
316
317 #ifdef _DEBUG_VERBOSE
318 assert (FinishCt > 1);
319
320 //-- Ouput calculation statistics
321 if(TargetReached )
322 fprintf(stderr,"Finish score = %ld : %ld,%ld\n",
323 Diag[FinishCt].I[0].max().value, N, M);
324 else
325 fprintf(stderr,"High score = %ld : %ld,%ld\n", high_score,
326 labs(Aadj) + 1, labs(Badj) + 1);
327 fprintf(stderr, "%ld nodes calculated, %ld nodes trimmed\n", CalcCt, TrimCt);
328 if(m_o & DIRECTION_BIT )
329 fprintf(stderr, "%ld bytes used\n",
330 (long int)sizeof(Diagonal) * Dct + (long int)sizeof(Node) * CalcCt);
331 else
332 fprintf(stderr, "%ld bytes used\n",
333 ((long int)sizeof(Diagonal) + (long int)sizeof(Node) * MaxL) * 2);
334 #endif
335
336 //-- If in forward alignment m_o, create the Delta information
337 if(~m_o & SEARCH_BIT )
338 generateDelta(Diag, FinishCt, FinishCDi, N, Delta);
339
340 return TargetReached;
341 }
342
scoreMatch(const Diagonal & Diag,long int Dct,long int CDi,const char * A,const char * B,long int N,unsigned int m_o) const343 long int aligner::scoreMatch
344 (const Diagonal& Diag, long int Dct, long int CDi,
345 const char * A, const char * B, long int N, unsigned int m_o) const
346
347 // Diag is the single diagonal that contains the node to be scored
348 // Dct is Diag's diagonal index in the edit matrix
349 // CDi is the conceptual node to be scored in Diag
350 // A and B are the alignment sequences
351 // N is the alignment target index in A
352 // m_o is the modus operandi of the alignment:
353 // FORWARD_ALIGN, FORWARD_SEARCH, BACKWARD_SEARCH
354
355 {
356 int Dir;
357 char Ac, Bc;
358
359 //-- 1 for forward, -1 for reverse
360 Dir = m_o & DIRECTION_BIT ? 1 : -1;
361
362 //-- Locate the characters that need to be compared
363 if ( Dct <= N )
364 {
365 Ac = *( A + ( (Dct - CDi) * Dir ) );
366 Bc = *( B + ( (CDi) * Dir ) );
367 }
368 else
369 {
370 Ac = *( A + ( (N - CDi) * Dir ) );
371 Bc = *( B + ( (Dct - N + CDi) * Dir ) );
372 }
373
374 if ( ! isalpha(Ac) )
375 Ac = STOP_CHAR;
376 if ( ! isalpha(Bc) )
377 Bc = STOP_CHAR;
378
379 return MATCH_SCORE [_matrix_type] [toupper(Ac) - 'A'] [toupper(Bc) - 'A'];
380 }
381
382
383
generateDelta(const DiagonalMatrix & Diag,long int FinishCt,long int FinishCDi,long int N,std::vector<long int> & Delta)384 static void generateDelta
385 (const DiagonalMatrix& Diag, long int FinishCt, long int FinishCDi,
386 long int N, std::vector<long int> & Delta)
387
388 // Diag is the list of diagonals that compose the edit matrix
389 // FinishCt is the diagonal that contains the finishing node
390 // FinishCDi is the conceptual finishing node, in FinishCt, for the align
391 // N & M are the target positions for the alignment
392 // Delta is the vector in which to store the alignment data, new data
393 // will be appended onto any existing data.
394 // NOTE: there will be no zero at the end of the data, end of data
395 // is signaled by the end of the vector
396 // Return is void
397
398 {
399 //-- Function pre-conditions
400 #ifdef _DEBUG_ASSERT
401 assert ( Diag != NULL );
402 assert ( FinishCt > 1 );
403 #endif
404
405 long int Count; // delta counter
406 long int Dct = FinishCt; // diagonal index
407 long int CDi = FinishCDi; // conceptual node index
408 long int Di = 0; // actual node index
409 long int Pi = 0; // path index
410 long int PSize = 100; // capacity of the path space
411 char * Reverse_Path; // path space
412
413 Score curr_score;
414 int edit;
415
416 //-- malloc space for the edit path
417 Reverse_Path = (char *) Safe_malloc ( PSize * sizeof(char) );
418
419 //-- Which Score index is the maximum value in? Store in edit
420 Di = CDi - Diag[Dct] . lbound;
421 edit = Diag[Dct].I[Di].edit();
422
423
424 //-- Walk the path backwards through the edit space
425 while ( Dct >= 0 ) {
426 //-- remalloc path space if neccessary
427 if ( Pi >= PSize ) {
428 PSize *= 2;
429 Reverse_Path = (char *) Safe_realloc( Reverse_Path, sizeof(char) * PSize );
430 }
431
432 Di = CDi - Diag[Dct].lbound;
433 curr_score = Diag[Dct].I[Di].S[edit];
434
435 Reverse_Path[Pi ++] = edit;
436 switch ( edit ) {
437 case DELETE :
438 CDi = Dct -- <= N ? CDi - 1 : CDi;
439 break;
440 case INSERT :
441 CDi = Dct -- <= N ? CDi : CDi + 1;
442 break;
443 case MATCH :
444 CDi = Dct <= N ? CDi - 1 : ( Dct == N + 1 ? CDi : CDi + 1 );
445 Dct -= 2;
446 break;
447 case START :
448 Dct = -1;
449 break;
450 default :
451 fprintf(stderr,"\nERROR: Invalid edit matrix entry,\n"
452 " please file a bug report\n");
453 exit ( EXIT_FAILURE );
454 }
455
456 edit = curr_score.used;
457 }
458
459 //-- Generate the delta information
460 Count = 1;
461 for (Pi -= 2; Pi >= 0; Pi --) {
462 switch ( Reverse_Path[Pi] ) {
463 case DELETE :
464 Delta.push_back(-Count);
465 Count = 1;
466 break;
467 case INSERT :
468 Delta.push_back(Count);
469 Count = 1;
470 break;
471 case MATCH :
472 Count ++;
473 break;
474 case START :
475 break;
476 default :
477 fprintf(stderr,"\nERROR: Invalid path matrix entry,\n"
478 " please file a bug report\n");
479 exit ( EXIT_FAILURE );
480 }
481 }
482
483 free (Reverse_Path);
484
485 return;
486 }
487
488
489
490
maxScore(Score S[3])491 static inline int maxScore(Score S[3])
492
493 // Return a pointer to the maximum score in the score array
494
495 {
496 if(S[DELETE].value > S[INSERT].value)
497 return S[DELETE].value > S[MATCH].value ? DELETE : MATCH;
498 else
499 return S[INSERT].value > S[MATCH].value ? INSERT : MATCH;
500 }
501
502
503
504
scoreEdit(Score & curr,const long int del,const long int ins,const long int mat)505 static inline void scoreEdit
506 (Score & curr, const long int del, const long int ins, const long int mat)
507
508 // Assign current edit a maximal score using either del, ins or mat
509
510 {
511 if ( del > ins ) {
512 if ( del > mat ) {
513 curr.value = del;
514 curr.used = DELETE;
515 } else {
516 curr.value = mat;
517 curr.used = MATCH;
518 }
519 } else if ( ins > mat ) {
520 curr.value = ins;
521 curr.used = INSERT;
522 } else {
523 curr.value = mat;
524 curr.used = MATCH;
525 }
526 }
527
528 } // namespace sw_align
529 } // namespace mummer
530