1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19
20 #include "system.H"
21 #include "sequence.H"
22
23 #include "sqStore.H"
24 #include "sqCache.H"
25 #include "ovStore.H"
26
27 #include "edlib.H"
28
29 #include "overlapAlign-globalData.H"
30 #include "overlapAlign-threadData.H"
31 #include "overlapAlign-computation.H"
32
33
34
35 double localMatch = 1;
36 double localMismatch = -1;
37 double localGap = -1;
38
39 double
getScore(EdlibAlignResult result,int32 pp)40 getScore(EdlibAlignResult result, int32 pp) {
41
42 if ((pp < 0) ||
43 (pp >= result.alignmentLength))
44 return(localMatch);
45
46 switch (result.alignment[pp]) {
47 case EDLIB_EDOP_MATCH: return(localMatch); break;
48 case EDLIB_EDOP_INSERT: return(localGap); break;
49 case EDLIB_EDOP_DELETE: return(localGap); break;
50 case EDLIB_EDOP_MISMATCH: return(localMismatch); break;
51 default: return(0); break;
52 }
53 }
54
55
56 bool
testAlignment(char * aRead,int32 & abgn,int32 & aend,int32 alen,uint32 Aid,char * bRead,int32 & bbgn,int32 & bend,int32 blen,uint32 Bid,double maxAlignErate,double maxAcceptErate,double & erate)57 testAlignment(char *aRead, int32 &abgn, int32 &aend, int32 alen, uint32 Aid,
58 char *bRead, int32 &bbgn, int32 &bend, int32 blen, uint32 Bid,
59 double maxAlignErate,
60 double maxAcceptErate,
61 double &erate) {
62
63 assert(abgn >= 0);
64 assert(aend <= alen);
65 assert(bbgn >= 0);
66 assert(bend <= blen);
67
68 erate = 1.0; // Set the default return value, 100% error.
69
70 // Compute an alignment with the alignment path.
71
72 EdlibAlignResult result = edlibAlign(aRead + abgn, aend - abgn,
73 bRead + bbgn, bend - bbgn,
74 edlibNewAlignConfig((int32)ceil(1.1 * maxAlignErate * ((aend - abgn) + (bend - bbgn)) / 2.0),
75 EDLIB_MODE_HW,
76 EDLIB_TASK_PATH));
77
78 // If no result, free it and return that the alignment failed.
79
80 if (result.numLocations == 0) {
81 edlibFreeAlignResult(result);
82 return(false);
83 }
84
85 // Compute the (approximate) length of the alignment, for EDLIB_TASK_LOC
86 //result.alignmentLength = ((aend - abgn) + (result.endLocations[0] + 1 - result.startLocations[0]) + (result.editDistance)) / 2;
87
88 // Compute the global error rate. If it's terrible, return that the alignment failed.
89
90 erate = (double)result.editDistance / result.alignmentLength;
91
92 if (erate > maxAcceptErate) {
93 edlibFreeAlignResult(result);
94 return(false);
95 }
96
97 // Good quality. Save the positions.
98 // The A positions don't change (yet).
99
100 bend = bbgn + result.endLocations[0] + 1; // Edlib returns 0-based positions, add one to end to get space-based.
101 bbgn = bbgn + result.startLocations[0];
102
103 edlibFreeAlignResult(result);
104
105 return(true);
106 }
107
108
109
110
111
112
113
114 int32
extend5(char * aRead,int32 & abgn,int32 & aend,int32 alen,uint32 Aid,char * bRead,int32 & bbgn,int32 & bend,int32 blen,uint32 Bid,double maxAlignErate,double maxAcceptErate,double & erate)115 extend5(char *aRead, int32 &abgn, int32 &aend, int32 alen, uint32 Aid,
116 char *bRead, int32 &bbgn, int32 &bend, int32 blen, uint32 Bid,
117 double maxAlignErate,
118 double maxAcceptErate,
119 double &erate) {
120
121 assert(abgn >= 0);
122 assert(aend <= alen);
123 assert(bbgn >= 0);
124 assert(bend <= blen);
125
126 erate = 1.0; // Set the default return value, 100% error.
127
128 // Compute an alignment with the alignment path.
129
130 EdlibAlignResult result = edlibAlign(aRead + abgn, aend - abgn, // Query
131 bRead + bbgn, bend - bbgn, // Target
132 edlibNewAlignConfig((int32)ceil(1.1 * maxAlignErate * ((aend - abgn) + (bend - bbgn)) / 2.0),
133 EDLIB_MODE_HW,
134 EDLIB_TASK_PATH));
135
136 // If no result, free it and return that the alignment failed.
137
138 if (result.numLocations == 0) {
139 edlibFreeAlignResult(result);
140 return(-1);
141 }
142
143 #if 0
144 char *aAln = new char [result.alignmentLength + 1];
145 char *bAln = new char [result.alignmentLength + 1];
146
147 edlibAlignmentToStrings(result,
148 aRead + abgn, aend - abgn, // Query
149 bRead + bbgn, bend - bbgn, // Target
150 aAln,
151 bAln);
152
153 fprintf(stderr, "A %s\n", aAln);
154 fprintf(stderr, "B %s\n", bAln);
155
156 delete [] aAln;
157 delete [] bAln;
158 #endif
159
160
161 // Compute the (approximate) length of the alignment, for EDLIB_TASK_LOC
162 //result.alignmentLength = ((aend - abgn) + (result.endLocations[0] + 1 - result.startLocations[0]) + (result.editDistance)) / 2;
163
164 // Compute the global error rate. If it's terrible, return that the alignment failed.
165
166 erate = (double)result.editDistance / result.alignmentLength;
167
168 //if (erate > maxAcceptErate) {
169 // edlibFreeAlignResult(result);
170 // return(-1);
171 //}
172
173 // Good quality. Save the positions.
174 // The A positions don't change (yet).
175
176 bend = bbgn + result.endLocations[0] + 1; // Edlib returns 0-based positions, add one to end to get space-based.
177 bbgn = bbgn + result.startLocations[0];
178
179 // Analyze the alignment in detail. Trim back bad ends.
180 //
181 // We're trying to extend the 5' end of the A sequence.
182 // Scan the alignment backwards to check if we enter a region of bad alignment.
183
184 int32 localWindow = 50;
185 double localScore = localMatch * localWindow;
186
187 double localThreshold = 0.50;
188
189 int32 abgnN = aend;
190 int32 bbgnN = bend;
191 int32 ii = result.alignmentLength - 1; // Window start (left most position of window)
192
193 // Scan towards the 5' end until we hit a region of local garbage.
194
195 for (; ii >= 0; ii--) {
196 localScore += getScore(result, ii);
197 localScore -= getScore(result, ii + localWindow);
198
199 abgnN -= (result.alignment[ii] != EDLIB_EDOP_DELETE); // Move left if not gap in B (delete in B)?
200 bbgnN -= (result.alignment[ii] != EDLIB_EDOP_INSERT); // Move left if not gap in A (insert in B)?
201
202 assert(abgnN >= 0);
203 assert(bbgnN >= 0);
204 //fprintf(stderr, "NEW %d %d after align %d\n", abgnN, bbgnN, result.alignment[ii]);
205
206 //fprintf(stderr, "extend5 at ii %d = %f A %d-%d B %d-%d\n",
207 // ii, localScore / localWindow, abgnN, aend, bbgnN, bend);
208
209 if (localScore / localWindow < localThreshold) {
210 //fprintf(stderr, "extend5 fail at ii %d = %f A %d-%d B %d-%d\n",
211 // ii, localScore / localWindow, abgnN, aend, bbgnN, bend);
212 break;
213 }
214 }
215
216 // If the score is good, the whole extension is good.
217
218 if (localScore / localWindow >= localThreshold) {
219 edlibFreeAlignResult(result);
220 return(0);
221 }
222
223 // Otherwise, scan the window to find the maximal score and trim to there.
224
225 double maxWindowScore = localScore;
226 int32 ww = ii;
227
228 for (int32 kk=0; kk<localWindow; kk++) {
229 localScore -= getScore(result, ii + kk);
230 localScore += getScore(result, ii + kk + localWindow);
231
232 if (maxWindowScore < localScore) {
233 maxWindowScore = localScore;
234 ww = ii + kk;
235 }
236 }
237
238 // Now that we know where we should terminate the alignment, back up to that spot,
239 // adjusting the coordinates.
240
241 for (int32 kk=ii; kk<=ww; kk++) {
242 abgnN += (result.alignment[kk] != EDLIB_EDOP_DELETE); // Delete in B?
243 bbgnN += (result.alignment[kk] != EDLIB_EDOP_INSERT); // Insert in B?
244 }
245
246 //fprintf(stderr, "extend5 fail at ii %d = %f abgn %d bbgn %d\n",
247 // ii, localScore / localWindow, abgnN, bbgnN);
248
249 abgn = abgnN;
250 bbgn = bbgnN;
251
252 edlibFreeAlignResult(result);
253 return(1);
254 }
255
256
257
258
259
260
261
262
263
264
265
266 int32
extend3(char * aRead,int32 & abgn,int32 & aend,int32 alen,uint32 Aid,char * bRead,int32 & bbgn,int32 & bend,int32 blen,uint32 Bid,double maxAlignErate,double maxAcceptErate,double & erate)267 extend3(char *aRead, int32 &abgn, int32 &aend, int32 alen, uint32 Aid,
268 char *bRead, int32 &bbgn, int32 &bend, int32 blen, uint32 Bid,
269 double maxAlignErate,
270 double maxAcceptErate,
271 double &erate) {
272
273 assert(abgn >= 0);
274 assert(aend <= alen);
275 assert(bbgn >= 0);
276 assert(bend <= blen);
277
278 erate = 1.0; // Set the default return value, 100% error.
279
280 // Compute an alignment with the alignment path.
281
282 EdlibAlignResult result = edlibAlign(aRead + abgn, aend - abgn,
283 bRead + bbgn, bend - bbgn,
284 edlibNewAlignConfig((int32)ceil(1.1 * maxAlignErate * ((aend - abgn) + (bend - bbgn)) / 2.0),
285 EDLIB_MODE_HW,
286 EDLIB_TASK_PATH));
287
288 // If no result, free it and return that the alignment failed.
289
290 if (result.numLocations == 0) {
291 edlibFreeAlignResult(result);
292 return(-1);
293 }
294
295 #if 0
296 char *aAln = new char [result.alignmentLength + 1];
297 char *bAln = new char [result.alignmentLength + 1];
298
299 edlibAlignmentToStrings(result,
300 aRead + abgn, aend - abgn, // Query
301 bRead + bbgn, bend - bbgn, // Target
302 aAln,
303 bAln);
304
305 fprintf(stderr, "A %s\n", aAln);
306 fprintf(stderr, "B %s\n", bAln);
307
308 delete [] aAln;
309 delete [] bAln;
310 #endif
311
312 // Compute the (approximate) length of the alignment, for EDLIB_TASK_LOC
313 //result.alignmentLength = ((aend - abgn) + (result.endLocations[0] + 1 - result.startLocations[0]) + (result.editDistance)) / 2;
314
315 // Compute the global error rate. If it's terrible, return that the alignment failed.
316
317 erate = (double)result.editDistance / result.alignmentLength;
318
319 //if (erate > maxAcceptErate) {
320 // edlibFreeAlignResult(result);
321 // return(-1);
322 //}
323
324 // Good quality. Save the positions.
325 // The A positions don't change (yet).
326
327 bend = bbgn + result.endLocations[0] + 1; // Edlib returns 0-based positions, add one to end to get space-based.
328 bbgn = bbgn + result.startLocations[0];
329
330 // Analyze the alignment in detail. Trim back bad ends.
331 //
332 // We're trying to extend the 5' end of the A sequence.
333 // Scan the alignment backwards to check if we enter a region of bad alignment.
334
335 int32 localWindow = 50;
336 double localScore = localMatch * localWindow;
337
338 double localThreshold = 0.50;
339
340 int32 aendN = abgn;
341 int32 bendN = bbgn;
342 int32 ii = 0; // Window end (right most position of window)
343
344 // Scan towards the 3' end until we hit a region of local garbage.
345
346 for (; ii < result.alignmentLength; ii++) {
347 localScore += getScore(result, ii);
348 localScore -= getScore(result, ii - localWindow);
349
350 aendN += (result.alignment[ii] != EDLIB_EDOP_DELETE); // Delete in B?
351 bendN += (result.alignment[ii] != EDLIB_EDOP_INSERT); // Insert in B?
352
353 assert(aendN <= alen);
354 assert(bendN <= blen);
355
356 if (localScore / localWindow < localThreshold) {
357 //fprintf(stderr, "extend3 fail at ii %d = %f\n", ii, localScore / localWindow);
358 break;
359 }
360 }
361
362 // If the score is good, the whole extension is good.
363
364 if (localScore / localWindow >= localThreshold) {
365 edlibFreeAlignResult(result);
366 //fprintf(stderr, "extend3 all good!\n");
367 return(0);
368 }
369
370 // Otherwise, scan the window to find the maximal score and trim to there.
371
372 double maxWindowScore = localScore;
373 int32 ww = ii;
374
375 for (int32 kk=0; kk<localWindow; kk++) {
376 localScore -= getScore(result, ii - kk);
377 localScore += getScore(result, ii - kk - localWindow);
378
379 if (maxWindowScore < localScore) {
380 maxWindowScore = localScore;
381 ww = ii - kk;
382 }
383 }
384
385 // Now that we know where we should terminate the alignment, back up to that spot,
386 // adjusting the coordinates.
387
388 for (int32 kk=ii; kk > ww; kk--) {
389 aendN -= (result.alignment[kk] != EDLIB_EDOP_DELETE); // Delete in B?
390 bendN -= (result.alignment[kk] != EDLIB_EDOP_INSERT); // Insert in B?
391 }
392
393 aend = aendN;
394 bend = bendN;
395
396 edlibFreeAlignResult(result);
397 return(1);
398 }
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414 // Compute an alignment between A and B,
415 // requring all of A to be in the alignment,
416 // returning new end points on B, allowing free gaps on the ends.
417 // The editDist and alignLen of the alignment are returned.
418 //
419 bool
computeAlignment(char * aRead,int32 abgn,int32 aend,int32 UNUSED (alen),char const * Alabel,uint32 Aid,char * bRead,int32 & bbgn,int32 & bend,int32 blen,char const * Blabel,uint32 Bid,double maxErate,int32 & editDist,int32 & alignLen,uint32 verbose)420 computeAlignment(char *aRead, int32 abgn, int32 aend, int32 UNUSED(alen), char const *Alabel, uint32 Aid,
421 char *bRead, int32 &bbgn, int32 &bend, int32 blen, char const *Blabel, uint32 Bid,
422 double maxErate,
423 int32 &editDist,
424 int32 &alignLen,
425 uint32 verbose) {
426 bool success = false;
427
428 editDist = 0;
429 alignLen = 0;
430
431 if (verbose > 0) {
432 fprintf(stderr, "computeAlignment()-- align %s %6u %6d-%-6d %.2f%% error\n", Alabel, Aid, abgn, aend, 100.0 * maxErate);
433 fprintf(stderr, "computeAlignment()-- vs %s %6u %6d-%-6d\n", Blabel, Bid, bbgn, bend);
434 }
435
436 EdlibAlignResult result = edlibAlign(aRead + abgn, aend - abgn,
437 bRead + bbgn, bend - bbgn,
438 edlibNewAlignConfig((int32)ceil(1.1 * maxErate * ((aend - abgn) + (bend - bbgn)) / 2.0),
439 EDLIB_MODE_HW,
440 EDLIB_TASK_LOC));
441
442 // If there is a result, compute the (approximate) length of the alignment.
443 // Edlib mode TASK_LOC doesn't populate this field.
444
445 if (result.numLocations > 0)
446 result.alignmentLength = ((aend - abgn) + (result.endLocations[0] + 1 - result.startLocations[0]) + (result.editDistance)) / 2;
447
448 // Save the result if it is of acceptable quality.
449
450 //fprintf(stderr, "XX numLocations %d editDist %d maxErate %f alignLength %d\n",
451 // result.numLocations, result.editDistance, maxErate, result.alignmentLength);
452
453 if ((result.numLocations > 0) &&
454 (result.editDistance <= maxErate * result.alignmentLength)) {
455 success = true;
456
457 bend = bbgn + result.endLocations[0] + 1; // Edlib returns 0-based positions, add one to end to get space-based.
458 bbgn = bbgn + result.startLocations[0];
459
460 editDist = result.editDistance;
461 alignLen = result.alignmentLength;
462
463 if (verbose > 0) {
464 fprintf(stderr, "computeAlignment()-- PASS %s %6u %6d-%-6d editDist %5d alignLen %6d error %6.4f\n",
465 Blabel, Bid, bbgn, bend,
466 result.editDistance,
467 result.alignmentLength,
468 result.editDistance / (double)result.alignmentLength);
469 }
470 }
471
472 else {
473 if ((verbose > 0) && (result.numLocations > 0)) {
474 fprintf(stderr, "computeAlignment()-- FAIL %s %6u %6d-%-6d editDist %5d alignLen %6d error %6.4f\n",
475 Blabel, Bid, bbgn + result.endLocations[0] + 1, bbgn + result.startLocations[0],
476 result.editDistance,
477 result.alignmentLength,
478 result.editDistance / (double)result.alignmentLength);
479 }
480
481 if ((verbose > 0) && (result.numLocations == 0)) {
482 fprintf(stderr, "computeAlignment()-- FAIL %s %6u %6d-%-6d no alignment\n",
483 Blabel, Bid, bbgn, bend);
484 }
485 }
486
487 edlibFreeAlignResult(result);
488
489 return(success);
490 }
491
492
493
494
495 // Our goal is to convert an alignment-free overlap with imprecise edges into
496 // an alignment-based overlap with precise edges.
497 //
498 // The strategy is to, for each end of the overlap, extend the alignment to the end
499 // of the sequence with the shorter hang, then compute the alignment of
500 // that to the other read allowing free gaps.
501 //
502 // { alignment-free overlap }
503 // ---(a5----{-----a3)----------------}------
504 // (b5--{---b3)------------------}-------------------
505 //
506 // Since the 5' hang on the B read is smaller, we use b5 to compute the other values.
507 // b5 = known from alignment-free overlap
508 // b3 = max(4*b5, 2*maxRepeatLength)
509 // a5 = b5 * (1 + maxErate) + SLOP
510 // a3 = b3 * (1 + maxErate) + SLOP
511 //
512 // For b3: this is our anchor in assumed good sequence. it needs to be larger than
513 // a repeat, and should be larger than the unknown sequence
514 //
515 // For a5: we need to allow space for all of b5 to align, plus any errors. The
516 // SLOP adjustment allows for shifting of the imprecise overlap edge.
517 //
518 // Similar for a3.
519 //
520 // Output of the alignment will be updated coordinates for a5 and a3,
521 // which we use to update this end of the overlap.
522 //
523 void
computeOverlapAlignment(uint32 ovlid,uint32 minOverlapLength,double maxErate,uint32 overlapSlop,uint32 maxRepeat)524 maComputation::computeOverlapAlignment(uint32 ovlid,
525 uint32 minOverlapLength,
526 double maxErate,
527 uint32 overlapSlop,
528 uint32 maxRepeat) {
529 ovOverlap *ovl = &_overlaps[ovlid]; // Convenience pointer to the overlap.
530 ovOverlap ori = _overlaps[ovlid]; // Copy of the original overlap.
531
532 int32 alen = _readData[_aID].trimmedLength;
533 int32 abgn = (int32) ovl->dat.ovl.ahg5;
534 int32 aend = (int32)alen - ovl->dat.ovl.ahg3;
535
536 int32 blen = _readData[_bID].trimmedLength;
537 int32 bbgn = (int32) ovl->dat.ovl.bhg5;
538 int32 bend = (int32)blen - ovl->dat.ovl.bhg3;
539
540 int32 alignLen = 0;
541 int32 editDist = INT32_MAX;
542
543 EdlibAlignResult result = { 0, NULL, NULL, 0, NULL, 0, 0 };
544
545 if (_verboseAlign > 0) {
546 fprintf(stderr, "computeOverlapAlignment()-- A %8u %6d-%-6d %d\n", _aID, abgn, aend, alen);
547 fprintf(stderr, "computeOverlapAlignment()-- B %8u %6d-%-6d %d\n", _bID, bbgn, bend, blen);
548 }
549
550 // A ------------{------...
551 // B ------{------...
552 if (ovl->dat.ovl.bhg5 < ovl->dat.ovl.ahg5) {
553 int32 ahg5 = ovl->dat.ovl.ahg5;
554 int32 ahg3 = ovl->dat.ovl.ahg3;
555 int32 bhg5 = ovl->dat.ovl.bhg5;
556 int32 bhg3 = ovl->dat.ovl.bhg3;
557
558 int32 b5 = bhg5;
559 int32 b3 = std::max(4 * bhg5, (int32)(2 * maxRepeat));
560
561 int32 a5 = b5 * (1 + maxErate) + overlapSlop;
562 int32 a3 = b3 * (1 + maxErate) + overlapSlop;
563
564 int32 bbgn = std::max(0, bhg5 - b5); // Now zero.
565 int32 bend = std::min(blen, bhg5 + b3);
566
567 int32 abgn = std::max(0, ahg5 - a5);
568 int32 aend = std::min(alen, ahg5 + a3);
569
570 if (_verboseAlign > 0)
571 fprintf(stderr, "computeOverlapAlignment()-- bhg5: B %d-%d onto A %d-%d\n", bbgn, bend, abgn, aend);
572
573 if (computeAlignment(_bRead, bbgn, bend, blen, "B", _bID, // Align all of sequence B into
574 _aRead, abgn, aend, alen, "A", _aID, // sequence A with free ends.
575 maxErate,
576 editDist,
577 alignLen, _verboseAlign) == true) {
578 if (_verboseAlign > 0)
579 fprintf(stderr, "computeOverlapAlignment()-- B %d-%d onto A %d-%d\n", bbgn, bend, abgn, aend);
580
581 ovl->dat.ovl.ahg5 = abgn;
582 ovl->dat.ovl.bhg5 = bbgn;
583
584 if ((bend == blen) || // Update the other end if the B read is contained, or
585 (alen - ovl->dat.ovl.ahg3 < aend)) { // if the alignment extends past our existing end.
586 ovl->dat.ovl.ahg3 = alen - aend;
587 ovl->dat.ovl.bhg3 = blen - bend;
588 }
589 }
590
591 else {
592 }
593 }
594
595 // A ------{------...
596 // B ------------{------...
597 else {
598 int32 ahg5 = ovl->dat.ovl.ahg5;
599 int32 ahg3 = ovl->dat.ovl.ahg3;
600 int32 bhg5 = ovl->dat.ovl.bhg5;
601 int32 bhg3 = ovl->dat.ovl.bhg3;
602
603 int32 a5 = ahg5;
604 int32 a3 = std::max(4 * ahg5, (int32)(2 * maxRepeat));
605
606 int32 b5 = a5 * (1 + maxErate) + overlapSlop;
607 int32 b3 = a3 * (1 + maxErate) + overlapSlop;
608
609 int32 bbgn = std::max(0, bhg5 - b5);
610 int32 bend = std::min(blen, bhg5 + b3);
611
612 int32 abgn = std::max(0, ahg5 - a5); // Now zero.
613 int32 aend = std::min(alen, ahg5 + a3);
614
615 if (_verboseAlign > 0)
616 fprintf(stderr, "computeOverlapAlignment()-- ahg5: A %d-%d onto B %d-%d\n", abgn, aend, bbgn, bend);
617
618 if (computeAlignment(_aRead, abgn, aend, alen, "A", _aID, // Align all of sequence A into
619 _bRead, bbgn, bend, blen, "B", _bID, // sequence B with free ends.
620 maxErate,
621 editDist,
622 alignLen, _verboseAlign) == true) {
623 if (_verboseAlign > 0)
624 fprintf(stderr, "computeOverlapAlignment()-- A %d-%d onto B %d-%d\n", abgn, aend, bbgn, bend);
625
626 ovl->dat.ovl.ahg5 = abgn;
627 ovl->dat.ovl.bhg5 = bbgn;
628
629 if ((aend == alen) || // Update the other end if the A read is contained, or
630 (blen - ovl->dat.ovl.bhg3 < bend)) { // if the alignment extends past our existing end.
631 ovl->dat.ovl.ahg3 = alen - aend;
632 ovl->dat.ovl.bhg3 = blen - bend;
633 }
634 }
635
636 else {
637 }
638 }
639
640
641
642
643 // A ...------}------------
644 // B ...------}------
645 if (ovl->dat.ovl.bhg3 < ovl->dat.ovl.ahg3) {
646 int32 ahg5 = ovl->dat.ovl.ahg5;
647 int32 ahg3 = ovl->dat.ovl.ahg3;
648 int32 bhg5 = ovl->dat.ovl.bhg5;
649 int32 bhg3 = ovl->dat.ovl.bhg3;
650
651 int32 b5 = std::max(4 * bhg3, (int32)(2 * maxRepeat));
652 int32 b3 = bhg3;
653
654 int32 a5 = b5 * (1 + maxErate) + overlapSlop;
655 int32 a3 = b3 * (1 + maxErate) + overlapSlop;
656
657 int32 bbgn = std::max(0, blen - bhg3 - b5);
658 int32 bend = std::min(blen, blen - bhg3 + b3); // Now blen.
659
660 int32 abgn = std::max(0, alen - ahg3 - a5);
661 int32 aend = std::min(alen, alen - ahg3 + a3);
662
663 if (_verboseAlign > 0)
664 fprintf(stderr, "computeOverlapAlignment()-- bhg3: B %d-%d onto A %d-%d\n", bbgn, bend, abgn, aend);
665
666 if (computeAlignment(_bRead, bbgn, bend, blen, "B", _bID, // Align all of sequence B into
667 _aRead, abgn, aend, alen, "A", _aID, // sequence A with free ends.
668 maxErate,
669 editDist,
670 alignLen, _verboseAlign) == true) {
671 if (_verboseAlign > 0)
672 fprintf(stderr, "computeOverlapAlignment()-- B %d-%d onto A %d-%d\n", bbgn, bend, abgn, aend);
673
674 if ((bbgn == 0) || // Update the other end if the B read is contained, or
675 (abgn < ovl->dat.ovl.ahg5)) { // if the alignment extends past our existing end.
676 ovl->dat.ovl.ahg5 = abgn;
677 ovl->dat.ovl.bhg5 = bbgn;
678 }
679
680 ovl->dat.ovl.ahg3 = alen - aend;
681 ovl->dat.ovl.bhg3 = blen - bend;
682 }
683
684 else {
685 }
686 }
687
688 // A ...------}------
689 // B ...------}------------
690 else {
691 int32 ahg5 = ovl->dat.ovl.ahg5;
692 int32 ahg3 = ovl->dat.ovl.ahg3;
693 int32 bhg5 = ovl->dat.ovl.bhg5;
694 int32 bhg3 = ovl->dat.ovl.bhg3;
695
696 int32 a5 = std::max(4 * ahg3, (int32)(2 * maxRepeat));
697 int32 a3 = ahg3;
698
699 int32 b5 = a5 * (1 + maxErate) + overlapSlop;
700 int32 b3 = a3 * (1 + maxErate) + overlapSlop;
701
702 int32 bbgn = std::max(0, blen - bhg3 - b5);
703 int32 bend = std::min(blen, blen - bhg3 + b3);
704
705 int32 abgn = std::max(0, alen - ahg3 - a5); // Now alen.
706 int32 aend = std::min(alen, alen - ahg3 + a3);
707
708 if (_verboseAlign > 0)
709 fprintf(stderr, "computeOverlapAlignment()-- ahg3: A %d-%d onto B %d-%d\n", abgn, aend, bbgn, bend);
710
711 if (computeAlignment(_aRead, abgn, aend, alen, "A", _aID, // Align all of sequence A into
712 _bRead, bbgn, bend, blen, "B", _bID, // sequence B with free ends.
713 maxErate,
714 editDist,
715 alignLen, _verboseAlign) == true) {
716 if (_verboseAlign > 0)
717 fprintf(stderr, "computeOverlapAlignment()-- A %d-%d onto B %d-%d\n", abgn, aend, bbgn, bend);
718
719 if ((abgn == 0) || // Update the other end if the A read is contained, or
720 (bbgn < ovl->dat.ovl.bhg5)) { // if the alignment extends past our existing end.
721 ovl->dat.ovl.ahg5 = abgn;
722 ovl->dat.ovl.bhg5 = bbgn;
723 }
724
725 ovl->dat.ovl.ahg3 = alen - aend;
726 ovl->dat.ovl.bhg3 = blen - bend;
727 }
728
729 else {
730 }
731 }
732
733 //
734 // Compute the final alignment, then copy it to our output buffer if it is
735 // of good quality.
736 //
737
738 if (1) {
739 int32 abgn = (int32) ovl->dat.ovl.ahg5;
740 int32 aend = (int32)alen - ovl->dat.ovl.ahg3;
741
742 int32 bbgn = (int32) ovl->dat.ovl.bhg5;
743 int32 bend = (int32)blen - ovl->dat.ovl.bhg3;
744
745 if (_verboseAlign > 0)
746 fprintf(stderr, "computeOverlapAlignment()-- final: A %d-%d vs B %d-%d\n", abgn, aend, bbgn, bend);
747
748 EdlibAlignResult result = edlibAlign(_aRead + abgn, aend - abgn,
749 _bRead + bbgn, bend - bbgn,
750 edlibNewAlignConfig((int32)ceil(1.1 * maxErate * ((aend - abgn) + (bend - bbgn)) / 2.0),
751 EDLIB_MODE_NW,
752 EDLIB_TASK_PATH));
753
754 // Decide, based on the edit distance and alignment length, if we should
755 // retain or discard the overlap.
756
757 if ((result.alignmentLength < minOverlapLength) || // Alignment is too short, or
758 (result.editDistance > maxErate * result.alignmentLength)) { // too noisy.
759 ovl->dat.ovl.forOBT = false;
760 ovl->dat.ovl.forDUP = false;
761 ovl->dat.ovl.forUTG = false;
762
763 ovl->evalue(AS_MAX_EVALUE);
764 }
765
766 else {
767 ovl->dat.ovl.forOBT = false; // Flag as useful for contigging
768 ovl->dat.ovl.forDUP = false; // if it is a dovetail overlap.
769 ovl->dat.ovl.forUTG = (ovl->overlapIsDovetail() == true);
770
771 ovl->erate(editDist / (double)alignLen);
772
773 _alignsA [ovlid] = new char [alen + 1]; // Allocate space for the alignment output.
774 _alignsB [ovlid] = new char [alen + 1];
775
776 for (uint32 ii=0; ii<alen; ii++) {
777 _alignsA[ovlid][ii] = '-';
778 _alignsB[ovlid][ii] = '-';
779 }
780
781 _alignsA[ovlid][alen] = 0;
782 _alignsB[ovlid][alen] = 0;
783
784 char *aaln = new char [result.alignmentLength + 1]; // Convert the alignment to a string.
785 char *baln = new char [result.alignmentLength + 1];
786
787 memset(aaln, 0, sizeof(char) * (result.alignmentLength + 1));
788 memset(baln, 0, sizeof(char) * (result.alignmentLength + 1));
789
790 edlibAlignmentToStrings(result.alignment, // Alignment
791 result.alignmentLength, // and length
792 result.startLocations[0], // tgtStart (_bRead)
793 result.endLocations[0]+1, // tgtEnd
794 0, // qryStart (_aRead)
795 alen, // qryEnd
796 _bRead + bbgn, // tgt sequence (_bRead)
797 _aRead + abgn, // qry sequence (_aRead)
798 baln, // output tgt alignment string
799 aaln); // output qry alignment string
800
801 // Dump
802
803 //fprintf(stdout, "B %u abgn %d aend %d ahg5 %d ahg5 %d alen %d\n", _bID, abgn, aend, (int32)ovl->dat.ovl.ahg5, (int32)ovl->dat.ovl.ahg3, alen);
804 //fprintf(stdout, "aaln %s\n", aaln);
805 //fprintf(stdout, "baln %s\n", baln);
806
807 // Copy it into the output space.
808
809 uint32 pp = 0; // Position in the emitted alignment string.
810 uint32 aa = 0; // Position in sequence A.
811
812 for (uint32 ii=0; ii<ovl->dat.ovl.ahg5; ii++, pp++) {
813 _alignsA[ovlid][pp] = _aRead[aa];
814 _alignsB[ovlid][pp] = '-';
815 aa++;
816 }
817
818 for (uint32 ii=0; ii<result.alignmentLength; ii++) {
819 if (aaln[ii] != '-') {
820 _alignsA[ovlid][pp] = aaln[ii];
821 _alignsB[ovlid][pp] = baln[ii];
822 aa++;
823 pp++;
824 }
825 }
826
827 for (uint32 ii=0; ii<ovl->dat.ovl.ahg3; ii++, pp++) {
828 _alignsA[ovlid][pp] = _aRead[aa];
829 _alignsB[ovlid][pp] = '-';
830 aa++;
831 }
832
833 assert(pp == alen); // If correct, we should have walked over the
834 assert(aa == alen); // full trimmed read with both indices.
835
836 delete [] aaln;
837 delete [] baln;
838
839 _alignsA[ovlid][alen] = 0;
840 _alignsB[ovlid][alen] = 0;
841 }
842
843 // Trash the alignment results.
844
845 edlibFreeAlignResult(result);
846 }
847
848 // More logging.
849
850 if (_verboseAlign > 0) {
851 fprintf(stderr, "computeOverlapAlignment()-- A %7u %6d-%-6d -> %6d-%-6d\n", _aID, ori.a_bgn(), ori.a_end(), ovl->a_bgn(), ovl->a_end());
852 fprintf(stderr, "computeOverlapAlignment()-- B %7u %6d-%-6d -> %6d-%-6d %5.4f\n", _bID, ori.b_bgn(), ori.b_end(), ovl->b_bgn(), ovl->b_end(), ovl->erate());
853 }
854 }
855
856
857