1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "unitigConsensus.H"
19
20 #include "bits.H"
21
22 #include "Alignment.H"
23 #include "AlnGraphBoost.H"
24 #include "edlib.H"
25 #include "align-parasail-driver.H"
26
27 #include <string>
28
29
30 #define ERROR_RATE_FACTOR 4
31 #define NUM_BANDS 2
32 #define MAX_RETRIES ERROR_RATE_FACTOR * NUM_BANDS
33
abSequence(uint32 readID,uint32 length,char * seq,uint32 complemented)34 abSequence::abSequence(uint32 readID,
35 uint32 length,
36 char *seq,
37 uint32 complemented) {
38 _iid = readID;
39 _length = length;
40 _complement = complemented;
41
42 _bases = new char [_length + 1];
43
44 // Make a complement table
45
46 char inv[256] = {0};
47
48 inv['a'] = 't'; inv['A'] = 'T';
49 inv['c'] = 'g'; inv['C'] = 'G';
50 inv['g'] = 'c'; inv['G'] = 'C';
51 inv['t'] = 'a'; inv['T'] = 'A';
52 inv['n'] = 'n'; inv['N'] = 'N';
53 inv['-'] = '-';
54
55 // Stash the bases/quals
56
57 for (uint32 ii=0; ii<_length; ii++)
58 assert((seq[ii] == 'A') ||
59 (seq[ii] == 'C') ||
60 (seq[ii] == 'G') ||
61 (seq[ii] == 'T') ||
62 (seq[ii] == 'N'));
63
64 if (complemented == false)
65 for (uint32 ii=0, pp=0; ii<_length; ii++, pp++)
66 _bases[pp] = seq[ii];
67
68 else
69 for (uint32 ii=_length, pp=0; ii-->0; pp++)
70 _bases[pp] = inv[ seq[ii] ];
71
72 _bases[_length] = 0; // NUL terminate the strings so we can use them in aligners.
73 };
74
75
76
unitigConsensus(sqStore * seqStore_,double errorRate_,double errorRateMax_,uint32 minOverlap_)77 unitigConsensus::unitigConsensus(sqStore *seqStore_,
78 double errorRate_,
79 double errorRateMax_,
80 uint32 minOverlap_) {
81
82 _seqStore = seqStore_;
83
84 _tig = NULL;
85 _numReads = 0;
86
87 _sequencesMax = 0;
88 _sequencesLen = 0;
89 _sequences = NULL;
90
91 _utgpos = NULL;
92 _cnspos = NULL;
93
94 _minOverlap = minOverlap_;
95 _errorRate = errorRate_;
96 _errorRateMax = errorRateMax_;
97 }
98
99
~unitigConsensus()100 unitigConsensus::~unitigConsensus() {
101
102 for (uint32 ss=0; ss<_sequencesLen; ss++)
103 delete _sequences[ss];
104
105 delete [] _sequences;
106 delete [] _utgpos;
107 delete [] _cnspos;
108 }
109
110
111
112 void
addRead(uint32 readID,uint32 askip,uint32 bskip,bool complemented,std::map<uint32,sqRead * > * inPackageRead)113 unitigConsensus::addRead(uint32 readID,
114 uint32 askip, uint32 bskip,
115 bool complemented,
116 std::map<uint32, sqRead *> *inPackageRead) {
117
118 // Grab the read. If there is no package, load the read from the store. Otherwise, load the
119 // read from the package. This REQUIRES that the package be in-sync with the unitig. We fail
120 // otherwise. Hey, it's used for debugging only...
121
122 sqRead *readToDelete = NULL;
123 sqRead *read = NULL;
124
125 if (inPackageRead == NULL) {
126 readToDelete = new sqRead;
127 read = _seqStore->sqStore_getRead(readID, readToDelete);
128 }
129
130 else {
131 read = (*inPackageRead)[readID];
132 }
133
134 if (read == NULL)
135 fprintf(stderr, "Failed to load read %u\n", readID);
136 assert(read != NULL);
137
138 // Grab seq/qlt from the read, offset to the proper begin and length.
139
140 uint32 seqLen = read->sqRead_length() - askip - bskip;
141 char *seq = read->sqRead_sequence() + ((complemented == false) ? askip : bskip);
142
143 // Add it to our list.
144
145 increaseArray(_sequences, _sequencesLen, _sequencesMax, 1);
146
147 _sequences[_sequencesLen++] = new abSequence(readID, seqLen, seq, complemented);
148
149 delete readToDelete;
150 }
151
152
153
154 bool
initialize(std::map<uint32,sqRead * > * reads)155 unitigConsensus::initialize(std::map<uint32, sqRead *> *reads) {
156
157 if (_numReads == 0) {
158 fprintf(stderr, "utgCns::initialize()-- unitig has no children.\n");
159 return(false);
160 }
161
162 _utgpos = new tgPosition [_numReads];
163 _cnspos = new tgPosition [_numReads];
164
165 memcpy(_utgpos, _tig->getChild(0), sizeof(tgPosition) * _numReads);
166 memcpy(_cnspos, _tig->getChild(0), sizeof(tgPosition) * _numReads);
167
168 // Clear the cnspos position. We use this to show it's been placed by consensus.
169 // Guess the number of columns we'll end up with.
170 // Initialize abacus with the reads.
171
172 for (int32 i=0; i<_numReads; i++) {
173 _cnspos[i].setMinMax(0, 0);
174
175 addRead(_utgpos[i].ident(),
176 _utgpos[i]._askip, _utgpos[i]._bskip,
177 _utgpos[i].isReverse(),
178 reads);
179 }
180
181 // Check for duplicate reads
182
183 {
184 std::set<uint32> dupFrag;
185
186 for (uint32 i=0; i<_numReads; i++) {
187 if (_utgpos[i].isRead() == false) {
188 fprintf(stderr, "unitigConsensus()-- Unitig %d FAILED. Child %d is not a read.\n",
189 _tig->tigID(), _utgpos[i].ident());
190 return(false);
191 }
192
193 if (dupFrag.find(_utgpos[i].ident()) != dupFrag.end()) {
194 fprintf(stderr, "unitigConsensus()-- Unitig %d FAILED. Child %d is a duplicate.\n",
195 _tig->tigID(), _utgpos[i].ident());
196 return(false);
197 }
198
199 dupFrag.insert(_utgpos[i].ident());
200 }
201 }
202
203 return(true);
204 }
205
switchToUncompressedCoordinates(void)206 void unitigConsensus::switchToUncompressedCoordinates(void) {
207 // update coordinates of the tig when needed (when it was assembled in compressed space, in normal space this will be skiped)
208 // we do this by tracking the read reaching furthest to the right and keeping its offset + homopolymer coordinate translation
209 // the read that overlaps it is then updated to start at that reads uncompressed offset + uncompressed bases based on the overlapping coordinate positions
210 //
211
212 // check that we need to do something first
213 // just rely on first read
214 if ((double)getSequence(0)->length() / (_utgpos[0].max()-_utgpos[0].min()) <= 1.2)
215 return;
216
217 uint32 compressedOffset = 0;
218 uint32 uncompressedOffset = 0;
219 uint32 currentEnd = _utgpos[0].max();
220 uint32 layoutLen = 0;
221 uint32 nlen = 0;
222 uint32* ntoc = NULL;
223
224 for (uint32 child = 0; child < _numReads; child++) {
225
226 if (compressedOffset > _utgpos[child].min())
227 fprintf(stderr, "switchToUncompressedCoordinates()-- ERROR1 in gap in positioning, last read ends at %d; next read starts at %d\n",
228 compressedOffset, _utgpos[child].min());
229 assert(_utgpos[child].min() >= compressedOffset);
230
231 uint32 readCompressedPosition = _utgpos[child].min() - compressedOffset;
232 uint32 compressedEnd = _utgpos[child].max();
233 uint32 compressedStart = _utgpos[child].min();
234
235 // find the start position in normal read based on position in compressed read
236 uint32 i = 0;
237 while (i < nlen && (ntoc[i] < readCompressedPosition))
238 i++;
239
240 //if (showAlgorithm())
241 // fprintf(stderr, "switchToUncompressedCoordinates()-- I'm trying to find start of child %d compressed %d (dist from guide read is %d and in uncompressed in becomes %d)\n", _utgpos[child].ident(), _utgpos[child].min(), readCompressedPosition, i);
242
243 _utgpos[child].setMinMax(i+uncompressedOffset, i+uncompressedOffset+getSequence(child)->length());
244
245 //if (showAlgorithm())
246 // fprintf(stderr, "switchToUncompressedCoordinates() --Updated read %d which has length %d to be from %d - %d\n", _utgpos[child].ident(), getSequence(child)->length(), _utgpos[child].min(), _utgpos[child].max());
247
248 // update best end if needed
249 if (ntoc == NULL || compressedEnd > currentEnd) {
250 nlen = getSequence(child)->length();
251 delete[] ntoc;
252 ntoc = new uint32 [ nlen + 1 ];
253 uint32 clen = homopolyCompress(getSequence(child)->getBases(), nlen, NULL, ntoc);
254
255 currentEnd = compressedEnd;
256 compressedOffset = compressedStart;
257 uncompressedOffset = _utgpos[child].min();
258
259 //if (showAlgorithm())
260 // fprintf(stderr, "switchToUncompressedCoordinates()-- Updating guide read to be %d which ends at %d. Best before ended at %d. Now my guide is at %d (%d uncompressed)\n", _utgpos[child].ident(), compressedEnd, currentEnd, compressedOffset, uncompressedOffset);
261 }
262
263 if (_utgpos[child].max() > layoutLen)
264 layoutLen = _utgpos[child].max();
265 }
266 delete[] ntoc;
267 _tig->_layoutLen = layoutLen;
268 }
269
270
271 // we assume some reads have good positions (recorded in cnspos) but most do
272 // not use those reads with good positions as anchors to re-compute everyone
273 // else's positions too this is very similar to the strategy in the above
274 // function to convert between compressed and uncompressed coordinates and it
275 // would be nice to unify the logic
276 //
updateReadPositions(void)277 void unitigConsensus::updateReadPositions(void) {
278 uint32 newOffset = 0;
279 uint32 oldOffset = 0;
280
281 for (uint32 child = 0; child < _numReads; child++) {
282 if (oldOffset > _utgpos[child].min())
283 fprintf(stderr, "switchToUncompressedCoordinates()-- ERROR1 in gap in positioning, last read ends at %d; next read starts at %d\n",
284 oldOffset, _utgpos[child].min());
285 assert(_utgpos[child].min() >= oldOffset);
286
287 // update best end if needed
288 if (_cnspos[child].max() != 0) {
289 newOffset = _cnspos[child].min();
290 oldOffset = _utgpos[child].min();
291
292 //if (showAlgorithm())
293 // fprintf(stderr, "updatePostTemplate()-- Updating guide read to be %d which ends at %d. Now my guide originally was at %d now is at %d\n", _utgpos[child].ident(), _utgpos[child].max(), oldOffset, newOffset);
294 }
295
296 uint32 readPosition = _utgpos[child].min() - oldOffset;
297
298 //if (showAlgorithm())
299 // fprintf(stderr, "updatePostTemplate()-- I'm trying to find start of child %d (dist from guide read is %d) currently from %d-%d\n", _utgpos[child].ident(), readPosition, _utgpos[child].min(), _utgpos[child].max());
300
301 _utgpos[child].setMinMax(readPosition+newOffset, readPosition+newOffset+getSequence(child)->length());
302
303 //if (showAlgorithm())
304 // fprintf(stderr, "updatePostTemplate() --Updated read %d which has length %d to be from %d - %d\n", _utgpos[child].ident(), getSequence(child)->length(), _utgpos[child].min(), _utgpos[child].max());
305 }
306 }
307
308 char *
generateTemplateStitch(void)309 unitigConsensus::generateTemplateStitch(void) {
310 uint32 minOlap = _minOverlap;
311
312 // Initialize, copy the first read.
313
314 uint32 rid = 0;
315
316 abSequence *seq = getSequence(rid);
317 char *fragment = seq->getBases();
318 uint32 readLen = seq->length();
319
320 uint32 tigmax = AS_MAX_READLEN; // Must be at least AS_MAX_READLEN, else resizeArray() could fail
321 uint32 tiglen = 0;
322 char *tigseq = NULL;
323
324 allocateArray(tigseq, tigmax, _raAct::clearNew);
325
326 if (showAlgorithm()) {
327 fprintf(stderr, "\n");
328 fprintf(stderr, "generateTemplateStitch()-- COPY READ read #%d %d (len=%d to %d-%d)\n",
329 0, _utgpos[0].ident(), readLen, _utgpos[0].min(), _utgpos[0].max());
330 }
331
332 for (uint32 ii=0; ii<readLen; ii++)
333 tigseq[tiglen++] = fragment[ii];
334
335 tigseq[tiglen] = 0;
336
337 uint32 ePos = _utgpos[0].max(); // Expected end of template, from bogart supplied positions.
338
339
340 // Find the next read that has some minimum overlap and a large extension, copy that into the template.
341
342 // Align read to template. Expecting to find alignment:
343 //
344 // template ---------------
345 // read ---------------
346 // ^
347 //
348 // All we need to find is where the template ends on the read, the ^ above. We know the
349 // expected size of the overlap, and can extract those bases from the template and look for a
350 // full alignment to the read.
351 //
352 // We'll align 80% of the expected overlap to the read, allowing a 20% buffer on either end.
353 //
354 // | +-80% expected overlap size
355 // | | +-fPos
356 // v v v
357 // template ----------(-----)
358 // read (-----------)------
359 //
360
361 while (rid < _numReads) {
362 uint32 nr = 0; // Next read
363 uint32 nm = 0; // Next read maximum position
364
365 // Pick the next read as the one with the longest extension from all with some minimum overlap
366 // to the template
367
368 if (showAlgorithm())
369 fprintf(stderr, "\n");
370
371 for (uint32 ii=rid+1; ii < _numReads; ii++) {
372
373 // If contained, move to the next read. (Not terribly useful to log, so we don't)
374
375 if (_utgpos[ii].max() < ePos)
376 continue;
377
378 // If a bigger end position, save the overlap. One quirk: if we've already saved an overlap, and this
379 // overlap is thin, don't save the thin overlap.
380
381 bool thick = (_utgpos[ii].min() + minOlap < ePos);
382 bool first = (nm == 0);
383 bool save = false;
384
385 if ((nm < _utgpos[ii].max()) && (thick || first)) {
386 save = true;
387 nr = ii;
388 nm = _utgpos[ii].max();
389 }
390
391 if (showAlgorithm())
392 fprintf(stderr, "generateTemplateStitch()-- read #%d/%d ident %d position %d-%d%s%s%s\n",
393 ii, _numReads, _utgpos[ii].ident(), _utgpos[ii].min(), _utgpos[ii].max(),
394 (save == true) ? " SAVE" : "",
395 (thick == false) ? " THIN" : "",
396 (first == true) ? " FIRST" : "");
397
398
399 // If this read has an overlap smaller than we want, stop searching.
400
401 if (thick == false)
402 break;
403 }
404
405 if (nr == 0) {
406 if (showAlgorithm())
407 fprintf(stderr, "generateTemplateStitch()-- NO MORE READS TO ALIGN\n");
408 break;
409 }
410
411 assert(nr != 0);
412
413 rid = nr; // We'll place read 'nr' in the template.
414
415 seq = getSequence(rid);
416 fragment = seq->getBases();
417 readLen = seq->length();
418
419 int32 readBgn;
420 int32 readEnd;
421
422 EdlibAlignResult result;
423 bool aligned = false;
424
425 double templateSize = 0.90;
426 double extensionSize = 0.10;
427 double bandErrRate = _errorRate / ERROR_RATE_FACTOR;
428
429 int32 olapLen = ePos - _utgpos[nr].min(); // The expected size of the overlap
430
431 if (olapLen < minOlap) {
432 if (showAlgorithm())
433 fprintf(stderr, "generateTemplateStitch()-- WARNING, increasing min overlap from %d to %u for read %u (%d - %d)\n",
434 olapLen, std::min(ePos, minOlap), nr, _utgpos[nr].min(), _utgpos[nr].max());
435 olapLen = std::min(ePos, minOlap);
436 }
437
438 int32 templateLen = 0;
439 int32 extensionLen = 0;
440
441 alignAgain:
442 templateLen = (int32)ceil(olapLen * templateSize); // Extract 80% of the expected overlap size
443 extensionLen = (int32)ceil(olapLen * extensionSize); // Extend read by 20% of the expected overlap size
444
445 readBgn = 0;
446 readEnd = olapLen + extensionLen;
447
448 if (readEnd > readLen)
449 readEnd = readLen;
450 // enforce minimum template length
451 if (templateLen <= 1)
452 templateLen ++;
453
454 if (showAlgorithm()) {
455 fprintf(stderr, "\n");
456 fprintf(stderr, "generateTemplateStitch()-- ALIGN template %d-%d (len=%d) to read #%d %d %d-%d (len=%d actual=%d at %d-%d) expecting olap of %d\n",
457 tiglen - templateLen, tiglen, templateLen,
458 nr, _utgpos[nr].ident(), readBgn, readEnd, readEnd - readBgn, readLen,
459 _utgpos[nr].min(), _utgpos[nr].max(),
460 olapLen);
461 }
462
463 result = edlibAlign(tigseq + tiglen - templateLen, templateLen,
464 fragment, readEnd - readBgn,
465 edlibNewAlignConfig(olapLen * bandErrRate, EDLIB_MODE_HW, EDLIB_TASK_PATH));
466
467 // We're expecting the template to align inside the read.
468 //
469 // v- always the end
470 // TEMPLATE --------------------------[---------------]
471 // READ [------------------------------]---------
472 // always the start -^
473 //
474 // If we don't find an alignment at all, we move the template start point to the right (making
475 // the template smaller) and also move the read end point to the right (making the read
476 // bigger).
477
478 bool tryAgain = false;
479
480 bool noResult = (result.numLocations == 0);
481 bool gotResult = (result.numLocations > 0);
482
483 bool hitTheStart = (gotResult) && (result.startLocations[0] == 0);
484
485 bool hitTheEnd = (gotResult) && (result.endLocations[0] + 1 == readEnd - readBgn);
486 bool moreToExtend = (readEnd < readLen);
487
488
489 int32 maxDifference = std::min(2500, (int32)ceil(0.30*olapLen));
490 // Reset if the edit distance is waay more than our error rate allows or it's very short and we haven't topped out on error. This seems to be a quirk with
491 // edlib when aligning to N's - I got startLocation = endLocation = 0 and editDistance = alignmentLength.
492 if ((double)result.editDistance / result.alignmentLength > bandErrRate ||
493 (abs(olapLen-result.alignmentLength) > maxDifference && bandErrRate < _errorRate)) {
494 noResult = true;
495 gotResult = false;
496 hitTheStart = false;
497 hitTheEnd = false;
498 }
499
500 // HOWEVER, if we get a result and it's near perfect, declare success even if we hit the start.
501 // These are simple repeats that will align with any overlap. The one BPW debugged was 99+% A.
502
503 if ((gotResult == true) &&
504 (hitTheStart == true) &&
505 ((double)result.editDistance / result.alignmentLength < 0.1)) {
506 hitTheStart = false;
507 }
508
509 // NOTE that if we hit the end with the same conditions, we should try again, unless there
510 // isn't anything left. In that case, we don't extend the template.
511
512 if ((gotResult == true) &&
513 (hitTheEnd == true) &&
514 (moreToExtend == false) &&
515 ((double)result.editDistance / result.alignmentLength < 0.1)) {
516 hitTheEnd = false;
517 }
518
519 // Now, report what happened, and maybe try again.
520
521 if ((showAlgorithm()) && (noResult == true))
522 fprintf(stderr, "generateTemplateStitch()-- FAILED to align - no result\n");
523
524 if ((showAlgorithm()) && (noResult == false))
525 fprintf(stderr, "generateTemplateStitch()-- FOUND alignment at %d-%d editDist %d alignLen %d %.f%% expected %d\n",
526 result.startLocations[0], result.endLocations[0]+1,
527 result.editDistance,
528 result.alignmentLength,
529 100.0 * result.editDistance / result.alignmentLength, olapLen);
530
531 if ((noResult) || (hitTheStart)) {
532 if (showAlgorithm())
533 fprintf(stderr, "generateTemplateStitch()-- FAILED to align - %s - decrease template size by 10%%\n",
534 (noResult == true) ? "no result" : "hit the start");
535 tryAgain = true;
536 templateSize -= 0.10;
537 }
538
539 if ((noResult) || (hitTheEnd && moreToExtend)) {
540 if (showAlgorithm())
541 fprintf(stderr, "generateTemplateStitch()-- FAILED to align - %s - increase read size by 10%%\n",
542 (noResult == true) ? "no result" : "hit the end");
543 tryAgain = true;
544 extensionSize += 0.10;
545 }
546
547 if (templateSize < 0.01) {
548 if (showAlgorithm())
549 fprintf(stderr, "generateTemplateStitch()-- FAILED to align - no more template to remove! Fail!\n");
550 if (bandErrRate + _errorRate / ERROR_RATE_FACTOR > _errorRate)
551 tryAgain = false;
552 else {
553 if (showAlgorithm())
554 fprintf(stderr, "generateTemplateStitch()-- FAILED to align at %.2f error rate, increasing to %.2f\n", bandErrRate, bandErrRate + _errorRate/ERROR_RATE_FACTOR);
555 tryAgain = true;
556 templateSize = 0.90;
557 extensionSize = 0.10;
558 bandErrRate += _errorRate / ERROR_RATE_FACTOR;
559 }
560 }
561
562 if (tryAgain) {
563 edlibFreeAlignResult(result);
564 goto alignAgain;
565 }
566
567 // Use the alignment (or the overlap) to figure out what bases in the read
568 // need to be appended to the template.
569
570 if (noResult == false) {
571 readBgn = result.startLocations[0]; // Expected to be zero
572 readEnd = result.endLocations[0] + 1; // Where we need to start copying the read
573
574 // record updated read coordinates which we will use later as anchors to re-computed everyone else
575 _cnspos[nr].setMinMax(tiglen-readEnd, tiglen + readLen - readEnd);
576
577 if (showAlgorithm()) {
578 fprintf(stderr, "generateTemplateStitch()--\n");
579 fprintf(stderr, "generateTemplateStitch()-- Aligned template %d-%d to read %u %d-%d; copy read %d-%d to template.\n",
580 tiglen - templateLen, tiglen, nr, readBgn, readEnd, readEnd, readLen);
581 fprintf(stderr, "generateTemplateStitch()-- New position for read %d is %d-%d\n", _utgpos[nr].ident(), _cnspos[nr].min(), _cnspos[nr].max());
582 }
583 } else {
584 readBgn = 0;
585 readEnd = olapLen;
586
587 if (showAlgorithm()) {
588 fprintf(stderr, "generateTemplateStitch()--\n");
589 fprintf(stderr, "generateTemplateStitch()-- Alignment failed, use original overlap; copy read %d-%d to template.\n",
590 readEnd, readLen);
591 }
592 }
593
594 edlibFreeAlignResult(result);
595
596
597 resizeArray(tigseq, tiglen, tigmax, tiglen + readLen - readEnd + 1);
598
599 // Append the read bases to the template.
600 //
601 // When the read fails to align, we used to go back in the overlap region and fill in any N
602 // bases, expected to be large blocks of Ns from shredded scaffolds, with bases from the read.
603 // As of March 2019, this isn't really needed, for two reasons: (1) sqStore is trimming N's
604 // from the end of a read, and (2) edlib is allowing matches to Ns.
605
606 if (showAlgorithm())
607 fprintf(stderr, "generateTemplateStitch()-- Append read from %u to %u, starting at tiglen %u\n", readEnd, readLen, tiglen);
608
609 for (uint32 ii=readEnd; ii<readLen; ii++)
610 tigseq[tiglen++] = fragment[ii];
611
612 tigseq[tiglen] = 0;
613
614 assert(tiglen < tigmax);
615
616 ePos = _utgpos[rid].max();
617
618 if (showAlgorithm())
619 fprintf(stderr, "generateTemplateStitch()-- Template now length %d, expected %d, difference %7.4f%%\n",
620 tiglen, ePos, 200.0 * ((int32)tiglen - (int32)ePos) / ((int32)tiglen + (int32)ePos));
621 }
622
623 // Report the expected and final size. We used to guard against long tigs getting chopped, but
624 // the new template construction (early 2019) will simply append reads that fail to extend the
625 // template - so long tigs never should get chopped.
626
627 double pd = 200.0 * ((int32)tiglen - (int32)ePos) / ((int32)tiglen + (int32)ePos);
628
629 if (showAlgorithm()) {
630 fprintf(stderr, "\n");
631 fprintf(stderr, "generateTemplateStitch()-- generated template of length %d, expected length %d, %7.4f%% difference.\n",
632 tiglen, ePos, pd);
633 }
634
635 updateReadPositions();
636
637 return(tigseq);
638 }
639
640
641
642 bool
alignEdLib(dagAlignment & aln,tgPosition & utgpos,char * fragment,uint32 fragmentLength,char * tigseq,uint32 tiglen,double errorRate,bool verbose)643 alignEdLib(dagAlignment &aln,
644 tgPosition &utgpos,
645 char *fragment,
646 uint32 fragmentLength,
647 char *tigseq,
648 uint32 tiglen,
649 double errorRate,
650 bool verbose) {
651
652 EdlibAlignResult align;
653
654 int32 padding = std::min(250, (int32)ceil(fragmentLength * 0.05));
655 double bandErrRate = errorRate / ERROR_RATE_FACTOR;
656 bool aligned = false;
657 double alignedErrRate = 0.0;
658
659 // Decide on where to align this read.
660
661 int32 tigbgn = std::max((int32)0, (int32)floor(utgpos.min() - padding));
662 int32 tigend = std::min((int32)tiglen, (int32)floor(utgpos.max() + padding));
663
664 if (verbose)
665 fprintf(stderr, "alignEdLib()-- align read %7u eRate %.4f at %9d-%-9d", utgpos.ident(), bandErrRate, tigbgn, tigend);
666
667 if (tigend < tigbgn) {
668 fprintf(stderr, "alignEdLib()-- WARNING: tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
669 tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
670 // try to align it to full
671 tigbgn = 0;
672 tigend = (utgpos.max() < 0 ? tiglen : utgpos.max());
673 fprintf(stderr, "alignEdLib()-- WARNING: updated tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
674 tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
675 }
676 assert(tigend > tigbgn);
677
678 // Align! If there is an alignment, compute error rate and declare success if acceptable.
679
680 align = edlibAlign(fragment, fragmentLength,
681 tigseq + tigbgn, tigend - tigbgn,
682 edlibNewAlignConfig(bandErrRate * fragmentLength, EDLIB_MODE_HW, EDLIB_TASK_PATH));
683
684 if (align.alignmentLength > 0) {
685 alignedErrRate = (double)align.editDistance / align.alignmentLength;
686 aligned = (alignedErrRate <= bandErrRate);
687 if (verbose)
688 fprintf(stderr, " - ALIGNED %.4f at %9d-%-9d\n", alignedErrRate, tigbgn + align.startLocations[0], tigbgn + align.endLocations[0]+1);
689 } else {
690 if (verbose)
691 fprintf(stderr, "\n");
692 }
693
694 for (uint32 ii=1 /*the 0ths was above*/; ((ii < MAX_RETRIES) && (aligned == false)); ii++) {
695 tigbgn = std::max((int32)0, tigbgn - 5 * padding);
696 tigend = std::min((int32)tiglen, tigend + 5 * padding);
697 // last attempt make a very wide band
698 if ((ii+1) % NUM_BANDS == 0) {
699 tigbgn = std::max((int32)0, tigbgn - 25 * padding);
700 tigend = std::min((int32)tiglen, tigend + 25 * padding);
701 }
702
703 // let the band increase without increasing error rate for a while, if we give up increase error rate
704 // and reset bnad
705 if (ii != 0 && ii % NUM_BANDS == 0) {
706 bandErrRate += errorRate / ERROR_RATE_FACTOR;
707 tigbgn = std::max((int32)0, (int32)floor(utgpos.min() - padding));
708 tigend = std::min((int32)tiglen, (int32)floor(utgpos.max() + padding));
709 }
710
711 if (tigend < tigbgn) {
712 fprintf(stderr, "alignEdLib()-- WARNING: tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
713 tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
714 // try to align it to full
715 tigbgn = 0;
716 tigend = (utgpos.max() < 0 ? tiglen : utgpos.max());
717 fprintf(stderr, "alignEdLib()-- WARNING: updated tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
718 tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
719 }
720 assert(tigend > tigbgn);
721
722 edlibFreeAlignResult(align);
723
724 if (verbose)
725 fprintf(stderr, "alignEdLib()-- eRate %.4f at %9d-%-9d", bandErrRate, tigbgn, tigend);
726
727 align = edlibAlign(fragment, strlen(fragment),
728 tigseq + tigbgn, tigend - tigbgn,
729 edlibNewAlignConfig(bandErrRate * fragmentLength, EDLIB_MODE_HW, EDLIB_TASK_PATH));
730
731 if (align.alignmentLength > 0) {
732 alignedErrRate = (double)align.editDistance / align.alignmentLength;
733 aligned = (alignedErrRate <= bandErrRate);
734 if (verbose)
735 fprintf(stderr, " - ALIGNED %.4f at %9d-%-9d\n", alignedErrRate, tigbgn + align.startLocations[0], tigbgn + align.endLocations[0]+1);
736 } else {
737 if (verbose)
738 fprintf(stderr, "\n");
739 }
740 }
741
742 if (aligned == false) {
743 edlibFreeAlignResult(align);
744 return(false);
745 }
746
747 char *tgtaln = new char [align.alignmentLength+1];
748 char *qryaln = new char [align.alignmentLength+1];
749
750 memset(tgtaln, 0, sizeof(char) * (align.alignmentLength+1));
751 memset(qryaln, 0, sizeof(char) * (align.alignmentLength+1));
752
753 edlibAlignmentToStrings(align.alignment, // Alignment
754 align.alignmentLength, // and length
755 align.startLocations[0], // tgtStart
756 align.endLocations[0]+1, // tgtEnd
757 0, // qryStart
758 fragmentLength, // qryEnd
759 tigseq + tigbgn, // tgt sequence
760 fragment, // qry sequence
761 tgtaln, // output tgt alignment string
762 qryaln); // output qry alignment string
763
764 // Populate the output. AlnGraphBoost does not handle mismatch alignments, at all, so convert
765 // them to a pair of indel.
766
767 uint32 nMatch = 0;
768
769 for (uint32 ii=0; ii<align.alignmentLength; ii++) // Edlib guarantees aln[alignmentLength] == 0.
770 if ((tgtaln[ii] != '-') &&
771 (qryaln[ii] != '-') &&
772 (tgtaln[ii] != qryaln[ii]))
773 nMatch++;
774
775 aln.start = tigbgn + align.startLocations[0] + 1; // AlnGraphBoost expects 1-based positions.
776 aln.end = tigbgn + align.endLocations[0] + 1; // EdLib returns 0-based positions.
777
778 aln.qstr = new char [align.alignmentLength + nMatch + 1];
779 aln.tstr = new char [align.alignmentLength + nMatch + 1];
780
781 for (uint32 ii=0, jj=0; ii<align.alignmentLength; ii++) {
782 char tc = tgtaln[ii];
783 char qc = qryaln[ii];
784
785 if ((tc != '-') &&
786 (qc != '-') &&
787 (tc != qc)) {
788 aln.tstr[jj] = '-'; aln.qstr[jj] = qc; jj++;
789 aln.tstr[jj] = tc; aln.qstr[jj] = '-'; jj++;
790 } else {
791 aln.tstr[jj] = tc; aln.qstr[jj] = qc; jj++;
792 }
793
794 aln.length = jj;
795 }
796
797 aln.qstr[aln.length] = 0;
798 aln.tstr[aln.length] = 0;
799
800 delete [] tgtaln;
801 delete [] qryaln;
802
803 edlibFreeAlignResult(align);
804
805 if (aln.end > tiglen)
806 fprintf(stderr, "ERROR: alignment from %d to %d, but tiglen is only %d\n", aln.start, aln.end, tiglen);
807 assert(aln.end <= tiglen);
808
809 return(true);
810 }
811
812
813
814 bool
initializeGenerate(tgTig * tig_,std::map<uint32,sqRead * > * reads_)815 unitigConsensus::initializeGenerate(tgTig *tig_,
816 std::map<uint32, sqRead *> *reads_) {
817
818 _tig = tig_;
819 _numReads = _tig->numberOfChildren();
820
821 if (initialize(reads_) == false) {
822 fprintf(stderr, "Failed to initialize for tig %u with %u children\n", _tig->tigID(), _tig->numberOfChildren());
823 return(false);
824 }
825
826 switchToUncompressedCoordinates();
827
828 return(true);
829 }
830
831
832
833 bool
generatePBDAG(tgTig * tig_,char aligner_,std::map<uint32,sqRead * > * reads_)834 unitigConsensus::generatePBDAG(tgTig *tig_,
835 char aligner_,
836 std::map<uint32, sqRead *> *reads_) {
837
838 if (initializeGenerate(tig_, reads_) == false)
839 return(false);
840
841 // Build a quick consensus to align to.
842
843 char *tigseq = generateTemplateStitch();
844 uint32 tiglen = strlen(tigseq);
845
846 if (showAlgorithm())
847 fprintf(stderr, "Generated template of length %d\n", tiglen);
848
849 // Compute alignments of each sequence in parallel
850
851 if (showAlgorithm())
852 fprintf(stderr, "Aligning reads.\n");
853
854 dagAlignment *aligns = new dagAlignment [_numReads];
855 uint32 pass = 0;
856 uint32 fail = 0;
857
858 #pragma omp parallel for schedule(dynamic)
859 for (uint32 ii=0; ii<_numReads; ii++) {
860 abSequence *seq = getSequence(ii);
861 bool aligned = false;
862
863 assert(aligner_ == 'E'); // Maybe later we'll have more than one aligner again.
864
865 aligned = alignEdLib(aligns[ii],
866 _utgpos[ii],
867 seq->getBases(), seq->length(),
868 tigseq, tiglen,
869 _errorRate,
870 showAlgorithm());
871
872 if (aligned == false) {
873 if (showAlgorithm())
874 fprintf(stderr, "generatePBDAG()-- read %7u FAILED\n", _utgpos[ii].ident());
875
876 fail++;
877
878 continue;
879 }
880
881 pass++;
882 }
883
884 if (showAlgorithm())
885 fprintf(stderr, "generatePBDAG()-- read alignment: %d failed, %d passed.\n", fail, pass);
886
887 // Construct the graph from the alignments. This is not thread safe.
888
889 if (showAlgorithm())
890 fprintf(stderr, "Constructing graph\n");
891
892 AlnGraphBoost ag(std::string(tigseq, tiglen));
893
894 for (uint32 ii=0; ii<_numReads; ii++) {
895 _cnspos[ii].setMinMax(aligns[ii].start, aligns[ii].end);
896
897 if ((aligns[ii].start == 0) &&
898 (aligns[ii].end == 0))
899 continue;
900
901 ag.addAln(aligns[ii]);
902
903 aligns[ii].clear();
904 }
905
906 delete [] aligns;
907
908 if (showAlgorithm())
909 fprintf(stderr, "Merging graph\n");
910
911 // Merge the nodes and call consensus
912 ag.mergeNodes();
913
914 if (showAlgorithm())
915 fprintf(stderr, "Calling consensus\n");
916
917 //FIXME why do we have 0weight nodes (template seq w/o support even from the read that generated them)?
918 std::string cns = ag.consensus(0);
919
920 delete [] tigseq;
921
922 // Save consensus
923
924 resizeArrayPair(_tig->_bases, _tig->_quals, 0, _tig->_basesMax, (uint32) cns.length() + 1, _raAct::doNothing);
925
926 std::string::size_type len = 0;
927
928 for (len=0; len<cns.size(); len++) {
929 _tig->_bases[len] = cns[len];
930 _tig->_quals[len] = CNS_MIN_QV;
931 }
932
933 // Terminate the string.
934
935 _tig->_bases[len] = 0;
936 _tig->_quals[len] = 0;
937 _tig->_basesLen = len;
938 _tig->_layoutLen = len;
939
940 assert(len < _tig->_basesMax);
941
942 return(true);
943 }
944
945
946
947 bool
generateQuick(tgTig * tig_,std::map<uint32,sqRead * > * reads_)948 unitigConsensus::generateQuick(tgTig *tig_,
949 std::map<uint32, sqRead *> *reads_) {
950
951 if (initializeGenerate(tig_, reads_) == false)
952 return(false);
953
954 // Quick is just the template sequence, so one and done!
955
956 char *tigseq = generateTemplateStitch();
957 uint32 tiglen = strlen(tigseq);
958
959 // Save consensus
960
961 resizeArrayPair(_tig->_bases, _tig->_quals, 0, _tig->_basesMax, tiglen + 1, _raAct::doNothing);
962
963 for (uint32 ii=0; ii<tiglen; ii++) {
964 _tig->_bases[ii] = tigseq[ii];
965 _tig->_quals[ii] = CNS_MIN_QV;
966 }
967
968 // Set positions of all the reads. We don't know anything and default to the incoming positions.
969
970 for (uint32 ii=0; ii<_numReads; ii++)
971 _cnspos[ii] = _utgpos[ii];
972
973 // Terminate the string.
974
975 _tig->_bases[tiglen] = 0;
976 _tig->_quals[tiglen] = 0;
977 _tig->_basesLen = tiglen;
978 _tig->_layoutLen = tiglen;
979
980 delete [] tigseq;
981
982 return(true);
983 }
984
985
986
987 bool
generateSingleton(tgTig * tig_,std::map<uint32,sqRead * > * reads_)988 unitigConsensus::generateSingleton(tgTig *tig_,
989 std::map<uint32, sqRead *> *reads_) {
990
991 if (initializeGenerate(tig_, reads_) == false)
992 return(false);
993
994 assert(_numReads == 1);
995
996 // Copy the single read to the tig sequence.
997
998 abSequence *seq = getSequence(0);
999 char *fragment = seq->getBases();
1000 uint32 readLen = seq->length();
1001
1002 resizeArrayPair(_tig->_bases, _tig->_quals, 0, _tig->_basesMax, readLen + 1, _raAct::doNothing);
1003
1004 for (uint32 ii=0; ii<readLen; ii++) {
1005 _tig->_bases[ii] = fragment[ii];
1006 _tig->_quals[ii] = CNS_MIN_QV;
1007 }
1008
1009 // Set positions of all the reads.
1010
1011 _cnspos[0].setMinMax(0, readLen);
1012
1013 // Terminate the string.
1014
1015 _tig->_bases[readLen] = 0;
1016 _tig->_quals[readLen] = 0;
1017 _tig->_basesLen = readLen;
1018 _tig->_layoutLen = readLen;
1019
1020 return(true);
1021 }
1022
1023
1024
1025
1026 // This has more canu dependency than edlib dependency, so it's here instead of in edlib.c
1027 //
1028 uint32
edlibAlignmentToCanu(stuffedBits * align,const unsigned char * alignment,int alignmentLength,int tgtStart,int tgtEnd,int qryStart,int qryEnd)1029 edlibAlignmentToCanu(stuffedBits *align,
1030 const unsigned char* alignment,
1031 int alignmentLength,
1032 int tgtStart, int tgtEnd,
1033 int qryStart, int qryEnd) {
1034 int32 qryPos = qryStart;
1035 int32 tgtPos = tgtStart;
1036
1037 // Count the length of the alignment.
1038
1039 int32 nMatch = 0; // Counting the number of match/mismatch in a delta block.
1040 int32 nBlocks = 0; // Number of blocks in the delta encoding.
1041
1042 // Code the alignment.
1043
1044 for (int32 a=0; a<alignmentLength; a++) {
1045 assert(qryPos <= qryEnd);
1046 assert(tgtPos <= tgtEnd);
1047
1048 // Match or mismatch.
1049 if ((alignment[a] == EDLIB_EDOP_MATCH) ||
1050 (alignment[a] == EDLIB_EDOP_MISMATCH)) {
1051 nMatch++;
1052
1053 qryPos++;
1054 tgtPos++;
1055 }
1056
1057 // Insertion in target.
1058 else if (alignment[a] == EDLIB_EDOP_INSERT) {
1059 align->setEliasDelta(nMatch + 1);
1060 align->setBit(0);
1061
1062 nMatch = 0;
1063 nBlocks++;
1064
1065 qryPos++;
1066 }
1067
1068 // Insertion in query.
1069 else if (alignment[a] == EDLIB_EDOP_DELETE) {
1070 align->setEliasDelta(nMatch + 1);
1071 align->setBit(1);
1072
1073 nMatch = 0;
1074 nBlocks++;
1075
1076 tgtPos++;
1077 }
1078 }
1079
1080 // Don't forget the last match/mismatch block.
1081 //align->setEliasDelta(nMatch + 1);
1082 //align->setBit(1);
1083
1084 //nMatch = 0;
1085 //nBlocks++;
1086
1087 //fprintf(stderr, "Align of length %d with %d gaps stored in %lu bits.\n", alignmentLength, nBlocks, align->getLength());
1088
1089 return(nBlocks);
1090 }
1091
1092
1093
1094 // Align each read to the region of the consensus sequence the read claims
1095 // to be from, extended by 5% of the read length on either end. If it fails
1096 // to align full length, make the extensions larger and/or error rate more
1097 // tolerant until it aligns.
1098 //
1099 // Reads that fail to align have their cnspos set to 0.
1100 //
1101 void
findCoordinates(void)1102 unitigConsensus::findCoordinates(void) {
1103
1104 if (showPlacement()) {
1105 fprintf(stderr, "\n");
1106 fprintf(stderr, "TIG %u length %u\n", _tig->tigID(), _tig->length());
1107 fprintf(stderr, "\n");
1108 }
1109
1110 _tig->_childDeltaBitsLen = 1;
1111 _tig->_childDeltaBits = new stuffedBits();
1112
1113 int32 alignShift = 0;
1114
1115 #pragma omp parallel for schedule(dynamic)
1116 for (uint32 ii=0; ii<_numReads; ii++) {
1117 abSequence *read = getSequence(ii);
1118 char *readSeq = read->getBases();
1119 uint32 readLen = read->length();
1120
1121 int32 origbgn = _cnspos[ii].min();
1122 int32 origend = _cnspos[ii].max();
1123 int32 origlen = origend - origbgn;
1124
1125 int32 ext5 = readLen * 0.01; // Try an initial alignment nice and tight. This is
1126 int32 ext3 = readLen * 0.01; // important for the first/last reads so we get them
1127 double era = _errorRate; // aligned at the end of the tig.
1128
1129 if (showPlacement()) {
1130 fprintf(stderr, "\n");
1131 fprintf(stderr, "ALIGN read #%d %u length %u cnspos %d %d\n",
1132 ii, read->seqIdent(), readLen, _cnspos[ii].min(), _cnspos[ii].max());
1133 }
1134
1135 _cnspos[ii].setMinMax(0, 0); // When the read aligns, we set the true position.
1136
1137 if ((origbgn == 0) &&
1138 (origend == 0)) {
1139 if (showPlacement())
1140 fprintf(stderr, "skip, failed to align to template initially.\n");
1141 continue;
1142 }
1143
1144 while ((ext5 < readLen * 1.5) &&
1145 (ext3 < readLen * 1.5) &&
1146 (era < 4 * _errorRate)) {
1147 int32 bgn = std::max(0, origbgn + alignShift - ext5); // First base in tig we'll align to.
1148 int32 end = std::min( origend + alignShift + ext3, (int32)_tig->length()); // Last base
1149 int32 len = end - bgn; // WAS: +1
1150
1151 // If the length of the region is (wildly) less than expected, reset
1152 // bgn to give the alignment a fighting chance. BPW has only seen
1153 // this when running consensus on 10% error reads, and only on reads
1154 // near the end of the tig -- that is, when end (above) is reset to
1155 // tig->length().
1156
1157 if ((bgn == 0) &&
1158 (len < ext5 + origlen + ext3)) {
1159 int32 endnew = bgn + (origend - origbgn) + ext5 + ext3;
1160
1161 if (showPlacement())
1162 fprintf(stderr, "reset position from %d-%d to %d-%d based on len=%d ext5=%d origlen=%d ext3=%d\n",
1163 bgn, end,
1164 bgn, endnew,
1165 len, ext5, origlen, ext3);
1166
1167 end = (endnew < (int32)_tig->length()) ? endnew : _tig->length();
1168 len = end - bgn;
1169 }
1170
1171 if ((end == _tig->length()) &&
1172 (len < ext5 + origlen + ext3)) {
1173 int32 bgnnew = end - (origend - origbgn) - ext5 - ext3;
1174
1175 if (showPlacement())
1176 fprintf(stderr, "reset position from %d-%d to %d-%d based on len=%d ext5=%d origlen=%d ext3=%d\n",
1177 bgn, end,
1178 bgnnew, end,
1179 len, ext5, origlen, ext3);
1180
1181 bgn = (bgnnew < 0) ? 0 : bgnnew;
1182 len = end - bgn;
1183 }
1184
1185 // Some logging.
1186
1187 if (showPlacement())
1188 fprintf(stderr, "align read #%u length %u to %d-%d - shift %d extension %d %d error rate %.3f\n",
1189 ii, readLen, bgn, end, alignShift, ext5, ext3, era);
1190
1191 // More logging.
1192
1193 if (showAlignments()) {
1194 char save = _tig->bases()[bgn + len];
1195
1196 _tig->bases()[bgn + len] = 0;
1197
1198 fprintf(stderr, "READ: %s\n", readSeq);
1199 fprintf(stderr, "TIG: %s\n", _tig->bases() + bgn);
1200
1201 _tig->bases()[bgn + len] = save;
1202 }
1203
1204 // Align the entire read into a subsequence of the tig.
1205
1206 assert(bgn < end);
1207
1208 EdlibAlignResult align = edlibAlign(readSeq, readLen,
1209 _tig->bases() + bgn, len,
1210 edlibNewAlignConfig(readLen * era * 2, EDLIB_MODE_HW, EDLIB_TASK_PATH));
1211
1212 // If nothing aligned, make the tig subsequence bigger and allow more errors.
1213
1214 if (align.numLocations == 0) {
1215 ext5 += readLen * 0.05;
1216 ext3 += readLen * 0.05;
1217 era += 0.025;
1218
1219 if (showPlacement())
1220 fprintf(stderr, " NO ALIGNMENT - Increase extension to %d / %d and error rate to %.3f\n", ext5, ext3, era);
1221
1222 edlibFreeAlignResult(align);
1223
1224 continue;
1225 }
1226
1227 // Something aligned. Find how much of the tig subsequence was not used in the alignment.
1228 // If we hit either end - the unaligned bit is length 0 - increase that extension and retry.
1229
1230 int32 unaligned5 = align.startLocations[0];
1231 int32 unaligned3 = len - (align.endLocations[0] + 1); // 0-based position of last character in alignment.
1232
1233 if (showPlacement())
1234 fprintf(stderr, " - read %4u original %9u-%9u claimed %9u-%9u aligned %9u-%9u unaligned %d %d\n",
1235 ii,
1236 _utgpos[ii].min(), _utgpos[ii].max(),
1237 origbgn, origend,
1238 bgn, end,
1239 unaligned5, unaligned3);
1240
1241 // If bump the start, make bigger.
1242
1243 if ((bgn > 0) && (unaligned5 <= 0)) {
1244 ext5 += readLen * 0.05;
1245
1246 if (showPlacement())
1247 fprintf(stderr, " BUMPED START - unaligned hangs %d %d - increase 5' extension to %d\n",
1248 unaligned5, unaligned3, ext5);
1249
1250 edlibFreeAlignResult(align);
1251
1252 continue;
1253 }
1254
1255 // If bump the end, make bigger.
1256
1257 if ((end < _tig->length()) && (unaligned3 <= 0)) {
1258 ext3 += readLen * 0.05;
1259
1260 if (showPlacement())
1261 fprintf(stderr, " BUMPED END - unaligned hangs %d %d - increase 3' extension to %d\n",
1262 unaligned5, unaligned3, ext3);
1263
1264 edlibFreeAlignResult(align);
1265
1266 continue;
1267 }
1268
1269 // Otherwise, SUCCESS!
1270 //
1271 // Save, for the next alignment, the difference between where the read though it was placed,
1272 // and where we actually aligned it to. This will adjust the next subsequence to (hopefully)
1273 // account for any expansion/collapses in the consensus sequence.
1274
1275 int32 abgn = bgn + align.startLocations[0];
1276 int32 aend = bgn + align.endLocations[0] + 1;
1277
1278 alignShift = ((abgn - origbgn) +
1279 (aend - origend)) / 2;
1280
1281 _cnspos[ii].setMinMax(abgn, aend);
1282
1283 _tig->getChild(ii)->setMinMax(abgn, aend);
1284
1285 if (showPlacement())
1286 fprintf(stderr, " SUCCESS aligned to %d %d at %f\n", abgn, aend, 100.0 * align.editDistance / align.alignmentLength);
1287
1288 #pragma omp critical (tgTigLoadAlign)
1289 {
1290 _tig->getChild(ii)->_deltaOffset = _tig->_childDeltaBits->getPosition();
1291 _tig->getChild(ii)->_deltaLen = edlibAlignmentToCanu(_tig->_childDeltaBits,
1292 align.alignment,
1293 align.alignmentLength,
1294 align.startLocations[0],
1295 align.endLocations[0] + 1,
1296 0,
1297 readLen);
1298 }
1299
1300 edlibFreeAlignResult(align);
1301
1302 break; // Stop looping over extension and error rate.
1303 } // Looping over extension and error rate.
1304 } // Looping over reads.
1305 }
1306
1307
1308
1309 void
findRawAlignments(void)1310 unitigConsensus::findRawAlignments(void) {
1311
1312 #if 0
1313 for (uint32 ii=0; ii<_numReads; ii++) {
1314 fprintf(stderr, "read %4u original %9u-%9u aligned %9u-%9u\n",
1315 ii,
1316 _utgpos[ii].min(), _utgpos[ii].max(),
1317 _cnspos[ii].min(), _cnspos[ii].max());
1318 }
1319 #endif
1320 }
1321
1322
1323
1324 void
trimCircular(void)1325 unitigConsensus::trimCircular(void) {
1326 char *bases = _tig->bases();
1327 uint32 length = _tig->length();
1328
1329 if (_tig->_suggestCircular == false)
1330 return;
1331
1332 if (_tig->_circularLength == 0)
1333 return;
1334
1335 if (showAlgorithm()) {
1336 fprintf(stderr, "\n");
1337 fprintf(stderr, "Circularizing tig %u with expected overlap %u bases.\n", _tig->tigID(), _tig->_circularLength);
1338 }
1339
1340 // Try alignments of various lengths until we find a proper overlap
1341 // between the end and the start at acceptable quality, or until we get
1342 // longer than expected.
1343 //
1344 // Stop searching if we found a proper overlap.
1345 //
1346 // bgnA -v v- end of the tig
1347 // A: ....---------------
1348 // B: ----------------......
1349 // beginning of the tig -^ ^- endB
1350
1351 bool found = false;
1352 parasailLib align(1, -4, 5, 3);
1353
1354 for (double scale = 1.05; (found == false) && (scale < 2.10); scale += 0.1) {
1355 uint32 trylen = std::min(length, (uint32)(_tig->_circularLength * scale));
1356
1357 if (showPlacement())
1358 fprintf(stderr, " Align A %6u-%6u against B %6u-%6u\n",
1359 length-trylen, length,
1360 0, trylen);
1361
1362 align.alignDovetail(bases, length, length-trylen, length, // A: the end of the tig
1363 bases, length, 0, trylen, false); // B: the start of the tig
1364
1365 if ((align.errorRate() < 0.05) && // "Found" if the error rate is reasonable, and
1366 (align.alignmentLength() >= _minOverlap) && // and length is reasonable
1367 (align.bgnA() > length - trylen) && // A has unaligned stuff at the start, and
1368 (align.endB() < trylen)) // B has unaligned stuff at the end.
1369 found = true;
1370
1371 if ((found == false) && (showPlacement()))
1372 fprintf(stderr, " Aligned A %6u-%6u against B %6u-%6u at %7.3f%%\n",
1373 align.bgnA(), align.endA(), align.bgnB(), align.endB(), align.percentIdentity());
1374 }
1375
1376 if (found == false) // No overlap found, probably not circular.
1377 return;
1378
1379 uint32 headBgn = align.bgnB(); // Should be zero!
1380 uint32 headEnd = align.endB(); //
1381
1382 uint32 tailBgn = align.bgnA(); //
1383 uint32 tailEnd = align.endA(); // Should be 'length'.
1384
1385 uint32 keepBgn = 0; // Default to trimming the
1386 uint32 keepEnd = tailBgn; // end part off.
1387
1388 if (showAlgorithm())
1389 fprintf(stderr, " Aligned A %6u-%6u against B %6u-%6u at %7.3f%%\n",
1390 tailBgn, tailEnd, headBgn, headEnd, align.percentIdentity());
1391
1392 if (showAlignments())
1393 align.display(20);
1394
1395 // Find the biggest match block, and use the middle of that for the trim point.
1396
1397 uint32 CC = UINT32_MAX;
1398
1399 for (uint32 cc=0; cc<align.cigarLength(); cc++) {
1400 if (align.cigarCode(cc) != '=')
1401 continue;
1402
1403 if ((CC == UINT32_MAX) || // If not a valid biggest block,
1404 (align.cigarValu(CC) < align.cigarValu(cc))) // or the current block is bigger,
1405 CC = cc; // reset the biggest block.
1406 }
1407
1408 if (CC != UINT32_MAX) {
1409 uint32 mid = align.cigarToMapBgn(CC) + (align.cigarToMapEnd(CC) - align.cigarToMapBgn(CC)) / 2;
1410
1411 keepBgn = align.alignMapB(mid); // B is the end of the tig, remember?
1412 keepEnd = align.alignMapA(mid); // A is the start of the tig.
1413 }
1414
1415 if (showAlgorithm())
1416 fprintf(stderr, " Trim to %u-%u\n", keepBgn, keepEnd);
1417
1418 // Test the trimming.
1419 //
1420 // Align the begin 0..keepBgn to the end of the tig buffer+keepEnd..len
1421 // Align the end keepEnd..len to the begin of the tig len..keepBgn+buffer
1422
1423 bool headTest = false;
1424 bool tailTest = false;
1425
1426 // Align trimmed-off start to end of tig. EDLIB_MODE_HW allows free gaps
1427 // at the start/end of the second sequence.
1428 {
1429 int32 aBgn = 0;
1430 int32 aEnd = keepBgn;
1431 int32 aLen = aEnd - aBgn;
1432
1433 int32 bBgn = (tailBgn >= 222) ? (tailBgn - 222) : (0);
1434 int32 bEnd = tailEnd;
1435 int32 bLen = bEnd - bBgn;
1436
1437 if (showAlgorithm())
1438 fprintf(stderr, " Test trimmed head %6d-%6d against kept tail %6d-%6d\n",
1439 aBgn, aEnd, bBgn, bEnd);
1440
1441 EdlibAlignResult result;
1442 result = edlibAlign(bases + aBgn, aLen, // Piece of the start we trim off
1443 bases + bBgn, bLen, // Should align into the tail we keep
1444 edlibNewAlignConfig(0.5 * aLen, EDLIB_MODE_HW, EDLIB_TASK_PATH));
1445
1446 if (bBgn + result.endLocations[0] + 1 == keepEnd)
1447 headTest = true;
1448
1449 if ((headTest == false) && (showAlgorithm()))
1450 fprintf(stderr, " Tested trimmed head %6d-%6d aligns to kept tail %6d-%6d%s\n",
1451 aBgn, aEnd, bBgn + result.startLocations[0], bBgn + result.endLocations[0] + 1, (headTest) ? "" : " - FAIL");
1452
1453 edlibFreeAlignResult(result);
1454 }
1455
1456 // Align trimmed-off end to start of tig.
1457 {
1458 int32 aBgn = keepEnd;
1459 int32 aEnd = length;
1460 int32 aLen = aEnd - aBgn;
1461
1462 int32 bBgn = headBgn;
1463 int32 bEnd = (headEnd + 222 <= length) ? (headEnd + 222) : (length);
1464 int32 bLen = bEnd - bBgn;
1465
1466 if (showAlgorithm())
1467 fprintf(stderr, " Test trimmed tail %6d-%6d against kept head %6d-%6d\n",
1468 aBgn, aEnd, bBgn, bEnd);
1469
1470 EdlibAlignResult result;
1471 result = edlibAlign(bases + aBgn, aLen, // Piece of the end we trim off
1472 bases + bBgn, bLen, // Should align into the head we keep
1473 edlibNewAlignConfig(0.5 * aLen, EDLIB_MODE_HW, EDLIB_TASK_PATH));
1474
1475 if (bBgn + result.startLocations[0] == keepBgn)
1476 tailTest = true;
1477
1478 if (showAlgorithm())
1479 fprintf(stderr, " Tested trimmed head %6d-%6d aligns to kept tail %6d-%6d%s\n",
1480 aBgn, aEnd, bBgn + result.startLocations[0], bBgn + result.endLocations[0] + 1, (tailTest) ? "" : " - FAIL");
1481
1482 edlibFreeAlignResult(result);
1483 }
1484
1485 if ((headTest == true) &&
1486 (tailTest == true)) {
1487 if (showAlgorithm())
1488 fprintf(stderr, " Tests pass.\n");
1489
1490 _tig->_trimBgn = keepBgn;
1491 _tig->_trimEnd = keepEnd;
1492 }
1493 }
1494
1495
1496
1497 bool
generate(tgTig * tig_,char algorithm_,char aligner_,std::map<uint32,sqRead * > * reads_)1498 unitigConsensus::generate(tgTig *tig_,
1499 char algorithm_,
1500 char aligner_,
1501 std::map<uint32, sqRead *> *reads_) {
1502 bool success = false;
1503
1504 if (tig_->numberOfChildren() == 1) {
1505 success = generateSingleton(tig_, reads_);
1506 }
1507
1508 else if (algorithm_ == 'Q') {
1509 success = generateQuick(tig_, reads_);
1510 }
1511
1512 else if ((algorithm_ == 'P') || // Normal utgcns.
1513 (algorithm_ == 'p')) { // 'norealign'
1514 success = generatePBDAG(tig_, aligner_, reads_);
1515 }
1516
1517 if (success) {
1518 _tig->_trimBgn = 0;
1519 _tig->_trimEnd = _tig->length();
1520
1521 if (algorithm_ == 'P') {
1522 findCoordinates();
1523 findRawAlignments();
1524 }
1525
1526 trimCircular();
1527 }
1528
1529 return(success);
1530 }
1531