1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "unitigConsensus.H"
19 
20 #include "bits.H"
21 
22 #include "Alignment.H"
23 #include "AlnGraphBoost.H"
24 #include "edlib.H"
25 #include "align-parasail-driver.H"
26 
27 #include <string>
28 
29 
30 #define ERROR_RATE_FACTOR 4
31 #define NUM_BANDS         2
32 #define MAX_RETRIES       ERROR_RATE_FACTOR * NUM_BANDS
33 
abSequence(uint32 readID,uint32 length,char * seq,uint32 complemented)34 abSequence::abSequence(uint32  readID,
35                        uint32  length,
36                        char   *seq,
37                        uint32  complemented) {
38   _iid              = readID;
39   _length           = length;
40   _complement       = complemented;
41 
42   _bases            = new char  [_length + 1];
43 
44   //  Make a complement table
45 
46   char inv[256] = {0};
47 
48   inv['a'] = 't';  inv['A'] = 'T';
49   inv['c'] = 'g';  inv['C'] = 'G';
50   inv['g'] = 'c';  inv['G'] = 'C';
51   inv['t'] = 'a';  inv['T'] = 'A';
52   inv['n'] = 'n';  inv['N'] = 'N';
53   inv['-'] = '-';
54 
55   //  Stash the bases/quals
56 
57   for (uint32 ii=0; ii<_length; ii++)
58     assert((seq[ii] == 'A') ||
59            (seq[ii] == 'C') ||
60            (seq[ii] == 'G') ||
61            (seq[ii] == 'T') ||
62            (seq[ii] == 'N'));
63 
64   if (complemented == false)
65     for (uint32 ii=0, pp=0; ii<_length; ii++, pp++)
66       _bases[pp] = seq[ii];
67 
68   else
69     for (uint32 ii=_length, pp=0; ii-->0; pp++)
70       _bases[pp] = inv[ seq[ii] ];
71 
72   _bases[_length] = 0;  //  NUL terminate the strings so we can use them in aligners.
73 };
74 
75 
76 
unitigConsensus(sqStore * seqStore_,double errorRate_,double errorRateMax_,uint32 minOverlap_)77 unitigConsensus::unitigConsensus(sqStore  *seqStore_,
78                                  double    errorRate_,
79                                  double    errorRateMax_,
80                                  uint32    minOverlap_) {
81 
82   _seqStore        = seqStore_;
83 
84   _tig             = NULL;
85   _numReads        = 0;
86 
87   _sequencesMax   = 0;
88   _sequencesLen   = 0;
89   _sequences      = NULL;
90 
91   _utgpos          = NULL;
92   _cnspos          = NULL;
93 
94   _minOverlap      = minOverlap_;
95   _errorRate       = errorRate_;
96   _errorRateMax    = errorRateMax_;
97 }
98 
99 
~unitigConsensus()100 unitigConsensus::~unitigConsensus() {
101 
102   for (uint32 ss=0; ss<_sequencesLen; ss++)
103     delete _sequences[ss];
104 
105   delete [] _sequences;
106   delete [] _utgpos;
107   delete [] _cnspos;
108 }
109 
110 
111 
112 void
addRead(uint32 readID,uint32 askip,uint32 bskip,bool complemented,std::map<uint32,sqRead * > * inPackageRead)113 unitigConsensus::addRead(uint32   readID,
114                          uint32   askip, uint32 bskip,
115                          bool     complemented,
116                          std::map<uint32, sqRead *>     *inPackageRead) {
117 
118   //  Grab the read.  If there is no package, load the read from the store.  Otherwise, load the
119   //  read from the package.  This REQUIRES that the package be in-sync with the unitig.  We fail
120   //  otherwise.  Hey, it's used for debugging only...
121 
122   sqRead      *readToDelete = NULL;
123   sqRead      *read         = NULL;
124 
125   if (inPackageRead == NULL) {
126     readToDelete = new sqRead;
127     read         = _seqStore->sqStore_getRead(readID, readToDelete);
128   }
129 
130   else {
131     read         = (*inPackageRead)[readID];
132   }
133 
134   if (read == NULL)
135     fprintf(stderr, "Failed to load read %u\n", readID);
136   assert(read != NULL);
137 
138   //  Grab seq/qlt from the read, offset to the proper begin and length.
139 
140   uint32  seqLen = read->sqRead_length() - askip - bskip;
141   char   *seq    = read->sqRead_sequence()  + ((complemented == false) ? askip : bskip);
142 
143   //  Add it to our list.
144 
145   increaseArray(_sequences, _sequencesLen, _sequencesMax, 1);
146 
147   _sequences[_sequencesLen++] = new abSequence(readID, seqLen, seq, complemented);
148 
149   delete readToDelete;
150 }
151 
152 
153 
154 bool
initialize(std::map<uint32,sqRead * > * reads)155 unitigConsensus::initialize(std::map<uint32, sqRead *>     *reads) {
156 
157   if (_numReads == 0) {
158     fprintf(stderr, "utgCns::initialize()-- unitig has no children.\n");
159     return(false);
160   }
161 
162   _utgpos = new tgPosition [_numReads];
163   _cnspos = new tgPosition [_numReads];
164 
165   memcpy(_utgpos, _tig->getChild(0), sizeof(tgPosition) * _numReads);
166   memcpy(_cnspos, _tig->getChild(0), sizeof(tgPosition) * _numReads);
167 
168   //  Clear the cnspos position.  We use this to show it's been placed by consensus.
169   //  Guess the number of columns we'll end up with.
170   //  Initialize abacus with the reads.
171 
172   for (int32 i=0; i<_numReads; i++) {
173     _cnspos[i].setMinMax(0, 0);
174 
175     addRead(_utgpos[i].ident(),
176             _utgpos[i]._askip, _utgpos[i]._bskip,
177             _utgpos[i].isReverse(),
178             reads);
179   }
180 
181   //  Check for duplicate reads
182 
183   {
184     std::set<uint32>  dupFrag;
185 
186     for (uint32 i=0; i<_numReads; i++) {
187       if (_utgpos[i].isRead() == false) {
188         fprintf(stderr, "unitigConsensus()-- Unitig %d FAILED.  Child %d is not a read.\n",
189                 _tig->tigID(), _utgpos[i].ident());
190         return(false);
191       }
192 
193       if (dupFrag.find(_utgpos[i].ident()) != dupFrag.end()) {
194         fprintf(stderr, "unitigConsensus()-- Unitig %d FAILED.  Child %d is a duplicate.\n",
195                 _tig->tigID(), _utgpos[i].ident());
196         return(false);
197       }
198 
199       dupFrag.insert(_utgpos[i].ident());
200     }
201   }
202 
203   return(true);
204 }
205 
switchToUncompressedCoordinates(void)206 void unitigConsensus::switchToUncompressedCoordinates(void) {
207     // update coordinates of the tig when needed (when it was assembled in compressed space, in normal space this will be skiped)
208     // we do this by tracking the read reaching furthest to the right and keeping its offset + homopolymer coordinate translation
209     // the read that overlaps it is then updated to start at that reads uncompressed offset + uncompressed bases based on the overlapping coordinate positions
210     //
211 
212     // check that we need to do something first
213     // just rely on first read
214     if ((double)getSequence(0)->length() / (_utgpos[0].max()-_utgpos[0].min()) <= 1.2)
215        return;
216 
217     uint32 compressedOffset   = 0;
218     uint32 uncompressedOffset = 0;
219     uint32 currentEnd         = _utgpos[0].max();
220     uint32 layoutLen          = 0;
221     uint32 nlen               = 0;
222     uint32* ntoc              = NULL;
223 
224     for (uint32 child = 0; child < _numReads; child++) {
225 
226       if (compressedOffset > _utgpos[child].min())
227         fprintf(stderr, "switchToUncompressedCoordinates()-- ERROR1 in gap in positioning, last read ends at %d; next read starts at %d\n",
228                 compressedOffset, _utgpos[child].min());
229       assert(_utgpos[child].min() >= compressedOffset);
230 
231       uint32 readCompressedPosition = _utgpos[child].min() - compressedOffset;
232       uint32 compressedEnd   = _utgpos[child].max();
233       uint32 compressedStart = _utgpos[child].min();
234 
235       // find the start position in normal read based on position in compressed read
236       uint32 i = 0;
237       while (i < nlen && (ntoc[i] < readCompressedPosition))
238          i++;
239 
240       //if (showAlgorithm())
241       //  fprintf(stderr, "switchToUncompressedCoordinates()-- I'm trying to find start of child %d compressed %d (dist from guide read is %d and in uncompressed in becomes %d)\n", _utgpos[child].ident(), _utgpos[child].min(), readCompressedPosition, i);
242 
243       _utgpos[child].setMinMax(i+uncompressedOffset, i+uncompressedOffset+getSequence(child)->length());
244 
245       //if (showAlgorithm())
246       //  fprintf(stderr, "switchToUncompressedCoordinates() --Updated read %d which has length %d to be from %d - %d\n", _utgpos[child].ident(), getSequence(child)->length(), _utgpos[child].min(), _utgpos[child].max());
247 
248        // update best end if needed
249        if (ntoc == NULL || compressedEnd > currentEnd) {
250           nlen  = getSequence(child)->length();
251           delete[] ntoc;
252           ntoc  = new uint32 [ nlen + 1 ];
253           uint32 clen  = homopolyCompress(getSequence(child)->getBases(), nlen, NULL, ntoc);
254 
255           currentEnd         = compressedEnd;
256           compressedOffset   = compressedStart;
257           uncompressedOffset = _utgpos[child].min();
258 
259           //if (showAlgorithm())
260           //  fprintf(stderr, "switchToUncompressedCoordinates()-- Updating guide read to be %d which ends at %d. Best before ended at %d. Now my guide is at %d (%d uncompressed)\n", _utgpos[child].ident(), compressedEnd, currentEnd, compressedOffset, uncompressedOffset);
261        }
262 
263       if (_utgpos[child].max() > layoutLen)
264         layoutLen = _utgpos[child].max();
265     }
266     delete[] ntoc;
267     _tig->_layoutLen = layoutLen;
268 }
269 
270 
271 // we assume some reads have good positions (recorded in cnspos) but most do
272 // not use those reads with good positions as anchors to re-compute everyone
273 // else's positions too this is very similar to the strategy in the above
274 // function to convert between compressed and uncompressed coordinates and it
275 // would be nice to unify the logic
276 //
updateReadPositions(void)277 void unitigConsensus::updateReadPositions(void) {
278   uint32 newOffset = 0;
279   uint32 oldOffset = 0;
280 
281   for (uint32 child = 0; child < _numReads; child++) {
282     if (oldOffset > _utgpos[child].min())
283       fprintf(stderr, "switchToUncompressedCoordinates()-- ERROR1 in gap in positioning, last read ends at %d; next read starts at %d\n",
284               oldOffset, _utgpos[child].min());
285     assert(_utgpos[child].min() >= oldOffset);
286 
287     // update best end if needed
288     if (_cnspos[child].max() != 0) {
289       newOffset = _cnspos[child].min();
290       oldOffset = _utgpos[child].min();
291 
292       //if (showAlgorithm())
293       // fprintf(stderr, "updatePostTemplate()-- Updating guide read to be %d which ends at %d. Now my guide originally was at %d now is at %d\n", _utgpos[child].ident(), _utgpos[child].max(), oldOffset, newOffset);
294     }
295 
296     uint32 readPosition = _utgpos[child].min() - oldOffset;
297 
298     //if (showAlgorithm())
299     //  fprintf(stderr, "updatePostTemplate()-- I'm trying to find start of child %d (dist from guide read is %d) currently from %d-%d\n", _utgpos[child].ident(), readPosition, _utgpos[child].min(), _utgpos[child].max());
300 
301     _utgpos[child].setMinMax(readPosition+newOffset, readPosition+newOffset+getSequence(child)->length());
302 
303     //if (showAlgorithm())
304     //  fprintf(stderr, "updatePostTemplate() --Updated read %d which has length %d to be from %d - %d\n", _utgpos[child].ident(), getSequence(child)->length(), _utgpos[child].min(), _utgpos[child].max());
305   }
306 }
307 
308 char *
generateTemplateStitch(void)309 unitigConsensus::generateTemplateStitch(void) {
310   uint32   minOlap  = _minOverlap;
311 
312   //  Initialize, copy the first read.
313 
314   uint32       rid      = 0;
315 
316   abSequence  *seq      = getSequence(rid);
317   char        *fragment = seq->getBases();
318   uint32       readLen  = seq->length();
319 
320   uint32       tigmax = AS_MAX_READLEN;  //  Must be at least AS_MAX_READLEN, else resizeArray() could fail
321   uint32       tiglen = 0;
322   char        *tigseq = NULL;
323 
324   allocateArray(tigseq, tigmax, _raAct::clearNew);
325 
326   if (showAlgorithm()) {
327     fprintf(stderr, "\n");
328     fprintf(stderr, "generateTemplateStitch()-- COPY READ read #%d %d (len=%d to %d-%d)\n",
329             0, _utgpos[0].ident(), readLen, _utgpos[0].min(), _utgpos[0].max());
330   }
331 
332   for (uint32 ii=0; ii<readLen; ii++)
333     tigseq[tiglen++] = fragment[ii];
334 
335   tigseq[tiglen] = 0;
336 
337   uint32       ePos = _utgpos[0].max();   //  Expected end of template, from bogart supplied positions.
338 
339 
340   //  Find the next read that has some minimum overlap and a large extension, copy that into the template.
341 
342   //  Align read to template.  Expecting to find alignment:
343   //
344   //        template ---------------
345   //        read             ---------------
346   //                               ^
347   //
348   //  All we need to find is where the template ends on the read, the ^ above.  We know the
349   //  expected size of the overlap, and can extract those bases from the template and look for a
350   //  full alignment to the read.
351   //
352   //  We'll align 80% of the expected overlap to the read, allowing a 20% buffer on either end.
353   //
354   //                        |  +-80% expected overlap size
355   //                        |  |     +-fPos
356   //                        v  v     v
357   //        template ----------(-----)
358   //        read            (-----------)------
359   //
360 
361   while (rid < _numReads) {
362     uint32 nr = 0;  //  Next read
363     uint32 nm = 0;  //  Next read maximum position
364 
365     //  Pick the next read as the one with the longest extension from all with some minimum overlap
366     //  to the template
367 
368     if (showAlgorithm())
369       fprintf(stderr, "\n");
370 
371     for (uint32 ii=rid+1; ii < _numReads; ii++) {
372 
373       //  If contained, move to the next read.  (Not terribly useful to log, so we don't)
374 
375       if (_utgpos[ii].max() < ePos)
376         continue;
377 
378       //  If a bigger end position, save the overlap.  One quirk: if we've already saved an overlap, and this
379       //  overlap is thin, don't save the thin overlap.
380 
381       bool   thick = (_utgpos[ii].min() + minOlap < ePos);
382       bool   first = (nm == 0);
383       bool   save  = false;
384 
385       if ((nm < _utgpos[ii].max()) && (thick || first)) {
386         save = true;
387         nr   = ii;
388         nm   = _utgpos[ii].max();
389       }
390 
391       if (showAlgorithm())
392         fprintf(stderr, "generateTemplateStitch()-- read #%d/%d ident %d position %d-%d%s%s%s\n",
393                 ii, _numReads, _utgpos[ii].ident(), _utgpos[ii].min(), _utgpos[ii].max(),
394                 (save  == true)  ? " SAVE"  : "",
395                 (thick == false) ? " THIN"  : "",
396                 (first == true)  ? " FIRST" : "");
397 
398 
399       //  If this read has an overlap smaller than we want, stop searching.
400 
401       if (thick == false)
402         break;
403     }
404 
405     if (nr == 0) {
406       if (showAlgorithm())
407         fprintf(stderr, "generateTemplateStitch()-- NO MORE READS TO ALIGN\n");
408       break;
409     }
410 
411     assert(nr != 0);
412 
413     rid      = nr;        //  We'll place read 'nr' in the template.
414 
415     seq      = getSequence(rid);
416     fragment = seq->getBases();
417     readLen  = seq->length();
418 
419     int32  readBgn;
420     int32  readEnd;
421 
422     EdlibAlignResult result;
423     bool             aligned       = false;
424 
425     double           templateSize  = 0.90;
426     double           extensionSize = 0.10;
427     double           bandErrRate   = _errorRate / ERROR_RATE_FACTOR;
428 
429     int32            olapLen       = ePos - _utgpos[nr].min();  //  The expected size of the overlap
430 
431     if (olapLen < minOlap) {
432       if (showAlgorithm())
433         fprintf(stderr, "generateTemplateStitch()-- WARNING, increasing min overlap from %d to %u for read %u (%d - %d)\n",
434                 olapLen, std::min(ePos, minOlap), nr, _utgpos[nr].min(), _utgpos[nr].max());
435       olapLen = std::min(ePos, minOlap);
436     }
437 
438     int32            templateLen  = 0;
439     int32            extensionLen = 0;
440 
441   alignAgain:
442     templateLen  = (int32)ceil(olapLen * templateSize);    //  Extract 80% of the expected overlap size
443     extensionLen = (int32)ceil(olapLen * extensionSize);   //  Extend read by 20% of the expected overlap size
444 
445     readBgn = 0;
446     readEnd = olapLen + extensionLen;
447 
448     if (readEnd > readLen)
449       readEnd = readLen;
450     // enforce minimum template length
451     if (templateLen <= 1)
452        templateLen ++;
453 
454     if (showAlgorithm()) {
455       fprintf(stderr, "\n");
456       fprintf(stderr, "generateTemplateStitch()-- ALIGN template %d-%d (len=%d) to read #%d %d %d-%d (len=%d actual=%d at %d-%d)  expecting olap of %d\n",
457               tiglen - templateLen, tiglen, templateLen,
458               nr, _utgpos[nr].ident(), readBgn, readEnd, readEnd - readBgn, readLen,
459               _utgpos[nr].min(), _utgpos[nr].max(),
460               olapLen);
461     }
462 
463     result = edlibAlign(tigseq + tiglen - templateLen, templateLen,
464                         fragment, readEnd - readBgn,
465                         edlibNewAlignConfig(olapLen * bandErrRate, EDLIB_MODE_HW, EDLIB_TASK_PATH));
466 
467     //  We're expecting the template to align inside the read.
468     //
469     //                                                        v- always the end
470     //    TEMPLATE  --------------------------[---------------]
471     //    READ                          [------------------------------]---------
472     //                always the start -^
473     //
474     //  If we don't find an alignment at all, we move the template start point to the right (making
475     //  the template smaller) and also move the read end point to the right (making the read
476     //  bigger).
477 
478     bool   tryAgain = false;
479 
480     bool   noResult      = (result.numLocations == 0);
481     bool   gotResult     = (result.numLocations  > 0);
482 
483     bool   hitTheStart   = (gotResult) && (result.startLocations[0] == 0);
484 
485     bool   hitTheEnd     = (gotResult) && (result.endLocations[0] + 1 == readEnd - readBgn);
486     bool   moreToExtend  = (readEnd < readLen);
487 
488 
489     int32 maxDifference = std::min(2500, (int32)ceil(0.30*olapLen));
490     //  Reset if the edit distance is waay more than our error rate allows or it's very short and we haven't topped out on error.  This seems to be a quirk with
491     //  edlib when aligning to N's - I got startLocation = endLocation = 0 and editDistance = alignmentLength.
492     if ((double)result.editDistance / result.alignmentLength > bandErrRate ||
493         (abs(olapLen-result.alignmentLength) > maxDifference && bandErrRate < _errorRate)) {
494       noResult    = true;
495       gotResult   = false;
496       hitTheStart = false;
497       hitTheEnd   = false;
498     }
499 
500     //  HOWEVER, if we get a result and it's near perfect, declare success even if we hit the start.
501     //  These are simple repeats that will align with any overlap.  The one BPW debugged was 99+% A.
502 
503     if ((gotResult == true) &&
504         (hitTheStart == true) &&
505         ((double)result.editDistance / result.alignmentLength < 0.1)) {
506       hitTheStart = false;
507     }
508 
509     //  NOTE that if we hit the end with the same conditions, we should try again, unless there
510     //  isn't anything left.  In that case, we don't extend the template.
511 
512     if ((gotResult == true) &&
513         (hitTheEnd == true) &&
514         (moreToExtend == false) &&
515         ((double)result.editDistance / result.alignmentLength < 0.1)) {
516       hitTheEnd = false;
517     }
518 
519     //  Now, report what happened, and maybe try again.
520 
521     if ((showAlgorithm()) && (noResult == true))
522       fprintf(stderr, "generateTemplateStitch()-- FAILED to align - no result\n");
523 
524     if ((showAlgorithm()) && (noResult == false))
525       fprintf(stderr, "generateTemplateStitch()-- FOUND alignment at %d-%d editDist %d alignLen %d %.f%% expected %d\n",
526               result.startLocations[0], result.endLocations[0]+1,
527               result.editDistance,
528               result.alignmentLength,
529               100.0 * result.editDistance / result.alignmentLength, olapLen);
530 
531     if ((noResult) || (hitTheStart)) {
532       if (showAlgorithm())
533         fprintf(stderr, "generateTemplateStitch()-- FAILED to align - %s - decrease template size by 10%%\n",
534                 (noResult == true) ? "no result" : "hit the start");
535       tryAgain = true;
536       templateSize -= 0.10;
537     }
538 
539     if ((noResult) || (hitTheEnd && moreToExtend)) {
540       if (showAlgorithm())
541         fprintf(stderr, "generateTemplateStitch()-- FAILED to align - %s - increase read size by 10%%\n",
542                 (noResult == true) ? "no result" : "hit the end");
543       tryAgain = true;
544       extensionSize += 0.10;
545     }
546 
547     if (templateSize < 0.01) {
548       if (showAlgorithm())
549         fprintf(stderr, "generateTemplateStitch()-- FAILED to align - no more template to remove!  Fail!\n");
550       if (bandErrRate + _errorRate / ERROR_RATE_FACTOR > _errorRate)
551          tryAgain = false;
552       else {
553         if (showAlgorithm())
554           fprintf(stderr, "generateTemplateStitch()-- FAILED to align at %.2f error rate, increasing to %.2f\n", bandErrRate, bandErrRate + _errorRate/ERROR_RATE_FACTOR);
555         tryAgain = true;
556         templateSize  = 0.90;
557         extensionSize = 0.10;
558         bandErrRate  += _errorRate / ERROR_RATE_FACTOR;
559       }
560     }
561 
562     if (tryAgain) {
563       edlibFreeAlignResult(result);
564       goto alignAgain;
565     }
566 
567     //  Use the alignment (or the overlap) to figure out what bases in the read
568     //  need to be appended to the template.
569 
570     if (noResult == false) {
571       readBgn = result.startLocations[0];     //  Expected to be zero
572       readEnd = result.endLocations[0] + 1;   //  Where we need to start copying the read
573 
574       // record updated read coordinates which we will use later as anchors to re-computed everyone else
575       _cnspos[nr].setMinMax(tiglen-readEnd, tiglen + readLen - readEnd);
576 
577       if (showAlgorithm()) {
578         fprintf(stderr, "generateTemplateStitch()--\n");
579         fprintf(stderr, "generateTemplateStitch()-- Aligned template %d-%d to read %u %d-%d; copy read %d-%d to template.\n",
580                 tiglen - templateLen, tiglen, nr, readBgn, readEnd, readEnd, readLen);
581         fprintf(stderr, "generateTemplateStitch()-- New position for read %d is  %d-%d\n", _utgpos[nr].ident(), _cnspos[nr].min(), _cnspos[nr].max());
582       }
583     } else {
584       readBgn = 0;
585       readEnd = olapLen;
586 
587       if (showAlgorithm()) {
588         fprintf(stderr, "generateTemplateStitch()--\n");
589         fprintf(stderr, "generateTemplateStitch()-- Alignment failed, use original overlap; copy read %d-%d to template.\n",
590                 readEnd, readLen);
591       }
592     }
593 
594     edlibFreeAlignResult(result);
595 
596 
597     resizeArray(tigseq, tiglen, tigmax, tiglen + readLen - readEnd + 1);
598 
599     //  Append the read bases to the template.
600     //
601     //  When the read fails to align, we used to go back in the overlap region and fill in any N
602     //  bases, expected to be large blocks of Ns from shredded scaffolds, with bases from the read.
603     //  As of March 2019, this isn't really needed, for two reasons: (1) sqStore is trimming N's
604     //  from the end of a read, and (2) edlib is allowing matches to Ns.
605 
606     if (showAlgorithm())
607       fprintf(stderr, "generateTemplateStitch()-- Append read from %u to %u, starting at tiglen %u\n", readEnd, readLen, tiglen);
608 
609     for (uint32 ii=readEnd; ii<readLen; ii++)
610       tigseq[tiglen++] = fragment[ii];
611 
612     tigseq[tiglen] = 0;
613 
614     assert(tiglen < tigmax);
615 
616     ePos = _utgpos[rid].max();
617 
618     if (showAlgorithm())
619       fprintf(stderr, "generateTemplateStitch()-- Template now length %d, expected %d, difference %7.4f%%\n",
620               tiglen, ePos, 200.0 * ((int32)tiglen - (int32)ePos) / ((int32)tiglen + (int32)ePos));
621   }
622 
623   //  Report the expected and final size.  We used to guard against long tigs getting chopped, but
624   //  the new template construction (early 2019) will simply append reads that fail to extend the
625   //  template - so long tigs never should get chopped.
626 
627   double  pd = 200.0 * ((int32)tiglen - (int32)ePos) / ((int32)tiglen + (int32)ePos);
628 
629   if (showAlgorithm()) {
630     fprintf(stderr, "\n");
631     fprintf(stderr, "generateTemplateStitch()-- generated template of length %d, expected length %d, %7.4f%% difference.\n",
632             tiglen, ePos, pd);
633   }
634 
635   updateReadPositions();
636 
637   return(tigseq);
638 }
639 
640 
641 
642 bool
alignEdLib(dagAlignment & aln,tgPosition & utgpos,char * fragment,uint32 fragmentLength,char * tigseq,uint32 tiglen,double errorRate,bool verbose)643 alignEdLib(dagAlignment      &aln,
644            tgPosition        &utgpos,
645            char              *fragment,
646            uint32             fragmentLength,
647            char              *tigseq,
648            uint32             tiglen,
649            double             errorRate,
650            bool               verbose) {
651 
652   EdlibAlignResult align;
653 
654   int32   padding        = std::min(250, (int32)ceil(fragmentLength * 0.05));
655   double  bandErrRate    = errorRate / ERROR_RATE_FACTOR;
656   bool    aligned        = false;
657   double  alignedErrRate = 0.0;
658 
659   //  Decide on where to align this read.
660 
661   int32  tigbgn = std::max((int32)0,      (int32)floor(utgpos.min() - padding));
662   int32  tigend = std::min((int32)tiglen, (int32)floor(utgpos.max() + padding));
663 
664   if (verbose)
665     fprintf(stderr, "alignEdLib()-- align read %7u eRate %.4f at %9d-%-9d", utgpos.ident(), bandErrRate, tigbgn, tigend);
666 
667   if (tigend < tigbgn) {
668     fprintf(stderr, "alignEdLib()-- WARNING: tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
669             tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
670     // try to align it to full
671     tigbgn = 0;
672     tigend = (utgpos.max() < 0 ? tiglen : utgpos.max());
673     fprintf(stderr, "alignEdLib()-- WARNING: updated tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
674             tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
675   }
676   assert(tigend > tigbgn);
677 
678   //  Align!  If there is an alignment, compute error rate and declare success if acceptable.
679 
680   align = edlibAlign(fragment, fragmentLength,
681                      tigseq + tigbgn, tigend - tigbgn,
682                      edlibNewAlignConfig(bandErrRate * fragmentLength, EDLIB_MODE_HW, EDLIB_TASK_PATH));
683 
684   if (align.alignmentLength > 0) {
685     alignedErrRate = (double)align.editDistance / align.alignmentLength;
686     aligned        = (alignedErrRate <= bandErrRate);
687     if (verbose)
688       fprintf(stderr, " - ALIGNED %.4f at %9d-%-9d\n", alignedErrRate, tigbgn + align.startLocations[0], tigbgn + align.endLocations[0]+1);
689   } else {
690     if (verbose)
691       fprintf(stderr, "\n");
692   }
693 
694   for (uint32 ii=1 /*the 0ths was above*/; ((ii < MAX_RETRIES) && (aligned == false)); ii++) {
695     tigbgn = std::max((int32)0,      tigbgn - 5 * padding);
696     tigend = std::min((int32)tiglen, tigend + 5 * padding);
697     // last attempt make a very wide band
698     if ((ii+1) % NUM_BANDS == 0) {
699        tigbgn = std::max((int32)0,      tigbgn - 25 * padding);
700        tigend = std::min((int32)tiglen, tigend + 25 * padding);
701     }
702 
703     // let the band increase without increasing error rate for a while, if we give up increase error rate
704     // and reset bnad
705     if (ii != 0 && ii % NUM_BANDS == 0) {
706        bandErrRate += errorRate / ERROR_RATE_FACTOR;
707        tigbgn = std::max((int32)0,      (int32)floor(utgpos.min() - padding));
708        tigend = std::min((int32)tiglen, (int32)floor(utgpos.max() + padding));
709     }
710 
711     if (tigend < tigbgn) {
712        fprintf(stderr, "alignEdLib()-- WARNING: tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
713                tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
714        // try to align it to full
715        tigbgn = 0;
716        tigend = (utgpos.max() < 0 ? tiglen : utgpos.max());
717        fprintf(stderr, "alignEdLib()-- WARNING: updated tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
718                tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
719     }
720     assert(tigend > tigbgn);
721 
722     edlibFreeAlignResult(align);
723 
724     if (verbose)
725       fprintf(stderr, "alignEdLib()--                    eRate %.4f at %9d-%-9d", bandErrRate, tigbgn, tigend);
726 
727     align = edlibAlign(fragment, strlen(fragment),
728                        tigseq + tigbgn, tigend - tigbgn,
729                        edlibNewAlignConfig(bandErrRate * fragmentLength, EDLIB_MODE_HW, EDLIB_TASK_PATH));
730 
731     if (align.alignmentLength > 0) {
732       alignedErrRate = (double)align.editDistance / align.alignmentLength;
733       aligned        = (alignedErrRate <= bandErrRate);
734       if (verbose)
735         fprintf(stderr, " - ALIGNED %.4f at %9d-%-9d\n", alignedErrRate, tigbgn + align.startLocations[0], tigbgn + align.endLocations[0]+1);
736     } else {
737       if (verbose)
738         fprintf(stderr, "\n");
739     }
740   }
741 
742   if (aligned == false) {
743     edlibFreeAlignResult(align);
744     return(false);
745   }
746 
747   char *tgtaln = new char [align.alignmentLength+1];
748   char *qryaln = new char [align.alignmentLength+1];
749 
750   memset(tgtaln, 0, sizeof(char) * (align.alignmentLength+1));
751   memset(qryaln, 0, sizeof(char) * (align.alignmentLength+1));
752 
753   edlibAlignmentToStrings(align.alignment,               //  Alignment
754                           align.alignmentLength,         //    and length
755                           align.startLocations[0],       //  tgtStart
756                           align.endLocations[0]+1,       //  tgtEnd
757                           0,                             //  qryStart
758                           fragmentLength,                //  qryEnd
759                           tigseq + tigbgn,               //  tgt sequence
760                           fragment,                      //  qry sequence
761                           tgtaln,                   //  output tgt alignment string
762                           qryaln);                  //  output qry alignment string
763 
764   //  Populate the output.  AlnGraphBoost does not handle mismatch alignments, at all, so convert
765   //  them to a pair of indel.
766 
767   uint32 nMatch = 0;
768 
769   for (uint32 ii=0; ii<align.alignmentLength; ii++)   //  Edlib guarantees aln[alignmentLength] == 0.
770     if ((tgtaln[ii] != '-') &&
771         (qryaln[ii] != '-') &&
772         (tgtaln[ii] != qryaln[ii]))
773       nMatch++;
774 
775   aln.start  = tigbgn + align.startLocations[0] + 1;   //  AlnGraphBoost expects 1-based positions.
776   aln.end    = tigbgn + align.endLocations[0] + 1;     //  EdLib returns 0-based positions.
777 
778   aln.qstr   = new char [align.alignmentLength + nMatch + 1];
779   aln.tstr   = new char [align.alignmentLength + nMatch + 1];
780 
781   for (uint32 ii=0, jj=0; ii<align.alignmentLength; ii++) {
782     char  tc = tgtaln[ii];
783     char  qc = qryaln[ii];
784 
785     if ((tc != '-') &&
786         (qc != '-') &&
787         (tc != qc)) {
788       aln.tstr[jj] = '-';   aln.qstr[jj] = qc;    jj++;
789       aln.tstr[jj] = tc;    aln.qstr[jj] = '-';   jj++;
790     } else {
791       aln.tstr[jj] = tc;    aln.qstr[jj] = qc;    jj++;
792     }
793 
794     aln.length = jj;
795   }
796 
797   aln.qstr[aln.length] = 0;
798   aln.tstr[aln.length] = 0;
799 
800   delete [] tgtaln;
801   delete [] qryaln;
802 
803   edlibFreeAlignResult(align);
804 
805   if (aln.end > tiglen)
806     fprintf(stderr, "ERROR:  alignment from %d to %d, but tiglen is only %d\n", aln.start, aln.end, tiglen);
807   assert(aln.end <= tiglen);
808 
809   return(true);
810 }
811 
812 
813 
814 bool
initializeGenerate(tgTig * tig_,std::map<uint32,sqRead * > * reads_)815 unitigConsensus::initializeGenerate(tgTig                       *tig_,
816                                     std::map<uint32, sqRead *>  *reads_) {
817 
818   _tig      = tig_;
819   _numReads = _tig->numberOfChildren();
820 
821   if (initialize(reads_) == false) {
822     fprintf(stderr, "Failed to initialize for tig %u with %u children\n", _tig->tigID(), _tig->numberOfChildren());
823     return(false);
824   }
825 
826   switchToUncompressedCoordinates();
827 
828   return(true);
829 }
830 
831 
832 
833 bool
generatePBDAG(tgTig * tig_,char aligner_,std::map<uint32,sqRead * > * reads_)834 unitigConsensus::generatePBDAG(tgTig                       *tig_,
835                                char                         aligner_,
836                                std::map<uint32, sqRead *>  *reads_) {
837 
838   if (initializeGenerate(tig_, reads_) == false)
839     return(false);
840 
841   //  Build a quick consensus to align to.
842 
843   char   *tigseq = generateTemplateStitch();
844   uint32  tiglen = strlen(tigseq);
845 
846   if (showAlgorithm())
847     fprintf(stderr, "Generated template of length %d\n", tiglen);
848 
849   //  Compute alignments of each sequence in parallel
850 
851   if (showAlgorithm())
852     fprintf(stderr, "Aligning reads.\n");
853 
854   dagAlignment *aligns = new dagAlignment [_numReads];
855   uint32        pass = 0;
856   uint32        fail = 0;
857 
858 #pragma omp parallel for schedule(dynamic)
859   for (uint32 ii=0; ii<_numReads; ii++) {
860     abSequence  *seq      = getSequence(ii);
861     bool         aligned  = false;
862 
863     assert(aligner_ == 'E');  //  Maybe later we'll have more than one aligner again.
864 
865     aligned = alignEdLib(aligns[ii],
866                          _utgpos[ii],
867                          seq->getBases(), seq->length(),
868                          tigseq, tiglen,
869                          _errorRate,
870                          showAlgorithm());
871 
872     if (aligned == false) {
873       if (showAlgorithm())
874         fprintf(stderr, "generatePBDAG()--    read %7u FAILED\n", _utgpos[ii].ident());
875 
876       fail++;
877 
878       continue;
879     }
880 
881     pass++;
882   }
883 
884   if (showAlgorithm())
885     fprintf(stderr, "generatePBDAG()--    read alignment: %d failed, %d passed.\n", fail, pass);
886 
887   //  Construct the graph from the alignments.  This is not thread safe.
888 
889   if (showAlgorithm())
890     fprintf(stderr, "Constructing graph\n");
891 
892   AlnGraphBoost ag(std::string(tigseq, tiglen));
893 
894   for (uint32 ii=0; ii<_numReads; ii++) {
895     _cnspos[ii].setMinMax(aligns[ii].start, aligns[ii].end);
896 
897     if ((aligns[ii].start == 0) &&
898         (aligns[ii].end   == 0))
899       continue;
900 
901     ag.addAln(aligns[ii]);
902 
903     aligns[ii].clear();
904   }
905 
906   delete [] aligns;
907 
908   if (showAlgorithm())
909     fprintf(stderr, "Merging graph\n");
910 
911   //  Merge the nodes and call consensus
912   ag.mergeNodes();
913 
914   if (showAlgorithm())
915     fprintf(stderr, "Calling consensus\n");
916 
917   //FIXME why do we have 0weight nodes (template seq w/o support even from the read that generated them)?
918   std::string cns = ag.consensus(0);
919 
920   delete [] tigseq;
921 
922   //  Save consensus
923 
924   resizeArrayPair(_tig->_bases, _tig->_quals, 0, _tig->_basesMax, (uint32) cns.length() + 1, _raAct::doNothing);
925 
926   std::string::size_type len = 0;
927 
928   for (len=0; len<cns.size(); len++) {
929     _tig->_bases[len] = cns[len];
930     _tig->_quals[len] = CNS_MIN_QV;
931   }
932 
933   //  Terminate the string.
934 
935   _tig->_bases[len] = 0;
936   _tig->_quals[len] = 0;
937   _tig->_basesLen   = len;
938   _tig->_layoutLen  = len;
939 
940   assert(len < _tig->_basesMax);
941 
942   return(true);
943 }
944 
945 
946 
947 bool
generateQuick(tgTig * tig_,std::map<uint32,sqRead * > * reads_)948 unitigConsensus::generateQuick(tgTig                       *tig_,
949                                std::map<uint32, sqRead *>  *reads_) {
950 
951   if (initializeGenerate(tig_, reads_) == false)
952     return(false);
953 
954   //  Quick is just the template sequence, so one and done!
955 
956   char   *tigseq = generateTemplateStitch();
957   uint32  tiglen = strlen(tigseq);
958 
959   //  Save consensus
960 
961   resizeArrayPair(_tig->_bases, _tig->_quals, 0, _tig->_basesMax, tiglen + 1, _raAct::doNothing);
962 
963   for (uint32 ii=0; ii<tiglen; ii++) {
964     _tig->_bases[ii] = tigseq[ii];
965     _tig->_quals[ii] = CNS_MIN_QV;
966   }
967 
968   //  Set positions of all the reads.  We don't know anything and default to the incoming positions.
969 
970   for (uint32 ii=0; ii<_numReads; ii++)
971     _cnspos[ii] = _utgpos[ii];
972 
973   //  Terminate the string.
974 
975   _tig->_bases[tiglen] = 0;
976   _tig->_quals[tiglen] = 0;
977   _tig->_basesLen      = tiglen;
978   _tig->_layoutLen     = tiglen;
979 
980   delete [] tigseq;
981 
982   return(true);
983 }
984 
985 
986 
987 bool
generateSingleton(tgTig * tig_,std::map<uint32,sqRead * > * reads_)988 unitigConsensus::generateSingleton(tgTig                       *tig_,
989                                    std::map<uint32, sqRead *>  *reads_) {
990 
991   if (initializeGenerate(tig_, reads_) == false)
992     return(false);
993 
994   assert(_numReads == 1);
995 
996   //  Copy the single read to the tig sequence.
997 
998   abSequence  *seq      = getSequence(0);
999   char        *fragment = seq->getBases();
1000   uint32       readLen  = seq->length();
1001 
1002   resizeArrayPair(_tig->_bases, _tig->_quals, 0, _tig->_basesMax, readLen + 1, _raAct::doNothing);
1003 
1004   for (uint32 ii=0; ii<readLen; ii++) {
1005     _tig->_bases[ii] = fragment[ii];
1006     _tig->_quals[ii] = CNS_MIN_QV;
1007   }
1008 
1009   //  Set positions of all the reads.
1010 
1011   _cnspos[0].setMinMax(0, readLen);
1012 
1013   //  Terminate the string.
1014 
1015   _tig->_bases[readLen] = 0;
1016   _tig->_quals[readLen] = 0;
1017   _tig->_basesLen       = readLen;
1018   _tig->_layoutLen      = readLen;
1019 
1020   return(true);
1021 }
1022 
1023 
1024 
1025 
1026 //  This has more canu dependency than edlib dependency, so it's here instead of in edlib.c
1027 //
1028 uint32
edlibAlignmentToCanu(stuffedBits * align,const unsigned char * alignment,int alignmentLength,int tgtStart,int tgtEnd,int qryStart,int qryEnd)1029 edlibAlignmentToCanu(stuffedBits *align,
1030                      const unsigned char* alignment,
1031                      int alignmentLength,
1032                      int tgtStart, int tgtEnd,
1033                      int qryStart, int qryEnd) {
1034   int32  qryPos = qryStart;
1035   int32  tgtPos = tgtStart;
1036 
1037   //  Count the length of the alignment.
1038 
1039   int32  nMatch  = 0;   //  Counting the number of match/mismatch in a delta block.
1040   int32  nBlocks = 0;   //  Number of blocks in the delta encoding.
1041 
1042   //  Code the alignment.
1043 
1044   for (int32 a=0; a<alignmentLength; a++) {
1045     assert(qryPos <= qryEnd);
1046     assert(tgtPos <= tgtEnd);
1047 
1048     //  Match or mismatch.
1049     if ((alignment[a] == EDLIB_EDOP_MATCH) ||
1050         (alignment[a] == EDLIB_EDOP_MISMATCH)) {
1051       nMatch++;
1052 
1053       qryPos++;
1054       tgtPos++;
1055     }
1056 
1057     //  Insertion in target.
1058     else if (alignment[a] == EDLIB_EDOP_INSERT) {
1059       align->setEliasDelta(nMatch + 1);
1060       align->setBit(0);
1061 
1062       nMatch = 0;
1063       nBlocks++;
1064 
1065       qryPos++;
1066     }
1067 
1068     //  Insertion in query.
1069     else if (alignment[a] == EDLIB_EDOP_DELETE) {
1070       align->setEliasDelta(nMatch + 1);
1071       align->setBit(1);
1072 
1073       nMatch = 0;
1074       nBlocks++;
1075 
1076       tgtPos++;
1077     }
1078   }
1079 
1080   //  Don't forget the last match/mismatch block.
1081   //align->setEliasDelta(nMatch + 1);
1082   //align->setBit(1);
1083 
1084   //nMatch = 0;
1085   //nBlocks++;
1086 
1087   //fprintf(stderr, "Align of length %d with %d gaps stored in %lu bits.\n", alignmentLength, nBlocks, align->getLength());
1088 
1089   return(nBlocks);
1090 }
1091 
1092 
1093 
1094 //  Align each read to the region of the consensus sequence the read claims
1095 //  to be from, extended by 5% of the read length on either end.  If it fails
1096 //  to align full length, make the extensions larger and/or error rate more
1097 //  tolerant until it aligns.
1098 //
1099 //  Reads that fail to align have their cnspos set to 0.
1100 //
1101 void
findCoordinates(void)1102 unitigConsensus::findCoordinates(void) {
1103 
1104   if (showPlacement()) {
1105     fprintf(stderr, "\n");
1106     fprintf(stderr, "TIG %u length %u\n", _tig->tigID(), _tig->length());
1107     fprintf(stderr, "\n");
1108   }
1109 
1110   _tig->_childDeltaBitsLen = 1;
1111   _tig->_childDeltaBits    = new stuffedBits();
1112 
1113   int32         alignShift = 0;
1114 
1115   #pragma omp parallel for schedule(dynamic)
1116   for (uint32 ii=0; ii<_numReads; ii++) {
1117     abSequence   *read    = getSequence(ii);
1118     char         *readSeq = read->getBases();
1119     uint32        readLen = read->length();
1120 
1121     int32         origbgn = _cnspos[ii].min();
1122     int32         origend = _cnspos[ii].max();
1123     int32         origlen = origend - origbgn;
1124 
1125     int32         ext5    = readLen * 0.01;  //  Try an initial alignment nice and tight.  This is
1126     int32         ext3    = readLen * 0.01;  //  important for the first/last reads so we get them
1127     double        era     = _errorRate;      //  aligned at the end of the tig.
1128 
1129     if (showPlacement()) {
1130       fprintf(stderr, "\n");
1131       fprintf(stderr, "ALIGN read #%d %u length %u cnspos %d %d\n",
1132               ii, read->seqIdent(), readLen, _cnspos[ii].min(), _cnspos[ii].max());
1133     }
1134 
1135     _cnspos[ii].setMinMax(0, 0);  //  When the read aligns, we set the true position.
1136 
1137     if ((origbgn == 0) &&
1138         (origend == 0)) {
1139       if (showPlacement())
1140         fprintf(stderr, "skip, failed to align to template initially.\n");
1141       continue;
1142     }
1143 
1144     while ((ext5 < readLen * 1.5) &&
1145            (ext3 < readLen * 1.5) &&
1146            (era  < 4 * _errorRate)) {
1147       int32 bgn = std::max(0, origbgn + alignShift - ext5);                          //  First base in tig we'll align to.
1148       int32 end = std::min(   origend + alignShift + ext3, (int32)_tig->length());   //  Last base
1149       int32 len = end - bgn;   //  WAS: +1
1150 
1151       //  If the length of the region is (wildly) less than expected, reset
1152       //  bgn to give the alignment a fighting chance.  BPW has only seen
1153       //  this when running consensus on 10% error reads, and only on reads
1154       //  near the end of the tig -- that is, when end (above) is reset to
1155       //  tig->length().
1156 
1157       if ((bgn == 0) &&
1158           (len < ext5 + origlen + ext3)) {
1159         int32 endnew = bgn + (origend - origbgn) + ext5 + ext3;
1160 
1161         if (showPlacement())
1162           fprintf(stderr, "reset position from %d-%d to %d-%d based on len=%d ext5=%d origlen=%d ext3=%d\n",
1163                   bgn, end,
1164                   bgn, endnew,
1165                   len, ext5, origlen, ext3);
1166 
1167         end = (endnew < (int32)_tig->length()) ? endnew : _tig->length();
1168         len = end - bgn;
1169       }
1170 
1171       if ((end == _tig->length()) &&
1172           (len < ext5 + origlen + ext3)) {
1173         int32 bgnnew = end - (origend - origbgn) - ext5 - ext3;
1174 
1175         if (showPlacement())
1176           fprintf(stderr, "reset position from %d-%d to %d-%d based on len=%d ext5=%d origlen=%d ext3=%d\n",
1177                   bgn,    end,
1178                   bgnnew, end,
1179                   len, ext5, origlen, ext3);
1180 
1181         bgn = (bgnnew < 0) ? 0 : bgnnew;
1182         len = end - bgn;
1183       }
1184 
1185       //  Some logging.
1186 
1187       if (showPlacement())
1188         fprintf(stderr, "align read #%u length %u to %d-%d - shift %d extension %d %d error rate %.3f\n",
1189                 ii, readLen, bgn, end, alignShift, ext5, ext3, era);
1190 
1191       //  More logging.
1192 
1193       if (showAlignments()) {
1194         char  save = _tig->bases()[bgn + len];
1195 
1196         _tig->bases()[bgn + len] = 0;
1197 
1198         fprintf(stderr, "READ: %s\n", readSeq);
1199         fprintf(stderr, "TIG:  %s\n", _tig->bases() + bgn);
1200 
1201         _tig->bases()[bgn + len] = save;
1202       }
1203 
1204       // Align the entire read into a subsequence of the tig.
1205 
1206       assert(bgn < end);
1207 
1208       EdlibAlignResult align = edlibAlign(readSeq, readLen,
1209                                           _tig->bases() + bgn, len,
1210                                           edlibNewAlignConfig(readLen * era * 2, EDLIB_MODE_HW, EDLIB_TASK_PATH));
1211 
1212       //  If nothing aligned, make the tig subsequence bigger and allow more errors.
1213 
1214       if (align.numLocations == 0) {
1215         ext5 += readLen * 0.05;
1216         ext3 += readLen * 0.05;
1217         era  += 0.025;
1218 
1219         if (showPlacement())
1220           fprintf(stderr, "  NO ALIGNMENT - Increase extension to %d / %d and error rate to %.3f\n", ext5, ext3, era);
1221 
1222         edlibFreeAlignResult(align);
1223 
1224         continue;
1225       }
1226 
1227       //  Something aligned.  Find how much of the tig subsequence was not used in the alignment.
1228       //  If we hit either end - the unaligned bit is length 0 - increase that extension and retry.
1229 
1230       int32 unaligned5 = align.startLocations[0];
1231       int32 unaligned3 = len - (align.endLocations[0] + 1);   //  0-based position of last character in alignment.
1232 
1233       if (showPlacement())
1234         fprintf(stderr, "               - read %4u original %9u-%9u claimed %9u-%9u aligned %9u-%9u unaligned %d %d\n",
1235                 ii,
1236                 _utgpos[ii].min(), _utgpos[ii].max(),
1237                 origbgn, origend,
1238                 bgn, end,
1239                 unaligned5, unaligned3);
1240 
1241       //  If bump the start, make bigger.
1242 
1243       if ((bgn > 0) && (unaligned5 <= 0)) {
1244         ext5 += readLen * 0.05;
1245 
1246         if (showPlacement())
1247           fprintf(stderr, "  BUMPED START - unaligned hangs %d %d - increase 5' extension to %d\n",
1248                   unaligned5, unaligned3, ext5);
1249 
1250         edlibFreeAlignResult(align);
1251 
1252         continue;
1253       }
1254 
1255       //  If bump the end, make bigger.
1256 
1257       if ((end < _tig->length()) && (unaligned3 <= 0)) {
1258         ext3 += readLen * 0.05;
1259 
1260         if (showPlacement())
1261           fprintf(stderr, "  BUMPED END   - unaligned hangs %d %d - increase 3' extension to %d\n",
1262                   unaligned5, unaligned3, ext3);
1263 
1264         edlibFreeAlignResult(align);
1265 
1266         continue;
1267       }
1268 
1269       //  Otherwise, SUCCESS!
1270       //
1271       //  Save, for the next alignment, the difference between where the read though it was placed,
1272       //  and where we actually aligned it to.  This will adjust the next subsequence to (hopefully)
1273       //  account for any expansion/collapses in the consensus sequence.
1274 
1275       int32   abgn = bgn + align.startLocations[0];
1276       int32   aend = bgn + align.endLocations[0] + 1;
1277 
1278       alignShift = ((abgn - origbgn) +
1279                     (aend - origend)) / 2;
1280 
1281       _cnspos[ii].setMinMax(abgn, aend);
1282 
1283       _tig->getChild(ii)->setMinMax(abgn, aend);
1284 
1285       if (showPlacement())
1286         fprintf(stderr, "  SUCCESS aligned to %d %d at %f\n", abgn, aend, 100.0 * align.editDistance / align.alignmentLength);
1287 
1288 #pragma omp critical (tgTigLoadAlign)
1289       {
1290         _tig->getChild(ii)->_deltaOffset = _tig->_childDeltaBits->getPosition();
1291         _tig->getChild(ii)->_deltaLen    = edlibAlignmentToCanu(_tig->_childDeltaBits,
1292                                                                 align.alignment,
1293                                                                 align.alignmentLength,
1294                                                                 align.startLocations[0],
1295                                                                 align.endLocations[0] + 1,
1296                                                                 0,
1297                                                                 readLen);
1298       }
1299 
1300       edlibFreeAlignResult(align);
1301 
1302       break;   //  Stop looping over extension and error rate.
1303     }  //  Looping over extension and error rate.
1304   }    //  Looping over reads.
1305 }
1306 
1307 
1308 
1309 void
findRawAlignments(void)1310 unitigConsensus::findRawAlignments(void) {
1311 
1312 #if 0
1313   for (uint32 ii=0; ii<_numReads; ii++) {
1314     fprintf(stderr, "read %4u original %9u-%9u aligned %9u-%9u\n",
1315             ii,
1316             _utgpos[ii].min(), _utgpos[ii].max(),
1317             _cnspos[ii].min(), _cnspos[ii].max());
1318   }
1319 #endif
1320 }
1321 
1322 
1323 
1324 void
trimCircular(void)1325 unitigConsensus::trimCircular(void) {
1326   char    *bases  = _tig->bases();
1327   uint32   length = _tig->length();
1328 
1329   if (_tig->_suggestCircular == false)
1330     return;
1331 
1332   if (_tig->_circularLength == 0)
1333     return;
1334 
1335   if (showAlgorithm()) {
1336     fprintf(stderr, "\n");
1337     fprintf(stderr, "Circularizing tig %u with expected overlap %u bases.\n", _tig->tigID(), _tig->_circularLength);
1338   }
1339 
1340   //  Try alignments of various lengths until we find a proper overlap
1341   //  between the end and the start at acceptable quality, or until we get
1342   //  longer than expected.
1343   //
1344   //  Stop searching if we found a proper overlap.
1345   //
1346   //                   bgnA -v        v- end of the tig
1347   //            A:  ....---------------
1348   //            B:           ----------------......
1349   //   beginning of the tig -^        ^- endB
1350 
1351   bool          found = false;
1352   parasailLib   align(1, -4, 5, 3);
1353 
1354   for (double scale = 1.05; (found == false) && (scale < 2.10); scale += 0.1) {
1355     uint32 trylen = std::min(length, (uint32)(_tig->_circularLength * scale));
1356 
1357     if (showPlacement())
1358       fprintf(stderr, "  Align   A %6u-%6u against B %6u-%6u\n",
1359               length-trylen, length,
1360               0, trylen);
1361 
1362     align.alignDovetail(bases, length, length-trylen, length,           //  A:  the end of the tig
1363                         bases, length, 0,             trylen, false);   //  B:  the start of the tig
1364 
1365     if ((align.errorRate() < 0.05) &&                //  "Found" if the error rate is reasonable, and
1366         (align.alignmentLength() >= _minOverlap) &&  //  and length is reasonable
1367         (align.bgnA() > length - trylen) &&          //  A has unaligned stuff at the start, and
1368         (align.endB() < trylen))                     //  B has unaligned stuff at the end.
1369       found = true;
1370 
1371     if ((found == false) && (showPlacement()))
1372       fprintf(stderr, "  Aligned A %6u-%6u against B %6u-%6u at %7.3f%%\n",
1373               align.bgnA(), align.endA(), align.bgnB(), align.endB(), align.percentIdentity());
1374   }
1375 
1376   if (found == false)   //  No overlap found, probably not circular.
1377     return;
1378 
1379   uint32   headBgn = align.bgnB();   //  Should be zero!
1380   uint32   headEnd = align.endB();   //
1381 
1382   uint32   tailBgn = align.bgnA();   //
1383   uint32   tailEnd = align.endA();   //  Should be 'length'.
1384 
1385   uint32   keepBgn = 0;              //  Default to trimming the
1386   uint32   keepEnd = tailBgn;        //  end part off.
1387 
1388   if (showAlgorithm())
1389     fprintf(stderr, "  Aligned A %6u-%6u against B %6u-%6u at %7.3f%%\n",
1390             tailBgn, tailEnd, headBgn, headEnd, align.percentIdentity());
1391 
1392   if (showAlignments())
1393     align.display(20);
1394 
1395   //  Find the biggest match block, and use the middle of that for the trim point.
1396 
1397   uint32  CC = UINT32_MAX;
1398 
1399   for (uint32 cc=0; cc<align.cigarLength(); cc++) {
1400     if (align.cigarCode(cc) != '=')
1401       continue;
1402 
1403     if ((CC == UINT32_MAX) ||                           //  If not a valid biggest block,
1404         (align.cigarValu(CC) <  align.cigarValu(cc)))   //  or the current block is bigger,
1405       CC = cc;                                          //  reset the biggest block.
1406   }
1407 
1408   if (CC != UINT32_MAX) {
1409     uint32  mid = align.cigarToMapBgn(CC) + (align.cigarToMapEnd(CC) - align.cigarToMapBgn(CC)) / 2;
1410 
1411     keepBgn = align.alignMapB(mid);   //  B is the end of the tig, remember?
1412     keepEnd = align.alignMapA(mid);   //  A is the start of the tig.
1413   }
1414 
1415   if (showAlgorithm())
1416     fprintf(stderr, "  Trim to %u-%u\n", keepBgn, keepEnd);
1417 
1418   //  Test the trimming.
1419   //
1420   //  Align the begin 0..keepBgn   to the end   of the tig buffer+keepEnd..len
1421   //  Align the end   keepEnd..len to the begin of the tig            len..keepBgn+buffer
1422 
1423   bool  headTest = false;
1424   bool  tailTest = false;
1425 
1426   //  Align trimmed-off start to end of tig.  EDLIB_MODE_HW allows free gaps
1427   //  at the start/end of the second sequence.
1428   {
1429     int32  aBgn = 0;
1430     int32  aEnd = keepBgn;
1431     int32  aLen = aEnd - aBgn;
1432 
1433     int32  bBgn = (tailBgn >= 222) ? (tailBgn - 222) : (0);
1434     int32  bEnd = tailEnd;
1435     int32  bLen = bEnd - bBgn;
1436 
1437     if (showAlgorithm())
1438       fprintf(stderr, "  Test   trimmed head %6d-%6d against   kept tail %6d-%6d\n",
1439               aBgn, aEnd, bBgn, bEnd);
1440 
1441     EdlibAlignResult  result;
1442     result = edlibAlign(bases + aBgn, aLen,   //  Piece of the start we trim off
1443                         bases + bBgn, bLen,   //  Should align into the tail we keep
1444                         edlibNewAlignConfig(0.5 * aLen, EDLIB_MODE_HW, EDLIB_TASK_PATH));
1445 
1446     if (bBgn + result.endLocations[0] + 1 == keepEnd)
1447       headTest = true;
1448 
1449     if ((headTest == false) && (showAlgorithm()))
1450       fprintf(stderr, "  Tested trimmed head %6d-%6d aligns to kept tail %6d-%6d%s\n",
1451               aBgn, aEnd, bBgn + result.startLocations[0], bBgn + result.endLocations[0] + 1, (headTest) ? "" : " - FAIL");
1452 
1453     edlibFreeAlignResult(result);
1454   }
1455 
1456   //  Align trimmed-off end to start of tig.
1457   {
1458     int32  aBgn = keepEnd;
1459     int32  aEnd = length;
1460     int32  aLen = aEnd - aBgn;
1461 
1462     int32  bBgn = headBgn;
1463     int32  bEnd = (headEnd + 222 <= length) ? (headEnd + 222) : (length);
1464     int32  bLen = bEnd - bBgn;
1465 
1466     if (showAlgorithm())
1467       fprintf(stderr, "  Test   trimmed tail %6d-%6d against   kept head %6d-%6d\n",
1468               aBgn, aEnd, bBgn, bEnd);
1469 
1470     EdlibAlignResult  result;
1471     result = edlibAlign(bases + aBgn, aLen,   //  Piece of the end we trim off
1472                         bases + bBgn, bLen,   //  Should align into the head we keep
1473                         edlibNewAlignConfig(0.5 * aLen, EDLIB_MODE_HW, EDLIB_TASK_PATH));
1474 
1475     if (bBgn + result.startLocations[0] == keepBgn)
1476       tailTest = true;
1477 
1478     if (showAlgorithm())
1479       fprintf(stderr, "  Tested trimmed head %6d-%6d aligns to kept tail %6d-%6d%s\n",
1480               aBgn, aEnd, bBgn + result.startLocations[0], bBgn + result.endLocations[0] + 1, (tailTest) ? "" : " - FAIL");
1481 
1482     edlibFreeAlignResult(result);
1483   }
1484 
1485   if ((headTest == true) &&
1486       (tailTest == true)) {
1487     if (showAlgorithm())
1488       fprintf(stderr, "  Tests pass.\n");
1489 
1490     _tig->_trimBgn = keepBgn;
1491     _tig->_trimEnd = keepEnd;
1492   }
1493 }
1494 
1495 
1496 
1497 bool
generate(tgTig * tig_,char algorithm_,char aligner_,std::map<uint32,sqRead * > * reads_)1498 unitigConsensus::generate(tgTig                       *tig_,
1499                           char                         algorithm_,
1500                           char                         aligner_,
1501                           std::map<uint32, sqRead *>  *reads_) {
1502   bool  success = false;
1503 
1504   if      (tig_->numberOfChildren() == 1) {
1505     success = generateSingleton(tig_, reads_);
1506   }
1507 
1508   else if (algorithm_ == 'Q') {
1509     success = generateQuick(tig_, reads_);
1510   }
1511 
1512   else if ((algorithm_ == 'P') ||   //  Normal utgcns.
1513            (algorithm_ == 'p')) {   //  'norealign'
1514     success = generatePBDAG(tig_, aligner_, reads_);
1515   }
1516 
1517   if (success) {
1518     _tig->_trimBgn = 0;
1519     _tig->_trimEnd = _tig->length();
1520 
1521     if (algorithm_ == 'P') {
1522       findCoordinates();
1523       findRawAlignments();
1524     }
1525 
1526     trimCircular();
1527   }
1528 
1529   return(success);
1530 }
1531