1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "AS_BAT_ReadInfo.H"
19 #include "AS_BAT_OverlapCache.H"
20 #include "AS_BAT_BestOverlapGraph.H"  //  sizeof(BestEdgeOverlap)
21 #include "AS_BAT_Unitig.H"            //  sizeof(ufNode)
22 #include "AS_BAT_Logging.H"
23 
24 #include "system.H"
25 #include <tuple>
26 
27 uint64  ovlCacheMagic = 0x65686361436c766fLLU;  //0102030405060708LLU;
28 
29 
30 #undef TEST_LINEAR_SEARCH
31 
32 
33 #define  ERR_MASK   (((uint64)1 << AS_MAX_EVALUE_BITS) - 1)
34 
35 #define  SALT_BITS  (64 - AS_MAX_READLEN_BITS - AS_MAX_EVALUE_BITS)
36 #define  SALT_MASK  (((uint64)1 << SALT_BITS) - 1)
37 
38 
OverlapCache(const char * ovlStorePath,const char * prefix,double maxErate,uint32 minOverlap,uint64 memlimit,uint64 genomeSize,bool symmetrize)39 OverlapCache::OverlapCache(const char *ovlStorePath,
40                            const char *prefix,
41                            double maxErate,
42                            uint32 minOverlap,
43                            uint64 memlimit,
44                            uint64 genomeSize,
45                            bool symmetrize) {
46 
47   _prefix = prefix;
48 
49   writeStatus("\n");
50 
51   if (memlimit == UINT64_MAX) {
52     _memLimit = getPhysicalMemorySize();
53     writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (total physical memory).\n", _memLimit >> 20);
54     writeStatus("\n");
55   }
56 
57   else if (memlimit > 0) {
58     _memLimit = memlimit;
59     writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (user supplied).\n", _memLimit >> 20);
60     writeStatus("\n");
61   }
62 
63   else {
64     _memLimit = UINT64_MAX;
65     writeStatus("OverlapCache()-- using unlimited memory (-M 0).\n");
66     writeStatus("\n");
67   }
68 
69   //  Account for memory used by read data, best overlaps, and tigs.
70   //  The chunk graph is temporary, and should be less than the size of the tigs.
71   //  Likewise, the buffers used for loading and scoring overlaps aren't accounted for.
72   //
73   //  NOTES:
74   //
75   //  memFI - read length,
76   //
77   //  memUT - worst case, we have one unitig per read.  also, maps of read-to-unitig and read-to-vector-position.
78   //
79   //  memEP - each read adds two epValue points, the open and close points, and two uint32 pointers
80   //  to the data.
81   //
82   //  memEO - overlaps for computing error profiles.  this is definitely a hack, but I can't think of
83   //  any reasonable estimates.  just reserve 25% of memory, which then dominates our accounting.
84   //
85   //  memOS - make sure we're this much below using all the memory - allows for other stuff to run,
86   //  and a little buffer in case we're too big.
87 
88   uint64 memFI = RI->memoryUsage();
89   uint64 memBE = RI->numReads() * sizeof(BestEdgeOverlap) * 2;
90   uint64 memUT = RI->numReads() * sizeof(Unitig) + RI->numReads() * sizeof(uint32) * 2;
91   uint64 memUL = RI->numReads() * sizeof(ufNode);
92 
93   uint64 memEP = RI->numReads() * sizeof(uint32) * 2 + RI->numReads() * Unitig::epValueSize() * 2;
94   uint64 memEO = (_memLimit == UINT64_MAX) ? (0.0) : (0.25 * _memLimit);
95 
96   uint64 memOS = (_memLimit < 0.9 * getPhysicalMemorySize()) ? (0.0) : (0.1 * getPhysicalMemorySize());
97 
98   uint64 memST = ((RI->numReads() + 1) * (sizeof(BAToverlap *) + sizeof(uint32)) +   //  Cache pointers
99                   (RI->numReads() + 1) * sizeof(uint32) +                            //  Num olaps stored per read
100                   (RI->numReads() + 1) * sizeof(uint32));                            //  Num olaps allocated per read
101 
102 
103   _memReserved = memFI + memBE + memUL + memUT + memEP + memEO + memST + memOS;
104   _memStore    = memST;
105   _memAvail    = (_memReserved + _memStore < _memLimit) ? (_memLimit - _memReserved - _memStore) : 0;
106   _memOlaps    = 0;
107 
108   writeStatus("OverlapCache()-- %7" F_U64P "MB for read data.\n",                      memFI >> 20);
109   writeStatus("OverlapCache()-- %7" F_U64P "MB for best edges.\n",                     memBE >> 20);
110   writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs.\n",                           memUT >> 20);
111   writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - read layouts.\n",            memUL >> 20);
112   writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - error profiles.\n",          memEP >> 20);
113   writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - error profile overlaps.\n",  memEO >> 20);
114   writeStatus("OverlapCache()-- %7" F_U64P "MB for other processes.\n",                memOS >> 20);
115   writeStatus("OverlapCache()-- ---------\n");
116   writeStatus("OverlapCache()-- %7" F_U64P "MB for data structures (sum of above).\n", _memReserved >> 20);
117   writeStatus("OverlapCache()-- ---------\n");
118   writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap store structure.\n",        _memStore >> 20);
119   writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap data.\n",                   _memAvail >> 20);
120   writeStatus("OverlapCache()-- ---------\n");
121   writeStatus("OverlapCache()-- %7" F_U64P "MB allowed.\n",                            _memLimit >> 20);
122   writeStatus("OverlapCache()--\n");
123 
124   if (_memAvail == 0) {
125     writeStatus("OverlapCache()-- Out of memory before loading overlaps; increase -M.\n");
126     exit(1);
127   }
128 
129   _maxEvalue     = AS_OVS_encodeEvalue(maxErate);
130   _minOverlap    = minOverlap;
131 
132   //  Allocate space to load overlaps.  With a NULL seqStore we can't call the bgn or end methods.
133 
134   _ovsMax  = 0;
135   _ovs     = NULL;
136   _ovsSco  = NULL;
137   _ovsTmp  = NULL;
138 
139   //  Allocate pointers to overlaps.
140 
141   _overlapLen = new uint32       [RI->numReads() + 1];
142   _overlapMax = new uint32       [RI->numReads() + 1];
143   _overlaps   = new BAToverlap * [RI->numReads() + 1];
144 
145   memset(_overlapLen, 0, sizeof(uint32)       * (RI->numReads() + 1));
146   memset(_overlapMax, 0, sizeof(uint32)       * (RI->numReads() + 1));
147   memset(_overlaps,   0, sizeof(BAToverlap *) * (RI->numReads() + 1));
148 
149   //  Open the overlap store.
150 
151   ovStore *ovlStore = new ovStore(ovlStorePath, NULL);
152 
153   //  Load overlaps!
154 
155   computeOverlapLimit(ovlStore, genomeSize);
156   loadOverlaps(ovlStore);
157 
158   delete [] _ovs;       _ovs      = NULL;   //  There is a small cost with these arrays that we'd
159   delete [] _ovsSco;    _ovsSco   = NULL;   //  like to not have, and a big cost with ovlStore (in that
160   delete [] _ovsTmp;    _ovsTmp   = NULL;   //  it loaded updated erates into memory), so release
161   delete     ovlStore;   ovlStore = NULL;   //  these before symmetrizing overlaps.
162 
163   if (symmetrize == true)
164     symmetrizeOverlaps();
165 
166   delete [] _minSco;    _minSco   = NULL;
167 }
168 
169 
~OverlapCache()170 OverlapCache::~OverlapCache() {
171 
172   delete [] _overlaps;
173   delete [] _overlapLen;
174   delete [] _overlapMax;
175 
176   delete    _overlapStorage;
177 }
178 
179 
180 
181 //  Decide on limits per read.
182 //
183 //  From the memory limit, we can compute the average allowed per read.  If this is higher than
184 //  the expected coverage, we'll not fill memory completely as the reads in unique sequence will
185 //  have fewer than this number of overlaps.
186 //
187 //  We'd like to iterate this, but the unused space computation assumes all reads are assigned
188 //  the same amount of memory.  On the next iteration, this isn't true any more.  The benefit is
189 //  (hopefully) small, and the algorithm is unknown.
190 //
191 //  This isn't perfect.  It estimates based on whatever is in the store, not only those overlaps
192 //  below the error threshold.  Result is that memory usage is far below what it should be.  Easy to
193 //  fix if we assume all reads have the same properties (same library, same length, same error
194 //  rate) but not so easy in reality.  We need big architecture changes to make it easy (grouping
195 //  reads by library, collecting statistics from the overlaps, etc).
196 //
197 //  It also doesn't distinguish between 5' and 3' overlaps - it is possible for all the long
198 //  overlaps to be off of one end.
199 //
200 
201 void
computeOverlapLimit(ovStore * ovlStore,uint64 genomeSize)202 OverlapCache::computeOverlapLimit(ovStore *ovlStore, uint64 genomeSize) {
203   uint32  frstRead  = 0;
204   uint32  lastRead  = 0;
205   uint32 *numPer    = ovlStore->numOverlapsPerRead();
206 
207   //  Set the minimum number of overlaps per read to twice coverage.  Then set the maximum number of
208   //  overlaps per read to a guess of what it will take to fill up memory.
209 
210   _minPer = 2 * RI->numBases() / genomeSize;
211   _maxPer = _memAvail / (RI->numReads() * sizeof(BAToverlap));
212 
213   writeStatus("OverlapCache()-- Retain at least " F_U32 " overlaps/read, based on %.2fx coverage.\n", _minPer, (double)RI->numBases() / genomeSize);
214   writeStatus("OverlapCache()-- Initial guess at " F_U32 " overlaps/read.\n", _maxPer);
215   writeStatus("OverlapCache()--\n");
216 
217   uint64  totalOlaps = ovlStore->numOverlapsInRange();
218 
219   assert(totalOlaps > 0);
220 
221   uint64  olapLoad   = 0;  //  Total overlaps we would load at this threshold
222   uint64  olapMem    = 0;
223 
224   uint32  numBelow   = 0;  //  Number of reads below the threshold
225   uint32  numEqual   = 0;
226   uint32  numAbove   = 0;  //  Number of reads above the threshold
227 
228   writeStatus("OverlapCache()-- Adjusting for sparse overlaps.\n");
229   writeStatus("OverlapCache()--\n");
230   writeStatus("OverlapCache()--               reads loading olaps          olaps               memory\n");
231   writeStatus("OverlapCache()--   olaps/read       all      some          loaded                 free\n");
232   writeStatus("OverlapCache()--   ----------   -------   -------     ----------- -------     --------\n");
233 
234   while (true) {
235     olapLoad = 0;
236     numBelow = 0;
237     numEqual = 0;
238     numAbove = 0;
239 
240     for (uint32 i=1; i<=RI->numReads(); i++) {
241       if (numPer[i] < _maxPer) {
242         numBelow += 1;
243         olapLoad += numPer[i];
244 
245       } else if (numPer[i] == _maxPer) {
246         numEqual += 1;
247         olapLoad += _maxPer;
248 
249       } else {
250         numAbove += 1;
251         olapLoad += _maxPer;
252       }
253     }
254 
255     olapMem = olapLoad * sizeof(BAToverlap);
256 
257     //  If we're too high, decrease the threshold and compute again.  We shouldn't ever be too high.
258 
259     if (_memAvail < olapMem) {
260       _maxPer--;
261       continue;
262     }
263 
264     //  Log what we will be loading.
265 
266     writeStatus("OverlapCache()--      %7" F_U32P "   %7" F_U32P "   %7" F_U32P "    %12" F_U32P " %6.2f%%    %7" F_U32P " MB\n",
267                 _maxPer,
268                 numBelow + numEqual,
269                 numAbove,
270                 olapLoad,
271                 100.0 * olapLoad / totalOlaps,
272                 (_memAvail - olapMem) >> 20);
273 
274     //  If there are no more overlaps to load, we're done.
275 
276     if (numAbove == 0)
277       break;
278 
279     //  Otherwise, there is still (potentially) space left for more overlaps.  Estimate how much
280     //  higher we could push the threshold: compute how many more overlaps we could load before
281     //  exceeding the memory limit, then assume we'd load that many overlaps for each of the
282     //  numAbove reads.
283 
284     int64  olapFree  = (_memAvail - olapMem) / sizeof(BAToverlap);
285     int64  increase  = olapFree / numAbove;
286 
287     if (increase == 0)
288       break;
289 
290     _maxPer += increase;
291   }
292 
293   if (_maxPer < _minPer)
294     writeStatus("OverlapCache()-- Not enough memory to load the minimum number of overlaps; increase -M.\n"), exit(1);
295 
296   delete [] numPer;
297 }
298 
299 //return true if o1 is worse than o2
compareOverlaps(const ovOverlap & o1,const ovOverlap & o2)300 bool OverlapCache::compareOverlaps(const ovOverlap &o1, const ovOverlap &o2) const {
301    assert(o1.a_iid == o2.a_iid);
302    assert(o1.b_iid == o2.b_iid);
303    auto as_tuple = [&](const ovOverlap &o) {
304       return std::make_tuple(1 - o.erate(), RI->overlapLength(o.a_iid, o.b_iid, o.a_hang(), o.b_hang()), !o.flipped());
305    };
306    return as_tuple(o2) > as_tuple(o1);
307 }
308 
309 
310 //return true if o1 is worse than o2
compareOverlaps(const BAToverlap & o1,const BAToverlap & o2)311 bool OverlapCache::compareOverlaps(const BAToverlap &o1, const BAToverlap &o2) const {
312    assert(o1.a_iid == o2.a_iid);
313    auto as_tuple = [&](const BAToverlap &o) {
314       return std::make_tuple(1 - o.erate(), RI->overlapLength(o.a_iid, o.b_iid, o.a_hang, o.b_hang), !o.flipped);
315    };
316    return as_tuple(o2) > as_tuple(o1);
317 }
318 
319 uint32
filterDuplicates(uint32 & no)320 OverlapCache::filterDuplicates(uint32 &no) {
321   uint32   nFiltered = 0;
322 
323   for (uint32 ii=0, jj=1; jj<no; ii++, jj++) {
324     if (_ovs[ii].b_iid != _ovs[jj].b_iid)
325       continue;
326 
327     //  Found duplicate B IDs.  Drop one of them.
328 
329     nFiltered++;
330 
331     //  Drop the weaker overlap.  If a tie, drop the flipped one.
332 
333     uint32 to_drop = compareOverlaps(_ovs[ii], _ovs[jj]) ? ii : jj;
334     uint32 to_save = (to_drop == ii ? jj : ii);
335 #if 0
336     writeLog("OverlapCache::filterDuplicates()-- Dropping overlap A: %9" F_U64P " B: %9" F_U64P " - score %8.2f - %6.4f%% - %6" F_S32P " %6" F_S32P " - %s\n",
337              _ovs[to_drop].a_iid, _ovs[to_drop].b_iid, to_drops, _ovs[to_drop].erate(), _ovs[to_drop].a_hang(), _ovs[to_drop].b_hang(), _ovs[to_drop].flipped() ? "flipped" : "");
338     writeLog("OverlapCache::filterDuplicates()-- Saving   overlap A: %9" F_U64P " B: %9" F_U64P " - score %8.2f - %6.4f%% - %6" F_S32P " %6" F_S32P " - %s\n",
339              _ovs[to_save].a_iid, _ovs[to_save].b_iid, to_saves, _ovs[to_save].erate(), _ovs[to_save].a_hang(), _ovs[to_save].b_hang(), _ovs[to_save].flipped() ? "flipped" : "");
340 #endif
341 
342     _ovs[to_drop].a_iid = 0;
343     _ovs[to_drop].b_iid = 0;
344   }
345 
346   //  If nothing was filtered, return.
347 
348   if (nFiltered == 0)
349     return(0);
350 
351   //  Squeeze out the filtered overlaps.  Preserve order so we can binary search later.
352 
353   for (uint32 ii=0, jj=0; jj<no; ) {
354     if (_ovs[jj].a_iid == 0) {
355       jj++;
356       continue;
357     }
358 
359     if (ii != jj)
360       _ovs[ii] = _ovs[jj];
361 
362     ii++;
363     jj++;
364   }
365 
366   no -= nFiltered;
367 
368   //  Check for errors.
369 
370   bool  errors = false;
371 
372   for (uint32 jj=0; jj<no; jj++)
373     if ((_ovs[jj].a_iid == 0) || (_ovs[jj].b_iid == 0))
374       errors = true;
375 
376   if (errors == false)
377     return(nFiltered);
378 
379   writeLog("ERROR: filtered overlap found in saved list for read %u.  Filtered %u overlaps.\n", _ovs[0].a_iid, nFiltered);
380 
381   for (uint32 jj=0; jj<no + nFiltered; jj++)
382     writeLog("OVERLAP  %8d %8d  hangs %5d %5d  erate %.4f\n",
383              _ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang(), _ovs[jj].erate());
384 
385   flushLog();
386 
387   assert(errors == false);
388   return(0);
389 }
390 
391 
392 
393 inline
394 uint64
ovlSco(uint64 olen,uint64 evalue,uint64 ii)395 ovlSco(uint64 olen, uint64 evalue, uint64 ii) {
396   uint64  os;
397 
398   os   = olen;
399   os <<= AS_MAX_EVALUE_BITS;
400   os  |= (~evalue) & ERR_MASK;
401   os <<= SALT_BITS;
402   os  |= ii & SALT_MASK;
403 
404   return(os);
405 }
406 
407 inline
408 uint64
ovlScoToLength(uint64 score)409 ovlScoToLength(uint64 score) {
410   return(score >> (AS_MAX_EVALUE_BITS + SALT_BITS));
411 }
412 
413 
414 
415 uint32
filterOverlaps(uint32 aid,uint32 maxEvalue,uint32 minOverlap,uint32 no)416 OverlapCache::filterOverlaps(uint32 aid, uint32 maxEvalue, uint32 minOverlap, uint32 no) {
417   uint32 ns        = 0;
418   bool   beVerbose = false;
419 
420  //beVerbose = (_ovs[0].a_iid == 3514657);
421 
422   for (uint32 ii=0; ii<no; ii++) {
423     _ovsSco[ii] = 0;                                //  Overlaps 'continue'd below will be filtered, even if 'no filtering' is needed.
424     _ovsTmp[ii] = 0;
425 
426     if ((RI->readLength(_ovs[ii].a_iid) == 0) ||    //  At least one read in the overlap is deleted
427         (RI->readLength(_ovs[ii].b_iid) == 0)) {
428       if (beVerbose)
429         writeLog("olap %d involves deleted reads - %u %s - %u %s\n",
430                 ii,
431                 _ovs[ii].a_iid, (RI->readLength(_ovs[ii].a_iid) == 0) ? "deleted" : "active",
432                 _ovs[ii].b_iid, (RI->readLength(_ovs[ii].b_iid) == 0) ? "deleted" : "active");
433       continue;
434     }
435 
436     if (_ovs[ii].evalue() > maxEvalue) {            //  Too noisy to care
437       if (beVerbose)
438         writeLog("olap %d too noisy evalue %f > maxEvalue %f\n",
439                 ii, AS_OVS_decodeEvalue(_ovs[ii].evalue()), AS_OVS_decodeEvalue(maxEvalue));
440       continue;
441     }
442 
443     uint32  olen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
444 
445     //  If too short, drop it.
446 
447     if (olen < minOverlap) {
448       if (beVerbose)
449         writeLog("olap %d too short olen %u minOverlap %u\n",
450                 ii, olen, minOverlap);
451       continue;
452     }
453 
454     //  Just right!
455 
456     _ovsTmp[ii] = _ovsSco[ii] = ovlSco(olen, _ovs[ii].evalue(), ii);
457 
458     ns++;
459   }
460 
461   _minSco[aid] = 0;                          //  Minimum score of overlap we'll keep for this read.
462 
463   if (ns <= _maxPer)                         //  Fewer overlaps than the limit, no filtering needed.
464     return(ns);
465 
466   std::sort(_ovsTmp, _ovsTmp + no);          //  Sort the scores so we can pick a minScore that
467   _minSco[aid] = _ovsTmp[no - _maxPer];      //  results in the correct number of overlaps.
468 
469   ns = 0;
470 
471   for (uint32 ii=0; ii<no; ii++)
472     if (_ovsSco[ii] < _minSco[aid])          //  Score too low, flag it as junk.
473       _ovsSco[ii] = 0;                       //  We could also do this when copying overlaps to
474     else                                     //  storage, except we need to know how many overlaps
475       ns++;                                  //  to copy so we can allocate storage.
476 
477   assert(ns <= _maxPer);
478 
479   return(ns);
480 }
481 
482 
483 
484 void
loadOverlaps(ovStore * ovlStore)485 OverlapCache::loadOverlaps(ovStore *ovlStore) {
486 
487   writeStatus("OverlapCache()--\n");
488   writeStatus("OverlapCache()-- Loading overlaps.\n");
489   writeStatus("OverlapCache()--\n");
490   writeStatus("OverlapCache()--          read from store           saved in cache\n");
491   writeStatus("OverlapCache()--   ------------ ---------   ------------ ---------\n");
492 
493   uint64   numTotal     = 0;
494   uint64   numLoaded    = 0;
495   uint64   numDups      = 0;
496   uint32   numReads     = 0;
497   uint64   numStore     = ovlStore->numOverlapsInRange();
498 
499   assert(numStore > 0);
500 
501   _overlapStorage = new OverlapStorage(ovlStore->numOverlapsInRange());
502 
503   //  Scan the overlaps, finding the maximum number of overlaps for a single read.  This lets
504   //  us pre-allocate space and simplifies the loading process.
505 
506   assert(_ovsMax == 0);
507   assert(_ovs    == NULL);
508 
509   _ovsMax = 0;
510 
511   for (uint32 rr=0; rr<RI->numReads()+1; rr++)
512     _ovsMax = std::max(_ovsMax, ovlStore->numOverlaps(rr));
513 
514   _minSco  = new uint64    [RI->numReads()+1];
515 
516   _ovs     = new ovOverlap [_ovsMax];
517   _ovsSco  = new uint64    [_ovsMax];
518   _ovsTmp  = new uint64    [_ovsMax];
519 
520   for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
521 
522     //  Actually load the overlaps, then detect and remove overlaps between
523     //  the same pair, then filter short and low quality overlaps.
524 
525     uint32  no = ovlStore->loadOverlapsForRead(rr, _ovs, _ovsMax);   //  no == total overlaps == numOvl
526     uint32  nd = filterDuplicates(no);                               //  nd == duplicated overlaps (no is decreased by this amount)
527     uint32  ns = filterOverlaps(rr, _maxEvalue, _minOverlap, no);    //  ns == acceptable overlaps
528 
529     //  If we still have overlaps, get official space to store them and copy
530     //  to storage,
531 
532     if (ns > 0) {
533       uint32  id = _ovs[0].a_iid;
534 
535       _overlapMax[id] = ns;
536       _overlapLen[id] = ns;
537       _overlaps[id]   = _overlapStorage->get(ns);                //  Get space for overlaps.
538 
539       _memOlaps += _overlapMax[id] * sizeof(BAToverlap);
540 
541       uint32  oo=0;
542 
543       for (uint32 ii=0; ii<no; ii++) {
544         if (_ovsSco[ii] == 0)                                    //  Skip if it was filtered.
545           continue;
546 
547         _overlaps[id][oo].evalue    = _ovs[ii].evalue();         //  Or copy to our storage.
548         _overlaps[id][oo].a_hang    = _ovs[ii].a_hang();
549         _overlaps[id][oo].b_hang    = _ovs[ii].b_hang();
550         _overlaps[id][oo].flipped   = _ovs[ii].flipped();
551         _overlaps[id][oo].filtered  = false;
552         _overlaps[id][oo].symmetric = false;
553         _overlaps[id][oo].a_iid     = _ovs[ii].a_iid;
554         _overlaps[id][oo].b_iid     = _ovs[ii].b_iid;
555 
556         assert(_overlaps[id][oo].a_iid != 0);   //  Guard against some kind of weird error that
557         assert(_overlaps[id][oo].b_iid != 0);   //  I can no longer remember.
558 
559         oo++;
560       }
561 
562       assert(oo == _overlapLen[id]);    //  Ensure we got all the overlaps we were supposed to get.
563     }
564 
565     //  Keep track of what we loaded and didn't.
566 
567     numTotal  += no + nd;   //  Because no was decremented by nd in filterDuplicates()
568     numLoaded += ns;
569     numDups   += nd;
570 
571     if ((numReads++ % 100000) == 99999)
572       writeStatus("OverlapCache()--   %12" F_U64P " (%06.2f%%)   %12" F_U64P " (%06.2f%%)\n",
573                   numTotal,  100.0 * numTotal  / numStore,
574                   numLoaded, 100.0 * numLoaded / numStore);
575   }
576 
577   writeStatus("OverlapCache()--   ------------ ---------   ------------ ---------\n");
578   writeStatus("OverlapCache()--   %12" F_U64P " (%06.2f%%)   %12" F_U64P " (%06.2f%%)\n",
579               numTotal,  100.0 * numTotal  / numStore,
580               numLoaded, 100.0 * numLoaded / numStore);
581 
582   writeStatus("OverlapCache()--\n");
583   writeStatus("OverlapCache()-- Ignored %lu duplicate overlaps.\n", numDups);
584 }
585 
586 
587 
588 //  Binary search a list of overlaps for one matching bID and flipped.
589 uint32
searchForOverlap(BAToverlap * ovl,uint32 ovlLen,uint32 bID,bool flipped)590 searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID, bool flipped) {
591   int32  F = 0;
592   int32  L = ovlLen - 1;
593   int32  M = 0;
594 
595 #ifdef TEST_LINEAR_SEARCH
596   bool linearSearchFound = false;
597 
598   for (uint32 ss=0; ss<ovlLen; ss++)
599     if ((ovl[ss].b_iid   == bID) &&
600         (ovl[ss].flipped == flipped)) {
601       linearSearchFound = true;
602       break;
603     }
604 #endif
605 
606   while (F <= L) {
607     M = (F + L) / 2;
608 
609     if ((ovl[M].b_iid   == bID) &&
610         (ovl[M].flipped == flipped)) {
611 #ifdef TEST_LINEAR_SEARCH
612       assert(linearSearchFound == true);
613 #endif
614       return(M);
615     }
616 
617     if (((ovl[M].b_iid  < bID)) ||
618         ((ovl[M].b_iid == bID) && (ovl[M].flipped < flipped)))
619       F = M+1;
620     else
621       L = M-1;
622   }
623 
624 #ifdef TEST_LINEAR_SEARCH
625   assert(linearSearchFound == false);
626 #endif
627 
628   return(UINT32_MAX);
629 }
630 
631 
632 
633 
634 void
symmetrizeOverlaps(void)635 OverlapCache::symmetrizeOverlaps(void) {
636   uint32  fiLimit    = RI->numReads() + 1;
637   uint32  numThreads = getNumThreads();
638   uint32  blockSize  = (fiLimit < 1000 * numThreads) ? numThreads : fiLimit / 999;
639 
640   uint32  *nNonSymPerRead = new uint32 [fiLimit];
641   uint32  *nFiltPerRead   = new uint32 [fiLimit];
642   uint32  *nMissPerRead   = new uint32 [fiLimit];
643 
644   for (uint32 rr=0; rr < fiLimit; rr++) {
645     nNonSymPerRead[rr] = 0;
646     nFiltPerRead[rr]   = 0;
647     nMissPerRead[rr]   = 0;
648   }
649 
650   writeStatus("OverlapCache()--\n");
651   writeStatus("OverlapCache()-- Symmetrizing overlaps.\n");
652   writeStatus("OverlapCache()--   Finding missing twins.\n");
653 
654   FILE *NSE = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-error-rates", false);   //  These turn out to be VERY big
655   FILE *NTW = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-overlaps",    false);
656 
657   if (NSE) {
658     fprintf(NSE, "     aID      bID  a error b error\n");
659     fprintf(NSE, "-------- --------  ------- -------\n");
660   }
661 
662   //  For each overlap, see if the twin overlap exists.  It is tempting to skip searching if the
663   //  b-read has loaded all overlaps (the overlap we're searching for must exist) but we can't.
664   //  We must still mark the overlap as being symmetric.
665 
666 #pragma omp parallel for schedule(dynamic, blockSize)
667   for (uint32 ra=0; ra < fiLimit; ra++) {
668     for (uint32 oa=0; oa<_overlapLen[ra]; oa++) {
669       BAToverlap  *ova = &_overlaps[ra][oa];
670       uint32       rb  =  _overlaps[ra][oa].b_iid;
671 
672       //  If already marked, we're done.
673 
674       if (ova->symmetric == true)
675         continue;
676 
677       //  Search for the twin overlap.
678 
679       uint32       ob  = searchForOverlap(_overlaps[rb], _overlapLen[rb], ra, ova->flipped);
680 
681       //  If the twin was found, mark both as symmetric and make sure the
682       //  error rate is symmetric too.
683 
684       if (ob < UINT32_MAX) {
685         BAToverlap  *ovb = &_overlaps[rb][ob];
686 
687         ova->symmetric = true;   //  I have a twin!
688         ovb->symmetric = true;   //  My twin has a twin, me!
689 
690         if (ova->evalue != ovb->evalue) {
691           if (NSE)
692             fprintf(NSE, "%8u %8u  %7.3f %7.3f\n",
693                      ra, rb,
694                      ova->erate() * 100.0,
695                      ovb->erate() * 100.0);
696 
697           uint64 ev = std::min(ova->evalue, ovb->evalue);
698 
699           ova->evalue = ev;
700           ovb->evalue = ev;
701 
702           nNonSymPerRead[ra]++;
703         }
704 
705         continue;
706       }
707 
708       //  Otherwise, the twin was not found.
709 
710       //  If this overlap is weak according to the other read (that is, if
711       //  the other read dropped it for whatever reason) flag that we don't
712       //  want to keep it here either.
713 
714       uint32  olen = RI->overlapLength(ova->a_iid, ova->b_iid, ova->a_hang, ova->b_hang);
715       uint64  osco = ovlSco(olen, ova->evalue, UINT64_MAX);
716 
717       if (osco < _minSco[rb]) {
718         if (NTW)
719           fprintf(NTW, "NO TWIN for %6u -> %6u - length %lu < min %lu - WEAK\n",
720                   ra, ova->b_iid, ovlScoToLength(osco), ovlScoToLength(_minSco[rb]));
721 
722         nFiltPerRead[ra]++;
723         ova->filtered = true;
724         continue;
725       }
726 
727       //  Not weak.  We want to duplicate this in the other read.
728 
729       if (NTW)
730         fprintf(NTW, "NO TWIN for %6u -> %6u - length %lu >= min %lu\n",
731                 ra, ova->b_iid, ovlScoToLength(osco), ovlScoToLength(_minSco[rb]));
732 
733 #pragma omp critical (nMissPerRead)
734       nMissPerRead[rb]++;
735     }
736   }
737 
738   AS_UTL_closeFile(NTW);
739   AS_UTL_closeFile(NSE);
740 
741   uint64   nOverlaps  = 0;
742   uint64   nNonSymErr = 0;
743   uint64   nWeak      = 0;
744   uint64   nMissing   = 0;
745 
746   for (uint32 rr=0; rr < fiLimit; rr++) {
747     nOverlaps  += _overlapLen[rr];
748     nNonSymErr += nNonSymPerRead[rr];
749     nWeak      += nFiltPerRead[rr];
750     nMissing   += nMissPerRead[rr];
751   }
752 
753   writeStatus("OverlapCache()--   In %llu overlaps:\n", nOverlaps);
754   writeStatus("OverlapCache()--     Found %llu overlaps with non-symmetric error rates.\n", nNonSymErr);
755   writeStatus("OverlapCache()--     Found %llu overlaps with missing twins.\n", nMissing);
756   writeStatus("OverlapCache()--     Dropped %llu weak missing-twin overlaps.\n", nWeak);
757 
758   //
759   //  Expand or shrink space for the overlaps.
760   //
761 
762   //  Allocate new temporary pointers for each read.
763 
764   //  The new storage must start after the old storage.  And if it starts after the old storage ends,
765   //  we can copy easier.  If not, we just grab some empty overlaps to make space.
766 
767   //  A complication occurs at the end of a single segment.  If there isn't enough space in the
768   //  current segment for the overlaps, we skip ahead to the next segment without accounting for the
769   //  overlaps we skip.  It's possible for the new size to fit into this unused space, which would
770   //  then put the old overlaps physically after the new ones.
771   //
772   //  [ olaps1+unused   | olaps2+unused   | olaps3+unused    |        ] [ olaps4+unused   | ..... ]
773   //  [ olaps1+new      | olaps2+new      | olaps3+new | olaps4+new | ] [ olaps5+new  | ....      ]
774   //
775   //  So, we need to compare not overlap counts, but raw positions in the OverlapStorage object.
776 
777   BAToverlap      **nPtr = new BAToverlap * [fiLimit];            //  Pointers to new overlap storage locations
778   OverlapStorage   *oldS = new OverlapStorage(_overlapStorage);   //  Recreates the existing layout without allocating anything
779   OverlapStorage   *newS = _overlapStorage->reset();              //  Resets pointers for the new layout, using existing space
780 
781   for (uint32 rr=1; rr < fiLimit; rr++)
782     nPtr[rr] = nullptr;
783 
784   //  Compute new pointers for overlap data that begin after the old data.
785 
786   for (uint32 rr=1; rr < fiLimit; rr++) {
787     uint32  nOvl = _overlapLen[rr] + nMissPerRead[rr] - nFiltPerRead[rr];
788 
789     assert(_overlapLen[rr] + nMissPerRead[rr] >= nFiltPerRead[rr]);
790 
791     oldS->get(_overlapMax[rr]);   //  Move old storage ahead to the start of the next read.
792     newS->advance(oldS);          //  Advance newS so that it is at or after where oldS is.
793     nPtr[rr] = newS->get(nOvl);   //  Grab space for the number of overlaps we'll end up with on this read.
794 
795     _overlapMax[rr] = nOvl;       //  Update the maximum number of overlaps on this read.
796   }
797 
798   //  Copy overlap data from the old space to the new space, backwards.
799   //  THIS CANNOT BE THREADED.
800 
801   writeStatus("OverlapCache()--   Shifting overlaps.\n");
802 
803   FILE *NTD = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-weak-dropped", false);
804 
805   for (uint32 rr=fiLimit; rr-- > 0; ) {
806     if (_overlapLen[rr] == 0)   //  Skip the asserts!
807       continue;
808 
809     assert(_overlaps[rr][0                ].a_iid == rr);  //  Ensure that we didn't overwrite an existing
810     assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);  //  overlap with overlaps for the reads after it.
811 
812     uint32 oo = 0;  //  Position in original overlap list
813     uint32 nn = 0;  //  Position in new overlap list
814 
815     for (; oo<_overlapLen[rr]; oo++)             //  Over all the original overlaps,
816       if (_overlaps[rr][oo].filtered == false)   //  Copy to new storage if they're
817         nPtr[rr][nn++] = _overlaps[rr][oo];      //  not filtered.
818       else
819         if (NTD)
820           fprintf(NTD, "DROP overlap a %u b %u\n", _overlaps[rr][oo].a_iid, _overlaps[rr][oo].b_iid);
821 
822     assert(nn == _overlapLen[rr] - nFiltPerRead[rr]);
823 
824     _overlapLen[rr] = _overlapLen[rr] - nFiltPerRead[rr];
825   }
826 
827   AS_UTL_closeFile(NTD);
828 
829   //  Swap pointers to the pointers and remove the old pointer storage.
830 
831   delete [] _overlaps;
832   _overlaps = nPtr;
833 
834   delete  oldS;
835 
836   //  Copy non-twin overlaps to their twin.
837 
838   writeStatus("OverlapCache()--   Adding missing twins.\n");
839 
840   FILE *NTA = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-added", false);
841 
842   //  This has several concurrency issues.
843   //  1)  loop test on overlapLen[ra] can change if we increment overlapLen[rb].
844   //      we could access overlaps[ra][oo] before it is copied to in another thread
845   //  2)  overlapLen[rb]++
846   //  3)  nMissPerRead[rb]
847 
848   //#pragma omp parallel for schedule(dynamic, blockSize)
849   for (uint32 ra=0; ra < fiLimit; ra++) {
850     for (uint32 oo=0; oo<_overlapLen[ra]; oo++) {
851       if (_overlaps[ra][oo].symmetric == true)
852         continue;
853 
854       uint32  rb = _overlaps[ra][oo].b_iid;
855       uint32  nn = _overlapLen[rb]++;
856 
857       assert(nn < _overlapMax[rb]);
858 
859       if (NTA)
860         fprintf(NTA, "add missing twin from read %u -> read %u at pos %u out of %u\n", ra, rb, nn, _overlapMax[rb] - 1);
861 
862       _overlaps[rb][nn].evalue    =  _overlaps[ra][oo].evalue;
863       _overlaps[rb][nn].a_hang    = (_overlaps[ra][oo].flipped) ? (_overlaps[ra][oo].b_hang) : (-_overlaps[ra][oo].a_hang);
864       _overlaps[rb][nn].b_hang    = (_overlaps[ra][oo].flipped) ? (_overlaps[ra][oo].a_hang) : (-_overlaps[ra][oo].b_hang);
865       _overlaps[rb][nn].flipped   =  _overlaps[ra][oo].flipped;
866 
867       _overlaps[rb][nn].filtered  =  _overlaps[ra][oo].filtered;
868       _overlaps[rb][nn].symmetric =  _overlaps[ra][oo].symmetric = true;
869 
870       _overlaps[rb][nn].a_iid     =  _overlaps[ra][oo].b_iid;
871       _overlaps[rb][nn].b_iid     =  _overlaps[ra][oo].a_iid;
872 
873       assert(ra == _overlaps[ra][oo].a_iid);
874       assert(rb == _overlaps[ra][oo].b_iid);
875 
876       assert(nMissPerRead[rb] > 0);
877 
878       nMissPerRead[rb]--;
879     }
880   }
881 
882   AS_UTL_closeFile(NTA);
883 
884   //  Check that everything worked.
885 
886   for (uint32 rr=0; rr < fiLimit; rr++) {
887     assert(nMissPerRead[rr] == 0);
888 
889     if (_overlapLen[rr] == 0)
890       continue;
891 
892     assert(_overlapLen[rr] == _overlapMax[rr]);
893 
894     assert(_overlaps[rr][0                ].a_iid == rr);
895     assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
896   }
897 
898   //  Cleanup.
899 
900   delete [] nNonSymPerRead;
901   delete [] nFiltPerRead;
902   delete [] nMissPerRead;
903 
904   //  Probably should sort again.  Not sure if anything depends on this.
905 
906   writeStatus("OverlapCache()--   Sorting overlaps.\n");
907 
908 #pragma omp parallel for schedule(dynamic, blockSize)
909   for (uint32 rr=0; rr < fiLimit; rr++)
910     if (_overlapLen[rr] > 0)
911       std::sort(_overlaps[rr], _overlaps[rr] + _overlapLen[rr], [](BAToverlap const &a, BAToverlap const &b) {
912                                                                   return(((a.b_iid == b.b_iid) && (a.flipped < b.flipped)) || (a.b_iid < b.b_iid)); } );
913 
914   //  Check that all overlaps are present.
915 
916   writeStatus("OverlapCache()--   Checking overlap symmetry.\n");
917 
918 #pragma omp parallel for schedule(dynamic, blockSize)
919   for (uint32 ra=0; ra < fiLimit; ra++) {
920     for (uint32 oa=0; oa<_overlapLen[ra]; oa++) {
921       BAToverlap  *ova = &_overlaps[ra][oa];
922       uint32       rb  =  _overlaps[ra][oa].b_iid;
923 
924       uint32       ob  = searchForOverlap(_overlaps[rb], _overlapLen[rb], ra, ova->flipped);
925 
926       if (ob == UINT32_MAX) {
927         for(uint32 ii=0; ii<_overlapLen[ra]; ii++)
928           fprintf(stderr, "olapA %u -> %u flip %c%s\n", _overlaps[ra][ii].a_iid, _overlaps[ra][ii].b_iid, _overlaps[ra][ii].flipped ? 'Y' : 'N', (ii == ra) ? " **" : "");
929         for(uint32 ii=0; ii<_overlapLen[rb]; ii++)
930           fprintf(stderr, "olapB %u -> %u flip %c\n",   _overlaps[rb][ii].a_iid, _overlaps[rb][ii].b_iid, _overlaps[rb][ii].flipped ? 'Y' : 'N');
931       }
932       assert(ob != UINT32_MAX);
933     }
934   }
935 
936   writeStatus("OverlapCache()--   Finished.\n");
937 }
938 
939 
940