1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "AS_BAT_ReadInfo.H"
19 #include "AS_BAT_OverlapCache.H"
20 #include "AS_BAT_BestOverlapGraph.H" // sizeof(BestEdgeOverlap)
21 #include "AS_BAT_Unitig.H" // sizeof(ufNode)
22 #include "AS_BAT_Logging.H"
23
24 #include "system.H"
25 #include <tuple>
26
27 uint64 ovlCacheMagic = 0x65686361436c766fLLU; //0102030405060708LLU;
28
29
30 #undef TEST_LINEAR_SEARCH
31
32
33 #define ERR_MASK (((uint64)1 << AS_MAX_EVALUE_BITS) - 1)
34
35 #define SALT_BITS (64 - AS_MAX_READLEN_BITS - AS_MAX_EVALUE_BITS)
36 #define SALT_MASK (((uint64)1 << SALT_BITS) - 1)
37
38
OverlapCache(const char * ovlStorePath,const char * prefix,double maxErate,uint32 minOverlap,uint64 memlimit,uint64 genomeSize,bool symmetrize)39 OverlapCache::OverlapCache(const char *ovlStorePath,
40 const char *prefix,
41 double maxErate,
42 uint32 minOverlap,
43 uint64 memlimit,
44 uint64 genomeSize,
45 bool symmetrize) {
46
47 _prefix = prefix;
48
49 writeStatus("\n");
50
51 if (memlimit == UINT64_MAX) {
52 _memLimit = getPhysicalMemorySize();
53 writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (total physical memory).\n", _memLimit >> 20);
54 writeStatus("\n");
55 }
56
57 else if (memlimit > 0) {
58 _memLimit = memlimit;
59 writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (user supplied).\n", _memLimit >> 20);
60 writeStatus("\n");
61 }
62
63 else {
64 _memLimit = UINT64_MAX;
65 writeStatus("OverlapCache()-- using unlimited memory (-M 0).\n");
66 writeStatus("\n");
67 }
68
69 // Account for memory used by read data, best overlaps, and tigs.
70 // The chunk graph is temporary, and should be less than the size of the tigs.
71 // Likewise, the buffers used for loading and scoring overlaps aren't accounted for.
72 //
73 // NOTES:
74 //
75 // memFI - read length,
76 //
77 // memUT - worst case, we have one unitig per read. also, maps of read-to-unitig and read-to-vector-position.
78 //
79 // memEP - each read adds two epValue points, the open and close points, and two uint32 pointers
80 // to the data.
81 //
82 // memEO - overlaps for computing error profiles. this is definitely a hack, but I can't think of
83 // any reasonable estimates. just reserve 25% of memory, which then dominates our accounting.
84 //
85 // memOS - make sure we're this much below using all the memory - allows for other stuff to run,
86 // and a little buffer in case we're too big.
87
88 uint64 memFI = RI->memoryUsage();
89 uint64 memBE = RI->numReads() * sizeof(BestEdgeOverlap) * 2;
90 uint64 memUT = RI->numReads() * sizeof(Unitig) + RI->numReads() * sizeof(uint32) * 2;
91 uint64 memUL = RI->numReads() * sizeof(ufNode);
92
93 uint64 memEP = RI->numReads() * sizeof(uint32) * 2 + RI->numReads() * Unitig::epValueSize() * 2;
94 uint64 memEO = (_memLimit == UINT64_MAX) ? (0.0) : (0.25 * _memLimit);
95
96 uint64 memOS = (_memLimit < 0.9 * getPhysicalMemorySize()) ? (0.0) : (0.1 * getPhysicalMemorySize());
97
98 uint64 memST = ((RI->numReads() + 1) * (sizeof(BAToverlap *) + sizeof(uint32)) + // Cache pointers
99 (RI->numReads() + 1) * sizeof(uint32) + // Num olaps stored per read
100 (RI->numReads() + 1) * sizeof(uint32)); // Num olaps allocated per read
101
102
103 _memReserved = memFI + memBE + memUL + memUT + memEP + memEO + memST + memOS;
104 _memStore = memST;
105 _memAvail = (_memReserved + _memStore < _memLimit) ? (_memLimit - _memReserved - _memStore) : 0;
106 _memOlaps = 0;
107
108 writeStatus("OverlapCache()-- %7" F_U64P "MB for read data.\n", memFI >> 20);
109 writeStatus("OverlapCache()-- %7" F_U64P "MB for best edges.\n", memBE >> 20);
110 writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs.\n", memUT >> 20);
111 writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - read layouts.\n", memUL >> 20);
112 writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - error profiles.\n", memEP >> 20);
113 writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - error profile overlaps.\n", memEO >> 20);
114 writeStatus("OverlapCache()-- %7" F_U64P "MB for other processes.\n", memOS >> 20);
115 writeStatus("OverlapCache()-- ---------\n");
116 writeStatus("OverlapCache()-- %7" F_U64P "MB for data structures (sum of above).\n", _memReserved >> 20);
117 writeStatus("OverlapCache()-- ---------\n");
118 writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap store structure.\n", _memStore >> 20);
119 writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap data.\n", _memAvail >> 20);
120 writeStatus("OverlapCache()-- ---------\n");
121 writeStatus("OverlapCache()-- %7" F_U64P "MB allowed.\n", _memLimit >> 20);
122 writeStatus("OverlapCache()--\n");
123
124 if (_memAvail == 0) {
125 writeStatus("OverlapCache()-- Out of memory before loading overlaps; increase -M.\n");
126 exit(1);
127 }
128
129 _maxEvalue = AS_OVS_encodeEvalue(maxErate);
130 _minOverlap = minOverlap;
131
132 // Allocate space to load overlaps. With a NULL seqStore we can't call the bgn or end methods.
133
134 _ovsMax = 0;
135 _ovs = NULL;
136 _ovsSco = NULL;
137 _ovsTmp = NULL;
138
139 // Allocate pointers to overlaps.
140
141 _overlapLen = new uint32 [RI->numReads() + 1];
142 _overlapMax = new uint32 [RI->numReads() + 1];
143 _overlaps = new BAToverlap * [RI->numReads() + 1];
144
145 memset(_overlapLen, 0, sizeof(uint32) * (RI->numReads() + 1));
146 memset(_overlapMax, 0, sizeof(uint32) * (RI->numReads() + 1));
147 memset(_overlaps, 0, sizeof(BAToverlap *) * (RI->numReads() + 1));
148
149 // Open the overlap store.
150
151 ovStore *ovlStore = new ovStore(ovlStorePath, NULL);
152
153 // Load overlaps!
154
155 computeOverlapLimit(ovlStore, genomeSize);
156 loadOverlaps(ovlStore);
157
158 delete [] _ovs; _ovs = NULL; // There is a small cost with these arrays that we'd
159 delete [] _ovsSco; _ovsSco = NULL; // like to not have, and a big cost with ovlStore (in that
160 delete [] _ovsTmp; _ovsTmp = NULL; // it loaded updated erates into memory), so release
161 delete ovlStore; ovlStore = NULL; // these before symmetrizing overlaps.
162
163 if (symmetrize == true)
164 symmetrizeOverlaps();
165
166 delete [] _minSco; _minSco = NULL;
167 }
168
169
~OverlapCache()170 OverlapCache::~OverlapCache() {
171
172 delete [] _overlaps;
173 delete [] _overlapLen;
174 delete [] _overlapMax;
175
176 delete _overlapStorage;
177 }
178
179
180
181 // Decide on limits per read.
182 //
183 // From the memory limit, we can compute the average allowed per read. If this is higher than
184 // the expected coverage, we'll not fill memory completely as the reads in unique sequence will
185 // have fewer than this number of overlaps.
186 //
187 // We'd like to iterate this, but the unused space computation assumes all reads are assigned
188 // the same amount of memory. On the next iteration, this isn't true any more. The benefit is
189 // (hopefully) small, and the algorithm is unknown.
190 //
191 // This isn't perfect. It estimates based on whatever is in the store, not only those overlaps
192 // below the error threshold. Result is that memory usage is far below what it should be. Easy to
193 // fix if we assume all reads have the same properties (same library, same length, same error
194 // rate) but not so easy in reality. We need big architecture changes to make it easy (grouping
195 // reads by library, collecting statistics from the overlaps, etc).
196 //
197 // It also doesn't distinguish between 5' and 3' overlaps - it is possible for all the long
198 // overlaps to be off of one end.
199 //
200
201 void
computeOverlapLimit(ovStore * ovlStore,uint64 genomeSize)202 OverlapCache::computeOverlapLimit(ovStore *ovlStore, uint64 genomeSize) {
203 uint32 frstRead = 0;
204 uint32 lastRead = 0;
205 uint32 *numPer = ovlStore->numOverlapsPerRead();
206
207 // Set the minimum number of overlaps per read to twice coverage. Then set the maximum number of
208 // overlaps per read to a guess of what it will take to fill up memory.
209
210 _minPer = 2 * RI->numBases() / genomeSize;
211 _maxPer = _memAvail / (RI->numReads() * sizeof(BAToverlap));
212
213 writeStatus("OverlapCache()-- Retain at least " F_U32 " overlaps/read, based on %.2fx coverage.\n", _minPer, (double)RI->numBases() / genomeSize);
214 writeStatus("OverlapCache()-- Initial guess at " F_U32 " overlaps/read.\n", _maxPer);
215 writeStatus("OverlapCache()--\n");
216
217 uint64 totalOlaps = ovlStore->numOverlapsInRange();
218
219 assert(totalOlaps > 0);
220
221 uint64 olapLoad = 0; // Total overlaps we would load at this threshold
222 uint64 olapMem = 0;
223
224 uint32 numBelow = 0; // Number of reads below the threshold
225 uint32 numEqual = 0;
226 uint32 numAbove = 0; // Number of reads above the threshold
227
228 writeStatus("OverlapCache()-- Adjusting for sparse overlaps.\n");
229 writeStatus("OverlapCache()--\n");
230 writeStatus("OverlapCache()-- reads loading olaps olaps memory\n");
231 writeStatus("OverlapCache()-- olaps/read all some loaded free\n");
232 writeStatus("OverlapCache()-- ---------- ------- ------- ----------- ------- --------\n");
233
234 while (true) {
235 olapLoad = 0;
236 numBelow = 0;
237 numEqual = 0;
238 numAbove = 0;
239
240 for (uint32 i=1; i<=RI->numReads(); i++) {
241 if (numPer[i] < _maxPer) {
242 numBelow += 1;
243 olapLoad += numPer[i];
244
245 } else if (numPer[i] == _maxPer) {
246 numEqual += 1;
247 olapLoad += _maxPer;
248
249 } else {
250 numAbove += 1;
251 olapLoad += _maxPer;
252 }
253 }
254
255 olapMem = olapLoad * sizeof(BAToverlap);
256
257 // If we're too high, decrease the threshold and compute again. We shouldn't ever be too high.
258
259 if (_memAvail < olapMem) {
260 _maxPer--;
261 continue;
262 }
263
264 // Log what we will be loading.
265
266 writeStatus("OverlapCache()-- %7" F_U32P " %7" F_U32P " %7" F_U32P " %12" F_U32P " %6.2f%% %7" F_U32P " MB\n",
267 _maxPer,
268 numBelow + numEqual,
269 numAbove,
270 olapLoad,
271 100.0 * olapLoad / totalOlaps,
272 (_memAvail - olapMem) >> 20);
273
274 // If there are no more overlaps to load, we're done.
275
276 if (numAbove == 0)
277 break;
278
279 // Otherwise, there is still (potentially) space left for more overlaps. Estimate how much
280 // higher we could push the threshold: compute how many more overlaps we could load before
281 // exceeding the memory limit, then assume we'd load that many overlaps for each of the
282 // numAbove reads.
283
284 int64 olapFree = (_memAvail - olapMem) / sizeof(BAToverlap);
285 int64 increase = olapFree / numAbove;
286
287 if (increase == 0)
288 break;
289
290 _maxPer += increase;
291 }
292
293 if (_maxPer < _minPer)
294 writeStatus("OverlapCache()-- Not enough memory to load the minimum number of overlaps; increase -M.\n"), exit(1);
295
296 delete [] numPer;
297 }
298
299 //return true if o1 is worse than o2
compareOverlaps(const ovOverlap & o1,const ovOverlap & o2)300 bool OverlapCache::compareOverlaps(const ovOverlap &o1, const ovOverlap &o2) const {
301 assert(o1.a_iid == o2.a_iid);
302 assert(o1.b_iid == o2.b_iid);
303 auto as_tuple = [&](const ovOverlap &o) {
304 return std::make_tuple(1 - o.erate(), RI->overlapLength(o.a_iid, o.b_iid, o.a_hang(), o.b_hang()), !o.flipped());
305 };
306 return as_tuple(o2) > as_tuple(o1);
307 }
308
309
310 //return true if o1 is worse than o2
compareOverlaps(const BAToverlap & o1,const BAToverlap & o2)311 bool OverlapCache::compareOverlaps(const BAToverlap &o1, const BAToverlap &o2) const {
312 assert(o1.a_iid == o2.a_iid);
313 auto as_tuple = [&](const BAToverlap &o) {
314 return std::make_tuple(1 - o.erate(), RI->overlapLength(o.a_iid, o.b_iid, o.a_hang, o.b_hang), !o.flipped);
315 };
316 return as_tuple(o2) > as_tuple(o1);
317 }
318
319 uint32
filterDuplicates(uint32 & no)320 OverlapCache::filterDuplicates(uint32 &no) {
321 uint32 nFiltered = 0;
322
323 for (uint32 ii=0, jj=1; jj<no; ii++, jj++) {
324 if (_ovs[ii].b_iid != _ovs[jj].b_iid)
325 continue;
326
327 // Found duplicate B IDs. Drop one of them.
328
329 nFiltered++;
330
331 // Drop the weaker overlap. If a tie, drop the flipped one.
332
333 uint32 to_drop = compareOverlaps(_ovs[ii], _ovs[jj]) ? ii : jj;
334 uint32 to_save = (to_drop == ii ? jj : ii);
335 #if 0
336 writeLog("OverlapCache::filterDuplicates()-- Dropping overlap A: %9" F_U64P " B: %9" F_U64P " - score %8.2f - %6.4f%% - %6" F_S32P " %6" F_S32P " - %s\n",
337 _ovs[to_drop].a_iid, _ovs[to_drop].b_iid, to_drops, _ovs[to_drop].erate(), _ovs[to_drop].a_hang(), _ovs[to_drop].b_hang(), _ovs[to_drop].flipped() ? "flipped" : "");
338 writeLog("OverlapCache::filterDuplicates()-- Saving overlap A: %9" F_U64P " B: %9" F_U64P " - score %8.2f - %6.4f%% - %6" F_S32P " %6" F_S32P " - %s\n",
339 _ovs[to_save].a_iid, _ovs[to_save].b_iid, to_saves, _ovs[to_save].erate(), _ovs[to_save].a_hang(), _ovs[to_save].b_hang(), _ovs[to_save].flipped() ? "flipped" : "");
340 #endif
341
342 _ovs[to_drop].a_iid = 0;
343 _ovs[to_drop].b_iid = 0;
344 }
345
346 // If nothing was filtered, return.
347
348 if (nFiltered == 0)
349 return(0);
350
351 // Squeeze out the filtered overlaps. Preserve order so we can binary search later.
352
353 for (uint32 ii=0, jj=0; jj<no; ) {
354 if (_ovs[jj].a_iid == 0) {
355 jj++;
356 continue;
357 }
358
359 if (ii != jj)
360 _ovs[ii] = _ovs[jj];
361
362 ii++;
363 jj++;
364 }
365
366 no -= nFiltered;
367
368 // Check for errors.
369
370 bool errors = false;
371
372 for (uint32 jj=0; jj<no; jj++)
373 if ((_ovs[jj].a_iid == 0) || (_ovs[jj].b_iid == 0))
374 errors = true;
375
376 if (errors == false)
377 return(nFiltered);
378
379 writeLog("ERROR: filtered overlap found in saved list for read %u. Filtered %u overlaps.\n", _ovs[0].a_iid, nFiltered);
380
381 for (uint32 jj=0; jj<no + nFiltered; jj++)
382 writeLog("OVERLAP %8d %8d hangs %5d %5d erate %.4f\n",
383 _ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang(), _ovs[jj].erate());
384
385 flushLog();
386
387 assert(errors == false);
388 return(0);
389 }
390
391
392
393 inline
394 uint64
ovlSco(uint64 olen,uint64 evalue,uint64 ii)395 ovlSco(uint64 olen, uint64 evalue, uint64 ii) {
396 uint64 os;
397
398 os = olen;
399 os <<= AS_MAX_EVALUE_BITS;
400 os |= (~evalue) & ERR_MASK;
401 os <<= SALT_BITS;
402 os |= ii & SALT_MASK;
403
404 return(os);
405 }
406
407 inline
408 uint64
ovlScoToLength(uint64 score)409 ovlScoToLength(uint64 score) {
410 return(score >> (AS_MAX_EVALUE_BITS + SALT_BITS));
411 }
412
413
414
415 uint32
filterOverlaps(uint32 aid,uint32 maxEvalue,uint32 minOverlap,uint32 no)416 OverlapCache::filterOverlaps(uint32 aid, uint32 maxEvalue, uint32 minOverlap, uint32 no) {
417 uint32 ns = 0;
418 bool beVerbose = false;
419
420 //beVerbose = (_ovs[0].a_iid == 3514657);
421
422 for (uint32 ii=0; ii<no; ii++) {
423 _ovsSco[ii] = 0; // Overlaps 'continue'd below will be filtered, even if 'no filtering' is needed.
424 _ovsTmp[ii] = 0;
425
426 if ((RI->readLength(_ovs[ii].a_iid) == 0) || // At least one read in the overlap is deleted
427 (RI->readLength(_ovs[ii].b_iid) == 0)) {
428 if (beVerbose)
429 writeLog("olap %d involves deleted reads - %u %s - %u %s\n",
430 ii,
431 _ovs[ii].a_iid, (RI->readLength(_ovs[ii].a_iid) == 0) ? "deleted" : "active",
432 _ovs[ii].b_iid, (RI->readLength(_ovs[ii].b_iid) == 0) ? "deleted" : "active");
433 continue;
434 }
435
436 if (_ovs[ii].evalue() > maxEvalue) { // Too noisy to care
437 if (beVerbose)
438 writeLog("olap %d too noisy evalue %f > maxEvalue %f\n",
439 ii, AS_OVS_decodeEvalue(_ovs[ii].evalue()), AS_OVS_decodeEvalue(maxEvalue));
440 continue;
441 }
442
443 uint32 olen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
444
445 // If too short, drop it.
446
447 if (olen < minOverlap) {
448 if (beVerbose)
449 writeLog("olap %d too short olen %u minOverlap %u\n",
450 ii, olen, minOverlap);
451 continue;
452 }
453
454 // Just right!
455
456 _ovsTmp[ii] = _ovsSco[ii] = ovlSco(olen, _ovs[ii].evalue(), ii);
457
458 ns++;
459 }
460
461 _minSco[aid] = 0; // Minimum score of overlap we'll keep for this read.
462
463 if (ns <= _maxPer) // Fewer overlaps than the limit, no filtering needed.
464 return(ns);
465
466 std::sort(_ovsTmp, _ovsTmp + no); // Sort the scores so we can pick a minScore that
467 _minSco[aid] = _ovsTmp[no - _maxPer]; // results in the correct number of overlaps.
468
469 ns = 0;
470
471 for (uint32 ii=0; ii<no; ii++)
472 if (_ovsSco[ii] < _minSco[aid]) // Score too low, flag it as junk.
473 _ovsSco[ii] = 0; // We could also do this when copying overlaps to
474 else // storage, except we need to know how many overlaps
475 ns++; // to copy so we can allocate storage.
476
477 assert(ns <= _maxPer);
478
479 return(ns);
480 }
481
482
483
484 void
loadOverlaps(ovStore * ovlStore)485 OverlapCache::loadOverlaps(ovStore *ovlStore) {
486
487 writeStatus("OverlapCache()--\n");
488 writeStatus("OverlapCache()-- Loading overlaps.\n");
489 writeStatus("OverlapCache()--\n");
490 writeStatus("OverlapCache()-- read from store saved in cache\n");
491 writeStatus("OverlapCache()-- ------------ --------- ------------ ---------\n");
492
493 uint64 numTotal = 0;
494 uint64 numLoaded = 0;
495 uint64 numDups = 0;
496 uint32 numReads = 0;
497 uint64 numStore = ovlStore->numOverlapsInRange();
498
499 assert(numStore > 0);
500
501 _overlapStorage = new OverlapStorage(ovlStore->numOverlapsInRange());
502
503 // Scan the overlaps, finding the maximum number of overlaps for a single read. This lets
504 // us pre-allocate space and simplifies the loading process.
505
506 assert(_ovsMax == 0);
507 assert(_ovs == NULL);
508
509 _ovsMax = 0;
510
511 for (uint32 rr=0; rr<RI->numReads()+1; rr++)
512 _ovsMax = std::max(_ovsMax, ovlStore->numOverlaps(rr));
513
514 _minSco = new uint64 [RI->numReads()+1];
515
516 _ovs = new ovOverlap [_ovsMax];
517 _ovsSco = new uint64 [_ovsMax];
518 _ovsTmp = new uint64 [_ovsMax];
519
520 for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
521
522 // Actually load the overlaps, then detect and remove overlaps between
523 // the same pair, then filter short and low quality overlaps.
524
525 uint32 no = ovlStore->loadOverlapsForRead(rr, _ovs, _ovsMax); // no == total overlaps == numOvl
526 uint32 nd = filterDuplicates(no); // nd == duplicated overlaps (no is decreased by this amount)
527 uint32 ns = filterOverlaps(rr, _maxEvalue, _minOverlap, no); // ns == acceptable overlaps
528
529 // If we still have overlaps, get official space to store them and copy
530 // to storage,
531
532 if (ns > 0) {
533 uint32 id = _ovs[0].a_iid;
534
535 _overlapMax[id] = ns;
536 _overlapLen[id] = ns;
537 _overlaps[id] = _overlapStorage->get(ns); // Get space for overlaps.
538
539 _memOlaps += _overlapMax[id] * sizeof(BAToverlap);
540
541 uint32 oo=0;
542
543 for (uint32 ii=0; ii<no; ii++) {
544 if (_ovsSco[ii] == 0) // Skip if it was filtered.
545 continue;
546
547 _overlaps[id][oo].evalue = _ovs[ii].evalue(); // Or copy to our storage.
548 _overlaps[id][oo].a_hang = _ovs[ii].a_hang();
549 _overlaps[id][oo].b_hang = _ovs[ii].b_hang();
550 _overlaps[id][oo].flipped = _ovs[ii].flipped();
551 _overlaps[id][oo].filtered = false;
552 _overlaps[id][oo].symmetric = false;
553 _overlaps[id][oo].a_iid = _ovs[ii].a_iid;
554 _overlaps[id][oo].b_iid = _ovs[ii].b_iid;
555
556 assert(_overlaps[id][oo].a_iid != 0); // Guard against some kind of weird error that
557 assert(_overlaps[id][oo].b_iid != 0); // I can no longer remember.
558
559 oo++;
560 }
561
562 assert(oo == _overlapLen[id]); // Ensure we got all the overlaps we were supposed to get.
563 }
564
565 // Keep track of what we loaded and didn't.
566
567 numTotal += no + nd; // Because no was decremented by nd in filterDuplicates()
568 numLoaded += ns;
569 numDups += nd;
570
571 if ((numReads++ % 100000) == 99999)
572 writeStatus("OverlapCache()-- %12" F_U64P " (%06.2f%%) %12" F_U64P " (%06.2f%%)\n",
573 numTotal, 100.0 * numTotal / numStore,
574 numLoaded, 100.0 * numLoaded / numStore);
575 }
576
577 writeStatus("OverlapCache()-- ------------ --------- ------------ ---------\n");
578 writeStatus("OverlapCache()-- %12" F_U64P " (%06.2f%%) %12" F_U64P " (%06.2f%%)\n",
579 numTotal, 100.0 * numTotal / numStore,
580 numLoaded, 100.0 * numLoaded / numStore);
581
582 writeStatus("OverlapCache()--\n");
583 writeStatus("OverlapCache()-- Ignored %lu duplicate overlaps.\n", numDups);
584 }
585
586
587
588 // Binary search a list of overlaps for one matching bID and flipped.
589 uint32
searchForOverlap(BAToverlap * ovl,uint32 ovlLen,uint32 bID,bool flipped)590 searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID, bool flipped) {
591 int32 F = 0;
592 int32 L = ovlLen - 1;
593 int32 M = 0;
594
595 #ifdef TEST_LINEAR_SEARCH
596 bool linearSearchFound = false;
597
598 for (uint32 ss=0; ss<ovlLen; ss++)
599 if ((ovl[ss].b_iid == bID) &&
600 (ovl[ss].flipped == flipped)) {
601 linearSearchFound = true;
602 break;
603 }
604 #endif
605
606 while (F <= L) {
607 M = (F + L) / 2;
608
609 if ((ovl[M].b_iid == bID) &&
610 (ovl[M].flipped == flipped)) {
611 #ifdef TEST_LINEAR_SEARCH
612 assert(linearSearchFound == true);
613 #endif
614 return(M);
615 }
616
617 if (((ovl[M].b_iid < bID)) ||
618 ((ovl[M].b_iid == bID) && (ovl[M].flipped < flipped)))
619 F = M+1;
620 else
621 L = M-1;
622 }
623
624 #ifdef TEST_LINEAR_SEARCH
625 assert(linearSearchFound == false);
626 #endif
627
628 return(UINT32_MAX);
629 }
630
631
632
633
634 void
symmetrizeOverlaps(void)635 OverlapCache::symmetrizeOverlaps(void) {
636 uint32 fiLimit = RI->numReads() + 1;
637 uint32 numThreads = getNumThreads();
638 uint32 blockSize = (fiLimit < 1000 * numThreads) ? numThreads : fiLimit / 999;
639
640 uint32 *nNonSymPerRead = new uint32 [fiLimit];
641 uint32 *nFiltPerRead = new uint32 [fiLimit];
642 uint32 *nMissPerRead = new uint32 [fiLimit];
643
644 for (uint32 rr=0; rr < fiLimit; rr++) {
645 nNonSymPerRead[rr] = 0;
646 nFiltPerRead[rr] = 0;
647 nMissPerRead[rr] = 0;
648 }
649
650 writeStatus("OverlapCache()--\n");
651 writeStatus("OverlapCache()-- Symmetrizing overlaps.\n");
652 writeStatus("OverlapCache()-- Finding missing twins.\n");
653
654 FILE *NSE = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-error-rates", false); // These turn out to be VERY big
655 FILE *NTW = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-overlaps", false);
656
657 if (NSE) {
658 fprintf(NSE, " aID bID a error b error\n");
659 fprintf(NSE, "-------- -------- ------- -------\n");
660 }
661
662 // For each overlap, see if the twin overlap exists. It is tempting to skip searching if the
663 // b-read has loaded all overlaps (the overlap we're searching for must exist) but we can't.
664 // We must still mark the overlap as being symmetric.
665
666 #pragma omp parallel for schedule(dynamic, blockSize)
667 for (uint32 ra=0; ra < fiLimit; ra++) {
668 for (uint32 oa=0; oa<_overlapLen[ra]; oa++) {
669 BAToverlap *ova = &_overlaps[ra][oa];
670 uint32 rb = _overlaps[ra][oa].b_iid;
671
672 // If already marked, we're done.
673
674 if (ova->symmetric == true)
675 continue;
676
677 // Search for the twin overlap.
678
679 uint32 ob = searchForOverlap(_overlaps[rb], _overlapLen[rb], ra, ova->flipped);
680
681 // If the twin was found, mark both as symmetric and make sure the
682 // error rate is symmetric too.
683
684 if (ob < UINT32_MAX) {
685 BAToverlap *ovb = &_overlaps[rb][ob];
686
687 ova->symmetric = true; // I have a twin!
688 ovb->symmetric = true; // My twin has a twin, me!
689
690 if (ova->evalue != ovb->evalue) {
691 if (NSE)
692 fprintf(NSE, "%8u %8u %7.3f %7.3f\n",
693 ra, rb,
694 ova->erate() * 100.0,
695 ovb->erate() * 100.0);
696
697 uint64 ev = std::min(ova->evalue, ovb->evalue);
698
699 ova->evalue = ev;
700 ovb->evalue = ev;
701
702 nNonSymPerRead[ra]++;
703 }
704
705 continue;
706 }
707
708 // Otherwise, the twin was not found.
709
710 // If this overlap is weak according to the other read (that is, if
711 // the other read dropped it for whatever reason) flag that we don't
712 // want to keep it here either.
713
714 uint32 olen = RI->overlapLength(ova->a_iid, ova->b_iid, ova->a_hang, ova->b_hang);
715 uint64 osco = ovlSco(olen, ova->evalue, UINT64_MAX);
716
717 if (osco < _minSco[rb]) {
718 if (NTW)
719 fprintf(NTW, "NO TWIN for %6u -> %6u - length %lu < min %lu - WEAK\n",
720 ra, ova->b_iid, ovlScoToLength(osco), ovlScoToLength(_minSco[rb]));
721
722 nFiltPerRead[ra]++;
723 ova->filtered = true;
724 continue;
725 }
726
727 // Not weak. We want to duplicate this in the other read.
728
729 if (NTW)
730 fprintf(NTW, "NO TWIN for %6u -> %6u - length %lu >= min %lu\n",
731 ra, ova->b_iid, ovlScoToLength(osco), ovlScoToLength(_minSco[rb]));
732
733 #pragma omp critical (nMissPerRead)
734 nMissPerRead[rb]++;
735 }
736 }
737
738 AS_UTL_closeFile(NTW);
739 AS_UTL_closeFile(NSE);
740
741 uint64 nOverlaps = 0;
742 uint64 nNonSymErr = 0;
743 uint64 nWeak = 0;
744 uint64 nMissing = 0;
745
746 for (uint32 rr=0; rr < fiLimit; rr++) {
747 nOverlaps += _overlapLen[rr];
748 nNonSymErr += nNonSymPerRead[rr];
749 nWeak += nFiltPerRead[rr];
750 nMissing += nMissPerRead[rr];
751 }
752
753 writeStatus("OverlapCache()-- In %llu overlaps:\n", nOverlaps);
754 writeStatus("OverlapCache()-- Found %llu overlaps with non-symmetric error rates.\n", nNonSymErr);
755 writeStatus("OverlapCache()-- Found %llu overlaps with missing twins.\n", nMissing);
756 writeStatus("OverlapCache()-- Dropped %llu weak missing-twin overlaps.\n", nWeak);
757
758 //
759 // Expand or shrink space for the overlaps.
760 //
761
762 // Allocate new temporary pointers for each read.
763
764 // The new storage must start after the old storage. And if it starts after the old storage ends,
765 // we can copy easier. If not, we just grab some empty overlaps to make space.
766
767 // A complication occurs at the end of a single segment. If there isn't enough space in the
768 // current segment for the overlaps, we skip ahead to the next segment without accounting for the
769 // overlaps we skip. It's possible for the new size to fit into this unused space, which would
770 // then put the old overlaps physically after the new ones.
771 //
772 // [ olaps1+unused | olaps2+unused | olaps3+unused | ] [ olaps4+unused | ..... ]
773 // [ olaps1+new | olaps2+new | olaps3+new | olaps4+new | ] [ olaps5+new | .... ]
774 //
775 // So, we need to compare not overlap counts, but raw positions in the OverlapStorage object.
776
777 BAToverlap **nPtr = new BAToverlap * [fiLimit]; // Pointers to new overlap storage locations
778 OverlapStorage *oldS = new OverlapStorage(_overlapStorage); // Recreates the existing layout without allocating anything
779 OverlapStorage *newS = _overlapStorage->reset(); // Resets pointers for the new layout, using existing space
780
781 for (uint32 rr=1; rr < fiLimit; rr++)
782 nPtr[rr] = nullptr;
783
784 // Compute new pointers for overlap data that begin after the old data.
785
786 for (uint32 rr=1; rr < fiLimit; rr++) {
787 uint32 nOvl = _overlapLen[rr] + nMissPerRead[rr] - nFiltPerRead[rr];
788
789 assert(_overlapLen[rr] + nMissPerRead[rr] >= nFiltPerRead[rr]);
790
791 oldS->get(_overlapMax[rr]); // Move old storage ahead to the start of the next read.
792 newS->advance(oldS); // Advance newS so that it is at or after where oldS is.
793 nPtr[rr] = newS->get(nOvl); // Grab space for the number of overlaps we'll end up with on this read.
794
795 _overlapMax[rr] = nOvl; // Update the maximum number of overlaps on this read.
796 }
797
798 // Copy overlap data from the old space to the new space, backwards.
799 // THIS CANNOT BE THREADED.
800
801 writeStatus("OverlapCache()-- Shifting overlaps.\n");
802
803 FILE *NTD = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-weak-dropped", false);
804
805 for (uint32 rr=fiLimit; rr-- > 0; ) {
806 if (_overlapLen[rr] == 0) // Skip the asserts!
807 continue;
808
809 assert(_overlaps[rr][0 ].a_iid == rr); // Ensure that we didn't overwrite an existing
810 assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr); // overlap with overlaps for the reads after it.
811
812 uint32 oo = 0; // Position in original overlap list
813 uint32 nn = 0; // Position in new overlap list
814
815 for (; oo<_overlapLen[rr]; oo++) // Over all the original overlaps,
816 if (_overlaps[rr][oo].filtered == false) // Copy to new storage if they're
817 nPtr[rr][nn++] = _overlaps[rr][oo]; // not filtered.
818 else
819 if (NTD)
820 fprintf(NTD, "DROP overlap a %u b %u\n", _overlaps[rr][oo].a_iid, _overlaps[rr][oo].b_iid);
821
822 assert(nn == _overlapLen[rr] - nFiltPerRead[rr]);
823
824 _overlapLen[rr] = _overlapLen[rr] - nFiltPerRead[rr];
825 }
826
827 AS_UTL_closeFile(NTD);
828
829 // Swap pointers to the pointers and remove the old pointer storage.
830
831 delete [] _overlaps;
832 _overlaps = nPtr;
833
834 delete oldS;
835
836 // Copy non-twin overlaps to their twin.
837
838 writeStatus("OverlapCache()-- Adding missing twins.\n");
839
840 FILE *NTA = AS_UTL_openOutputFile(_prefix, '.', "non-symmetric-added", false);
841
842 // This has several concurrency issues.
843 // 1) loop test on overlapLen[ra] can change if we increment overlapLen[rb].
844 // we could access overlaps[ra][oo] before it is copied to in another thread
845 // 2) overlapLen[rb]++
846 // 3) nMissPerRead[rb]
847
848 //#pragma omp parallel for schedule(dynamic, blockSize)
849 for (uint32 ra=0; ra < fiLimit; ra++) {
850 for (uint32 oo=0; oo<_overlapLen[ra]; oo++) {
851 if (_overlaps[ra][oo].symmetric == true)
852 continue;
853
854 uint32 rb = _overlaps[ra][oo].b_iid;
855 uint32 nn = _overlapLen[rb]++;
856
857 assert(nn < _overlapMax[rb]);
858
859 if (NTA)
860 fprintf(NTA, "add missing twin from read %u -> read %u at pos %u out of %u\n", ra, rb, nn, _overlapMax[rb] - 1);
861
862 _overlaps[rb][nn].evalue = _overlaps[ra][oo].evalue;
863 _overlaps[rb][nn].a_hang = (_overlaps[ra][oo].flipped) ? (_overlaps[ra][oo].b_hang) : (-_overlaps[ra][oo].a_hang);
864 _overlaps[rb][nn].b_hang = (_overlaps[ra][oo].flipped) ? (_overlaps[ra][oo].a_hang) : (-_overlaps[ra][oo].b_hang);
865 _overlaps[rb][nn].flipped = _overlaps[ra][oo].flipped;
866
867 _overlaps[rb][nn].filtered = _overlaps[ra][oo].filtered;
868 _overlaps[rb][nn].symmetric = _overlaps[ra][oo].symmetric = true;
869
870 _overlaps[rb][nn].a_iid = _overlaps[ra][oo].b_iid;
871 _overlaps[rb][nn].b_iid = _overlaps[ra][oo].a_iid;
872
873 assert(ra == _overlaps[ra][oo].a_iid);
874 assert(rb == _overlaps[ra][oo].b_iid);
875
876 assert(nMissPerRead[rb] > 0);
877
878 nMissPerRead[rb]--;
879 }
880 }
881
882 AS_UTL_closeFile(NTA);
883
884 // Check that everything worked.
885
886 for (uint32 rr=0; rr < fiLimit; rr++) {
887 assert(nMissPerRead[rr] == 0);
888
889 if (_overlapLen[rr] == 0)
890 continue;
891
892 assert(_overlapLen[rr] == _overlapMax[rr]);
893
894 assert(_overlaps[rr][0 ].a_iid == rr);
895 assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
896 }
897
898 // Cleanup.
899
900 delete [] nNonSymPerRead;
901 delete [] nFiltPerRead;
902 delete [] nMissPerRead;
903
904 // Probably should sort again. Not sure if anything depends on this.
905
906 writeStatus("OverlapCache()-- Sorting overlaps.\n");
907
908 #pragma omp parallel for schedule(dynamic, blockSize)
909 for (uint32 rr=0; rr < fiLimit; rr++)
910 if (_overlapLen[rr] > 0)
911 std::sort(_overlaps[rr], _overlaps[rr] + _overlapLen[rr], [](BAToverlap const &a, BAToverlap const &b) {
912 return(((a.b_iid == b.b_iid) && (a.flipped < b.flipped)) || (a.b_iid < b.b_iid)); } );
913
914 // Check that all overlaps are present.
915
916 writeStatus("OverlapCache()-- Checking overlap symmetry.\n");
917
918 #pragma omp parallel for schedule(dynamic, blockSize)
919 for (uint32 ra=0; ra < fiLimit; ra++) {
920 for (uint32 oa=0; oa<_overlapLen[ra]; oa++) {
921 BAToverlap *ova = &_overlaps[ra][oa];
922 uint32 rb = _overlaps[ra][oa].b_iid;
923
924 uint32 ob = searchForOverlap(_overlaps[rb], _overlapLen[rb], ra, ova->flipped);
925
926 if (ob == UINT32_MAX) {
927 for(uint32 ii=0; ii<_overlapLen[ra]; ii++)
928 fprintf(stderr, "olapA %u -> %u flip %c%s\n", _overlaps[ra][ii].a_iid, _overlaps[ra][ii].b_iid, _overlaps[ra][ii].flipped ? 'Y' : 'N', (ii == ra) ? " **" : "");
929 for(uint32 ii=0; ii<_overlapLen[rb]; ii++)
930 fprintf(stderr, "olapB %u -> %u flip %c\n", _overlaps[rb][ii].a_iid, _overlaps[rb][ii].b_iid, _overlaps[rb][ii].flipped ? 'Y' : 'N');
931 }
932 assert(ob != UINT32_MAX);
933 }
934 }
935
936 writeStatus("OverlapCache()-- Finished.\n");
937 }
938
939
940