1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19 #include "system.H"
20
21 #include <pthread.h>
22
23 #include "sqStore.H"
24 #include "ovStore.H"
25
26 #include "edlib.H"
27
28 #include "overlapReadCache.H"
29
30 #include "sequence.H"
31
32 // The process will load BATCH_SIZE overlaps into memory, then load all the reads referenced by
33 // those overlaps. Once all data is loaded, compute threads are spawned. Each thread will reserve
34 // THREAD_SIZE overlaps to compute. A small THREAD_SIZE relative to BATCH_SIZE will result in
35 // better load balancing, but too small and the overhead of reserving overlaps will dominate (too
36 // small is on the order of 1). While threads are computing, the next batch of overlaps and reads
37 // is loaded.
38 //
39 // A large BATCH_SIZE will make startup cost large - no computes are started until the initial load
40 // is finished. To alleivate this (a little bit), the initial load is only 1/8 of the full
41 // BATCH_SIZE.
42
43 #define BATCH_SIZE 1024 * 1024
44 #define THREAD_SIZE 128
45
46 // Does slightly better with 2550 than 500. Speed takes a slight hit.
47 #define MHAP_SLOP 500
48
49
50
51 class alignStats {
52 public:
alignStats()53 alignStats() {
54 startTime = getTime();
55 reportThreshold = 0;
56
57 clear();
58 };
~alignStats()59 ~alignStats() {
60 };
61
clear(void)62 void clear(void) {
63 nSkipped = 0;
64 nPassed = 0;
65 nFailed = 0;
66
67 nFailExtA = 0;
68 nFailExtB = 0;
69 nFailExt = 0;
70
71 nExtendedA = 0;
72 nExtendedB = 0;
73
74 nPartial = 0;
75 nDovetail = 0;
76
77 nExt5a = 0;
78 nExt3a = 0;
79 nExt5b = 0;
80 nExt3b = 0;
81 };
82
83
84 alignStats &operator+=(alignStats &that) {
85 nSkipped += that.nSkipped;
86 nPassed += that.nPassed;
87 nFailed += that.nFailed;
88
89 nFailExtA += that.nFailExtA;
90 nFailExtB += that.nFailExtB;
91 nFailExt += that.nFailExt;
92
93 nExtendedA += that.nExtendedA;
94 nExtendedB += that.nExtendedB;
95
96 nPartial += that.nPartial;
97 nDovetail += that.nDovetail;
98
99 nExt5a += that.nExt5a;
100 nExt3a += that.nExt3a;
101 nExt5b += that.nExt5b;
102 nExt3b += that.nExt3b;
103
104 return(*this);
105 };
106
reportStatus(void)107 void reportStatus(void) {
108
109 if (nPassed + nFailed < reportThreshold)
110 return;
111
112 reportThreshold += 10000;
113
114 fprintf(stderr, "Tested %9" F_U64P " olaps -- Skipped %8.4f%% -- Passed %8.4f%% -- %8.2f olaps/sec\n",
115 nPassed + nFailed,
116 100.0 * nSkipped / (nPassed + nFailed),
117 100.0 * nPassed / (nPassed + nFailed),
118 (nPassed + nFailed) / (getTime() - startTime));
119 };
120
reportFinal(void)121 void reportFinal(void) {
122 fprintf(stderr, "\n");
123 fprintf(stderr, " -- %" F_U64P " overlaps processed.\n", nPassed + nFailed);
124 fprintf(stderr, " -- %" F_U64P " skipped.\n", nSkipped);
125 fprintf(stderr, " -- %" F_U64P " failed %" F_U64P " passed (%.4f%%).\n", nFailed, nPassed, 100.0 * nPassed / (nPassed + nFailed));
126 fprintf(stderr, " --\n");
127 fprintf(stderr, " -- %" F_U64P " failed initial alignment, allowing A to extend\n", nFailExtA);
128 fprintf(stderr, " -- %" F_U64P " failed initial alignment, allowing B to extend\n", nFailExtB);
129 fprintf(stderr, " -- %" F_U64P " failed initial alignment\n", nFailExt);
130 fprintf(stderr, " --\n");
131 fprintf(stderr, " -- %" F_U64P " partial overlaps (of any quality)\n", nPartial);
132 fprintf(stderr, " -- %" F_U64P " dovetail overlaps (before extensions, of any quality)\n", nDovetail);
133 fprintf(stderr, " --\n");
134 fprintf(stderr, " -- %" F_U64P "/%" F_U64P " A read dovetail extensions\n", nExt5a, nExt3a);
135 fprintf(stderr, " -- %" F_U64P "/%" F_U64P " B read dovetail extensions\n", nExt5b, nExt3b);
136 };
137
138 double startTime;
139 uint64 reportThreshold;
140
141 uint64 nSkipped;
142 uint64 nPassed;
143 uint64 nFailed;
144
145 uint64 nFailExtA;
146 uint64 nFailExtB;
147 uint64 nFailExt;
148
149 uint64 nExtendedA;
150 uint64 nExtendedB;
151
152 uint64 nPartial;
153 uint64 nDovetail;
154
155 uint64 nExt5a;
156 uint64 nExt3a;
157 uint64 nExt5b;
158 uint64 nExt3b;
159 };
160
161
162
163 class workSpace {
164 public:
workSpace()165 workSpace() {
166 threadID = 0;
167
168 maxErate = 0;
169 partialOverlaps = false;
170 invertOverlaps = false;
171
172 seqStore = NULL;
173 overlapsLen = 0;
174 overlaps = NULL;
175 readSeq = NULL;
176 };
~workSpace()177 ~workSpace() {
178 delete[] readSeq;
179 };
180
181 public:
182 uint32 threadID;
183 double maxErate;
184 bool partialOverlaps;
185 bool invertOverlaps;
186 char* readSeq;
187
188 sqStore *seqStore;
189
190 uint32 overlapsLen; // Not used.
191 ovOverlap *overlaps;
192 };
193
194
195
196
197
198 overlapReadCache *rcache = NULL; // Used to be just 'cache', but that conflicted with -pg: /usr/lib/libc_p.a(msgcat.po):(.bss+0x0): multiple definition of `cache'
199 uint32 batchPrtID = 0; // When to report progress
200 uint32 batchPosID = 0; // The current position of the batch
201 uint32 batchEndID = 0; // The end of the batch
202 pthread_mutex_t balanceMutex;
203
204 uint32 minOverlapLength = 0;
205
206 alignStats globalStats;
207
208 bool debug = false;
209
210
211
212
213 bool
getRange(uint32 & bgnID,uint32 & endID)214 getRange(uint32 &bgnID, uint32 &endID) {
215
216 pthread_mutex_lock(&balanceMutex);
217
218 bgnID = batchPosID;
219 batchPosID += THREAD_SIZE; // Supposed to overflow.
220 endID = batchPosID;
221
222 if (endID > batchEndID)
223 endID = batchEndID;
224
225 pthread_mutex_unlock(&balanceMutex);
226
227 // If we're out of overlaps, batchPosID is more than batchEndID (from the last call to this
228 // function), which makes bgnID > endID (in this call).
229
230 return(bgnID < endID);
231 }
232
233
234
235
236 // Try to extend the overlap on the B read. If successful, returns new bbgn,bend and editDist and alignLen.
237 //
238 bool
extendAlignment(char * aRead,int32 abgn,int32 aend,int32 UNUSED (alen),char const * Alabel,uint32 Aid,char * bRead,int32 & bbgn,int32 & bend,int32 blen,char const * Blabel,uint32 Bid,double maxErate,int32 slop,int32 & editDist,int32 & alignLen)239 extendAlignment(char *aRead, int32 abgn, int32 aend, int32 UNUSED(alen), char const *Alabel, uint32 Aid,
240 char *bRead, int32 &bbgn, int32 &bend, int32 blen, char const *Blabel, uint32 Bid,
241 double maxErate,
242 int32 slop,
243 int32 &editDist,
244 int32 &alignLen) {
245 alignStats threadStats;
246 EdlibAlignResult result = { 0, NULL, NULL, 0, NULL, 0, 0 };
247 bool success = false;
248
249 // Find an alignment, allowing extensions on the B read.
250
251 int32 bbgnExt = std::max(0, bbgn - slop);
252 int32 bendExt = std::min(blen, bend + slop);
253
254 // This probably isn't exactly correct, but close enough.
255 int32 maxEdit = (int32)ceil(std::max(aend - abgn, bendExt - bbgnExt) * maxErate * 1.1);
256
257 if (debug)
258 fprintf(stderr, " align %s %6u %6d-%-6d to %s %6u %6d-%-6d", Alabel, Aid, abgn, aend, Blabel, Bid, bbgnExt, bendExt);
259
260 result = edlibAlign(aRead + abgn, aend - abgn,
261 bRead + bbgnExt, bendExt - bbgnExt,
262 edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
263
264 // Change the overlap for any extension found.
265
266 if (result.numLocations > 0) {
267 bbgn = bbgnExt + result.startLocations[0];
268 bend = bbgnExt + result.endLocations[0] + 1; // Edlib returns 0-based positions, add one to end to get space-based.
269
270 editDist = result.editDistance;
271 alignLen = result.alignmentLength; // Edlib 'alignmentLength' isn't populated for TASK_LOC, so we approximate it.
272 alignLen = ((aend - abgn) + (bend - bbgn) + (editDist)) / 2;
273
274 if (debug)
275 fprintf(stderr, " aligned to %s at %6d-%-6d editDist %5d alignLen %6d qual %6.4f\n",
276 Blabel, bbgn, bend, editDist, alignLen, 1.0 - editDist / (double)alignLen);
277
278 success = true;
279 } else {
280 if (debug)
281 fprintf(stderr, "\n");
282 }
283
284 edlibFreeAlignResult(result);
285
286 return(success);
287 }
288
289
290
291 bool
finalAlignment(char * aRead,int32 alen,char * bRead,int32 blen,ovOverlap * ovl,double maxErate,int32 & editDist,int32 & alignLen)292 finalAlignment(char *aRead, int32 alen,// char *Alabel, uint32 Aid,
293 char *bRead, int32 blen,// char *Blabel, uint32 Bid,
294 ovOverlap *ovl,
295 double maxErate,
296 int32 &editDist,
297 int32 &alignLen) {
298 EdlibAlignResult result = { 0, NULL, NULL, 0, NULL, 0, 0 };
299 bool success = false;
300
301 int32 abgn = (int32) ovl->dat.ovl.ahg5;
302 int32 aend = (int32)alen - ovl->dat.ovl.ahg3;
303 int32 bbgn = (int32) ovl->dat.ovl.bhg5;
304 int32 bend = (int32)blen - ovl->dat.ovl.bhg3;
305
306 int32 maxEdit = (int32)ceil(std::max(aend - abgn, bend - bbgn) * maxErate * 1.1);
307
308 result = edlibAlign(aRead + abgn, aend - abgn,
309 bRead + bbgn, bend - bbgn,
310 edlibNewAlignConfig(maxEdit, EDLIB_MODE_NW, EDLIB_TASK_LOC)); // NOTE! Global alignment.
311
312 if (result.numLocations > 0) {
313 editDist = result.editDistance;
314 alignLen = result.alignmentLength; // Edlib 'alignmentLength' isn't populated for TASK_LOC, so we approximate it.
315 alignLen = ((aend - abgn) + (bend - bbgn) + (editDist)) / 2;
316
317 success = true;
318 } else {
319 }
320
321 edlibFreeAlignResult(result);
322
323 return(success);
324 }
325
326
327
328 void *
recomputeOverlaps(void * ptr)329 recomputeOverlaps(void *ptr) {
330 workSpace *WA = (workSpace *)ptr;
331
332 uint32 bgnID = 0;
333 uint32 endID = 0;
334
335 while (getRange(bgnID, endID)) {
336 alignStats localStats;
337
338 for (uint32 oo=bgnID; oo<endID; oo++) {
339 ovOverlap *ovl = WA->overlaps + oo;
340
341 // Swap IDs if requested (why would anyone want to do this?)
342
343 if (WA->invertOverlaps) {
344 ovOverlap swapped = WA->overlaps[oo];
345
346 WA->overlaps[oo].swapIDs(swapped); // Needs to be from a temporary!
347 }
348
349 // Initialize early, just so we can use goto.
350
351 uint32 aID = ovl->a_iid;
352 char *aRead = rcache->getRead(aID);
353 int32 alen = (int32)rcache->getLength(aID);
354 int32 abgn = (int32) ovl->dat.ovl.ahg5;
355 int32 aend = (int32)alen - ovl->dat.ovl.ahg3;
356
357 uint32 bID = ovl->b_iid;
358 char *bRead = WA->readSeq;
359 int32 blen = (int32)rcache->getLength(bID);
360 int32 bbgn = (int32) ovl->dat.ovl.bhg5;
361 int32 bend = (int32)blen - ovl->dat.ovl.bhg3;
362
363 int32 alignLen = 1;
364 int32 editDist = INT32_MAX;
365
366 EdlibAlignResult result = { 0, NULL, NULL, 0, NULL, 0, 0 };
367
368 if (debug) {
369 fprintf(stderr, "--------\n");
370 fprintf(stderr, "OLAP A %7" F_U32P " %6d-%-6d\n", aID, abgn, aend);
371 fprintf(stderr, " B %7" F_U32P " %6d-%-6d %s\n", bID, bbgn, bend, (ovl->flipped() == false) ? "" : " flipped");
372 fprintf(stderr, "\n");
373 }
374
375 // Invalidate the overlap.
376
377 ovl->evalue(AS_MAX_EVALUE);
378 ovl->dat.ovl.forOBT = false;
379 ovl->dat.ovl.forDUP = false;
380 ovl->dat.ovl.forUTG = false;
381
382 // Make some bad changes, for testing
383 #if 0
384 abgn += 100;
385 aend -= 100;
386 bbgn += 100;
387 bend -= 100;
388 #endif
389
390 // Too short? Don't bother doing anything.
391 //
392 // Warning! Edlib failed on a 10bp to 10bp (extended to 5kbp) alignment.
393
394 if ((aend - abgn < minOverlapLength) ||
395 (bend - bbgn < minOverlapLength)) {
396 localStats.nSkipped++;
397 goto finished;
398 }
399
400 // Grab the B read sequence.
401
402 strcpy(bRead, rcache->getRead(bID));
403
404 // If flipped, reverse complement the B read.
405
406 if (ovl->flipped() == true)
407 reverseComplementSequence(bRead, blen);
408
409 //
410 // Find initial alignments, allowing one, then the other, sequence to be extended as needed.
411 //
412
413 if (extendAlignment(bRead, bbgn, bend, blen, "B", bID,
414 aRead, abgn, aend, alen, "A", aID,
415 WA->maxErate, MHAP_SLOP,
416 editDist,
417 alignLen) == false) {
418 localStats.nFailExtA++;
419 }
420
421 if (extendAlignment(aRead, abgn, aend, alen, "A", aID,
422 bRead, bbgn, bend, blen, "B", bID,
423 WA->maxErate, MHAP_SLOP,
424 editDist,
425 alignLen) == false) {
426 localStats.nFailExtB++;
427 }
428
429 // If no alignments were found, fail.
430
431 if (alignLen == 1) {
432 localStats.nFailExt++;
433 goto finished;
434 }
435
436 // Update the overlap.
437
438 ovl->dat.ovl.ahg5 = abgn;
439 ovl->dat.ovl.ahg3 = alen - aend;
440
441 ovl->dat.ovl.bhg5 = bbgn;
442 ovl->dat.ovl.bhg3 = blen - bend;
443
444 if (debug) {
445 fprintf(stderr, "\n");
446 fprintf(stderr, "init A %7" F_U32P " %6d-%-6d\n", aID, abgn, aend);
447 fprintf(stderr, " B %7" F_U32P " %6d-%-6d\n", bID, bbgn, bend);
448 fprintf(stderr, "\n");
449 }
450
451 // If we're just doing partial alignments or if we've found a dovetail, we're all done.
452
453 if (WA->partialOverlaps == true) {
454 localStats.nPartial++;
455 goto finished;
456 }
457
458 if (ovl->overlapIsDovetail() == true) {
459 localStats.nDovetail++;
460 goto finished;
461 }
462
463 #warning do we need to check for contained too?
464
465
466
467 // Otherwise, try to extend the alignment to make a dovetail overlap.
468
469 {
470 int32 ahg5 = ovl->dat.ovl.ahg5;
471 int32 ahg3 = ovl->dat.ovl.ahg3;
472
473 int32 bhg5 = ovl->dat.ovl.bhg5;
474 int32 bhg3 = ovl->dat.ovl.bhg3;
475
476 int32 slop = 0;
477
478 if ((ahg5 >= bhg5) && (bhg5 > 0)) {
479 //fprintf(stderr, "extend 5' by B=%d\n", bhg5);
480 ahg5 -= bhg5;
481 bhg5 -= bhg5; // Now zero.
482 slop = bhg5 * WA->maxErate + 100;
483
484 abgn = (int32) ahg5;
485 aend = (int32)alen - ahg3;
486
487 bbgn = (int32) bhg5;
488 bend = (int32)blen - bhg3;
489
490 if (extendAlignment(bRead, bbgn, bend, blen, "Bb5", bID,
491 aRead, abgn, aend, alen, "Ab5", aID,
492 WA->maxErate, slop,
493 editDist,
494 alignLen) == true) {
495 ahg5 = abgn;
496 //ahg3 = alen - aend;
497 } else {
498 ahg5 = ovl->dat.ovl.ahg5;
499 bhg5 = ovl->dat.ovl.bhg5;
500 }
501 localStats.nExt5b++;
502 }
503
504 if ((bhg5 >= ahg5) && (ahg5 > 0)) {
505 //fprintf(stderr, "extend 5' by A=%d\n", ahg5);
506 bhg5 -= ahg5;
507 ahg5 -= ahg5; // Now zero.
508 slop = ahg5 * WA->maxErate + 100;
509
510 abgn = (int32) ahg5;
511 aend = (int32)alen - ahg3;
512
513 bbgn = (int32) bhg5;
514 bend = (int32)blen - bhg3;
515
516 if (extendAlignment(aRead, abgn, aend, alen, "Aa5", aID,
517 bRead, bbgn, bend, blen, "Ba5", bID,
518 WA->maxErate, slop,
519 editDist,
520 alignLen) == true) {
521 bhg5 = bbgn;
522 //bhg3 = blen - bend;
523 } else {
524 bhg5 = ovl->dat.ovl.bhg5;
525 ahg5 = ovl->dat.ovl.ahg5;
526 }
527 localStats.nExt5a++;
528 }
529
530
531
532 if ((bhg3 >= ahg3) && (ahg3 > 0)) {
533 //fprintf(stderr, "extend 3' by A=%d\n", ahg3);
534 bhg3 -= ahg3;
535 ahg3 -= ahg3; // Now zero.
536 slop = ahg3 * WA->maxErate + 100;
537
538 abgn = (int32) ahg5;
539 aend = (int32)alen - ahg3;
540
541 bbgn = (int32) bhg5;
542 bend = (int32)blen - bhg3;
543
544 if (extendAlignment(aRead, abgn, aend, alen, "Aa3", aID,
545 bRead, bbgn, bend, blen, "Ba3", bID,
546 WA->maxErate, slop,
547 editDist,
548 alignLen) == true) {
549 //bhg5 = bbgn;
550 bhg3 = blen - bend;
551 } else {
552 bhg3 = ovl->dat.ovl.bhg3;
553 ahg3 = ovl->dat.ovl.ahg3;
554 }
555 localStats.nExt3a++;
556 }
557
558 if ((ahg3 >= bhg3) && (bhg3 > 0)) {
559 //fprintf(stderr, "extend 3' by B=%d\n", bhg3);
560 ahg3 -= bhg3;
561 bhg3 -= bhg3; // Now zero.
562 slop = bhg3 * WA->maxErate + 100;
563
564 abgn = (int32) ahg5;
565 aend = (int32)alen - ahg3;
566
567 bbgn = (int32) bhg5;
568 bend = (int32)blen - bhg3;
569
570 if (extendAlignment(bRead, bbgn, bend, blen, "Bb3", bID,
571 aRead, abgn, aend, alen, "Ab3", aID,
572 WA->maxErate, slop,
573 editDist,
574 alignLen) == true) {
575 //ahg5 = abgn;
576 ahg3 = alen - aend;
577 } else {
578 ahg3 = ovl->dat.ovl.ahg3;
579 bhg3 = ovl->dat.ovl.bhg3;
580 }
581 localStats.nExt3b++;
582 }
583
584 // Now reset the overlap.
585
586 ovl->dat.ovl.ahg5 = ahg5;
587 ovl->dat.ovl.ahg3 = ahg3;
588
589 ovl->dat.ovl.bhg5 = bhg5;
590 ovl->dat.ovl.bhg3 = bhg3;
591 } // If not a contained overlap
592
593
594
595 // If we're still not dovetail, nothing more we want to do. Let the overlap be trashed.
596
597
598 if (debug) {
599 fprintf(stderr, "\n");
600 fprintf(stderr, "fini A %7" F_U32P " %6d-%-6d %d %d\n", aID, abgn, aend, ovl->a_bgn(), ovl->a_end());
601 fprintf(stderr, " B %7" F_U32P " %6d-%-6d %d %d %s\n", bID, bbgn, bend, ovl->b_bgn(), ovl->b_end(), (ovl->flipped() == false) ? "" : " flipped");
602 fprintf(stderr, "\n");
603 }
604
605 finalAlignment(aRead, alen,// "A", aID,
606 bRead, blen,// "B", bID,
607 ovl, WA->maxErate, editDist, alignLen);
608
609
610 finished:
611
612 // Trash the overlap if it's junky quality.
613
614 double eRate = editDist / (double)alignLen;
615
616 if ((alignLen < minOverlapLength) ||
617 (eRate > WA->maxErate)) {
618 localStats.nFailed++;
619 ovl->evalue(AS_MAX_EVALUE);
620 ovl->dat.ovl.forOBT = false;
621 ovl->dat.ovl.forDUP = false;
622 ovl->dat.ovl.forUTG = false;
623
624 } else {
625 localStats.nPassed++;
626 ovl->erate(eRate);
627 ovl->dat.ovl.forOBT = (WA->partialOverlaps == true);
628 ovl->dat.ovl.forDUP = (WA->partialOverlaps == true);
629 ovl->dat.ovl.forUTG = (WA->partialOverlaps == false) && (ovl->overlapIsDovetail() == true);
630 }
631
632 } // Over all overlaps in this range
633
634
635 // Log that we've done stuff
636
637 pthread_mutex_lock(&balanceMutex);
638 globalStats += localStats;
639 globalStats.reportStatus();
640 localStats.clear();
641 pthread_mutex_unlock(&balanceMutex);
642 } // Over all ranges
643
644 return(NULL);
645 }
646
647
648
649
650
651
652 int
main(int argc,char ** argv)653 main(int argc, char **argv) {
654 char *seqName = NULL;
655 char *ovlName = NULL;
656 char *outName = NULL;
657
658 uint32 bgnID = 0;
659 uint32 endID = UINT32_MAX;
660
661 uint32 numThreads = getMaxThreadsAllowed();
662
663 double maxErate = 0.12;
664 bool partialOverlaps = false;
665 bool invertOverlaps = false;
666
667 uint64 memLimit = 4;
668
669 argc = AS_configure(argc, argv);
670
671 int err=0;
672 int arg=1;
673 while (arg < argc) {
674 if (strcmp(argv[arg], "-S") == 0) {
675 seqName = argv[++arg];
676
677 } else if (strcmp(argv[arg], "-O") == 0) {
678 ovlName = argv[++arg];
679
680 } else if (strcmp(argv[arg], "-o") == 0) {
681 outName = argv[++arg];
682
683 } else if (strcmp(argv[arg], "-b") == 0) {
684 bgnID = atoi(argv[++arg]);
685
686 } else if (strcmp(argv[arg], "-e") == 0) {
687 endID = atoi(argv[++arg]);
688
689 } else if (strcmp(argv[arg], "-t") == 0) {
690 numThreads = setNumThreads(argv[++arg]);
691
692 } else if (strcmp(argv[arg], "-erate") == 0) {
693 maxErate = atof(argv[++arg]);
694
695 } else if (strcmp(argv[arg], "-partial") == 0) {
696 partialOverlaps = true;
697
698 } else if (strcmp(argv[arg], "-invert") == 0) {
699 invertOverlaps = true;
700
701 } else if (strcmp(argv[arg], "-memory") == 0) {
702 memLimit = atoi(argv[++arg]);
703
704 } else if (strcmp(argv[arg], "-len") == 0) {
705 minOverlapLength = atoi(argv[++arg]);
706
707 } else {
708 err++;
709 }
710
711 arg++;
712 }
713
714 if (seqName == NULL)
715 err++;
716 if (ovlName == NULL)
717 err++;
718 if (outName == NULL)
719 err++;
720
721 if (err) {
722 fprintf(stderr, "usage: %s ...\n", argv[0]);
723 fprintf(stderr, " -S seqStore Mandatory, path to seqStore\n");
724 fprintf(stderr, "\n");
725 fprintf(stderr, "Inputs can come from either a store or a file.\n");
726 fprintf(stderr, " -O ovlStore \n");
727 fprintf(stderr, " -O ovlFile \n");
728 fprintf(stderr, "\n");
729 fprintf(stderr, "If from an ovlStore, the range of reads processed can be restricted.\n");
730 fprintf(stderr, " -b bgnID \n");
731 fprintf(stderr, " -e endID \n");
732 fprintf(stderr, "\n");
733 fprintf(stderr, "Outputs will be written to a store or file, depending on the input type\n");
734 fprintf(stderr, " -o ovlStore \n");
735 fprintf(stderr, " -o ovlFile \n");
736 fprintf(stderr, "\n");
737 fprintf(stderr, " -erate e Overlaps are computed at 'e' fraction error; must be larger than the original erate\n");
738 fprintf(stderr, " -partial Overlaps are 'overlapInCore -S' partial overlaps\n");
739 fprintf(stderr, " -memory m Use up to 'm' GB of memory\n");
740 fprintf(stderr, "\n");
741 fprintf(stderr, " -t n Use up to 'n' cores\n");
742 fprintf(stderr, "\n");
743 fprintf(stderr, "Advanced options:\n");
744 fprintf(stderr, "\n");
745 fprintf(stderr, " -invert Invert the overlap A <-> B before aligning (they are not re-inverted before output)\n");
746 fprintf(stderr, "\n");
747 exit(1);
748 }
749
750 sqStore *seqStore = new sqStore(seqName);
751
752 ovStore *ovlStore = NULL;
753 ovStoreWriter *outStore = NULL;
754 ovFile *ovlFile = NULL;
755 ovFile *outFile = NULL;
756
757 if (directoryExists(ovlName)) {
758 fprintf(stderr, "Reading overlaps from store '%s' and writing to '%s'\n",
759 ovlName, outName);
760 ovlStore = new ovStore(ovlName, seqStore);
761 outStore = new ovStoreWriter(outName, seqStore);
762
763 if (bgnID < 1)
764 bgnID = 1;
765 if (endID > seqStore->sqStore_lastReadID())
766 endID = seqStore->sqStore_lastReadID();
767
768 ovlStore->setRange(bgnID, endID);
769
770 } else {
771 fprintf(stderr, "Reading overlaps from file '%s' and writing to '%s'\n",
772 ovlName, outName);
773 ovlFile = new ovFile(seqStore, ovlName, ovFileFull);
774 outFile = new ovFile(seqStore, outName, ovFileFullWrite);
775 }
776
777 workSpace *WA = new workSpace [numThreads];
778 pthread_t *tID = new pthread_t [numThreads];
779 pthread_attr_t attr;
780
781 pthread_attr_init(&attr);
782 pthread_attr_setstacksize(&attr, 12 * 131072);
783 pthread_mutex_init(&balanceMutex, NULL);
784
785 // Initialize thread work areas. Mirrored from overlapInCore.C
786
787 for (uint32 tt=0; tt<numThreads; tt++) {
788 fprintf(stderr, "Initialize thread %u\n", tt);
789
790 WA[tt].threadID = tt;
791 WA[tt].maxErate = maxErate;
792 WA[tt].partialOverlaps = partialOverlaps;
793 WA[tt].invertOverlaps = invertOverlaps;
794
795 WA[tt].seqStore = seqStore;
796 WA[tt].overlaps = NULL;
797
798 // preallocate some work thread memory for common tasks to avoid allocation
799 WA[tt].readSeq = new char[AS_MAX_READLEN+1];
800 }
801
802
803 // Thread flow:
804 //
805 // for reads bgn to end {
806 // Load N overlaps
807 // Load new reads - set touched reads to age=0
808 // Wait for threads to finish
809 // Increment age of reads in cache, delete reads that are too old
810 // Launch threads
811 // }
812 //
813 // instead of fixed cutoff on age, use max memory usage and cull the oldest to remain below
814
815 class overlapBlock {
816 public:
817 overlapBlock() {
818 _len = 0;
819 _max = BATCH_SIZE;
820 _ovl = new ovOverlap[BATCH_SIZE];
821 }
822 ~overlapBlock() {
823 delete [] _ovl;
824 };
825 uint32 _len;
826 uint32 _max;
827 ovOverlap *_ovl;
828 };
829
830 overlapBlock overlapsA;
831 overlapBlock overlapsB;
832
833 overlapBlock *overlaps = &overlapsA;
834
835 rcache = new overlapReadCache(seqStore, memLimit);
836
837 // Load the first batch of overlaps and reads.
838
839 if (ovlStore)
840 overlaps->_len = ovlStore->loadBlockOfOverlaps(overlaps->_ovl, overlaps->_max);
841
842 if (ovlFile)
843 overlaps->_len = ovlFile->readOverlaps(overlaps->_ovl, overlaps->_max);
844
845 fprintf(stderr, "Loaded %u overlaps.\n", overlaps->_len);
846
847 rcache->loadReads(overlaps->_ovl, overlaps->_len);
848
849 // Loop over all the overlaps.
850
851 while (overlapsA._len + overlapsB._len > 0) {
852
853 // Launch next batch of threads
854 //fprintf(stderr, "LAUNCH THREADS\n");
855
856 // Globals, ugh. These limit the threads to the range of overlaps we have loaded. Each thread
857 // will pull out THREAD_SIZE overlaps at a time to compute, updating batchPosID as it does so.
858 // Each thread will stop when batchPosID > batchEndID.
859
860 batchPrtID = 0;
861 batchPosID = 0;
862 batchEndID = overlaps->_len;
863
864 for (uint32 tt=0; tt<numThreads; tt++) {
865 WA[tt].overlapsLen = overlaps->_len;
866 WA[tt].overlaps = overlaps->_ovl;
867
868 int32 status = pthread_create(tID + tt, &attr, recomputeOverlaps, WA + tt);
869
870 if (status != 0)
871 fprintf(stderr, "pthread_create error: %s\n", strerror(status)), exit(1);
872 }
873
874 // Flip back to the now computed overlaps
875
876 if (overlaps == &overlapsA)
877 overlaps = &overlapsB;
878 else
879 overlaps = &overlapsA;
880
881 // Write recomputed overlaps - if this is the first pass through the loop,
882 // then overlapsLen will be zero
883 //
884 // Should we output overlaps that failed to recompute?
885
886 if (ovlStore)
887 for (uint64 oo=0; oo<overlaps->_len; oo++)
888 outStore->writeOverlap(overlaps->_ovl + oo);
889 if (ovlFile)
890 outFile->writeOverlaps(overlaps->_ovl, overlaps->_len);
891
892 // Load more overlaps
893
894 if (ovlStore)
895 overlaps->_len = ovlStore->loadBlockOfOverlaps(overlaps->_ovl, overlaps->_max);
896 if (ovlFile)
897 overlaps->_len = ovlFile->readOverlaps(overlaps->_ovl, overlaps->_max);
898
899 fprintf(stderr, "Loaded %u overlaps.\n", overlaps->_len);
900
901 rcache->loadReads(overlaps->_ovl, overlaps->_len);
902
903 // Wait for threads to finish
904
905 for (uint32 tt=0; tt<numThreads; tt++) {
906 //fprintf(stderr, "wait for thread %u\n", tt);
907 int32 status = pthread_join(tID[tt], NULL);
908 //fprintf(stderr, "joined thread %u\n", tt);
909
910 if (status != 0)
911 fprintf(stderr, "pthread_join error: %s\n", strerror(status)), exit(1);
912 }
913
914 // Expire old reads
915
916 rcache->purgeReads();
917 }
918
919 // Report. The last batch has no work to do.
920
921 globalStats.reportFinal();
922
923 // Goodbye.
924
925 delete rcache;
926
927 delete seqStore;
928
929 delete ovlStore;
930 delete outStore;
931
932 delete ovlFile;
933 delete outFile;
934
935 delete [] WA;
936 delete [] tID;
937
938 fprintf(stderr, "\n");
939 fprintf(stderr, "Bye.\n");
940
941 return(0);
942 }
943
944
945