1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "overlapInCore.H"
19 #include "sequence.H"
20
21 // Find and output all overlaps between strings in store and those in the global hash table.
22 // This is the entry point for each compute thread.
23
24 void *
Process_Overlaps(void * ptr)25 Process_Overlaps(void *ptr){
26 Work_Area_t *WA = (Work_Area_t *)ptr;
27
28 uint32 seqptrLen = 0;
29 uint32 seqptrMax = AS_MAX_READLEN + 1;
30 char *seqptr = new char [seqptrMax];
31 char *bases = new char [AS_MAX_READLEN + 1];
32
33 while (WA->bgnID < G.endRefID) {
34 WA->overlapsLen = 0;
35
36 WA->Total_Overlaps = 0;
37 WA->Contained_Overlap_Ct = 0;
38 WA->Dovetail_Overlap_Ct = 0;
39
40 WA->Kmer_Hits_Without_Olap_Ct = 0;
41 WA->Kmer_Hits_With_Olap_Ct = 0;
42 WA->Kmer_Hits_Skipped_Ct = 0;
43 WA->Multi_Overlap_Ct = 0;
44
45 fprintf(stderr, "Thread %02u processes reads " F_U32 "-" F_U32 "\n",
46 WA->thread_id, WA->bgnID, WA->endID);
47
48 for (uint32 fi=WA->bgnID; fi<=WA->endID; fi++) {
49 uint32 libID = WA->readStore->sqStore_getLibraryIDForRead(fi);
50 uint32 readLen = WA->readCache->sqCache_getLength(fi);
51
52 // Load sequence/quality data
53 // Duplicated in Build_Hash_Index()
54
55 if ((libID < G.minLibToRef) ||
56 (libID > G.maxLibToRef))
57 continue;
58
59 if (readLen < G.Min_Olap_Len)
60 continue;
61
62 WA->readCache->sqCache_getSequence(fi, seqptr, seqptrLen, seqptrMax);
63
64 for (uint32 i=0; i<readLen; i++)
65 bases[i] = tolower(seqptr[i]);
66
67 bases[readLen] = 0;
68
69 assert(strlen(bases) == readLen);
70
71 // Generate overlaps.
72
73 //fprintf(stderr, "FORWARD\n");
74 Find_Overlaps(bases, readLen, fi, FORWARD, WA);
75
76 reverseComplementSequence(bases, readLen);
77
78 //fprintf(stderr, "REVERSE\n");
79 Find_Overlaps(bases, readLen, fi, REVERSE, WA);
80 }
81
82 // Write out this block of overlaps, no need to keep them in core!
83 // While we have a mutex, also find the next block of things to process.
84
85 fprintf(stderr, "Thread %02u writes reads " F_U32 "-" F_U32 " (" F_U64 " overlaps " F_U64 "/" F_U64 "/" F_U64 " kmer hits with/without overlap/skipped)\n",
86 WA->thread_id, WA->bgnID, WA->endID,
87 WA->overlapsLen,
88 WA->Kmer_Hits_With_Olap_Ct, WA->Kmer_Hits_Without_Olap_Ct, WA->Kmer_Hits_Skipped_Ct);
89
90 // Flush any remaining overlaps and update statistics.
91
92 #pragma omp critical
93 {
94 for (int zz=0; zz<WA->overlapsLen; zz++)
95 Out_BOF->writeOverlap(WA->overlaps + zz);
96
97 WA->overlapsLen = 0;
98
99 Total_Overlaps += WA->Total_Overlaps;
100 Contained_Overlap_Ct += WA->Contained_Overlap_Ct;
101 Dovetail_Overlap_Ct += WA->Dovetail_Overlap_Ct;
102
103 Kmer_Hits_Without_Olap_Ct += WA->Kmer_Hits_Without_Olap_Ct;
104 Kmer_Hits_With_Olap_Ct += WA->Kmer_Hits_With_Olap_Ct;
105 Kmer_Hits_Skipped_Ct += WA->Kmer_Hits_Skipped_Ct;
106 Multi_Overlap_Ct += WA->Multi_Overlap_Ct;
107
108 WA->bgnID = G.curRefID;
109 WA->endID = G.curRefID + G.perThread - 1;
110
111 if (WA->endID > G.endRefID)
112 WA->endID = G.endRefID;
113
114 G.curRefID = WA->endID + 1;
115 }
116 }
117
118 delete [] bases;
119 delete [] seqptr;
120
121 return(ptr);
122 }
123
124
125