1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "overlapInCore.H"
19 #include "sequence.H"
20 
21 //  Find and output all overlaps between strings in store and those in the global hash table.
22 //  This is the entry point for each compute thread.
23 
24 void *
Process_Overlaps(void * ptr)25 Process_Overlaps(void *ptr){
26   Work_Area_t  *WA = (Work_Area_t *)ptr;
27 
28   uint32        seqptrLen = 0;
29   uint32        seqptrMax = AS_MAX_READLEN + 1;
30   char         *seqptr    = new char [seqptrMax];
31   char         *bases     = new char [AS_MAX_READLEN + 1];
32 
33   while (WA->bgnID < G.endRefID) {
34     WA->overlapsLen                = 0;
35 
36     WA->Total_Overlaps             = 0;
37     WA->Contained_Overlap_Ct       = 0;
38     WA->Dovetail_Overlap_Ct        = 0;
39 
40     WA->Kmer_Hits_Without_Olap_Ct  = 0;
41     WA->Kmer_Hits_With_Olap_Ct     = 0;
42     WA->Kmer_Hits_Skipped_Ct       = 0;
43     WA->Multi_Overlap_Ct           = 0;
44 
45     fprintf(stderr, "Thread %02u processes reads " F_U32 "-" F_U32 "\n",
46             WA->thread_id, WA->bgnID, WA->endID);
47 
48     for (uint32 fi=WA->bgnID; fi<=WA->endID; fi++) {
49       uint32  libID   = WA->readStore->sqStore_getLibraryIDForRead(fi);
50       uint32  readLen = WA->readCache->sqCache_getLength(fi);
51 
52       //  Load sequence/quality data
53       //  Duplicated in Build_Hash_Index()
54 
55       if ((libID < G.minLibToRef) ||
56           (libID > G.maxLibToRef))
57         continue;
58 
59       if (readLen < G.Min_Olap_Len)
60         continue;
61 
62       WA->readCache->sqCache_getSequence(fi, seqptr, seqptrLen, seqptrMax);
63 
64       for (uint32 i=0; i<readLen; i++)
65         bases[i] = tolower(seqptr[i]);
66 
67       bases[readLen] = 0;
68 
69       assert(strlen(bases) == readLen);
70 
71       //  Generate overlaps.
72 
73       //fprintf(stderr, "FORWARD\n");
74       Find_Overlaps(bases, readLen, fi, FORWARD, WA);
75 
76       reverseComplementSequence(bases, readLen);
77 
78       //fprintf(stderr, "REVERSE\n");
79       Find_Overlaps(bases, readLen, fi, REVERSE, WA);
80     }
81 
82     //  Write out this block of overlaps, no need to keep them in core!
83     //  While we have a mutex, also find the next block of things to process.
84 
85     fprintf(stderr, "Thread %02u writes    reads " F_U32 "-" F_U32 " (" F_U64 " overlaps " F_U64 "/" F_U64 "/" F_U64 " kmer hits with/without overlap/skipped)\n",
86             WA->thread_id, WA->bgnID, WA->endID,
87             WA->overlapsLen,
88             WA->Kmer_Hits_With_Olap_Ct, WA->Kmer_Hits_Without_Olap_Ct, WA->Kmer_Hits_Skipped_Ct);
89 
90     //  Flush any remaining overlaps and update statistics.
91 
92 #pragma omp critical
93     {
94       for (int zz=0; zz<WA->overlapsLen; zz++)
95         Out_BOF->writeOverlap(WA->overlaps + zz);
96 
97       WA->overlapsLen = 0;
98 
99       Total_Overlaps            += WA->Total_Overlaps;
100       Contained_Overlap_Ct      += WA->Contained_Overlap_Ct;
101       Dovetail_Overlap_Ct       += WA->Dovetail_Overlap_Ct;
102 
103       Kmer_Hits_Without_Olap_Ct += WA->Kmer_Hits_Without_Olap_Ct;
104       Kmer_Hits_With_Olap_Ct    += WA->Kmer_Hits_With_Olap_Ct;
105       Kmer_Hits_Skipped_Ct      += WA->Kmer_Hits_Skipped_Ct;
106       Multi_Overlap_Ct          += WA->Multi_Overlap_Ct;
107 
108       WA->bgnID = G.curRefID;
109       WA->endID = G.curRefID + G.perThread - 1;
110 
111       if (WA->endID > G.endRefID)
112         WA->endID = G.endRefID;
113 
114       G.curRefID = WA->endID + 1;
115     }
116   }
117 
118   delete [] bases;
119   delete [] seqptr;
120 
121   return(ptr);
122 }
123 
124 
125