1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "runtime.H"
19 #include "sqStore.H"
20 #include "ovStore.H"
21 #include "tgStore.H"
22 
23 #include "strings.H"
24 #include "files.H"
25 #include "intervalList.H"
26 #include "sequence.H"
27 
28 #include <set>
29 
30 //  Debugging on which reads are filtered, which are used, and which are removed
31 //  to meet coverage thresholds.  Very big.
32 #undef DEBUG_LAYOUT
33 
34 //FILE *flgFile = stderr;
35 FILE *flgFile = NULL;
36 
37 
38 
39 uint16 *
loadThresholds(sqStore * seqStore,ovStore * ovlStore,char * scoreName,uint32 expectedCoverage,FILE * scoFile)40 loadThresholds(sqStore *seqStore,
41                ovStore *ovlStore,
42                char    *scoreName,
43                uint32   expectedCoverage,
44                FILE    *scoFile) {
45   uint32   numReads   = seqStore->sqStore_lastReadID();
46   uint16  *olapThresh = new uint16 [numReads + 1];
47 
48   if (scoreName != NULL)
49     AS_UTL_loadFile(scoreName, olapThresh, numReads + 1);
50 
51   else {
52     ovStoreHistogram  *ovlHisto = ovlStore->getHistogram();
53 
54     for (uint32 ii=0; ii<numReads+1; ii++)
55       olapThresh[ii] = ovlHisto->overlapScoreEstimate(ii, expectedCoverage, scoFile);
56 
57     delete ovlHisto;
58   }
59 
60   return(olapThresh);
61 }
62 
63 
64 
65 static
66 void
generateLayout(tgTig * layout,uint16 * olapThresh,uint32 minEvidenceLength,double maxEvidenceErate,double maxEvidenceCoverage,ovOverlap * ovl,uint32 ovlLen,FILE * logFile)67 generateLayout(tgTig      *layout,
68                uint16     *olapThresh,
69                uint32      minEvidenceLength,
70                double      maxEvidenceErate,
71                double      maxEvidenceCoverage,
72                ovOverlap *ovl,
73                uint32      ovlLen,
74                FILE       *logFile) {
75 
76   //  Generate a layout for the read in ovl[0].a_iid, using most or all of the overlaps in ovl.
77 
78   layout->allocateChildren(ovlLen);
79 
80   if (logFile)
81     fprintf(logFile, "Generate layout for read " F_U32 " length " F_U32 " using up to " F_U32 " overlaps.\n",
82             layout->_tigID, layout->_layoutLen, ovlLen);
83 
84   std::set<uint32_t>  children;
85 
86   for (uint32 oo=0; oo<ovlLen; oo++) {
87     uint64   ovlLength = ovl[oo].b_len();
88     uint16   ovlScore  = ovl[oo].overlapScore(true);
89 
90     if (ovlLength > AS_MAX_READLEN) {
91       char ovlString[1024];
92       fprintf(stderr, "ERROR: bogus overlap '%s'\n", ovl[oo].toString(ovlString, ovOverlapAsCoords, false));
93     }
94     assert(ovlLength < AS_MAX_READLEN);
95 
96     if (ovl[oo].erate() > maxEvidenceErate) {
97       if (logFile)
98         fprintf(logFile, "  filter read %9u at position %6u,%6u length %5lu erate %.4f - low quality (threshold %.4f)\n",
99                 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), maxEvidenceErate);
100       continue;
101     }
102 
103     if (ovl[oo].a_end() - ovl[oo].a_bgn() < minEvidenceLength) {
104       if (logFile)
105         fprintf(logFile, "  filter read %9u at position %6u,%6u length %5lu erate %.4f - too short (threshold %u)\n",
106                 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), minEvidenceLength);
107       continue;
108     }
109 
110     if ((olapThresh != NULL) &&
111         (ovlScore < olapThresh[ovl[oo].b_iid])) {
112       if (logFile)
113         fprintf(logFile, "  filter read %9u at position %6u,%6u length %5lu erate %.4f - filtered by global filter (" F_U16 " < " F_U16 ")\n",
114                 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), ovlScore, olapThresh[ovl[oo].b_iid]);
115       continue;
116     }
117 
118     if (children.find(ovl[oo].b_iid) != children.end()) {
119       if (logFile)
120         fprintf(logFile, "  filter read %9u at position %6u,%6u length %5lu erate %.4f - duplicate\n",
121                 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate());
122       continue;
123     }
124 
125     if (logFile)
126       fprintf(logFile, "  allow  read %9u at position %6u,%6u length %5lu erate %.4f - allowed  by global filter (" F_U16 " > " F_U16 ")\n",
127               ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), ovlScore, olapThresh[ovl[oo].b_iid]);
128 
129     tgPosition   *pos = layout->addChild();
130 
131     //  Set the read.  Parent is always the read we're building for, hangs and position come from
132     //  the overlap.  Easy as pie!
133 
134     if (ovl[oo].flipped() == false) {
135       pos->set(ovl[oo].b_iid,
136                ovl[oo].a_iid,
137                ovl[oo].a_hang(),
138                ovl[oo].b_hang(),
139                ovl[oo].a_bgn(), ovl[oo].a_end());
140 
141     } else {
142       pos->set(ovl[oo].b_iid,
143                ovl[oo].a_iid,
144                ovl[oo].a_hang(),
145                ovl[oo].b_hang(),
146                ovl[oo].a_end(), ovl[oo].a_bgn());
147     }
148 
149     //  Remember the unaligned bit!
150 
151     pos->_askip = ovl[oo].dat.ovl.bhg5;
152     pos->_bskip = ovl[oo].dat.ovl.bhg3;
153 
154     //  Remember we added this read - to filter read with both fwd/rev overlaps.
155 
156     children.insert(ovl[oo].b_iid);
157   }
158 
159   //  Dro excess coverage in the evidence by dropping short reads.  This also
160   //  sorts by position.
161 
162   layout->dropExcessCoverage(maxEvidenceCoverage);
163 }
164 
165 
166 
167 
168 
169 int
main(int argc,char ** argv)170 main(int argc, char **argv) {
171   char             *seqName    = 0L;
172   char             *ovlName    = 0L;
173   char             *corName    = 0L;
174   char             *flgName    = 0L;
175 
176   char             *scoreName  = 0L;
177 
178   uint32            errorRate  = AS_OVS_encodeEvalue(0.015);
179 
180   bool              dumpScores = false;
181   bool              doLogging  = false;
182 
183   uint32            expectedCoverage    = 40;    //  How many overlaps per read to save, global filter
184 
185   uint32            iidMin = 1;
186   uint32            iidMax = UINT32_MAX;
187 
188   uint32            minEvidenceLength   = 0;
189   double            maxEvidenceErate    = 1.0;
190   double            maxEvidenceCoverage = DBL_MAX;
191 
192 
193   argc = AS_configure(argc, argv, 1);
194 
195   int arg=1;
196   int err=0;
197 
198   while (arg < argc) {
199     if        (strcmp(argv[arg], "-S") == 0) {   //  INPUTS
200       seqName = argv[++arg];
201 
202     } else if (strcmp(argv[arg], "-O") == 0) {
203       ovlName = argv[++arg];
204 
205     } else if (strcmp(argv[arg], "-scores") == 0) {
206       scoreName = argv[++arg];
207 
208 
209     } else if (strcmp(argv[arg], "-C") == 0) {   //  OUTPUT FORMAT
210       corName = argv[++arg];
211 
212 
213     } else if (strcmp(argv[arg], "-b") == 0) {   //  READ SELECTION
214       iidMin  = strtouint32(argv[++arg]);
215 
216     } else if (strcmp(argv[arg], "-e") == 0) {
217       iidMax  = strtouint32(argv[++arg]);
218 
219     } else if (strcmp(argv[arg], "-eL") == 0) {   //  EVIDENCE SELECTION
220       minEvidenceLength  = strtouint32(argv[++arg]);
221 
222     } else if (strcmp(argv[arg], "-eE") == 0) {
223       maxEvidenceErate = strtodouble(argv[++arg]);
224 
225     } else if (strcmp(argv[arg], "-eC") == 0) {
226       maxEvidenceCoverage = strtodouble(argv[++arg]);
227 
228     } else if (strcmp(argv[arg], "-xC") == 0) {
229       expectedCoverage = strtouint32(argv[++arg]);
230 
231     } else if (strcmp(argv[arg], "-V") == 0) {
232       doLogging = true;
233 
234     } else if (strcmp(argv[arg], "-D") == 0) {
235       dumpScores = true;
236 
237     } else {
238       fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
239       err++;
240     }
241 
242     arg++;
243   }
244   if (seqName == NULL)
245     err++;
246   if (corName == NULL)
247     err++;
248   if (err) {
249     fprintf(stderr, "usage: %s -S seqStore -O ovlStore ...\n", argv[0]);
250     fprintf(stderr, "\n");
251     fprintf(stderr, "INPUTS\n");
252     fprintf(stderr, "  -S seqStore      mandatory path to seqStore\n");
253     fprintf(stderr, "  -O ovlStore      mandatory path to ovlStore\n");
254     fprintf(stderr, "\n");
255     fprintf(stderr, "  -scores sf       overlap score thresholds (from filterCorrectionOverlaps)\n");
256     fprintf(stderr, "                   if not supplied, will be estimated from ovlStore\n");
257     fprintf(stderr, "\n");
258     fprintf(stderr, "OUTPUTS\n");
259     fprintf(stderr, "  -C corStore      output layouts to store 'corStore'\n");
260     fprintf(stderr, "  -V               write extremely verbose logging to 'corStore.log'\n");
261     fprintf(stderr, "  -D               dump the data used to estimate overlap scores to 'corStore.scores'\n");
262     fprintf(stderr, "\n");
263     fprintf(stderr, "READ SELECTION\n");
264     fprintf(stderr, "  -b bgnID         process reads starting at bgnID\n");
265     fprintf(stderr, "  -e endID         process reads up to but not including endID\n");
266     fprintf(stderr, "\n");
267     fprintf(stderr, "EVIDENCE SELECTION\n");
268     fprintf(stderr, "  -eL length       minimum length of evidence overlaps\n");
269     fprintf(stderr, "  -eE erate        maximum error rate of evidence overlaps\n");
270     fprintf(stderr, "  -eC coverage     maximum coverage of evidence reads to emit\n");
271     fprintf(stderr, "\n");
272     fprintf(stderr, "  -xC coverage     estimated coverage in input reads\n");
273     fprintf(stderr, "\n");
274 
275     if (seqName == NULL)
276       fprintf(stderr, "ERROR: no input seqStore (-S) supplied.\n");
277     if (corName == NULL)
278       fprintf(stderr, "ERROR: no output corStore (-C) supplied.\n");
279     exit(1);
280   }
281 
282   //  Open inputs and output tigStore.
283 
284   sqRead_setDefaultVersion(sqRead_raw);
285 
286   sqStore  *seqStore = new sqStore(seqName);
287   ovStore  *ovlStore = new ovStore(ovlName, seqStore);
288   tgStore  *corStore = new tgStore(corName);
289 
290   uint32    numReads = seqStore->sqStore_lastReadID();
291 
292   //  Threshold the range of reads to operate on.
293 
294   if (numReads < iidMin) {
295     fprintf(stderr, "ERROR: only " F_U32 " reads in the store (IDs 0-" F_U32 " inclusive); can't process requested range -b " F_U32 " -e " F_U32 "\n",
296             numReads,
297             numReads-1,
298             iidMin, iidMax);
299     exit(1);
300   }
301   if (numReads < iidMax)
302     iidMax = numReads;
303 
304   ovlStore->setRange(iidMin, iidMax);
305 
306   //  Open logging and summary files
307 
308   FILE *logFile = AS_UTL_openOutputFile(corName, '.', "log",    doLogging);
309   FILE *scoFile = AS_UTL_openOutputFile(corName, '.', "scores", dumpScores);
310 
311   //  Load read scores, if supplied.
312 
313   uint16   *olapThresh = loadThresholds(seqStore, ovlStore, scoreName, expectedCoverage, scoFile);
314 
315   //  Initialize processing.
316 
317   uint32             ovlMax    = 0;
318   ovOverlap         *ovl       = NULL;
319 
320   //  And process.
321 
322   for (uint32 rr=1; rr<numReads+1; rr++) {
323     uint32 ovlLen = ovlStore->loadOverlapsForRead(rr, ovl, ovlMax);
324 
325     if (ovlLen > 0) {
326       tgTig   *layout = new tgTig;
327 
328       layout->_tigID     = rr;
329       layout->_layoutLen = seqStore->sqStore_getReadLength(rr, sqRead_raw);
330 
331       generateLayout(layout,
332                      olapThresh,
333                      minEvidenceLength, maxEvidenceErate, maxEvidenceCoverage,
334                      ovl, ovlLen,
335                      logFile);
336 
337       corStore->insertTig(layout, false);
338 
339       delete layout;
340     }
341   }
342 
343   //  Close files and clean up.
344 
345   AS_UTL_closeFile(logFile);
346 
347   delete [] olapThresh;
348   delete [] ovl;
349   delete    corStore;
350   delete    ovlStore;
351 
352   delete seqStore;
353 
354   fprintf(stderr, "Bye.\n");
355 
356   return(0);
357 }
358