1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19 #include "sqStore.H"
20 #include "ovStore.H"
21 #include "tgStore.H"
22
23 #include "strings.H"
24 #include "files.H"
25 #include "intervalList.H"
26 #include "sequence.H"
27
28 #include <set>
29
30 // Debugging on which reads are filtered, which are used, and which are removed
31 // to meet coverage thresholds. Very big.
32 #undef DEBUG_LAYOUT
33
34 //FILE *flgFile = stderr;
35 FILE *flgFile = NULL;
36
37
38
39 uint16 *
loadThresholds(sqStore * seqStore,ovStore * ovlStore,char * scoreName,uint32 expectedCoverage,FILE * scoFile)40 loadThresholds(sqStore *seqStore,
41 ovStore *ovlStore,
42 char *scoreName,
43 uint32 expectedCoverage,
44 FILE *scoFile) {
45 uint32 numReads = seqStore->sqStore_lastReadID();
46 uint16 *olapThresh = new uint16 [numReads + 1];
47
48 if (scoreName != NULL)
49 AS_UTL_loadFile(scoreName, olapThresh, numReads + 1);
50
51 else {
52 ovStoreHistogram *ovlHisto = ovlStore->getHistogram();
53
54 for (uint32 ii=0; ii<numReads+1; ii++)
55 olapThresh[ii] = ovlHisto->overlapScoreEstimate(ii, expectedCoverage, scoFile);
56
57 delete ovlHisto;
58 }
59
60 return(olapThresh);
61 }
62
63
64
65 static
66 void
generateLayout(tgTig * layout,uint16 * olapThresh,uint32 minEvidenceLength,double maxEvidenceErate,double maxEvidenceCoverage,ovOverlap * ovl,uint32 ovlLen,FILE * logFile)67 generateLayout(tgTig *layout,
68 uint16 *olapThresh,
69 uint32 minEvidenceLength,
70 double maxEvidenceErate,
71 double maxEvidenceCoverage,
72 ovOverlap *ovl,
73 uint32 ovlLen,
74 FILE *logFile) {
75
76 // Generate a layout for the read in ovl[0].a_iid, using most or all of the overlaps in ovl.
77
78 layout->allocateChildren(ovlLen);
79
80 if (logFile)
81 fprintf(logFile, "Generate layout for read " F_U32 " length " F_U32 " using up to " F_U32 " overlaps.\n",
82 layout->_tigID, layout->_layoutLen, ovlLen);
83
84 std::set<uint32_t> children;
85
86 for (uint32 oo=0; oo<ovlLen; oo++) {
87 uint64 ovlLength = ovl[oo].b_len();
88 uint16 ovlScore = ovl[oo].overlapScore(true);
89
90 if (ovlLength > AS_MAX_READLEN) {
91 char ovlString[1024];
92 fprintf(stderr, "ERROR: bogus overlap '%s'\n", ovl[oo].toString(ovlString, ovOverlapAsCoords, false));
93 }
94 assert(ovlLength < AS_MAX_READLEN);
95
96 if (ovl[oo].erate() > maxEvidenceErate) {
97 if (logFile)
98 fprintf(logFile, " filter read %9u at position %6u,%6u length %5lu erate %.4f - low quality (threshold %.4f)\n",
99 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), maxEvidenceErate);
100 continue;
101 }
102
103 if (ovl[oo].a_end() - ovl[oo].a_bgn() < minEvidenceLength) {
104 if (logFile)
105 fprintf(logFile, " filter read %9u at position %6u,%6u length %5lu erate %.4f - too short (threshold %u)\n",
106 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), minEvidenceLength);
107 continue;
108 }
109
110 if ((olapThresh != NULL) &&
111 (ovlScore < olapThresh[ovl[oo].b_iid])) {
112 if (logFile)
113 fprintf(logFile, " filter read %9u at position %6u,%6u length %5lu erate %.4f - filtered by global filter (" F_U16 " < " F_U16 ")\n",
114 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), ovlScore, olapThresh[ovl[oo].b_iid]);
115 continue;
116 }
117
118 if (children.find(ovl[oo].b_iid) != children.end()) {
119 if (logFile)
120 fprintf(logFile, " filter read %9u at position %6u,%6u length %5lu erate %.4f - duplicate\n",
121 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate());
122 continue;
123 }
124
125 if (logFile)
126 fprintf(logFile, " allow read %9u at position %6u,%6u length %5lu erate %.4f - allowed by global filter (" F_U16 " > " F_U16 ")\n",
127 ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), ovlScore, olapThresh[ovl[oo].b_iid]);
128
129 tgPosition *pos = layout->addChild();
130
131 // Set the read. Parent is always the read we're building for, hangs and position come from
132 // the overlap. Easy as pie!
133
134 if (ovl[oo].flipped() == false) {
135 pos->set(ovl[oo].b_iid,
136 ovl[oo].a_iid,
137 ovl[oo].a_hang(),
138 ovl[oo].b_hang(),
139 ovl[oo].a_bgn(), ovl[oo].a_end());
140
141 } else {
142 pos->set(ovl[oo].b_iid,
143 ovl[oo].a_iid,
144 ovl[oo].a_hang(),
145 ovl[oo].b_hang(),
146 ovl[oo].a_end(), ovl[oo].a_bgn());
147 }
148
149 // Remember the unaligned bit!
150
151 pos->_askip = ovl[oo].dat.ovl.bhg5;
152 pos->_bskip = ovl[oo].dat.ovl.bhg3;
153
154 // Remember we added this read - to filter read with both fwd/rev overlaps.
155
156 children.insert(ovl[oo].b_iid);
157 }
158
159 // Dro excess coverage in the evidence by dropping short reads. This also
160 // sorts by position.
161
162 layout->dropExcessCoverage(maxEvidenceCoverage);
163 }
164
165
166
167
168
169 int
main(int argc,char ** argv)170 main(int argc, char **argv) {
171 char *seqName = 0L;
172 char *ovlName = 0L;
173 char *corName = 0L;
174 char *flgName = 0L;
175
176 char *scoreName = 0L;
177
178 uint32 errorRate = AS_OVS_encodeEvalue(0.015);
179
180 bool dumpScores = false;
181 bool doLogging = false;
182
183 uint32 expectedCoverage = 40; // How many overlaps per read to save, global filter
184
185 uint32 iidMin = 1;
186 uint32 iidMax = UINT32_MAX;
187
188 uint32 minEvidenceLength = 0;
189 double maxEvidenceErate = 1.0;
190 double maxEvidenceCoverage = DBL_MAX;
191
192
193 argc = AS_configure(argc, argv, 1);
194
195 int arg=1;
196 int err=0;
197
198 while (arg < argc) {
199 if (strcmp(argv[arg], "-S") == 0) { // INPUTS
200 seqName = argv[++arg];
201
202 } else if (strcmp(argv[arg], "-O") == 0) {
203 ovlName = argv[++arg];
204
205 } else if (strcmp(argv[arg], "-scores") == 0) {
206 scoreName = argv[++arg];
207
208
209 } else if (strcmp(argv[arg], "-C") == 0) { // OUTPUT FORMAT
210 corName = argv[++arg];
211
212
213 } else if (strcmp(argv[arg], "-b") == 0) { // READ SELECTION
214 iidMin = strtouint32(argv[++arg]);
215
216 } else if (strcmp(argv[arg], "-e") == 0) {
217 iidMax = strtouint32(argv[++arg]);
218
219 } else if (strcmp(argv[arg], "-eL") == 0) { // EVIDENCE SELECTION
220 minEvidenceLength = strtouint32(argv[++arg]);
221
222 } else if (strcmp(argv[arg], "-eE") == 0) {
223 maxEvidenceErate = strtodouble(argv[++arg]);
224
225 } else if (strcmp(argv[arg], "-eC") == 0) {
226 maxEvidenceCoverage = strtodouble(argv[++arg]);
227
228 } else if (strcmp(argv[arg], "-xC") == 0) {
229 expectedCoverage = strtouint32(argv[++arg]);
230
231 } else if (strcmp(argv[arg], "-V") == 0) {
232 doLogging = true;
233
234 } else if (strcmp(argv[arg], "-D") == 0) {
235 dumpScores = true;
236
237 } else {
238 fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
239 err++;
240 }
241
242 arg++;
243 }
244 if (seqName == NULL)
245 err++;
246 if (corName == NULL)
247 err++;
248 if (err) {
249 fprintf(stderr, "usage: %s -S seqStore -O ovlStore ...\n", argv[0]);
250 fprintf(stderr, "\n");
251 fprintf(stderr, "INPUTS\n");
252 fprintf(stderr, " -S seqStore mandatory path to seqStore\n");
253 fprintf(stderr, " -O ovlStore mandatory path to ovlStore\n");
254 fprintf(stderr, "\n");
255 fprintf(stderr, " -scores sf overlap score thresholds (from filterCorrectionOverlaps)\n");
256 fprintf(stderr, " if not supplied, will be estimated from ovlStore\n");
257 fprintf(stderr, "\n");
258 fprintf(stderr, "OUTPUTS\n");
259 fprintf(stderr, " -C corStore output layouts to store 'corStore'\n");
260 fprintf(stderr, " -V write extremely verbose logging to 'corStore.log'\n");
261 fprintf(stderr, " -D dump the data used to estimate overlap scores to 'corStore.scores'\n");
262 fprintf(stderr, "\n");
263 fprintf(stderr, "READ SELECTION\n");
264 fprintf(stderr, " -b bgnID process reads starting at bgnID\n");
265 fprintf(stderr, " -e endID process reads up to but not including endID\n");
266 fprintf(stderr, "\n");
267 fprintf(stderr, "EVIDENCE SELECTION\n");
268 fprintf(stderr, " -eL length minimum length of evidence overlaps\n");
269 fprintf(stderr, " -eE erate maximum error rate of evidence overlaps\n");
270 fprintf(stderr, " -eC coverage maximum coverage of evidence reads to emit\n");
271 fprintf(stderr, "\n");
272 fprintf(stderr, " -xC coverage estimated coverage in input reads\n");
273 fprintf(stderr, "\n");
274
275 if (seqName == NULL)
276 fprintf(stderr, "ERROR: no input seqStore (-S) supplied.\n");
277 if (corName == NULL)
278 fprintf(stderr, "ERROR: no output corStore (-C) supplied.\n");
279 exit(1);
280 }
281
282 // Open inputs and output tigStore.
283
284 sqRead_setDefaultVersion(sqRead_raw);
285
286 sqStore *seqStore = new sqStore(seqName);
287 ovStore *ovlStore = new ovStore(ovlName, seqStore);
288 tgStore *corStore = new tgStore(corName);
289
290 uint32 numReads = seqStore->sqStore_lastReadID();
291
292 // Threshold the range of reads to operate on.
293
294 if (numReads < iidMin) {
295 fprintf(stderr, "ERROR: only " F_U32 " reads in the store (IDs 0-" F_U32 " inclusive); can't process requested range -b " F_U32 " -e " F_U32 "\n",
296 numReads,
297 numReads-1,
298 iidMin, iidMax);
299 exit(1);
300 }
301 if (numReads < iidMax)
302 iidMax = numReads;
303
304 ovlStore->setRange(iidMin, iidMax);
305
306 // Open logging and summary files
307
308 FILE *logFile = AS_UTL_openOutputFile(corName, '.', "log", doLogging);
309 FILE *scoFile = AS_UTL_openOutputFile(corName, '.', "scores", dumpScores);
310
311 // Load read scores, if supplied.
312
313 uint16 *olapThresh = loadThresholds(seqStore, ovlStore, scoreName, expectedCoverage, scoFile);
314
315 // Initialize processing.
316
317 uint32 ovlMax = 0;
318 ovOverlap *ovl = NULL;
319
320 // And process.
321
322 for (uint32 rr=1; rr<numReads+1; rr++) {
323 uint32 ovlLen = ovlStore->loadOverlapsForRead(rr, ovl, ovlMax);
324
325 if (ovlLen > 0) {
326 tgTig *layout = new tgTig;
327
328 layout->_tigID = rr;
329 layout->_layoutLen = seqStore->sqStore_getReadLength(rr, sqRead_raw);
330
331 generateLayout(layout,
332 olapThresh,
333 minEvidenceLength, maxEvidenceErate, maxEvidenceCoverage,
334 ovl, ovlLen,
335 logFile);
336
337 corStore->insertTig(layout, false);
338
339 delete layout;
340 }
341 }
342
343 // Close files and clean up.
344
345 AS_UTL_closeFile(logFile);
346
347 delete [] olapThresh;
348 delete [] ovl;
349 delete corStore;
350 delete ovlStore;
351
352 delete seqStore;
353
354 fprintf(stderr, "Bye.\n");
355
356 return(0);
357 }
358