1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "trimReads.H"
19 #include "trimStat.H"
20 #include "clearRangeFile.H"
21
22 #include "strings.H"
23
24
25
26
27 // Enforce any maximum clear range, if it exists (mbgn < mend)
28 //
29 // There are six cases:
30 //
31 // ---MAX-RANGE---
32 // ---
33 // -------------------
34 // -----
35 // -----
36 // -----
37 // ---
38 //
39 // If the begin is below the max-bgn, we reset it to max-bgn.
40 // If the end is after the max-end, we reset it to max-end.
41 //
42 // If after the resets we have an invalid clear (bgn > end)
43 // the original clear range was completely outside the max range.
44 //
45 bool
enforceMaximumClearRange(uint32 readID,uint32 UNUSED (ibgn),uint32 UNUSED (iend),uint32 & fbgn,uint32 & fend,char * logMsg,clearRangeFile * maxClr)46 enforceMaximumClearRange(uint32 readID,
47 uint32 UNUSED(ibgn),
48 uint32 UNUSED(iend),
49 uint32 &fbgn,
50 uint32 &fend,
51 char *logMsg,
52 clearRangeFile *maxClr) {
53
54 if (maxClr == NULL)
55 return(true);
56
57 if (fbgn == fend)
58 return(true);
59
60 uint32 mbgn = maxClr->bgn(readID);
61 uint32 mend = maxClr->end(readID);
62
63 assert(mbgn < mend);
64 assert(fbgn <= fend);
65
66 if ((fend < mbgn) ||
67 (mend < fbgn)) {
68 // Final clear not intersecting maximum clear.
69 strcat(logMsg, (logMsg[0]) ? " - " : "\t");
70 strcat(logMsg, "outside maximum allowed clear range");
71 return(false);
72
73 } else if ((fbgn < mbgn) ||
74 (mend < fend)) {
75 // Final clear extends outside the maximum clear.
76 fbgn = std::max(fbgn, mbgn);
77 fend = std::min(fend, mend);
78
79 strcat(logMsg, (logMsg[0]) ? " - " : "\t");
80 strcat(logMsg, "adjusted to obey maximum allowed clear range");
81 return(true);
82
83 } else {
84 // Final clear already within the maximum clear.
85 return(true);
86 }
87 }
88
89
90
91 int
main(int argc,char ** argv)92 main(int argc, char **argv) {
93 char *seqName = 0L;
94 char *ovsName = 0L;
95
96 char *iniClrName = NULL;
97 char *maxClrName = NULL;
98 char *outClrName = NULL;
99
100 uint32 errorValue = AS_OVS_encodeEvalue(0.015);
101 uint32 minAlignLength = 40;
102 uint32 minReadLength = 64;
103
104 char *outputPrefix = NULL;
105 char logName[FILENAME_MAX] = {0};
106 char sumName[FILENAME_MAX] = {0};
107 FILE *logFile = 0L;
108 FILE *staFile = 0L;
109
110 uint32 idMin = 1;
111 uint32 idMax = UINT32_MAX;
112
113 uint32 minEvidenceOverlap = 40;
114 uint32 minEvidenceCoverage = 1;
115
116 // Statistics on the trimming
117
118 trimStat readsIn; // Read is eligible for trimming
119 trimStat deletedIn; // Read was deleted already
120 trimStat noTrimIn; // Read not requesting trimming
121
122 trimStat readsOut; // Read was trimmed to a valid read
123 trimStat noOvlOut; // Read was deleted; no ovelaps
124 trimStat deletedOut; // Read was deleted; too small after trimming
125 trimStat noChangeOut; // Read was untrimmed
126
127 trimStat trim5; // Bases trimmed from the 5' end
128 trimStat trim3;
129
130
131 argc = AS_configure(argc, argv, 1);
132
133 int arg=1;
134 int err=0;
135 while (arg < argc) {
136 if (strcmp(argv[arg], "-S") == 0) {
137 seqName = argv[++arg];
138
139 } else if (strcmp(argv[arg], "-O") == 0) {
140 ovsName = argv[++arg];
141
142 } else if (strcmp(argv[arg], "-Ci") == 0) {
143 iniClrName = argv[++arg];
144 } else if (strcmp(argv[arg], "-Cm") == 0) {
145 maxClrName = argv[++arg];
146 } else if (strcmp(argv[arg], "-Co") == 0) {
147 outClrName = argv[++arg];
148
149 } else if (strcmp(argv[arg], "-e") == 0) {
150 double erate = atof(argv[++arg]);
151 errorValue = AS_OVS_encodeEvalue(erate);
152
153 } else if (strcmp(argv[arg], "-l") == 0) {
154 minAlignLength = atoi(argv[++arg]);
155
156 } else if (strcmp(argv[arg], "-minlength") == 0) {
157 minReadLength = atoi(argv[++arg]);
158
159 } else if (strcmp(argv[arg], "-ol") == 0) {
160 minEvidenceOverlap = atoi(argv[++arg]);
161
162 } else if (strcmp(argv[arg], "-oc") == 0) {
163 minEvidenceCoverage = atoi(argv[++arg]);
164
165 } else if (strcmp(argv[arg], "-o") == 0) {
166 outputPrefix = argv[++arg];
167
168 } else if (strcmp(argv[arg], "-t") == 0) {
169 decodeRange(argv[++arg], idMin, idMax);
170
171 } else {
172 fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
173 err++;
174 }
175
176 arg++;
177 }
178 if ((seqName == NULL) ||
179 (ovsName == NULL) ||
180 (outClrName == NULL) ||
181 (outputPrefix == NULL) ||
182 (err)) {
183 fprintf(stderr, "usage: %s -S seqStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]);
184 fprintf(stderr, "\n");
185 fprintf(stderr, " -S seqStore path to read store\n");
186 fprintf(stderr, " -O ovlStore path to overlap store\n");
187 fprintf(stderr, "\n");
188 fprintf(stderr, " -o name output prefix, for logging\n");
189 fprintf(stderr, "\n");
190 fprintf(stderr, " -t bgn-end limit processing to only reads from bgn to end (inclusive)\n");
191 fprintf(stderr, "\n");
192 fprintf(stderr, " -Ci clearFile path to input clear ranges (NOT SUPPORTED)\n");
193 //fprintf(stderr, " -Cm clearFile path to maximal clear ranges\n");
194 fprintf(stderr, " -Co clearFile path to ouput clear ranges\n");
195 fprintf(stderr, "\n");
196 fprintf(stderr, " -e erate ignore overlaps with more than 'erate' percent error\n");
197 //fprintf(stderr, " -l length ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
198 fprintf(stderr, "\n");
199 fprintf(stderr, " -ol l the minimum evidence overlap length\n");
200 fprintf(stderr, " -oc c the minimum evidence overlap coverage\n");
201 fprintf(stderr, " evidence overlaps must overlap by 'l' bases to be joined, and\n");
202 fprintf(stderr, " must be at least 'c' deep to be retained\n");
203 fprintf(stderr, "\n");
204 fprintf(stderr, " -minlength l reads trimmed below this many bases are deleted\n");
205 fprintf(stderr, "\n");
206 exit(1);
207 }
208
209 sqStore *seq = new sqStore(seqName);
210 ovStore *ovs = new ovStore(ovsName, seq);
211
212 clearRangeFile *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, seq);
213 clearRangeFile *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, seq);
214 clearRangeFile *outClr = new clearRangeFile(outClrName, seq);
215
216 if (outClr)
217 // If the outClr file exists, those clear ranges are loaded. We need to reset them
218 // back to 'untrimmed' for now.
219 outClr->reset(seq);
220
221 if (iniClr && outClr)
222 // An iniClr file was supplied, so use those as the initial clear ranges.
223 outClr->copy(iniClr);
224
225
226 if (outputPrefix) {
227 snprintf(logName, FILENAME_MAX, "%s.log", outputPrefix);
228
229 logFile = AS_UTL_openOutputFile(logName);
230
231 fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n");
232 }
233
234
235 uint32 ovlLen = 0;
236 uint32 ovlMax = 0;
237 ovOverlap *ovl = NULL;
238
239 char logMsg[1024] = {0};
240
241 if (idMin < 1)
242 idMin = 1;
243 if (idMax > seq->sqStore_lastReadID())
244 idMax = seq->sqStore_lastReadID();
245
246 fprintf(stderr, "Processing from ID " F_U32 " to " F_U32 " out of " F_U32 " reads.\n",
247 idMin,
248 idMax,
249 seq->sqStore_lastReadID());
250
251
252 for (uint32 id=idMin; id<=idMax; id++) {
253 sqLibrary *libr = seq->sqStore_getLibraryForRead(id);
254
255 logMsg[0] = 0;
256
257 // If the fragment is deleted, do nothing. If the fragment was deleted AFTER overlaps were
258 // generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments
259 // we skip.
260 //
261 if ((iniClr) && (iniClr->isDeleted(id) == true)) {
262 deletedIn += seq->sqStore_getReadLength(id);
263 continue;
264 }
265
266 // If it did not request trimming, do nothing. Similar to the above, we'll get overlaps to
267 // fragments we skip.
268 //
269 #if 0
270 // (yes, this is nonsense)
271 if ((libr->sqLibrary_finalTrim() == SQ_FINALTRIM_LARGEST_COVERED) &&
272 (libr->sqLibrary_finalTrim() == SQ_FINALTRIM_BEST_EDGE)) {
273 noTrimIn += seq->sqStore_getReadLength(id);
274 continue;
275 }
276 #endif
277
278 readsIn += seq->sqStore_getReadLength(id);
279
280
281 // Decide on the initial trimming. We copied any iniClr into outClr above, and if there wasn't
282 // an iniClr, then outClr is the full read.
283
284 uint32 ibgn = outClr->bgn(id);
285 uint32 iend = outClr->end(id);
286
287 // Set the, ahem, initial final trimming.
288
289 bool isGood = false;
290 uint32 fbgn = ibgn;
291 uint32 fend = iend;
292
293 // Load overlaps.
294
295 ovlLen = ovs->loadOverlapsForRead(id, ovl, ovlMax);
296
297 // Trim!
298
299 // No overlaps, so mark it as junk.
300 if (ovlLen == 0) {
301 isGood = false;
302 }
303
304 // Use the largest region covered by overlaps as the trim
305 else {
306
307 assert(ovlLen > 0);
308 assert(id == ovl[0].a_iid);
309
310 isGood = largestCovered(ovl, ovlLen,
311 id, seq->sqStore_getReadLength(id),
312 ibgn, iend, fbgn, fend,
313 logMsg,
314 errorValue,
315 minEvidenceOverlap,
316 minEvidenceCoverage,
317 minReadLength);
318 assert(fbgn <= fend);
319 }
320
321 #if 0
322 // Use the largest region covered by overlaps as the trim
323 else if (libr->sqLibrary_finalTrim() == SQ_FINALTRIM_BEST_EDGE) {
324
325 assert(ovlLen > 0);
326 assert(id == ovl[0].a_iid);
327
328 isGood = bestEdge(ovl, ovlLen,
329 id, seq->sqStore_getReadLength(id),
330 ibgn, iend, fbgn, fend,
331 logMsg,
332 errorValue,
333 minEvidenceOverlap,
334 minEvidenceCoverage,
335 minReadLength);
336 assert(fbgn <= fend);
337 }
338
339 // Do nothing. Really shouldn't get here.
340 else {
341 assert(0);
342 continue;
343 }
344 #endif
345
346 // Enforce the maximum clear range
347
348 if ((isGood) && (maxClr)) {
349 isGood = enforceMaximumClearRange(id,
350 ibgn, iend, fbgn, fend,
351 logMsg,
352 maxClr);
353 assert(fbgn <= fend);
354 }
355
356 //
357 // Trimmed. Make sense of the result, write some logs, and update the output.
358 //
359
360
361 // If bad trimming or too small, write the log and keep going.
362 //
363 if (ovlLen == 0) {
364 noOvlOut += seq->sqStore_getReadLength(id);
365
366 outClr->setbgn(id) = fbgn;
367 outClr->setend(id) = fend;
368 outClr->setDeleted(id); // Gah, just obliterates the clear range.
369
370 fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tNOV%s\n",
371 id,
372 ibgn, iend,
373 fbgn, fend,
374 (logMsg[0] == 0) ? "" : logMsg);
375 }
376
377 else if ((isGood == false) || (fend - fbgn < minReadLength)) {
378 deletedOut += seq->sqStore_getReadLength(id);
379
380 outClr->setbgn(id) = fbgn;
381 outClr->setend(id) = fend;
382 outClr->setDeleted(id); // Gah, just obliterates the clear range.
383
384 fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tDEL%s\n",
385 id,
386 ibgn, iend,
387 fbgn, fend,
388 (logMsg[0] == 0) ? "" : logMsg);
389 }
390
391 // If we didn't change anything, also write a log.
392 //
393 else if ((ibgn == fbgn) &&
394 (iend == fend)) {
395 noChangeOut += seq->sqStore_getReadLength(id);
396
397 fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tNOC%s\n",
398 id,
399 ibgn, iend,
400 fbgn, fend,
401 (logMsg[0] == 0) ? "" : logMsg);
402 continue;
403 }
404
405 // Otherwise, we actually did something.
406
407 else {
408 readsOut += fend - fbgn;
409
410 outClr->setbgn(id) = fbgn;
411 outClr->setend(id) = fend;
412
413 assert(ibgn <= fbgn);
414 assert(fend <= iend);
415
416 if (fbgn - ibgn > 0) trim5 += fbgn - ibgn;
417 if (iend - fend > 0) trim3 += iend - fend;
418
419 fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tMOD%s\n",
420 id,
421 ibgn, iend,
422 fbgn, fend,
423 (logMsg[0] == 0) ? "" : logMsg);
424 }
425 }
426
427 // Clean up.
428
429 delete seq;
430
431 delete [] ovl;
432 delete ovs;
433
434 delete iniClr;
435 delete maxClr;
436 delete outClr;
437
438 AS_UTL_closeFile(logFile, logName);
439
440 // should fprintf() the numbers directly here so an explanation of each category can be supplied;
441 // simpler for now to have report() do it.
442
443 // Dump the statistics and plots
444
445 if (outputPrefix) {
446 snprintf(sumName, FILENAME_MAX, "%s.stats", outputPrefix);
447
448 staFile = AS_UTL_openOutputFile(sumName);
449 }
450
451 if (staFile == NULL)
452 staFile = stderr;
453
454 fprintf(staFile, "PARAMETERS:\n");
455 fprintf(staFile, "----------\n");
456 fprintf(staFile, "%7u (reads trimmed below this many bases are deleted)\n", minReadLength);
457 fprintf(staFile, "%7.4f (use overlaps at or below this fraction error)\n", AS_OVS_decodeEvalue(errorValue));
458 fprintf(staFile, "%7u (break region if overlap is less than this long, for 'largest covered' algorithm)\n", minEvidenceOverlap);
459 fprintf(staFile, "%7u (break region if overlap coverage is less than this many read%s, for 'largest covered' algorithm)\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s");
460 fprintf(staFile, "\n");
461
462 fprintf(staFile, "INPUT READS:\n");
463 fprintf(staFile, "-----------\n");
464 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
465 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
466 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
467
468 readsIn .generatePlots(outputPrefix, "inputReads", 250);
469 deletedIn.generatePlots(outputPrefix, "inputDeletedReads", 250);
470 noTrimIn .generatePlots(outputPrefix, "inputNoTrimReads", 250);
471
472 fprintf(staFile, "\n");
473 fprintf(staFile, "OUTPUT READS:\n");
474 fprintf(staFile, "------------\n");
475 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (trimmed reads output)\n", readsOut.nReads, readsOut.nBases);
476 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with no change, kept as is)\n", noChangeOut.nReads, noChangeOut.nBases);
477 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with no overlaps, deleted)\n", noOvlOut.nReads, noOvlOut.nBases);
478 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with short trimmed length, deleted)\n", deletedOut.nReads, deletedOut.nBases);
479
480 readsOut .generatePlots(outputPrefix, "outputTrimmedReads", 250);
481 noOvlOut .generatePlots(outputPrefix, "outputNoOvlReads", 250);
482 deletedOut .generatePlots(outputPrefix, "outputDeletedReads", 250);
483 noChangeOut.generatePlots(outputPrefix, "outputUnchangedReads", 250);
484
485 fprintf(staFile, "\n");
486 fprintf(staFile, "TRIMMING DETAILS:\n");
487 fprintf(staFile, "----------------\n");
488 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (bases trimmed from the 5' end of a read)\n", trim5.nReads, trim5.nBases);
489 fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (bases trimmed from the 3' end of a read)\n", trim3.nReads, trim3.nBases);
490
491 trim5.generatePlots(outputPrefix, "trim5", 25);
492 trim3.generatePlots(outputPrefix, "trim3", 25);
493
494 AS_UTL_closeFile(staFile, sumName);
495
496 // Buh-bye.
497
498 exit(0);
499 }
500