1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "trimReads.H"
19 #include "trimStat.H"
20 #include "clearRangeFile.H"
21 
22 #include "strings.H"
23 
24 
25 
26 
27 //  Enforce any maximum clear range, if it exists (mbgn < mend)
28 //
29 //  There are six cases:
30 //
31 //       ---MAX-RANGE---
32 //   ---
33 //     -------------------
34 //     -----
35 //            -----
36 //                   -----
37 //                       ---
38 //
39 //  If the begin is below the max-bgn, we reset it to max-bgn.
40 //  If the end   is after the max-end, we reset it to max-end.
41 //
42 //  If after the resets we have an invalid clear (bgn > end)
43 //  the original clear range was completely outside the max range.
44 //
45 bool
enforceMaximumClearRange(uint32 readID,uint32 UNUSED (ibgn),uint32 UNUSED (iend),uint32 & fbgn,uint32 & fend,char * logMsg,clearRangeFile * maxClr)46 enforceMaximumClearRange(uint32           readID,
47                          uint32    UNUSED(ibgn),
48                          uint32    UNUSED(iend),
49                          uint32          &fbgn,
50                          uint32          &fend,
51                          char            *logMsg,
52                          clearRangeFile  *maxClr) {
53 
54   if (maxClr == NULL)
55     return(true);
56 
57   if (fbgn == fend)
58     return(true);
59 
60   uint32 mbgn = maxClr->bgn(readID);
61   uint32 mend = maxClr->end(readID);
62 
63   assert(mbgn <  mend);
64   assert(fbgn <= fend);
65 
66   if ((fend < mbgn) ||
67       (mend < fbgn)) {
68     //  Final clear not intersecting maximum clear.
69     strcat(logMsg, (logMsg[0]) ? " - " : "\t");
70     strcat(logMsg, "outside maximum allowed clear range");
71     return(false);
72 
73   } else if ((fbgn < mbgn) ||
74              (mend < fend)) {
75     //  Final clear extends outside the maximum clear.
76     fbgn = std::max(fbgn, mbgn);
77     fend = std::min(fend, mend);
78 
79     strcat(logMsg, (logMsg[0]) ? " - " : "\t");
80     strcat(logMsg, "adjusted to obey maximum allowed clear range");
81     return(true);
82 
83   } else {
84     //  Final clear already within the maximum clear.
85     return(true);
86   }
87 }
88 
89 
90 
91 int
main(int argc,char ** argv)92 main(int argc, char **argv) {
93   char       *seqName = 0L;
94   char       *ovsName = 0L;
95 
96   char       *iniClrName = NULL;
97   char       *maxClrName = NULL;
98   char       *outClrName = NULL;
99 
100   uint32      errorValue     = AS_OVS_encodeEvalue(0.015);
101   uint32      minAlignLength = 40;
102   uint32      minReadLength  = 64;
103 
104   char       *outputPrefix  = NULL;
105   char        logName[FILENAME_MAX] = {0};
106   char        sumName[FILENAME_MAX] = {0};
107   FILE       *logFile = 0L;
108   FILE       *staFile = 0L;
109 
110   uint32      idMin = 1;
111   uint32      idMax = UINT32_MAX;
112 
113   uint32      minEvidenceOverlap  = 40;
114   uint32      minEvidenceCoverage = 1;
115 
116   //  Statistics on the trimming
117 
118   trimStat    readsIn;      //  Read is eligible for trimming
119   trimStat    deletedIn;    //  Read was deleted already
120   trimStat    noTrimIn;     //  Read not requesting trimming
121 
122   trimStat    readsOut;     //  Read was trimmed to a valid read
123   trimStat    noOvlOut;     //  Read was deleted; no ovelaps
124   trimStat    deletedOut;   //  Read was deleted; too small after trimming
125   trimStat    noChangeOut;  //  Read was untrimmed
126 
127   trimStat    trim5;        //  Bases trimmed from the 5' end
128   trimStat    trim3;
129 
130 
131   argc = AS_configure(argc, argv, 1);
132 
133   int arg=1;
134   int err=0;
135   while (arg < argc) {
136     if        (strcmp(argv[arg], "-S") == 0) {
137       seqName = argv[++arg];
138 
139     } else if (strcmp(argv[arg], "-O") == 0) {
140       ovsName = argv[++arg];
141 
142     } else if (strcmp(argv[arg], "-Ci") == 0) {
143       iniClrName = argv[++arg];
144     } else if (strcmp(argv[arg], "-Cm") == 0) {
145       maxClrName = argv[++arg];
146     } else if (strcmp(argv[arg], "-Co") == 0) {
147       outClrName = argv[++arg];
148 
149     } else if (strcmp(argv[arg], "-e") == 0) {
150       double erate = atof(argv[++arg]);
151       errorValue = AS_OVS_encodeEvalue(erate);
152 
153     } else if (strcmp(argv[arg], "-l") == 0) {
154       minAlignLength = atoi(argv[++arg]);
155 
156     } else if (strcmp(argv[arg], "-minlength") == 0) {
157       minReadLength = atoi(argv[++arg]);
158 
159     } else if (strcmp(argv[arg], "-ol") == 0) {
160       minEvidenceOverlap = atoi(argv[++arg]);
161 
162     } else if (strcmp(argv[arg], "-oc") == 0) {
163       minEvidenceCoverage = atoi(argv[++arg]);
164 
165     } else if (strcmp(argv[arg], "-o") == 0) {
166       outputPrefix = argv[++arg];
167 
168     } else if (strcmp(argv[arg], "-t") == 0) {
169       decodeRange(argv[++arg], idMin, idMax);
170 
171     } else {
172       fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
173       err++;
174     }
175 
176     arg++;
177   }
178   if ((seqName       == NULL) ||
179       (ovsName       == NULL) ||
180       (outClrName    == NULL) ||
181       (outputPrefix  == NULL) ||
182       (err)) {
183     fprintf(stderr, "usage: %s -S seqStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]);
184     fprintf(stderr, "\n");
185     fprintf(stderr, "  -S seqStore    path to read store\n");
186     fprintf(stderr, "  -O ovlStore    path to overlap store\n");
187     fprintf(stderr, "\n");
188     fprintf(stderr, "  -o name        output prefix, for logging\n");
189     fprintf(stderr, "\n");
190     fprintf(stderr, "  -t bgn-end     limit processing to only reads from bgn to end (inclusive)\n");
191     fprintf(stderr, "\n");
192     fprintf(stderr, "  -Ci clearFile  path to input clear ranges (NOT SUPPORTED)\n");
193     //fprintf(stderr, "  -Cm clearFile  path to maximal clear ranges\n");
194     fprintf(stderr, "  -Co clearFile  path to ouput clear ranges\n");
195     fprintf(stderr, "\n");
196     fprintf(stderr, "  -e erate       ignore overlaps with more than 'erate' percent error\n");
197     //fprintf(stderr, "  -l length      ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
198     fprintf(stderr, "\n");
199     fprintf(stderr, "  -ol l          the minimum evidence overlap length\n");
200     fprintf(stderr, "  -oc c          the minimum evidence overlap coverage\n");
201     fprintf(stderr, "                   evidence overlaps must overlap by 'l' bases to be joined, and\n");
202     fprintf(stderr, "                   must be at least 'c' deep to be retained\n");
203     fprintf(stderr, "\n");
204     fprintf(stderr, "  -minlength l   reads trimmed below this many bases are deleted\n");
205     fprintf(stderr, "\n");
206     exit(1);
207   }
208 
209   sqStore          *seq = new sqStore(seqName);
210   ovStore          *ovs = new ovStore(ovsName, seq);
211 
212   clearRangeFile   *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, seq);
213   clearRangeFile   *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, seq);
214   clearRangeFile   *outClr =                               new clearRangeFile(outClrName, seq);
215 
216   if (outClr)
217     //  If the outClr file exists, those clear ranges are loaded.  We need to reset them
218     //  back to 'untrimmed' for now.
219     outClr->reset(seq);
220 
221   if (iniClr && outClr)
222     //  An iniClr file was supplied, so use those as the initial clear ranges.
223     outClr->copy(iniClr);
224 
225 
226   if (outputPrefix) {
227     snprintf(logName, FILENAME_MAX, "%s.log",   outputPrefix);
228 
229     logFile = AS_UTL_openOutputFile(logName);
230 
231     fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n");
232   }
233 
234 
235   uint32      ovlLen       = 0;
236   uint32      ovlMax       = 0;
237   ovOverlap  *ovl          = NULL;
238 
239   char        logMsg[1024] = {0};
240 
241   if (idMin < 1)
242     idMin = 1;
243   if (idMax > seq->sqStore_lastReadID())
244     idMax = seq->sqStore_lastReadID();
245 
246   fprintf(stderr, "Processing from ID " F_U32 " to " F_U32 " out of " F_U32 " reads.\n",
247           idMin,
248           idMax,
249           seq->sqStore_lastReadID());
250 
251 
252   for (uint32 id=idMin; id<=idMax; id++) {
253     sqLibrary  *libr = seq->sqStore_getLibraryForRead(id);
254 
255     logMsg[0] = 0;
256 
257     //  If the fragment is deleted, do nothing.  If the fragment was deleted AFTER overlaps were
258     //  generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments
259     //  we skip.
260     //
261     if ((iniClr) && (iniClr->isDeleted(id) == true)) {
262       deletedIn += seq->sqStore_getReadLength(id);
263       continue;
264     }
265 
266     //  If it did not request trimming, do nothing.  Similar to the above, we'll get overlaps to
267     //  fragments we skip.
268     //
269 #if 0
270     //  (yes, this is nonsense)
271     if ((libr->sqLibrary_finalTrim() == SQ_FINALTRIM_LARGEST_COVERED) &&
272         (libr->sqLibrary_finalTrim() == SQ_FINALTRIM_BEST_EDGE)) {
273       noTrimIn += seq->sqStore_getReadLength(id);
274       continue;
275     }
276 #endif
277 
278     readsIn += seq->sqStore_getReadLength(id);
279 
280 
281     //  Decide on the initial trimming.  We copied any iniClr into outClr above, and if there wasn't
282     //  an iniClr, then outClr is the full read.
283 
284     uint32      ibgn   = outClr->bgn(id);
285     uint32      iend   = outClr->end(id);
286 
287     //  Set the, ahem, initial final trimming.
288 
289     bool        isGood = false;
290     uint32      fbgn   = ibgn;
291     uint32      fend   = iend;
292 
293     //  Load overlaps.
294 
295     ovlLen = ovs->loadOverlapsForRead(id, ovl, ovlMax);
296 
297     //  Trim!
298 
299     //  No overlaps, so mark it as junk.
300     if (ovlLen == 0) {
301       isGood = false;
302     }
303 
304     //  Use the largest region covered by overlaps as the trim
305     else {
306 
307       assert(ovlLen > 0);
308       assert(id == ovl[0].a_iid);
309 
310       isGood = largestCovered(ovl, ovlLen,
311                               id, seq->sqStore_getReadLength(id),
312                               ibgn, iend, fbgn, fend,
313                               logMsg,
314                               errorValue,
315                               minEvidenceOverlap,
316                               minEvidenceCoverage,
317                               minReadLength);
318       assert(fbgn <= fend);
319     }
320 
321 #if 0
322     //  Use the largest region covered by overlaps as the trim
323     else if (libr->sqLibrary_finalTrim() == SQ_FINALTRIM_BEST_EDGE) {
324 
325       assert(ovlLen > 0);
326       assert(id == ovl[0].a_iid);
327 
328       isGood = bestEdge(ovl, ovlLen,
329                         id, seq->sqStore_getReadLength(id),
330                         ibgn, iend, fbgn, fend,
331                         logMsg,
332                         errorValue,
333                         minEvidenceOverlap,
334                         minEvidenceCoverage,
335                         minReadLength);
336       assert(fbgn <= fend);
337     }
338 
339     //  Do nothing.  Really shouldn't get here.
340     else {
341       assert(0);
342       continue;
343     }
344 #endif
345 
346     //  Enforce the maximum clear range
347 
348     if ((isGood) && (maxClr)) {
349       isGood = enforceMaximumClearRange(id,
350                                         ibgn, iend, fbgn, fend,
351                                         logMsg,
352                                         maxClr);
353       assert(fbgn <= fend);
354     }
355 
356     //
357     //  Trimmed.  Make sense of the result, write some logs, and update the output.
358     //
359 
360 
361     //  If bad trimming or too small, write the log and keep going.
362     //
363     if (ovlLen == 0) {
364       noOvlOut += seq->sqStore_getReadLength(id);
365 
366       outClr->setbgn(id) = fbgn;
367       outClr->setend(id) = fend;
368       outClr->setDeleted(id);  //  Gah, just obliterates the clear range.
369 
370       fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tNOV%s\n",
371               id,
372               ibgn, iend,
373               fbgn, fend,
374               (logMsg[0] == 0) ? "" : logMsg);
375     }
376 
377     else if ((isGood == false) || (fend - fbgn < minReadLength)) {
378       deletedOut += seq->sqStore_getReadLength(id);
379 
380       outClr->setbgn(id) = fbgn;
381       outClr->setend(id) = fend;
382       outClr->setDeleted(id);  //  Gah, just obliterates the clear range.
383 
384       fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tDEL%s\n",
385               id,
386               ibgn, iend,
387               fbgn, fend,
388               (logMsg[0] == 0) ? "" : logMsg);
389     }
390 
391     //  If we didn't change anything, also write a log.
392     //
393     else if ((ibgn == fbgn) &&
394              (iend == fend)) {
395       noChangeOut += seq->sqStore_getReadLength(id);
396 
397       fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tNOC%s\n",
398               id,
399               ibgn, iend,
400               fbgn, fend,
401               (logMsg[0] == 0) ? "" : logMsg);
402       continue;
403     }
404 
405     //  Otherwise, we actually did something.
406 
407     else {
408       readsOut += fend - fbgn;
409 
410       outClr->setbgn(id) = fbgn;
411       outClr->setend(id) = fend;
412 
413       assert(ibgn <= fbgn);
414       assert(fend <= iend);
415 
416       if (fbgn - ibgn > 0)   trim5 += fbgn - ibgn;
417       if (iend - fend > 0)   trim3 += iend - fend;
418 
419       fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tMOD%s\n",
420               id,
421               ibgn, iend,
422               fbgn, fend,
423               (logMsg[0] == 0) ? "" : logMsg);
424     }
425   }
426 
427   //  Clean up.
428 
429   delete seq;
430 
431   delete [] ovl;
432   delete    ovs;
433 
434   delete    iniClr;
435   delete    maxClr;
436   delete    outClr;
437 
438   AS_UTL_closeFile(logFile, logName);
439 
440   //  should fprintf() the numbers directly here so an explanation of each category can be supplied;
441   //  simpler for now to have report() do it.
442 
443   //  Dump the statistics and plots
444 
445   if (outputPrefix) {
446     snprintf(sumName, FILENAME_MAX, "%s.stats", outputPrefix);
447 
448     staFile = AS_UTL_openOutputFile(sumName);
449   }
450 
451   if (staFile == NULL)
452     staFile = stderr;
453 
454   fprintf(staFile, "PARAMETERS:\n");
455   fprintf(staFile, "----------\n");
456   fprintf(staFile, "%7u    (reads trimmed below this many bases are deleted)\n", minReadLength);
457   fprintf(staFile, "%7.4f    (use overlaps at or below this fraction error)\n", AS_OVS_decodeEvalue(errorValue));
458   fprintf(staFile, "%7u    (break region if overlap is less than this long, for 'largest covered' algorithm)\n", minEvidenceOverlap);
459   fprintf(staFile, "%7u    (break region if overlap coverage is less than this many read%s, for 'largest covered' algorithm)\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s");
460   fprintf(staFile, "\n");
461 
462   fprintf(staFile, "INPUT READS:\n");
463   fprintf(staFile, "-----------\n");
464   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads processed)\n", readsIn.nReads,  readsIn.nBases);
465   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
466   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
467 
468   readsIn  .generatePlots(outputPrefix, "inputReads",        250);
469   deletedIn.generatePlots(outputPrefix, "inputDeletedReads", 250);
470   noTrimIn .generatePlots(outputPrefix, "inputNoTrimReads",  250);
471 
472   fprintf(staFile, "\n");
473   fprintf(staFile, "OUTPUT READS:\n");
474   fprintf(staFile, "------------\n");
475   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (trimmed reads output)\n", readsOut.nReads,    readsOut.nBases);
476   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with no change, kept as is)\n", noChangeOut.nReads, noChangeOut.nBases);
477   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with no overlaps, deleted)\n", noOvlOut.nReads,    noOvlOut.nBases);
478   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with short trimmed length, deleted)\n", deletedOut.nReads,  deletedOut.nBases);
479 
480   readsOut   .generatePlots(outputPrefix, "outputTrimmedReads",   250);
481   noOvlOut   .generatePlots(outputPrefix, "outputNoOvlReads",     250);
482   deletedOut .generatePlots(outputPrefix, "outputDeletedReads",   250);
483   noChangeOut.generatePlots(outputPrefix, "outputUnchangedReads", 250);
484 
485   fprintf(staFile, "\n");
486   fprintf(staFile, "TRIMMING DETAILS:\n");
487   fprintf(staFile, "----------------\n");
488   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (bases trimmed from the 5' end of a read)\n", trim5.nReads, trim5.nBases);
489   fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (bases trimmed from the 3' end of a read)\n", trim3.nReads, trim3.nBases);
490 
491   trim5.generatePlots(outputPrefix, "trim5", 25);
492   trim3.generatePlots(outputPrefix, "trim3", 25);
493 
494   AS_UTL_closeFile(staFile, sumName);
495 
496   //  Buh-bye.
497 
498   exit(0);
499 }
500