1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "trimReads.H"
19 
20 #include <vector>
21 
22 //  Generates plots for each trim.
23 #undef  GNUPLOT
24 
25 //  Logging to the screen.
26 #undef  VERBOSE
27 
28 
29 bool
bestEdge(ovOverlap * ovl,uint32 ovlLen,uint32 readID,uint32 readLen,uint32 ibgn,uint32 iend,uint32 & fbgn,uint32 & fend,char * logMsg,uint32 errorValue,uint32 minOverlap,uint32 minCoverage,uint32 minReadLength)30 bestEdge(ovOverlap  *ovl,
31          uint32       ovlLen,
32          uint32       readID,
33          uint32       readLen,
34          uint32       ibgn,
35          uint32       iend,
36          uint32      &fbgn,
37          uint32      &fend,
38          char        *logMsg,
39          uint32       errorValue,
40          uint32       minOverlap,
41          uint32       minCoverage,
42          uint32       minReadLength) {
43 
44   fbgn      = ibgn;
45   fend      = iend;
46   logMsg[0] = 0;
47 
48   assert(readID == ovl[0].a_iid);
49   assert(ovlLen > 0);
50 
51   //
52   //  Trim once, using the largest covered rule.  This gets rid of any gaps in overlap coverage,
53   //  etc.
54   //
55 
56   uint32  lbgn = 0;
57   uint32  lend = 0;
58 
59   if (largestCovered(ovl, ovlLen, readID, readLen, ibgn, iend, lbgn, lend, logMsg, errorValue, minOverlap, minCoverage, minReadLength) == false)
60     return(false);
61 
62   //
63   //  Trim again, to maximize overlap length.
64   //
65 
66   std::vector<uint32>    trim5;
67   std::vector<uint32>    trim3;
68 
69   uint32                 nContained = 0;
70 
71 #ifdef GNUPLOT
72   std::vector<uint32>    trim5iid, trim5sco;
73   std::vector<uint32>    trim3iid, trim3sco;
74 #endif
75 
76   //  For each overlap, add potential trim points where the overlap ends.
77 
78   for (uint32 i=0; i<ovlLen; i++) {
79     uint32 tbgn = ovl[i].a_bgn();
80     uint32 tend = ovl[i].a_end();
81 
82     if ((lend <= tbgn) ||
83         (tend <= lbgn))
84       //  Doesn't intersect the largest covered region.
85       continue;
86 
87     assert(tbgn < tend);
88 
89     assert(ibgn <= tbgn);
90     assert(tend <= iend);
91 
92     assert(readID == ovl[i].a_iid);
93 
94     if (ovl[i].evalue() > errorValue)
95       //  Overlap is crappy.
96       continue;
97 
98     //  Add trim points as long as they aren't outside the largest covered region.
99 
100     if (lbgn <= tbgn)
101       trim5.push_back(tbgn);
102 
103     if (tend <= lend)
104       trim3.push_back(tend);
105 
106     //  If overlap indicates this read is contained, we're done.  There isn't any
107     //  trimming needed.  We can set the final trim and return, or let the rest
108     //  of the algorithm/logging finish.
109 
110     if ((tbgn == ibgn) &&
111         (tend == iend))
112       nContained++;
113   }
114 
115   //  If the read is contained in more than one other read, we're done.  No trimming needed.
116 
117   if (nContained >= 2) {
118     //fbgn = ibgn;
119     //fend = iend;
120     //return(true);
121 
122     trim5.clear();
123     trim3.clear();
124   }
125 
126   //  Add trim points for the largest covered, i.e., no trimming past what largest covered did.
127 
128   trim5.push_back(lbgn);
129   trim3.push_back(lend);
130 
131   //  Duplicate removal, and possibly the processing algorithm, need the trim points
132   //  sorted from outside-in.
133 
134   sort(trim5.begin(), trim5.end(), std::less<uint32>());
135   sort(trim3.begin(), trim3.end(), std::greater<uint32>());
136 
137   //  Remove duplicate points (easier here than in the loops below)
138 
139   {
140     uint32 old = 0;
141     uint32 sav = 0;
142 
143     for (old=0, sav=0; old<trim5.size(); old++)
144       if (trim5[old] != trim5[sav])
145         trim5[++sav] = trim5[old];
146     trim5.resize(sav+1);
147 
148 #ifdef GNUPLOT
149     trim5iid.resize(sav+1);
150     trim5sco.resize(sav+1);
151 #endif
152 
153     for (old=0, sav=0; old<trim3.size(); old++)
154       if (trim3[old] != trim3[sav])
155         trim3[++sav] = trim3[old];
156     trim3.resize(sav+1);
157 
158 #ifdef GNUPLOT
159     trim3iid.resize(sav+1);
160     trim3sco.resize(sav+1);
161 #endif
162   }
163 
164   uint32   best5pt    = 0;  //  Index into trim5
165   uint32   best5score = 0;  //  Score for the best index
166   uint32   best5iid   = 0;  //  IID for the best index
167   uint32   best5end   = 0;  //  Coord of the end point for the best index
168 
169   uint32   best3pt    = 0;
170   uint32   best3score = 0;
171   uint32   best3iid   = 0;
172   uint32   best3bgn   = UINT32_MAX;
173 
174   //  Find the best 5' point.
175 
176   for (uint32 pt=0; pt < trim5.size(); pt++) {
177     uint32   triml    = trim5[pt];
178     uint32   score    = 0;
179     uint32   sciid    = 0;
180     uint32   scend    = 0;
181 
182     if (best5score >= readLen - triml)
183       //  Not possible to get a higher score by trimming more.
184       break;
185 
186     //fprintf(stderr, "trim5 pt %u out of %u\n", pt, trim5.size());
187 
188     for (uint32 i=0; i < ovlLen; i++) {
189       uint32 tbgn = ibgn + ovl[i].a_bgn();
190       uint32 tend = ibgn + ovl[i].a_end();
191 
192       if ((triml <  tbgn) ||
193           (tend  <= triml))
194         //  Alignment starts after of the trim point; not a valid overlap.
195         //  or, alignment ends before the trim point (trimmed out).
196         continue;
197 
198       //  Limit the overlap to the largest covered.
199       if (tend > lend)
200         tend = lend;
201 
202       assert(tend >= triml);
203 
204       uint32 tlen = tend - triml;
205 
206       assert(tlen <= readLen);
207 
208       //  Save the best score.  Break ties by favoring the current best.
209 
210       if (((score  < tlen)) ||
211           ((score == tlen) && (ovl[i].b_iid == best5iid))) {
212         score = tlen;
213         sciid = ovl[i].b_iid;
214         scend = tend;
215 #ifdef GNUPLOT
216         trim5sco[pt] = score;
217         trim5iid[pt] = sciid;
218 #endif
219       }
220     }
221 
222     //  Give up if we're not finding anything longer after 1/3 of the current read.  Previous
223     //  attempts at this wanted to ensure that the current read (sciid) is the same as the best
224     //  (best5iid) but ties and slightly different begin/end points make this unreliable.  For
225     //  example, an overlap from 100-500 and an overlap from 200-501.  The first is longest up until
226     //  trim point 200, then the second becomes longest.  The BEST longest is still the first
227     //  overlap, at trim point 100.
228 
229     if (score < best5score * 0.66)
230       break;
231 
232     //  Save a new best if the score is better, and the endpoint is more interior to the read.
233 
234     if ((best5score < score) &&
235         (best5end   < scend)) {
236       //fprintf(stderr, "RESET 5 end to pt %d score %d iid %d end %d\n",
237       //        pt, score, sciid, scend);
238       best5pt    = pt;
239       best5score = score;
240       best5iid   = sciid;
241       best5end   = scend;
242     }
243   }
244 
245   //fprintf(stderr, "BEST at %u position %u pt %u\n", best5score, trim5[best5pt], best5pt);
246 
247   //  Find the best 3' point.
248 
249   for (uint32 pt=0; pt<trim3.size(); pt++) {
250     uint32   trimr    = trim3[pt];;
251     uint32   score    = 0;
252     uint32   sciid    = 0;
253     uint32   scbgn    = 0;
254 
255     if (best3score >= trimr - 0)
256       //  Not possible to get a higher score by trimming more.
257       break;
258 
259     //fprintf(stderr, "trim3 pt %u out of %u\n", pt, trim3.size());
260 
261     for (uint32 i=0; i < ovlLen; i++) {
262       uint32 tbgn = ibgn + ovl[i].a_bgn();
263       uint32 tend = ibgn + ovl[i].a_end();
264 
265       if ((tend < trimr) ||
266           (trimr <= tbgn))
267         //  Alignment ends before the trim point; not a valid overlap,
268         //  or, alignment starts after the trim point (trimmed out)
269         continue;
270 
271       //  Limit the overlap to the largest covered.
272       if (tbgn < lbgn)
273         tbgn = lbgn;
274 
275       assert(trimr >= tbgn);
276 
277       uint32 tlen = trimr - tbgn;
278 
279       assert(tlen <= readLen);
280 
281       if (((score  < tlen)) ||
282           ((score == tlen) && (ovl[i].b_iid == best3iid))) {
283         score = tlen;
284         sciid = ovl[i].b_iid;
285         scbgn = tbgn;
286 #ifdef GNUPLOT
287         trim3sco[pt] = score;
288         trim3iid[pt] = sciid;
289 #endif
290       }
291     }
292 
293     if (score < best3score * 0.66)
294       break;
295 
296     if ((best3score < score) &&
297         (scbgn      < best3bgn)) {
298       //fprintf(stderr, "RESET 3 end to pt %d score %d iid %d bgn %d\n",
299       //        pt, score, sciid, scbgn);
300       best3pt    = pt;
301       best3score = score;
302       best3iid   = sciid;
303       best3bgn   = scbgn;
304     }
305   }
306 
307 #ifdef GNUPLOT
308   {
309     char  D[FILENAME_MAX];
310     char  G[FILENAME_MAX];
311     char  S[FILENAME_MAX];
312 
313     FILE *F;
314 
315     snprintf(D, FILENAME_MAX, "trim-%08d.dat", readID);
316     snprintf(G, FILENAME_MAX, "trim-%08d.gp",  readID);
317     snprintf(S, FILENAME_MAX, "gnuplot < trim-%08d.gp", readID);
318 
319     F = fopen(D, "w");
320     for (uint32 i=0; i<MAX(trim5.size(), trim3.size()); i++) {
321       if      (i < trim5.size() && i < trim3.size())
322         fprintf(F, "trim[%03d] pt %4d len %4d iid %7d -- pt %4d len %4d iid %7d \n",
323                 i,
324                 trim5[i], trim5sco[i], trim5iid[i],
325                 trim3[i], trim3sco[i], trim3iid[i]);
326       else if (i < trim5.size())
327         fprintf(F, "trim[%03d] pt %4d len %4d iid %7d -- pt %4d len %4d iid %7d \n",
328                 i,
329                 trim5[i], trim5sco[i], trim5iid[i],
330                 0, 0, 0);
331       else
332         fprintf(F, "trim[%03d] pt %4d len %4d iid %7d -- pt %4d len %4d iid %7d \n",
333                 i,
334                 0, 0, 0,
335                 trim3[i], trim3sco[i], trim3iid[i]);
336     }
337     AS_UTL_closeFile(F, D);
338 
339 
340     F = fopen(G, "w");
341     fprintf(F, "set terminal png\n");
342     fprintf(F, "set output \"trim-%08d.png\"\n",
343             readID);
344     fprintf(F, "plot \"trim-%08d.dat\" using 3:5 with linespoints, \"trim-%08d.dat\" using 10:12 with linespoints\n",
345             readID,
346             readID);
347     AS_UTL_closeFile(F, G);
348 
349 
350     system(S);
351   }
352 #endif
353 
354   //fprintf(stderr, "BEST at %u position %u pt %u\n", best3score, trim3[best3pt], best3pt);
355 
356   //  Set trimming.  Be just a little aggressive, and get rid of an extra base or two, if possible.
357   //  If not possible, the read is crap, and will be deleted anyway.
358 
359   fbgn = trim5[best5pt];
360   fend = trim3[best3pt];
361 
362   if ((fbgn + 4 < fend) &&
363       (2        < fend)) {
364     fbgn += 2;
365     fend -= 2;
366   }
367 
368   //  Did we go outside the largestCovered region?  Just reset.  Ideally we'd assert, and crash.
369 
370   if (fbgn < lbgn)  fbgn = lbgn;
371   if (lend < fend)  fend = lend;
372 
373   //  Lastly, did we just end up with a bogus trim?  There isn't any guard against picking the 5'
374   //  trim point to the right of the 3' trim point.
375 
376 #if 1
377   if (fend < fbgn) {
378     fprintf(stderr, "iid = %u\n", readID);
379     fbgn = lbgn;
380     fend = lend;
381   }
382 #endif
383 
384   assert(fbgn <= fend);
385 
386   return(true);
387 }
388