1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "trimReads.H"
19
20 #include <vector>
21
22 // Generates plots for each trim.
23 #undef GNUPLOT
24
25 // Logging to the screen.
26 #undef VERBOSE
27
28
29 bool
bestEdge(ovOverlap * ovl,uint32 ovlLen,uint32 readID,uint32 readLen,uint32 ibgn,uint32 iend,uint32 & fbgn,uint32 & fend,char * logMsg,uint32 errorValue,uint32 minOverlap,uint32 minCoverage,uint32 minReadLength)30 bestEdge(ovOverlap *ovl,
31 uint32 ovlLen,
32 uint32 readID,
33 uint32 readLen,
34 uint32 ibgn,
35 uint32 iend,
36 uint32 &fbgn,
37 uint32 &fend,
38 char *logMsg,
39 uint32 errorValue,
40 uint32 minOverlap,
41 uint32 minCoverage,
42 uint32 minReadLength) {
43
44 fbgn = ibgn;
45 fend = iend;
46 logMsg[0] = 0;
47
48 assert(readID == ovl[0].a_iid);
49 assert(ovlLen > 0);
50
51 //
52 // Trim once, using the largest covered rule. This gets rid of any gaps in overlap coverage,
53 // etc.
54 //
55
56 uint32 lbgn = 0;
57 uint32 lend = 0;
58
59 if (largestCovered(ovl, ovlLen, readID, readLen, ibgn, iend, lbgn, lend, logMsg, errorValue, minOverlap, minCoverage, minReadLength) == false)
60 return(false);
61
62 //
63 // Trim again, to maximize overlap length.
64 //
65
66 std::vector<uint32> trim5;
67 std::vector<uint32> trim3;
68
69 uint32 nContained = 0;
70
71 #ifdef GNUPLOT
72 std::vector<uint32> trim5iid, trim5sco;
73 std::vector<uint32> trim3iid, trim3sco;
74 #endif
75
76 // For each overlap, add potential trim points where the overlap ends.
77
78 for (uint32 i=0; i<ovlLen; i++) {
79 uint32 tbgn = ovl[i].a_bgn();
80 uint32 tend = ovl[i].a_end();
81
82 if ((lend <= tbgn) ||
83 (tend <= lbgn))
84 // Doesn't intersect the largest covered region.
85 continue;
86
87 assert(tbgn < tend);
88
89 assert(ibgn <= tbgn);
90 assert(tend <= iend);
91
92 assert(readID == ovl[i].a_iid);
93
94 if (ovl[i].evalue() > errorValue)
95 // Overlap is crappy.
96 continue;
97
98 // Add trim points as long as they aren't outside the largest covered region.
99
100 if (lbgn <= tbgn)
101 trim5.push_back(tbgn);
102
103 if (tend <= lend)
104 trim3.push_back(tend);
105
106 // If overlap indicates this read is contained, we're done. There isn't any
107 // trimming needed. We can set the final trim and return, or let the rest
108 // of the algorithm/logging finish.
109
110 if ((tbgn == ibgn) &&
111 (tend == iend))
112 nContained++;
113 }
114
115 // If the read is contained in more than one other read, we're done. No trimming needed.
116
117 if (nContained >= 2) {
118 //fbgn = ibgn;
119 //fend = iend;
120 //return(true);
121
122 trim5.clear();
123 trim3.clear();
124 }
125
126 // Add trim points for the largest covered, i.e., no trimming past what largest covered did.
127
128 trim5.push_back(lbgn);
129 trim3.push_back(lend);
130
131 // Duplicate removal, and possibly the processing algorithm, need the trim points
132 // sorted from outside-in.
133
134 sort(trim5.begin(), trim5.end(), std::less<uint32>());
135 sort(trim3.begin(), trim3.end(), std::greater<uint32>());
136
137 // Remove duplicate points (easier here than in the loops below)
138
139 {
140 uint32 old = 0;
141 uint32 sav = 0;
142
143 for (old=0, sav=0; old<trim5.size(); old++)
144 if (trim5[old] != trim5[sav])
145 trim5[++sav] = trim5[old];
146 trim5.resize(sav+1);
147
148 #ifdef GNUPLOT
149 trim5iid.resize(sav+1);
150 trim5sco.resize(sav+1);
151 #endif
152
153 for (old=0, sav=0; old<trim3.size(); old++)
154 if (trim3[old] != trim3[sav])
155 trim3[++sav] = trim3[old];
156 trim3.resize(sav+1);
157
158 #ifdef GNUPLOT
159 trim3iid.resize(sav+1);
160 trim3sco.resize(sav+1);
161 #endif
162 }
163
164 uint32 best5pt = 0; // Index into trim5
165 uint32 best5score = 0; // Score for the best index
166 uint32 best5iid = 0; // IID for the best index
167 uint32 best5end = 0; // Coord of the end point for the best index
168
169 uint32 best3pt = 0;
170 uint32 best3score = 0;
171 uint32 best3iid = 0;
172 uint32 best3bgn = UINT32_MAX;
173
174 // Find the best 5' point.
175
176 for (uint32 pt=0; pt < trim5.size(); pt++) {
177 uint32 triml = trim5[pt];
178 uint32 score = 0;
179 uint32 sciid = 0;
180 uint32 scend = 0;
181
182 if (best5score >= readLen - triml)
183 // Not possible to get a higher score by trimming more.
184 break;
185
186 //fprintf(stderr, "trim5 pt %u out of %u\n", pt, trim5.size());
187
188 for (uint32 i=0; i < ovlLen; i++) {
189 uint32 tbgn = ibgn + ovl[i].a_bgn();
190 uint32 tend = ibgn + ovl[i].a_end();
191
192 if ((triml < tbgn) ||
193 (tend <= triml))
194 // Alignment starts after of the trim point; not a valid overlap.
195 // or, alignment ends before the trim point (trimmed out).
196 continue;
197
198 // Limit the overlap to the largest covered.
199 if (tend > lend)
200 tend = lend;
201
202 assert(tend >= triml);
203
204 uint32 tlen = tend - triml;
205
206 assert(tlen <= readLen);
207
208 // Save the best score. Break ties by favoring the current best.
209
210 if (((score < tlen)) ||
211 ((score == tlen) && (ovl[i].b_iid == best5iid))) {
212 score = tlen;
213 sciid = ovl[i].b_iid;
214 scend = tend;
215 #ifdef GNUPLOT
216 trim5sco[pt] = score;
217 trim5iid[pt] = sciid;
218 #endif
219 }
220 }
221
222 // Give up if we're not finding anything longer after 1/3 of the current read. Previous
223 // attempts at this wanted to ensure that the current read (sciid) is the same as the best
224 // (best5iid) but ties and slightly different begin/end points make this unreliable. For
225 // example, an overlap from 100-500 and an overlap from 200-501. The first is longest up until
226 // trim point 200, then the second becomes longest. The BEST longest is still the first
227 // overlap, at trim point 100.
228
229 if (score < best5score * 0.66)
230 break;
231
232 // Save a new best if the score is better, and the endpoint is more interior to the read.
233
234 if ((best5score < score) &&
235 (best5end < scend)) {
236 //fprintf(stderr, "RESET 5 end to pt %d score %d iid %d end %d\n",
237 // pt, score, sciid, scend);
238 best5pt = pt;
239 best5score = score;
240 best5iid = sciid;
241 best5end = scend;
242 }
243 }
244
245 //fprintf(stderr, "BEST at %u position %u pt %u\n", best5score, trim5[best5pt], best5pt);
246
247 // Find the best 3' point.
248
249 for (uint32 pt=0; pt<trim3.size(); pt++) {
250 uint32 trimr = trim3[pt];;
251 uint32 score = 0;
252 uint32 sciid = 0;
253 uint32 scbgn = 0;
254
255 if (best3score >= trimr - 0)
256 // Not possible to get a higher score by trimming more.
257 break;
258
259 //fprintf(stderr, "trim3 pt %u out of %u\n", pt, trim3.size());
260
261 for (uint32 i=0; i < ovlLen; i++) {
262 uint32 tbgn = ibgn + ovl[i].a_bgn();
263 uint32 tend = ibgn + ovl[i].a_end();
264
265 if ((tend < trimr) ||
266 (trimr <= tbgn))
267 // Alignment ends before the trim point; not a valid overlap,
268 // or, alignment starts after the trim point (trimmed out)
269 continue;
270
271 // Limit the overlap to the largest covered.
272 if (tbgn < lbgn)
273 tbgn = lbgn;
274
275 assert(trimr >= tbgn);
276
277 uint32 tlen = trimr - tbgn;
278
279 assert(tlen <= readLen);
280
281 if (((score < tlen)) ||
282 ((score == tlen) && (ovl[i].b_iid == best3iid))) {
283 score = tlen;
284 sciid = ovl[i].b_iid;
285 scbgn = tbgn;
286 #ifdef GNUPLOT
287 trim3sco[pt] = score;
288 trim3iid[pt] = sciid;
289 #endif
290 }
291 }
292
293 if (score < best3score * 0.66)
294 break;
295
296 if ((best3score < score) &&
297 (scbgn < best3bgn)) {
298 //fprintf(stderr, "RESET 3 end to pt %d score %d iid %d bgn %d\n",
299 // pt, score, sciid, scbgn);
300 best3pt = pt;
301 best3score = score;
302 best3iid = sciid;
303 best3bgn = scbgn;
304 }
305 }
306
307 #ifdef GNUPLOT
308 {
309 char D[FILENAME_MAX];
310 char G[FILENAME_MAX];
311 char S[FILENAME_MAX];
312
313 FILE *F;
314
315 snprintf(D, FILENAME_MAX, "trim-%08d.dat", readID);
316 snprintf(G, FILENAME_MAX, "trim-%08d.gp", readID);
317 snprintf(S, FILENAME_MAX, "gnuplot < trim-%08d.gp", readID);
318
319 F = fopen(D, "w");
320 for (uint32 i=0; i<MAX(trim5.size(), trim3.size()); i++) {
321 if (i < trim5.size() && i < trim3.size())
322 fprintf(F, "trim[%03d] pt %4d len %4d iid %7d -- pt %4d len %4d iid %7d \n",
323 i,
324 trim5[i], trim5sco[i], trim5iid[i],
325 trim3[i], trim3sco[i], trim3iid[i]);
326 else if (i < trim5.size())
327 fprintf(F, "trim[%03d] pt %4d len %4d iid %7d -- pt %4d len %4d iid %7d \n",
328 i,
329 trim5[i], trim5sco[i], trim5iid[i],
330 0, 0, 0);
331 else
332 fprintf(F, "trim[%03d] pt %4d len %4d iid %7d -- pt %4d len %4d iid %7d \n",
333 i,
334 0, 0, 0,
335 trim3[i], trim3sco[i], trim3iid[i]);
336 }
337 AS_UTL_closeFile(F, D);
338
339
340 F = fopen(G, "w");
341 fprintf(F, "set terminal png\n");
342 fprintf(F, "set output \"trim-%08d.png\"\n",
343 readID);
344 fprintf(F, "plot \"trim-%08d.dat\" using 3:5 with linespoints, \"trim-%08d.dat\" using 10:12 with linespoints\n",
345 readID,
346 readID);
347 AS_UTL_closeFile(F, G);
348
349
350 system(S);
351 }
352 #endif
353
354 //fprintf(stderr, "BEST at %u position %u pt %u\n", best3score, trim3[best3pt], best3pt);
355
356 // Set trimming. Be just a little aggressive, and get rid of an extra base or two, if possible.
357 // If not possible, the read is crap, and will be deleted anyway.
358
359 fbgn = trim5[best5pt];
360 fend = trim3[best3pt];
361
362 if ((fbgn + 4 < fend) &&
363 (2 < fend)) {
364 fbgn += 2;
365 fend -= 2;
366 }
367
368 // Did we go outside the largestCovered region? Just reset. Ideally we'd assert, and crash.
369
370 if (fbgn < lbgn) fbgn = lbgn;
371 if (lend < fend) fend = lend;
372
373 // Lastly, did we just end up with a bogus trim? There isn't any guard against picking the 5'
374 // trim point to the right of the 3' trim point.
375
376 #if 1
377 if (fend < fbgn) {
378 fprintf(stderr, "iid = %u\n", readID);
379 fbgn = lbgn;
380 fend = lend;
381 }
382 #endif
383
384 assert(fbgn <= fend);
385
386 return(true);
387 }
388