1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "trimReads.H"
19
20 #include "intervalList.H"
21
22
23 bool
largestCovered(ovOverlap * ovl,uint32 ovlLen,uint32 readID,uint32 readLen,uint32 UNUSED (ibgn),uint32 iend,uint32 & fbgn,uint32 & fend,char * logMsg,uint32 errorValue,uint32 minOverlap,uint32 minCoverage,uint32 minReadLength)24 largestCovered(ovOverlap *ovl,
25 uint32 ovlLen,
26 uint32 readID,
27 uint32 readLen,
28 uint32 UNUSED(ibgn),
29 uint32 iend,
30 uint32 &fbgn,
31 uint32 &fend,
32 char *logMsg,
33 uint32 errorValue,
34 uint32 minOverlap,
35 uint32 minCoverage,
36 uint32 minReadLength) {
37
38 bool verbose = false;
39
40 logMsg[0] = 0;
41
42 assert(readID == ovl[0].a_iid);
43 assert(ovlLen > 0);
44
45 intervalList<uint32> IL;
46 intervalList<uint32> ID;
47 int32 iid = readID;
48
49 uint32 nSkip = 0;
50 uint32 nUsed = 0;
51
52 for (uint32 i=0; i<ovlLen; i++) {
53 uint32 tbgn = ovl[i].a_bgn();
54 uint32 tend = ovl[i].a_end();
55
56 assert(tbgn < tend);
57 assert(iid == ovl[i].a_iid);
58
59 if (ovl[i].evalue() > errorValue || (tend-tbgn < minOverlap)) {
60 // Overlap is crappy.
61 //fprintf(stderr, "skip %2u\n", i);
62 nSkip++;
63 continue;
64 }
65
66 //fprintf(stderr, "Processing interval %d - %d in read %d and currently I have %d stored\n", tbgn, tend, readID, IL.numberOfIntervals());
67 // check if this overlap ends at a point we already captured
68 bool doSkip = false;
69 for (uint32 j = 0; j < IL.numberOfIntervals(); j++) {
70 //fprintf(stderr, "Comparing interval from %d - %d in read %d to %d - %d with start bounds %d - %d\n", tbgn, tend, readID, IL.lo(j), IL.hi(j), (int32)(tbgn - floor(minOverlap/10)), (int32)(tbgn + floor(minOverlap/10)));
71 if (tbgn > 15 && (int32)(tbgn - floor(minOverlap/50)) <= (int32)IL.lo(j) && (int32)(tbgn + floor(minOverlap/50)) >= (int32)IL.lo(j)) {
72 doSkip = true;
73 // fprintf(stderr, "Skip interval in read %ul from %ul - %ul because it is too close to %ul - %ul\n", readID, tbgn, tend, IL.lo(j), IL.hi(j));
74 break;
75 }
76 //fprintf(stderr, "Comparing interval from %d - %d in read %d to %d - %d with end bounds %d - %d\n", tbgn, tend, readID, IL.lo(j), IL.hi(j), (int32)(tend - floor(minOverlap/10)), (int32)(tend + floor(minOverlap/10)));
77
78 if (tend < readLen-15 && (int32)(tend - floor(minOverlap/50)) <= IL.hi(j) && (int32)(tend + floor(minOverlap/50)) >= IL.hi(j)) {
79 doSkip = true;
80 // fprintf(stderr, "Skip interval in read %ul from %ul - %ul because it is too close to %ul - %ul\n", readID, tbgn, tend, IL.lo(j), IL.hi(j));
81 break;
82 }
83 }
84 if (doSkip) {
85 nSkip++;
86 continue;
87 }
88
89 //fprintf(stderr, "save %2u from %d - %d\n", i, tbgn, tend);
90 nUsed++;
91
92
93 uint32 trimLen = floor((tend-tbgn) * 0.05);
94 if (trimLen > 250) trimLen = 250;
95 //fprintf(stderr, "For interval from %d - %d the trim is %d\n", tbgn, tend, trimLen);
96
97 if (tend + trimLen >= readLen)
98 IL.add(tbgn, tend - tbgn);
99 else
100 IL.add(tbgn, tend - tbgn - trimLen);
101 }
102
103 if (verbose)
104 for (uint32 it=0; it<IL.numberOfIntervals(); it++)
105 fprintf(stderr, "IL - %3u - %5u %5u\n", readID, IL.lo(it), IL.hi(it));
106
107 // I thought I'd allow low coverage at the end of the read, but not internally, but that is hard,
108 // and largely unnecessary. We'll just not be assembling at low coverage joins, which is
109 // acceptable.
110
111 if (minCoverage > 0) {
112 intervalDepth<uint32> DE(IL);
113
114 uint32 it = 0;
115 uint32 ib = 0;
116 uint32 ie = 0;
117
118 while (it < DE.numberOfIntervals()) {
119 if (verbose)
120 fprintf(stderr, "DE - %3u - %5u %5u depth %u\n", readID, DE.lo(it), DE.hi(it), DE.depth(it));
121
122 if (DE.depth(it) < minCoverage) {
123 // Dropped below good coverage depth. If we have an interval, save it. Reset.
124 if (ie > ib) {
125 //fprintf(stderr, "AD1 %d-%d len %d\n", ib, ie, ie - ib);
126 ID.add(ib, ie - ib);
127 }
128 ib = 0;
129 ie = 0;
130
131 } else if ((ib == 0) && (ie == 0)) {
132 // Depth is good. If no current interval, make a new one.
133 ib = DE.lo(it);
134 ie = DE.hi(it);
135 //fprintf(stderr, "NE1 %d-%d len %d\n", ib, ie, ie - ib);
136
137 } else if (ie == DE.lo(it)) {
138 // Depth is good. If this interval is adjacent to the current, extend.
139 ie = DE.hi(it);
140 //fprintf(stderr, "EXT %d-%d len %d\n", ib, ie, ie - ib);
141
142 } else {
143 // Depth is good, but we just had a gap in coverage. Save any current interval. Reset.
144 if (ie > ib) {
145 //fprintf(stderr, "AD2 %d-%d len %d\n", ib, ie, ie - ib);
146 ID.add(ib, ie - ib);
147 }
148 ib = DE.lo(it);
149 ie = DE.hi(it);
150 //fprintf(stderr, "NE2 %d-%d len %d\n", ib, ie, ie - ib);
151 }
152
153 it++;
154 }
155
156 if (ie > ib) {
157 //fprintf(stderr, "AD3 %d-%d len %d\n", ib, ie, ie - ib);
158 ID.add(ib, ie - ib);
159 }
160 }
161
162 // Now that we've created depth, merge the intervals.
163
164 IL.merge(minOverlap);
165
166 // IL - covered interavls enforcing a minimum overlap size (these can overlap)
167 // ID - covered intervals enforcing a minimum depth (these cannot overlap)
168 //
169 // Create new intervals from the intersection of IL and ID.
170 //
171 // This catches one nasty case, where a thin overlap has more than minDepth coverage.
172 //
173 // ------------- 3x coverage
174 // ------------- all overlaps 1 or 2 dashes long
175 // ---------
176 // -----------
177
178 if (minCoverage > 0) {
179 intervalList<uint32> FI;
180
181 uint32 li = 0;
182 uint32 di = 0;
183
184 while ((li < IL.numberOfIntervals()) &&
185 (di < ID.numberOfIntervals())) {
186 uint32 ll = IL.lo(li);
187 uint32 lh = IL.hi(li);
188 uint32 dl = ID.lo(di);
189 uint32 dh = ID.hi(di);
190 uint32 nl = 0;
191 uint32 nh = 0;
192
193 // If they intersect, make a new region
194
195 if ((ll <= dl) && (dl < lh)) {
196 nl = dl;
197 nh = (lh < dh) ? lh : dh;
198 }
199
200 if ((dl <= ll) && (ll < dh)) {
201 nl = ll;
202 nh = (lh < dh) ? lh : dh;
203 }
204
205 if (nl < nh)
206 FI.add(nl, nh - nl);
207
208 // Advance the list with the earlier region.
209
210 if (lh <= dh)
211 // IL ends at or before ID
212 li++;
213
214 if (dh <= lh) {
215 // ID ends at or before IL
216 di++;
217 }
218 }
219
220 // Replace the intervals to use with the intersection.
221
222 IL = FI;
223 }
224
225 ////////////////////////////////////////
226
227 if (IL.numberOfIntervals() == 0) {
228 if (verbose)
229 fprintf(stderr, "no high quality overlaps\n");
230 strcpy(logMsg, "\tno high quality overlaps");
231 return(false);
232 }
233
234 sprintf(logMsg, "\tskipped %u overlaps; used %u overlaps", nSkip, nUsed);
235
236 fbgn = IL.lo(0);
237 fend = IL.hi(0);
238 if (verbose)
239 fprintf(stderr, "IL - %3u - %5u %5u SAVE\n", readID, fbgn, fend);
240
241 for (uint32 it=1; it<IL.numberOfIntervals(); it++) {
242 if (IL.hi(it) - IL.lo(it) > fend - fbgn) {
243 fbgn = IL.lo(it);
244 fend = IL.hi(it);
245 if (verbose)
246 fprintf(stderr, "IL - %3u - %5u %5u SAVE\n", readID, IL.lo(it), IL.hi(it));
247 } else {
248 if (verbose)
249 fprintf(stderr, "IL - %3u - %5u %5u\n", readID, IL.lo(it), IL.hi(it));
250 }
251 }
252
253 return(true);
254 }
255