1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "trimReads.H"
19 
20 #include "intervalList.H"
21 
22 
23 bool
largestCovered(ovOverlap * ovl,uint32 ovlLen,uint32 readID,uint32 readLen,uint32 UNUSED (ibgn),uint32 iend,uint32 & fbgn,uint32 & fend,char * logMsg,uint32 errorValue,uint32 minOverlap,uint32 minCoverage,uint32 minReadLength)24 largestCovered(ovOverlap    *ovl,
25                uint32        ovlLen,
26                uint32        readID,
27                uint32        readLen,
28                uint32 UNUSED(ibgn),
29                uint32        iend,
30                uint32       &fbgn,
31                uint32       &fend,
32                char         *logMsg,
33                uint32        errorValue,
34                uint32        minOverlap,
35                uint32        minCoverage,
36                uint32        minReadLength) {
37 
38   bool verbose = false;
39 
40   logMsg[0] = 0;
41 
42   assert(readID == ovl[0].a_iid);
43   assert(ovlLen > 0);
44 
45   intervalList<uint32>  IL;
46   intervalList<uint32>  ID;
47   int32                 iid = readID;
48 
49   uint32                nSkip = 0;
50   uint32                nUsed = 0;
51 
52   for (uint32 i=0; i<ovlLen; i++) {
53     uint32 tbgn = ovl[i].a_bgn();
54     uint32 tend = ovl[i].a_end();
55 
56     assert(tbgn < tend);
57     assert(iid == ovl[i].a_iid);
58 
59     if (ovl[i].evalue() > errorValue || (tend-tbgn < minOverlap)) {
60       //  Overlap is crappy.
61       //fprintf(stderr, "skip %2u\n", i);
62       nSkip++;
63       continue;
64     }
65 
66 //fprintf(stderr, "Processing interval %d - %d in read %d and currently I have %d stored\n", tbgn, tend, readID, IL.numberOfIntervals());
67 // check if this overlap ends at a point we already captured
68 bool doSkip = false;
69 for (uint32 j = 0; j < IL.numberOfIntervals(); j++) {
70 //fprintf(stderr, "Comparing interval from %d - %d in read %d to %d - %d with start bounds %d - %d\n", tbgn, tend, readID, IL.lo(j), IL.hi(j), (int32)(tbgn - floor(minOverlap/10)), (int32)(tbgn + floor(minOverlap/10)));
71    if (tbgn > 15 && (int32)(tbgn - floor(minOverlap/50)) <= (int32)IL.lo(j) && (int32)(tbgn + floor(minOverlap/50)) >= (int32)IL.lo(j)) {
72        doSkip = true;
73 //      fprintf(stderr, "Skip interval in read %ul from %ul - %ul because it is too close to %ul - %ul\n", readID, tbgn, tend, IL.lo(j), IL.hi(j));
74       break;
75    }
76 //fprintf(stderr, "Comparing interval from %d - %d in read %d to %d - %d with end bounds %d - %d\n", tbgn, tend, readID, IL.lo(j), IL.hi(j), (int32)(tend - floor(minOverlap/10)), (int32)(tend + floor(minOverlap/10)));
77 
78    if (tend < readLen-15 && (int32)(tend - floor(minOverlap/50)) <= IL.hi(j) && (int32)(tend + floor(minOverlap/50)) >= IL.hi(j)) {
79       doSkip = true;
80 //      fprintf(stderr, "Skip interval in read %ul from %ul - %ul because it is too close to %ul - %ul\n", readID, tbgn, tend, IL.lo(j), IL.hi(j));
81       break;
82    }
83 }
84 if (doSkip) {
85       nSkip++;
86 continue;
87 }
88 
89     //fprintf(stderr, "save %2u from %d - %d\n", i, tbgn, tend);
90     nUsed++;
91 
92 
93 uint32 trimLen = floor((tend-tbgn) * 0.05);
94 if (trimLen > 250) trimLen = 250;
95 //fprintf(stderr, "For interval from %d - %d the trim is %d\n", tbgn, tend, trimLen);
96 
97     if (tend + trimLen  >= readLen)
98        IL.add(tbgn, tend - tbgn);
99     else
100        IL.add(tbgn, tend - tbgn - trimLen);
101   }
102 
103   if (verbose)
104     for (uint32 it=0; it<IL.numberOfIntervals(); it++)
105       fprintf(stderr, "IL - %3u - %5u %5u\n", readID, IL.lo(it), IL.hi(it));
106 
107   //  I thought I'd allow low coverage at the end of the read, but not internally, but that is hard,
108   //  and largely unnecessary.  We'll just not be assembling at low coverage joins, which is
109   //  acceptable.
110 
111   if (minCoverage > 0) {
112     intervalDepth<uint32>  DE(IL);
113 
114     uint32  it = 0;
115     uint32  ib = 0;
116     uint32  ie = 0;
117 
118     while (it < DE.numberOfIntervals()) {
119       if (verbose)
120         fprintf(stderr, "DE - %3u - %5u %5u depth %u\n", readID, DE.lo(it), DE.hi(it), DE.depth(it));
121 
122       if (DE.depth(it) < minCoverage) {
123         //  Dropped below good coverage depth.  If we have an interval, save it.  Reset.
124         if (ie > ib) {
125           //fprintf(stderr, "AD1 %d-%d len %d\n", ib, ie, ie - ib);
126           ID.add(ib, ie - ib);
127         }
128         ib = 0;
129         ie = 0;
130 
131       } else if ((ib == 0) && (ie == 0)) {
132         //  Depth is good.  If no current interval, make a new one.
133         ib = DE.lo(it);
134         ie = DE.hi(it);
135         //fprintf(stderr, "NE1 %d-%d len %d\n", ib, ie, ie - ib);
136 
137       } else if (ie == DE.lo(it)) {
138         //  Depth is good.  If this interval is adjacent to the current, extend.
139         ie = DE.hi(it);
140         //fprintf(stderr, "EXT %d-%d len %d\n", ib, ie, ie - ib);
141 
142       } else {
143         //  Depth is good, but we just had a gap in coverage.  Save any current interval.  Reset.
144         if (ie > ib) {
145           //fprintf(stderr, "AD2 %d-%d len %d\n", ib, ie, ie - ib);
146           ID.add(ib, ie - ib);
147         }
148         ib = DE.lo(it);
149         ie = DE.hi(it);
150         //fprintf(stderr, "NE2 %d-%d len %d\n", ib, ie, ie - ib);
151       }
152 
153       it++;
154     }
155 
156     if (ie > ib) {
157       //fprintf(stderr, "AD3 %d-%d len %d\n", ib, ie, ie - ib);
158       ID.add(ib, ie - ib);
159     }
160   }
161 
162   //  Now that we've created depth, merge the intervals.
163 
164   IL.merge(minOverlap);
165 
166   //  IL - covered interavls enforcing a minimum overlap size (these can overlap)
167   //  ID - covered intervals enforcing a minimum depth (these cannot overlap)
168   //
169   //  Create new intervals from the intersection of IL and ID.
170   //
171   //  This catches one nasty case, where a thin overlap has more than minDepth coverage.
172   //
173   //         -------------               3x coverage
174   //          -------------              all overlaps 1 or 2 dashes long
175   //                     ---------
176   //                      -----------
177 
178   if (minCoverage > 0) {
179     intervalList<uint32> FI;
180 
181     uint32  li = 0;
182     uint32  di = 0;
183 
184     while ((li < IL.numberOfIntervals()) &&
185            (di < ID.numberOfIntervals())) {
186       uint32   ll = IL.lo(li);
187       uint32   lh = IL.hi(li);
188       uint32   dl = ID.lo(di);
189       uint32   dh = ID.hi(di);
190       uint32   nl  = 0;
191       uint32   nh  = 0;
192 
193       //  If they intersect, make a new region
194 
195       if ((ll <= dl) && (dl < lh)) {
196         nl = dl;
197         nh = (lh < dh) ? lh : dh;
198       }
199 
200       if ((dl <= ll) && (ll < dh)) {
201         nl = ll;
202         nh = (lh < dh) ? lh : dh;
203       }
204 
205       if (nl < nh)
206         FI.add(nl, nh - nl);
207 
208       //  Advance the list with the earlier region.
209 
210       if (lh <= dh)
211         //  IL ends at or before ID
212         li++;
213 
214       if (dh <= lh) {
215         //  ID ends at or before IL
216         di++;
217       }
218     }
219 
220     //  Replace the intervals to use with the intersection.
221 
222     IL = FI;
223   }
224 
225   ////////////////////////////////////////
226 
227   if (IL.numberOfIntervals() == 0) {
228     if (verbose)
229       fprintf(stderr, "no high quality overlaps\n");
230     strcpy(logMsg, "\tno high quality overlaps");
231     return(false);
232   }
233 
234   sprintf(logMsg, "\tskipped %u overlaps; used %u overlaps", nSkip, nUsed);
235 
236   fbgn = IL.lo(0);
237   fend = IL.hi(0);
238   if (verbose)
239     fprintf(stderr, "IL - %3u - %5u %5u  SAVE\n", readID, fbgn, fend);
240 
241   for (uint32 it=1; it<IL.numberOfIntervals(); it++) {
242     if (IL.hi(it) - IL.lo(it) > fend - fbgn) {
243       fbgn = IL.lo(it);
244       fend = IL.hi(it);
245       if (verbose)
246         fprintf(stderr, "IL - %3u - %5u %5u  SAVE\n", readID, IL.lo(it), IL.hi(it));
247     } else {
248       if (verbose)
249         fprintf(stderr, "IL - %3u - %5u %5u\n", readID, IL.lo(it), IL.hi(it));
250     }
251   }
252 
253   return(true);
254 }
255