1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "splitReads.H"
19 
20 //  Adjust the overlap for trimming already done.  Filter out useless overlaps.  Save the
21 //  overlap into a useful format.
22 
23 void
addAndFilterOverlaps(sqStore * seq,clearRangeFile * finClr,double errorRate,ovOverlap * ovl,uint32 ovlLen)24 workUnit::addAndFilterOverlaps(sqStore *seq,
25                                clearRangeFile *finClr,
26                                double errorRate,
27                                ovOverlap *ovl, uint32 ovlLen) {
28 
29   if (adjMax < ovlLen) {
30     delete [] adj;
31 
32     adjMax = ovlLen;
33     adj    = new adjOverlap [adjMax];
34   }
35 
36   adjLen = 0;
37 
38   for (uint32 oo=0; oo<ovlLen; oo++) {
39     ovOverlap *o = ovl + oo;
40     adjOverlap *a = adj + adjLen;
41 
42     uint32 idA    = o->a_iid;
43     uint32 idB    = o->b_iid;
44 
45     if (finClr->isDeleted(idA) ||
46         finClr->isDeleted(idB))
47       continue;
48 
49     //  Returns the finClr clear range of the two reads (in *clr*), and the overlap adjusted to
50     //  that clear range (in *ovl*).  The B read coordinates are reverse-complemented if the
51     //  overlap is reversed (so bovlbgn < bovlend always).
52 
53 
54     a->a_iid   = o->a_iid;
55     a->b_iid   = o->b_iid;
56     a->flipped = o->flipped();
57 
58     bool  valid = false;
59 
60     if (o->flipped() == false)
61       valid = adjustNormal(finClr, o,
62                            a->aovlbgn, a->aovlend, a->bovlbgn, a->bovlend,
63                            a->aclrbgn, a->aclrend, a->bclrbgn, a->bclrend);
64     else
65       valid = adjustFlipped(finClr, o,
66                             a->aovlbgn, a->aovlend, a->bovlbgn, a->bovlend,
67                             a->aclrbgn, a->aclrend, a->bclrbgn, a->bclrend,
68                             seq);
69 
70     if (valid == false)
71       //  adjust() says the overlap doesn't intersect the clear range, so nothing here.
72       continue;
73 
74     assert(a->aclrbgn <= a->aovlbgn);  //  Also checked in adjust()...
75     assert(a->bclrbgn <= a->bovlbgn);
76 
77     assert(a->aovlend <= a->aclrend);
78     assert(a->bovlend <= a->bclrend);
79 
80 
81     //  Reset evidence hangs that are close to zero to be zero.  This shifts the overlap end point
82     //  to remove as much of the useless hang as possible.
83 
84     if (a->bovlbgn - a->bclrbgn < 15) {
85       uint32 limit  = a->aovlbgn - a->aclrbgn;
86       uint32 adjust = a->bovlbgn - a->bclrbgn;
87 
88       if (adjust > limit)
89         adjust = limit;
90 
91       if ((a->aovlbgn < adjust) || (a->bovlbgn < adjust))
92         fprintf(stderr, "ovl %u %u-%u %u %u-%u -> clr %u-%u %u-%u adj %u-%u %u-%u\n",
93                 o->a_iid, o->a_bgn(), o->a_end(), o->b_iid, o->b_bgn(), o->b_end(),
94                 a->aclrbgn, a->aclrend, a->bclrbgn, a->bclrend,
95                 a->aovlbgn, a->aovlend, a->bovlbgn, a->bovlend);
96       assert(a->aovlbgn >= adjust);
97       assert(a->bovlbgn >= adjust);
98 
99       a->aovlbgn -= adjust;
100       a->bovlbgn -= adjust;
101 
102       assert(a->aclrbgn <= a->aovlbgn);
103       assert(a->bclrbgn <= a->bovlbgn);
104     }
105 
106 
107     if (a->bclrend - a->bovlend < 15) {
108       uint32 limit  = a->aclrend - a->aovlend;
109       uint32 adjust = a->bclrend - a->bovlend;
110 
111       if (adjust > limit)
112         adjust = limit;
113 
114       a->aovlend += adjust;
115       a->bovlend += adjust;
116 
117       assert(a->aovlend <= a->aclrend);
118       assert(a->bovlend <= a->bclrend);
119     }
120 
121 
122     //  Filter out garbage overlaps
123     //
124     //  The first version used hard and fast cutoffs of (>=35bp and <= 0.02 error) or (>= 70bp).
125     //  These were not fair to short reads.
126     //
127     //  The second version used compilacted rules (see cvs), but based the decision off of the
128     //  length of the A-read.  Fair, but only for assemblies of similarily sized reads.  Totally
129     //  unfair for Sanger-vs-Illumin overlaps.
130     //
131     //  The third version sets a minimum length and identity based on the shorter fragment. These
132     //  mirror closely what the second version was doing.  It was extended to allow even shorter
133     //  lengths if either read is aligned to an end.
134     //
135     //  The fourth version, for long noisy reads, accepts all overlaps.
136 
137     if (0)
138       continue;
139 
140 
141     //  Save the overlap.
142 
143     adjLen++;
144   }
145 }
146 
147