1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "splitReads.H"
19
20 // Adjust the overlap for trimming already done. Filter out useless overlaps. Save the
21 // overlap into a useful format.
22
23 void
addAndFilterOverlaps(sqStore * seq,clearRangeFile * finClr,double errorRate,ovOverlap * ovl,uint32 ovlLen)24 workUnit::addAndFilterOverlaps(sqStore *seq,
25 clearRangeFile *finClr,
26 double errorRate,
27 ovOverlap *ovl, uint32 ovlLen) {
28
29 if (adjMax < ovlLen) {
30 delete [] adj;
31
32 adjMax = ovlLen;
33 adj = new adjOverlap [adjMax];
34 }
35
36 adjLen = 0;
37
38 for (uint32 oo=0; oo<ovlLen; oo++) {
39 ovOverlap *o = ovl + oo;
40 adjOverlap *a = adj + adjLen;
41
42 uint32 idA = o->a_iid;
43 uint32 idB = o->b_iid;
44
45 if (finClr->isDeleted(idA) ||
46 finClr->isDeleted(idB))
47 continue;
48
49 // Returns the finClr clear range of the two reads (in *clr*), and the overlap adjusted to
50 // that clear range (in *ovl*). The B read coordinates are reverse-complemented if the
51 // overlap is reversed (so bovlbgn < bovlend always).
52
53
54 a->a_iid = o->a_iid;
55 a->b_iid = o->b_iid;
56 a->flipped = o->flipped();
57
58 bool valid = false;
59
60 if (o->flipped() == false)
61 valid = adjustNormal(finClr, o,
62 a->aovlbgn, a->aovlend, a->bovlbgn, a->bovlend,
63 a->aclrbgn, a->aclrend, a->bclrbgn, a->bclrend);
64 else
65 valid = adjustFlipped(finClr, o,
66 a->aovlbgn, a->aovlend, a->bovlbgn, a->bovlend,
67 a->aclrbgn, a->aclrend, a->bclrbgn, a->bclrend,
68 seq);
69
70 if (valid == false)
71 // adjust() says the overlap doesn't intersect the clear range, so nothing here.
72 continue;
73
74 assert(a->aclrbgn <= a->aovlbgn); // Also checked in adjust()...
75 assert(a->bclrbgn <= a->bovlbgn);
76
77 assert(a->aovlend <= a->aclrend);
78 assert(a->bovlend <= a->bclrend);
79
80
81 // Reset evidence hangs that are close to zero to be zero. This shifts the overlap end point
82 // to remove as much of the useless hang as possible.
83
84 if (a->bovlbgn - a->bclrbgn < 15) {
85 uint32 limit = a->aovlbgn - a->aclrbgn;
86 uint32 adjust = a->bovlbgn - a->bclrbgn;
87
88 if (adjust > limit)
89 adjust = limit;
90
91 if ((a->aovlbgn < adjust) || (a->bovlbgn < adjust))
92 fprintf(stderr, "ovl %u %u-%u %u %u-%u -> clr %u-%u %u-%u adj %u-%u %u-%u\n",
93 o->a_iid, o->a_bgn(), o->a_end(), o->b_iid, o->b_bgn(), o->b_end(),
94 a->aclrbgn, a->aclrend, a->bclrbgn, a->bclrend,
95 a->aovlbgn, a->aovlend, a->bovlbgn, a->bovlend);
96 assert(a->aovlbgn >= adjust);
97 assert(a->bovlbgn >= adjust);
98
99 a->aovlbgn -= adjust;
100 a->bovlbgn -= adjust;
101
102 assert(a->aclrbgn <= a->aovlbgn);
103 assert(a->bclrbgn <= a->bovlbgn);
104 }
105
106
107 if (a->bclrend - a->bovlend < 15) {
108 uint32 limit = a->aclrend - a->aovlend;
109 uint32 adjust = a->bclrend - a->bovlend;
110
111 if (adjust > limit)
112 adjust = limit;
113
114 a->aovlend += adjust;
115 a->bovlend += adjust;
116
117 assert(a->aovlend <= a->aclrend);
118 assert(a->bovlend <= a->bclrend);
119 }
120
121
122 // Filter out garbage overlaps
123 //
124 // The first version used hard and fast cutoffs of (>=35bp and <= 0.02 error) or (>= 70bp).
125 // These were not fair to short reads.
126 //
127 // The second version used compilacted rules (see cvs), but based the decision off of the
128 // length of the A-read. Fair, but only for assemblies of similarily sized reads. Totally
129 // unfair for Sanger-vs-Illumin overlaps.
130 //
131 // The third version sets a minimum length and identity based on the shorter fragment. These
132 // mirror closely what the second version was doing. It was extended to allow even shorter
133 // lengths if either read is aligned to an end.
134 //
135 // The fourth version, for long noisy reads, accepts all overlaps.
136
137 if (0)
138 continue;
139
140
141 // Save the overlap.
142
143 adjLen++;
144 }
145 }
146
147