1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "findErrors.H"
19 #include "computeDiff.H"
20
21 #include "sequence.H"
22 #include <tuple>
23
24 #define DISPLAY_WIDTH 250
25
26 // Show (to stdout ) the alignment encoded in delta [0 .. (deltaLen - 1)]
27 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
28
29 static
30 void
Display_Alignment(char * a,int32 aLen,char * b,int32 bLen,int32 * delta,int32 deltaLen)31 Display_Alignment(char *a, int32 aLen,
32 char *b, int32 bLen,
33 int32 *delta,
34 int32 deltaLen) {
35
36 int32 i = 0;
37 int32 j = 0;
38
39 char *top = new char [32 * 1024];
40 int32 topLen = 0;
41
42 char *bot = new char [32 * 1024];
43 int32 botLen = 0;
44
45 for (int32 k = 0; k < deltaLen; k++) {
46 for (int32 m = 1; m < abs(delta[k]); m++) {
47 top[topLen++] = a[i++];
48 j++;
49 }
50
51 if (delta[k] < 0) {
52 top[topLen++] = '-';
53 j++;
54 } else {
55 top[topLen++] = a[i++];
56 }
57 }
58
59 while (i < aLen && j < bLen) {
60 top[topLen++] = a[i++];
61 j++;
62 }
63 top[topLen] = '\0';
64
65
66 i = j = 0;
67
68 for (int32 k = 0; k < deltaLen; k++) {
69 for (int32 m = 1; m < abs(delta[k]); m++) {
70 bot[botLen++] = b[j++];
71 i++;
72 }
73
74 if (delta[k] > 0) {
75 bot[botLen++] = '-';
76 i++;
77 } else {
78 bot[botLen++] = b[j++];
79 }
80 }
81
82 while (j < bLen && i < aLen) {
83 bot[botLen++] = b[j++];
84 i++;
85 }
86 bot[botLen] = '\0';
87
88
89 for (i = 0; i < topLen || i < botLen; i += DISPLAY_WIDTH) {
90 putc('\n', stderr);
91
92 fprintf(stderr, "A: ");
93 for (j = 0; j < DISPLAY_WIDTH && i + j < topLen; j++)
94 putc(top[i + j], stderr);
95 putc('\n', stderr);
96
97 fprintf(stderr, "B: ");
98 for (j = 0; j < DISPLAY_WIDTH && i + j < botLen; j++)
99 putc(bot[i + j], stderr);
100 putc('\n', stderr);
101
102 fprintf(stderr, " ");
103 for (j = 0; j < DISPLAY_WIDTH && i + j < botLen && i + j < topLen; j++)
104 if (top[i + j] != ' ' && bot[i + j] != ' ' && top[i + j] != bot[i + j])
105 putc('^', stderr);
106 else
107 putc(' ', stderr);
108 putc('\n', stderr);
109 }
110
111 delete [] top;
112 delete [] bot;
113 }
114
115
116 int32
117 Prefix_Edit_Dist(char *A, int m,
118 char *T, int n,
119 int Error_Limit,
120 int32 &A_End,
121 int32 &T_End,
122 bool &Match_To_End,
123 pedWorkArea_t * wa);
124
125 void
126 Analyze_Alignment(Thread_Work_Area_t *wa,
127 char *a_part, int32 a_len, int32 a_offset,
128 char *b_part, int32 b_len,
129 int32 sub);
130
131
132 // Find the alignment referred to in olap , where the a_iid
133 // fragment is in Frag and the b_iid sequence is in b_seq .
134 // Use the alignment to increment the appropriate vote fields
135 // for the a fragment. shredded is true iff the b fragment
136 // is from shredded data, in which case the overlap will be
137 // ignored if the a fragment is also shredded.
138 // rev_seq is a buffer to hold the reverse complement of b_seq
139 // if needed. (* rev_id) is used to keep track of whether
140 // rev_seq has been created yet. (* wa) is the work-area
141 // containing space for the process to use in case of multi-threading.
142
143
144 void
Process_Olap(Olap_Info_t * olap,char * b_seq,bool shredded,Thread_Work_Area_t * wa)145 Process_Olap(Olap_Info_t *olap,
146 char *b_seq,
147 bool shredded,
148 Thread_Work_Area_t *wa) {
149
150 #if 0
151 fprintf(stderr, "Process_Olap: %8d %8d %5ld %5ld %c\n",
152 olap->a_iid, olap->b_iid,
153 olap->a_hang, olap->b_hang,
154 olap->innie == true ? 'I' : 'N');
155
156 //if (olap->a_iid != 39861 && olap->b_iid != 39861 && olap->a_iid != 2283 && olap->b_iid != 2283)
157 // return;
158 #endif
159
160 int32 ri = olap->a_iid - wa->G->bgnID;
161
162 if ((shredded == true) && (wa->G->reads[ri].shredded == true)) {
163 //fprintf(stderr, "%8d %8d shredded\n", olap->a_iid, olap->b_iid);
164 return;
165 }
166 //fprintf(stderr, "%8d %8d not shredded\n", olap->a_iid, olap->b_iid);
167
168 char *a_part = wa->G->reads[ri].sequence;
169 int32 a_offset = 0;
170
171 char *b_part = (olap->normal == true) ? b_seq : wa->rev_seq;
172 int32 b_offset = 0;
173
174 // If innie, reverse-complement the B sequence.
175
176 if ((olap->innie == true) && (wa->rev_id != olap->b_iid)) {
177 strcpy(b_part, b_seq);
178 reverseComplementSequence(b_part, 0);
179 wa->rev_id = olap->b_iid;
180 }
181
182 // Adjust for hangs.
183
184 if (olap->a_hang > 0) {
185 a_offset = olap->a_hang;
186 a_part += a_offset;
187 }
188
189 if (olap->a_hang < 0) {
190 b_offset = -olap->a_hang;
191 b_part += b_offset;
192 }
193
194 // Count degree - just how many times we cover the end of the read?
195
196 if ((olap->a_hang <= 0) && (wa->G->reads[ri].left_degree < MAX_DEGREE))
197 wa->G->reads[ri].left_degree++;
198
199 if ((olap->b_hang >= 0) && (wa->G->reads[ri].right_degree < MAX_DEGREE))
200 wa->G->reads[ri].right_degree++;
201
202 // Get the alignment
203
204 uint32 a_part_len = strlen(a_part);
205 uint32 b_part_len = strlen(b_part);
206
207 int32 a_end = 0;
208 int32 b_end = 0;
209
210 bool match_to_end = false;
211
212 //fprintf(stderr, "A: offset %d length %d\n", a_offset, a_part_len);
213 //fprintf(stderr, "B: offset %d length %d\n", b_offset, b_part_len);
214
215 auto *ped = &wa->ped;
216
217 int32 all_errors = Prefix_Edit_Dist(a_part, a_part_len,
218 b_part, b_part_len,
219 wa->G->Error_Bound[std::min(a_part_len, b_part_len)],
220 a_end,
221 b_end,
222 match_to_end,
223 ped);
224
225 if ((a_end < 0) || (a_end > a_part_len) || (b_end < 0) || (b_end > b_part_len)) {
226 fprintf (stderr, "ERROR: Bad edit distance.\n");
227 fprintf (stderr, " errors = %d a_end = %d b_end = %d\n", all_errors, a_end, b_end);
228 fprintf (stderr, " a_part_len = %d b_part_len = %d\n", a_part_len, b_part_len);
229 fprintf (stderr, " a_iid = %d b_iid = %d match_to_end = %c\n", olap->a_iid, olap->b_iid, match_to_end ? 'T' : 'F');
230 }
231 assert(a_end >= 0);
232 assert(a_end <= a_part_len);
233 assert(b_end >= 0);
234 assert(b_end <= b_part_len);
235
236 //fprintf(stderr, " all_errors = %d delta_len = %d\n", all_errors, ped.deltaLen);
237 //fprintf(stderr, " a_align = %d/%d b_align = %d/%d\n", a_end, a_part_len, b_end, b_part_len);
238 //Display_Alignment(a_part, a_end, b_part, b_end, ped.delta, ped.deltaLen);//, wa->G->reads[ri].clear_len - a_offset);
239
240 if ((match_to_end == false) && (a_end + a_offset >= wa->G->reads[ri].clear_len - 1)) {
241 match_to_end = true;
242 }
243
244 //TODO Adjusting the extremities?
245 //all_errors -= TrimStartingIndels(a_part, a_end, ped->delta, ped->deltaLen, 1);
246 //all_errors -= TrimStartingIndels(b_part, b_end, ped->delta, ped->deltaLen, -1);
247
248 //fprintf(stderr, "Showing alignment\n");
249 //Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);
250
251 if (match_to_end && a_end > 0 && b_end > 0) {
252 int32 events;
253 int32 alignment_len;
254
255 //fprintf(stderr, "Computing errors\n");
256 //fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);
257 std::tie(events, alignment_len) = ComputeErrors(a_part, b_part, ped->deltaLen, ped->delta,
258 a_end, b_end, wa->G->checkTrivialDNA);
259
260 assert(events >= 0 && alignment_len > 0);
261 //fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
262
263 // We used to assert that either checkTrivialDNA was true or that
264 // all_errors == events. This was to ensure that no alignment errors
265 // were rejected by ComputeErrors() -- but we're now ignoring alignment errors
266 // near the end of a read in all cases, so all_errors != events.
267 //
268 //assert(wa->G->checkTrivialDNA || all_errors == events);
269
270 if (events <= (int32) ceil(alignment_len * wa->G->maskedErrorRate)) {
271 wa->passedOlaps++;
272 //fprintf(stderr, "%8d %8d passed overlap\n", olap->a_iid, olap->b_iid);
273 Analyze_Alignment(wa,
274 a_part, a_end, a_offset,
275 b_part, b_end,
276 ri);
277 return;
278 }
279 }
280
281 wa->failedOlaps++;
282 //fprintf(stderr, "%8d %8d failed overlap\n", olap->a_iid, olap->b_iid);
283 //fprintf(stderr, "%8d %8d match to end %c\n", olap->a_iid, olap->b_iid, match_to_end ? 'T' : 'F');
284 //fprintf(stderr, "%8d %8d too many errors %c\n", olap->a_iid, olap->b_iid, (errors > wa->G->Error_Bound[olap_len]) ? 'T' : 'F');
285 }
286