1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "correctOverlaps.H"
19 #include "correctionOutput.H"
20 #include "sequence.H"
21 #include "computeDiff.H"
22
23 #include <tuple>
24
25
26
27 void
28 correctRead(uint32 curID,
29 char *fseq, uint32 &fseqLen, Adjust_t *fadj, uint32 &fadjLen,
30 const char *oseq, uint32 oseqLen,
31 Correction_Output_t *C,
32 uint64 &Cpos,
33 uint64 Clen,
34 uint64 *changes=NULL);
35
36
37 int32
38 Prefix_Edit_Dist(const char *A, int32 m,
39 const char *T, int32 n,
40 int32 Error_Limit,
41 int32 &A_End,
42 int32 &T_End,
43 bool &Match_To_End,
44 pedWorkArea_t *ped);
45
46
47
48
49
50
51
52
53
54
55 #define DISPLAY_WIDTH 250
56
57 // Show (to stdout ) the alignment encoded in delta [0 .. (deltaLen - 1)]
58 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
59
60 static
61 void
Display_Alignment(const char * a,int32 aLen,const char * b,int32 bLen,int32 * delta,int32 deltaLen)62 Display_Alignment(const char *a, int32 aLen,
63 const char *b, int32 bLen,
64 int32 *delta,
65 int32 deltaLen) {
66
67 int32 i = 0;
68 int32 j = 0;
69
70 char *top = new char [32 * 1024];
71 int32 topLen = 0;
72
73 char *bot = new char [32 * 1024];
74 int32 botLen = 0;
75
76 for (int32 k = 0; k < deltaLen; k++) {
77 for (int32 m = 1; m < abs(delta[k]); m++) {
78 top[topLen++] = a[i++];
79 j++;
80 }
81
82 if (delta[k] < 0) {
83 top[topLen++] = '-';
84 j++;
85 } else {
86 top[topLen++] = a[i++];
87 }
88 }
89
90 while (i < aLen && j < bLen) {
91 top[topLen++] = a[i++];
92 j++;
93 }
94 top[topLen] = '\0';
95
96
97 i = j = 0;
98
99 for (int32 k = 0; k < deltaLen; k++) {
100 for (int32 m = 1; m < abs(delta[k]); m++) {
101 bot[botLen++] = b[j++];
102 i++;
103 }
104
105 if (delta[k] > 0) {
106 bot[botLen++] = '-';
107 i++;
108 } else {
109 bot[botLen++] = b[j++];
110 }
111 }
112
113 while (j < bLen && i < aLen) {
114 bot[botLen++] = b[j++];
115 i++;
116 }
117 bot[botLen] = '\0';
118
119
120 for (i = 0; i < topLen || i < botLen; i += DISPLAY_WIDTH) {
121 putc('\n', stderr);
122
123 fprintf(stderr, "A: ");
124 for (j = 0; j < DISPLAY_WIDTH && i + j < topLen; j++)
125 putc(top[i + j], stderr);
126 putc('\n', stderr);
127
128 fprintf(stderr, "B: ");
129 for (j = 0; j < DISPLAY_WIDTH && i + j < botLen; j++)
130 putc(bot[i + j], stderr);
131 putc('\n', stderr);
132
133 fprintf(stderr, " ");
134 for (j = 0; j < DISPLAY_WIDTH && i + j < botLen && i + j < topLen; j++)
135 if (top[i + j] != ' ' && bot[i + j] != ' ' && top[i + j] != bot[i + j])
136 putc('^', stderr);
137 else
138 putc(' ', stderr);
139 putc('\n', stderr);
140 }
141
142 delete [] top;
143 delete [] bot;
144 }
145
146
147
148
149
150
151
152
153
154
155
156
157
158 // Set hanging offset values for reversed fragment in
159 // rev_adj[0 .. (adj_ct - 1)] based on corresponding forward
160 // values in fadj[0 .. (adj_ct - 1)]. frag_len is the length
161 // of the fragment.
162
163 static
164 void
Make_Rev_Adjust(Adjust_t * radj,Adjust_t * fadj,int32 adj_ct,int32 frag_len)165 Make_Rev_Adjust(Adjust_t *radj,
166 Adjust_t *fadj,
167 int32 adj_ct,
168 int32 frag_len) {
169
170 if (adj_ct == 0)
171 return;
172
173 int32 i = 0;
174 int32 j = 0;
175 int32 prev = 0;
176
177 for (i=adj_ct-1; i>0; i--) {
178 if (fadj[i].adjust == fadj[i-1].adjust + 1) {
179 radj[j].adjpos = 2 + frag_len - fadj[i].adjpos;
180 radj[j].adjust = prev + 1;
181
182 prev = radj[j].adjust;
183 }
184
185 else if (fadj[i].adjust == fadj[i-1].adjust - 1) {
186 radj[j].adjpos = 3 + frag_len - fadj[i].adjpos;
187 radj[j].adjust = prev - 1;
188
189 prev = radj[j].adjust;
190 }
191
192 else {
193 fprintf(stderr, "ERROR: Bad adjustment value. i = %d adj_ct = %d adjust[i] = %d adjust[i-1] = %d\n",
194 i, adj_ct, fadj[i].adjust, fadj[i-1].adjust);
195 assert(0);
196 }
197
198 j++;
199 }
200
201 assert(i == 0);
202
203 if (fadj[i].adjust == 1) {
204 radj[j].adjpos = 2 + frag_len - fadj[i].adjpos;
205 radj[j].adjust = prev + 1;
206 }
207
208 else if (fadj[i].adjust == -1) {
209 radj[j].adjpos = 3 + frag_len - fadj[i].adjpos;
210 radj[j].adjust = prev - 1;
211 }
212
213 else {
214 fprintf(stderr, "ERROR: Bad adjustment value. i = %d adj_ct = %d adjust[i] = %d\n",
215 i, adj_ct, fadj[i].adjust);
216 assert(0);
217 }
218
219 assert(j+1 == adj_ct);
220 }
221
222
223
224
225
226 // Return the adjusted value of hang based on
227 // adjust[0 .. (adjust_ct - 1)] .
228 static
229 int32
Hang_Adjust(int32 hang,Adjust_t * adjust,int32 adjust_ct)230 Hang_Adjust(int32 hang,
231 Adjust_t *adjust,
232 int32 adjust_ct) {
233 int32 delta = 0;
234
235 assert(hang >= 0);
236
237 // Replacing second test >= with just > didn't change anything. Both had 14 fails.
238
239 for (int32 i=0; (i < adjust_ct) && (hang >= adjust[i].adjpos); i++) {
240 //if (delta != adjust[i].adjust)
241 // fprintf(stderr, "hang_adjust i=%d adjust_ct=%d adjust=%d pos=%d\n", i, adjust_ct, adjust[i].adjust, adjust[i].adjpos);
242 delta = adjust[i].adjust;
243 }
244
245 if (hang + delta < 0) {
246 int32 i=0;
247
248 fprintf(stderr, "\n");
249 fprintf(stderr, "hang_adjust hang=%d\n", hang);
250
251 for (; (i < adjust_ct) && (hang >= adjust[i].adjpos); i++)
252 fprintf(stderr, "hang_adjust i=%d adjust_ct=%d adjust=%d pos=%d --\n", i, adjust_ct, adjust[i].adjust, adjust[i].adjpos);
253
254 for (int32 j=i+10; (i < adjust_ct) && (i < j); i++)
255 fprintf(stderr, "hang_adjust i=%d adjust_ct=%d adjust=%d pos=%d\n", i, adjust_ct, adjust[i].adjust, adjust[i].adjpos);
256
257 return(0);
258 }
259
260 //fprintf(stderr, "hang adjust delta %d\n", delta);
261 return(hang + delta);
262 }
263
264 static
265 void
PrepareRead(sqStore * seqStore,uint32 curID,uint32 & fseqLen,char * fseq,char * rseq,uint32 & fadjLen,Adjust_t * fadj,Adjust_t * radj,Correction_Output_t * C,uint64 & Cpos,uint64 Clen)266 PrepareRead(/*const*/ sqStore *seqStore, uint32 curID,
267 uint32 &fseqLen, char *fseq, char *rseq,
268 uint32 &fadjLen, Adjust_t *fadj, Adjust_t *radj,
269 Correction_Output_t *C, uint64 &Cpos, uint64 Clen) {
270 sqRead read;
271 seqStore->sqStore_getRead(curID, &read);
272 // Apply corrections to the B read (also converts to lower case, reverses it, etc)
273
274 //fprintf(stderr, "Correcting B read %u at Cpos=%u Clen=%u\n", curID, Cpos, Clen);
275
276 fseqLen = 0;
277 fadjLen = 0;
278
279 //Correcting "b" read. "a" reads were corrected beforehand.
280 correctRead(curID,
281 fseq, fseqLen, fadj, fadjLen,
282 read.sqRead_sequence(),
283 read.sqRead_length(),
284 C, Cpos, Clen);
285
286 //fprintf(stderr, "Finished B read %u at Cpos=%u Clen=%u\n", curID, Cpos, Clen);
287
288 // Create copies of the sequence for forward and reverse. There isn't a need for the forward copy (except that
289 // we mutate it with corrections), and the reverse copy could be deferred until it is needed.
290
291 memcpy(rseq, fseq, sizeof(char) * (fseqLen + 1));
292
293 reverseComplementSequence(rseq, fseqLen);
294
295 Make_Rev_Adjust(radj, fadj, fadjLen, fseqLen);
296 }
297
298 //FIXME code duplication
299 //returns error rate of the alignment or -1. if (!match_to_end || invalid_olap)
300 static
301 double
ProcessAlignment(const int32 a_part_len,char * a_part,const int32 b_part_len,char * b_part,int32 error_bound,bool check_trivial_dna,pedWorkArea_t * ped,bool * match_to_end,bool * invalid_olap)302 ProcessAlignment(const int32 a_part_len, char *a_part,
303 const int32 b_part_len, char *b_part,
304 int32 error_bound, bool check_trivial_dna,
305 pedWorkArea_t *ped, bool *match_to_end, bool *invalid_olap) {
306 int32 a_end = 0;
307 int32 b_end = 0;
308
309 //fprintf(stderr, ">A\n%s\n", a_part);
310 //fprintf(stderr, ">B\n%s\n", b_part);
311
312 int32 all_errors = Prefix_Edit_Dist(a_part, a_part_len,
313 b_part, b_part_len,
314 error_bound,
315 a_end,
316 b_end,
317 *match_to_end,
318 ped);
319
320
321 //Adjusting the extremities
322 //TODO discuss the logic!
323 //TODO discuss changes regarding a_hang usage!
324 //TODO refactor out the code duplication
325 all_errors -= TrimStartingIndels(a_part, a_end, ped->delta, ped->deltaLen, 1);
326 all_errors -= TrimStartingIndels(b_part, b_end, ped->delta, ped->deltaLen, -1);
327
328 //fprintf(stderr, "Showing alignment\n");
329 //Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);
330
331 *invalid_olap = (std::min(a_end, b_end) <= 0);
332
333 if (!*match_to_end || *invalid_olap) {
334 return -1.;
335 }
336
337 int32 events;
338 int32 alignment_len;
339
340 //fprintf(stderr, "Computing errors\n");
341 //fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);
342 std::tie(events, alignment_len) = ComputeErrors(a_part, b_part, ped->deltaLen, ped->delta,
343 a_end, b_end, check_trivial_dna);
344
345 //fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
346
347 //Assert can fail if DEFAULT_FLANK_IGNORE > 0
348 //assert(check_trivial_dna || all_errors == events);
349
350 assert(events >= 0 && alignment_len > 0);
351 return (double) events / alignment_len;
352 }
353
354 // Read old fragments in seqStore and choose the ones that
355 // have overlaps with fragments in Frag. Recompute the
356 // overlaps, using fragment corrections and output the revised error.
357 void
Redo_Olaps(coParameters * G,sqStore * seqStore)358 Redo_Olaps(coParameters *G, /*const*/ sqStore *seqStore) {
359
360 // Figure out the range of B reads we care about. We probably could just loop over every read in
361 // the store with minimal penalty.
362
363 uint64 thisOvl = 0;
364 uint64 lastOvl = G->olapsLen - 1;
365
366 uint32 loBid = G->olaps[thisOvl].b_iid;
367 uint32 hiBid = G->olaps[lastOvl].b_iid;
368
369 // Open all the corrections.
370
371 memoryMappedFile *Cfile = new memoryMappedFile(G->correctionsName);
372 Correction_Output_t *C = (Correction_Output_t *)Cfile->get();
373 uint64 Cpos = 0;
374 uint64 Clen = Cfile->length() / sizeof(Correction_Output_t);
375
376 // Allocate some temporary work space for the forward and reverse corrected B reads.
377
378 fprintf(stderr, "--Allocate " F_SIZE_T " MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20);
379 char *fseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
380 uint32 fseqLen = 0;
381
382 char *rseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
383
384 fprintf(stderr, "--Allocate " F_SIZE_T " MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20);
385 Adjust_t *fadj = new Adjust_t [AS_MAX_READLEN + 1];
386 Adjust_t *radj = new Adjust_t [AS_MAX_READLEN + 1];
387 uint32 fadjLen = 0; // radj is the same length
388
389 fprintf(stderr, "--Allocate " F_SIZE_T " MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20);
390 pedWorkArea_t *ped = new pedWorkArea_t;
391
392 uint64 Total_Alignments_Ct = 0;
393
394 uint64 Failed_Alignments_Ct = 0;
395 uint64 Failed_Alignments_Both_Ct = 0;
396 uint64 Failed_Alignments_End_Ct = 0;
397 uint64 Failed_Alignments_Length_Ct = 0;
398
399 uint64 olapsFwd = 0;
400 uint64 olapsRev = 0;
401
402 uint64 nBetter = 0;
403 uint64 nWorse = 0;
404 uint64 nSame = 0;
405
406
407 ped->initialize(G, G->errorRate);
408
409 // Process overlaps. Loop over the B reads, and recompute each overlap.
410 // Loop over the B reads ...
411 for (uint32 curID = loBid; curID <= hiBid; curID++) {
412 if (((curID - loBid) % 1024) == 0)
413 fprintf(stderr, "Recomputing overlaps - %9u - %9u - %9u\n", loBid, curID, hiBid);
414
415 // ... towards the read for current overlap
416 if (curID < G->olaps[thisOvl].b_iid)
417 continue;
418
419 assert(curID == G->olaps[thisOvl].b_iid);
420
421 // Load and correct the B read
422 PrepareRead(seqStore, curID,
423 fseqLen, fseq, rseq,
424 fadjLen, fadj, radj,
425 C, Cpos, Clen);
426
427 // Recompute alignments for ALL overlaps involving the B read
428 for (; thisOvl <= lastOvl && G->olaps[thisOvl].b_iid == curID; thisOvl++) {
429 const Olap_Info_t &olap = G->olaps[thisOvl];
430
431 if (G->secondID != UINT32_MAX && olap.b_iid != G->secondID)
432 continue;
433
434 if (olap.normal) {
435 // fprintf(stderr, "b_part = fseq %40.40s\n", fseq);
436 olapsFwd++;
437 } else {
438 // fprintf(stderr, "b_part = rseq %40.40s\n", rseq);
439 olapsRev++;
440 }
441
442 // Find the A segment. It's always forward. It's already been corrected.
443 char *a_part = G->reads[olap.a_iid - G->bgnID].bases;
444 if (olap.a_hang > 0) {
445 int32 ha = Hang_Adjust(olap.a_hang,
446 G->reads[olap.a_iid - G->bgnID].adjusts,
447 G->reads[olap.a_iid - G->bgnID].adjustsLen);
448 a_part += ha;
449 //fprintf(stderr, "offset a_part by ha=%d\n", ha);
450 }
451
452 // Find the B segment.
453 char *b_part = (olap.normal == true) ? fseq : rseq;
454
455 if (olap.a_hang < 0) {
456 int32 ha = olap.normal ? Hang_Adjust(-olap.a_hang, fadj, fadjLen) :
457 Hang_Adjust(-olap.a_hang, radj, fadjLen);
458 b_part += ha;
459 //fprintf(stderr, "offset b_part by ha=%d normal=%d\n", ha, olap.normal);
460 }
461
462 // Compute and process the alignment
463 Total_Alignments_Ct++;
464 //TODO discuss difference with error finding code
465 //In errors finding one of the sequences is the (almost) entire read and the length of its prefix is passed
466 int32 a_part_len = strlen(a_part);
467 int32 b_part_len = strlen(b_part);
468
469 bool match_to_end = false;
470 bool invalid_olap = false;
471 double err_rate = ProcessAlignment(a_part_len, a_part, //olap.a_hang,
472 b_part_len, b_part,
473 G->Error_Bound[std::min(a_part_len, b_part_len)],
474 /*check trivial DNA*/G->checkTrivialDNA,
475 ped, &match_to_end, &invalid_olap);
476
477 if (err_rate >= 0.) {
478 //if (err_rate > /*report_threshold*/ 0.) {
479 // fprintf(stderr, "Err rate of overlap %u - %u is %f\n", olap.a_iid, olap.b_iid, err_rate);
480 //}
481
482 const uint32 err_encoded = AS_OVS_encodeEvalue(err_rate);
483
484 const uint32 base_encoded = G->olaps[thisOvl].evalue;
485 if (err_encoded < base_encoded)
486 nBetter++;
487 else if (err_encoded > base_encoded)
488 nWorse++;
489 else
490 nSame++;
491
492 G->olaps[thisOvl].evalue = err_encoded;
493 //fprintf(stderr, "REDO - err rate = %f\n", AS_OVS_decodeEvalue(G->olaps[thisOvl].evalue));
494 } else {
495 //fprintf(stderr, "Err rate of overlap %u - %u failed\n", olap.a_iid, olap.b_iid);
496
497 Failed_Alignments_Ct++;
498
499 if (!match_to_end && invalid_olap)
500 Failed_Alignments_Both_Ct++;
501
502 if (!match_to_end)
503 Failed_Alignments_End_Ct++;
504
505 if (invalid_olap)
506 Failed_Alignments_Length_Ct++;
507
508 #if 0
509 // I can't find any patterns in these errors. I thought that it was caused by the corrections, but I
510 // found a case where no corrections were made and the alignment still failed. Perhaps it is differences
511 // in the alignment code (the forward vs reverse prefix distance in overlapper vs only the forward here)?
512
513 fprintf(stderr, "Redo_Olaps()--\n");
514 fprintf(stderr, "Redo_Olaps()--\n");
515 fprintf(stderr, "Redo_Olaps()-- Bad alignment errors %d a_end %d b_end %d match_to_end %d olapLen %d\n",
516 errors, a_end, b_end, match_to_end, olapLen);
517 fprintf(stderr, "Redo_Olaps()-- Overlap a_hang %d b_hang %d innie %d\n",
518 olap.a_hang, olap.b_hang, olap.innie);
519 fprintf(stderr, "Redo_Olaps()-- Reads a_id %u a_length %d b_id %u b_length %d\n",
520 G->olaps[thisOvl].a_iid,
521 G->reads[ G->olaps[thisOvl].a_iid ].basesLen,
522 G->olaps[thisOvl].b_iid,
523 G->reads[ G->olaps[thisOvl].b_iid ].basesLen);
524 fprintf(stderr, "Redo_Olaps()-- A %s\n", a_part);
525 fprintf(stderr, "Redo_Olaps()-- B %s\n", b_part);
526
527 Display_Alignment(a_part, a_part_len, b_part, b_part_len, ped->delta, ped->deltaLen);
528
529 fprintf(stderr, "\n");
530 #endif
531 }
532 }
533 }
534
535 fprintf(stderr, "\n");
536
537 delete ped;
538 delete [] radj;
539 delete [] fadj;
540 delete [] rseq;
541 delete [] fseq;
542 delete Cfile;
543
544 fprintf(stderr, "-- Release bases, adjusts and reads.\n");
545
546 delete [] G->bases; G->bases = NULL;
547 delete [] G->adjusts; G->adjusts = NULL;
548 delete [] G->reads; G->reads = NULL;
549
550 fprintf(stderr, "Olaps Fwd " F_U64 "\n", olapsFwd);
551 fprintf(stderr, "Olaps Rev " F_U64 "\n", olapsRev);
552
553 fprintf(stderr, "Total: " F_U64 "\n", Total_Alignments_Ct);
554 fprintf(stderr, "Failed: " F_U64 " (both)\n", Failed_Alignments_Both_Ct);
555 fprintf(stderr, "Failed: " F_U64 " (either)\n", Failed_Alignments_Ct);
556 fprintf(stderr, "Failed: " F_U64 " (match to end)\n", Failed_Alignments_End_Ct);
557 fprintf(stderr, "Failed: " F_U64 " (negative length)\n", Failed_Alignments_Length_Ct);
558
559 fprintf(stderr, "Changed " F_U64 " overlaps.\n", nBetter + nWorse + nSame);
560 fprintf(stderr, "Better: " F_U64 " overlaps.\n", nBetter);
561 fprintf(stderr, "Worse: " F_U64 " overlaps.\n", nWorse);
562 fprintf(stderr, "Same: " F_U64 " overlaps.\n", nSame);
563 }
564