1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "correctOverlaps.H"
19 #include "correctionOutput.H"
20 #include "sequence.H"
21 #include "computeDiff.H"
22 
23 #include <tuple>
24 
25 
26 
27 void
28 correctRead(uint32 curID,
29             char *fseq, uint32 &fseqLen, Adjust_t *fadj, uint32 &fadjLen,
30             const char *oseq, uint32  oseqLen,
31             Correction_Output_t  *C,
32             uint64               &Cpos,
33             uint64                Clen,
34             uint64               *changes=NULL);
35 
36 
37 int32
38 Prefix_Edit_Dist(const char    *A,  int32 m,
39                  const char    *T,  int32 n,
40                  int32    Error_Limit,
41                  int32   &A_End,
42                  int32   &T_End,
43                  bool    &Match_To_End,
44                  pedWorkArea_t *ped);
45 
46 
47 
48 
49 
50 
51 
52 
53 
54 
55 #define  DISPLAY_WIDTH   250
56 
57 //  Show (to  stdout ) the alignment encoded in  delta [0 .. (deltaLen - 1)]
58 //  between strings  a [0 .. (a_len - 1)]  and  b [0 .. (b_len - 1)] .
59 
60 static
61 void
Display_Alignment(const char * a,int32 aLen,const char * b,int32 bLen,int32 * delta,int32 deltaLen)62 Display_Alignment(const char    *a,   int32 aLen,
63                   const char    *b,   int32 bLen,
64                   int32   *delta,
65                   int32    deltaLen) {
66 
67   int32  i = 0;
68   int32  j = 0;
69 
70   char  *top    = new char [32 * 1024];
71   int32  topLen = 0;
72 
73   char  *bot    = new char [32 * 1024];
74   int32  botLen = 0;
75 
76   for (int32 k = 0;  k < deltaLen;  k++) {
77     for (int32 m = 1;  m < abs(delta[k]);  m++) {
78       top[topLen++] = a[i++];
79       j++;
80     }
81 
82     if (delta[k] < 0) {
83       top[topLen++] = '-';
84       j++;
85     } else {
86       top[topLen++] = a[i++];
87     }
88   }
89 
90   while (i < aLen && j < bLen) {
91     top[topLen++] = a[i++];
92     j++;
93   }
94   top[topLen] = '\0';
95 
96 
97   i = j = 0;
98 
99   for (int32 k = 0;  k < deltaLen;  k++) {
100     for (int32 m = 1;  m < abs(delta[k]);  m++) {
101       bot[botLen++] = b[j++];
102       i++;
103     }
104 
105     if (delta[k] > 0) {
106       bot[botLen++] = '-';
107       i++;
108     } else {
109       bot[botLen++] = b[j++];
110     }
111   }
112 
113   while (j < bLen && i < aLen) {
114     bot[botLen++] = b[j++];
115     i++;
116   }
117   bot[botLen] = '\0';
118 
119 
120   for (i = 0;  i < topLen || i < botLen;  i += DISPLAY_WIDTH) {
121     putc('\n', stderr);
122 
123     fprintf(stderr, "A: ");
124     for (j = 0;  j < DISPLAY_WIDTH && i + j < topLen;  j++)
125       putc(top[i + j], stderr);
126     putc('\n', stderr);
127 
128     fprintf(stderr, "B: ");
129     for (j = 0;  j < DISPLAY_WIDTH && i + j < botLen;  j++)
130       putc(bot[i + j], stderr);
131     putc('\n', stderr);
132 
133     fprintf(stderr, "   ");
134     for (j = 0;  j < DISPLAY_WIDTH && i + j < botLen && i + j < topLen; j++)
135       if (top[i + j] != ' ' && bot[i + j] != ' ' && top[i + j] != bot[i + j])
136         putc('^', stderr);
137       else
138         putc(' ', stderr);
139     putc('\n', stderr);
140   }
141 
142   delete [] top;
143   delete [] bot;
144 }
145 
146 
147 
148 
149 
150 
151 
152 
153 
154 
155 
156 
157 
158 //  Set hanging offset values for reversed fragment in
159 //   rev_adj[0 .. (adj_ct - 1)]  based on corresponding forward
160 //  values in  fadj[0 .. (adj_ct - 1)].  frag_len  is the length
161 //  of the fragment.
162 
163 static
164 void
Make_Rev_Adjust(Adjust_t * radj,Adjust_t * fadj,int32 adj_ct,int32 frag_len)165 Make_Rev_Adjust(Adjust_t    *radj,
166                 Adjust_t    *fadj,
167                 int32        adj_ct,
168                 int32        frag_len) {
169 
170   if (adj_ct == 0)
171     return;
172 
173   int32  i = 0;
174   int32  j = 0;
175   int32  prev = 0;
176 
177   for (i=adj_ct-1; i>0; i--) {
178     if (fadj[i].adjust == fadj[i-1].adjust + 1) {
179       radj[j].adjpos = 2 + frag_len - fadj[i].adjpos;
180       radj[j].adjust = prev + 1;
181 
182       prev = radj[j].adjust;
183     }
184 
185     else if (fadj[i].adjust == fadj[i-1].adjust - 1) {
186       radj[j].adjpos = 3 + frag_len - fadj[i].adjpos;
187       radj[j].adjust = prev - 1;
188 
189       prev = radj[j].adjust;
190     }
191 
192     else {
193       fprintf(stderr, "ERROR:  Bad adjustment value.  i = %d  adj_ct = %d  adjust[i] = %d  adjust[i-1] = %d\n",
194               i, adj_ct, fadj[i].adjust, fadj[i-1].adjust);
195       assert(0);
196     }
197 
198     j++;
199   }
200 
201   assert(i == 0);
202 
203   if (fadj[i].adjust == 1) {
204     radj[j].adjpos = 2 + frag_len - fadj[i].adjpos;
205     radj[j].adjust = prev + 1;
206   }
207 
208   else if (fadj[i].adjust == -1) {
209     radj[j].adjpos = 3 + frag_len - fadj[i].adjpos;
210     radj[j].adjust = prev - 1;
211   }
212 
213   else {
214     fprintf(stderr, "ERROR:  Bad adjustment value.  i = %d  adj_ct = %d  adjust[i] = %d\n",
215              i, adj_ct, fadj[i].adjust);
216     assert(0);
217   }
218 
219   assert(j+1 == adj_ct);
220 }
221 
222 
223 
224 
225 
226 //  Return the adjusted value of  hang  based on
227 //   adjust[0 .. (adjust_ct - 1)] .
228 static
229 int32
Hang_Adjust(int32 hang,Adjust_t * adjust,int32 adjust_ct)230 Hang_Adjust(int32     hang,
231             Adjust_t *adjust,
232             int32     adjust_ct) {
233   int32  delta = 0;
234 
235   assert(hang >= 0);
236 
237   //  Replacing second test >= with just > didn't change anything.  Both had 14 fails.
238 
239   for  (int32 i=0; (i < adjust_ct) && (hang >= adjust[i].adjpos); i++) {
240     //if (delta != adjust[i].adjust)
241     //  fprintf(stderr, "hang_adjust i=%d adjust_ct=%d adjust=%d pos=%d\n", i, adjust_ct, adjust[i].adjust, adjust[i].adjpos);
242     delta = adjust[i].adjust;
243   }
244 
245   if (hang + delta < 0) {
246     int32 i=0;
247 
248     fprintf(stderr, "\n");
249     fprintf(stderr, "hang_adjust hang=%d\n", hang);
250 
251     for  (; (i < adjust_ct) && (hang >= adjust[i].adjpos); i++)
252       fprintf(stderr, "hang_adjust i=%d adjust_ct=%d adjust=%d pos=%d --\n", i, adjust_ct, adjust[i].adjust, adjust[i].adjpos);
253 
254     for  (int32 j=i+10; (i < adjust_ct) && (i < j); i++)
255       fprintf(stderr, "hang_adjust i=%d adjust_ct=%d adjust=%d pos=%d\n", i, adjust_ct, adjust[i].adjust, adjust[i].adjpos);
256 
257     return(0);
258   }
259 
260   //fprintf(stderr, "hang adjust delta %d\n", delta);
261   return(hang + delta);
262 }
263 
264 static
265 void
PrepareRead(sqStore * seqStore,uint32 curID,uint32 & fseqLen,char * fseq,char * rseq,uint32 & fadjLen,Adjust_t * fadj,Adjust_t * radj,Correction_Output_t * C,uint64 & Cpos,uint64 Clen)266 PrepareRead(/*const*/ sqStore *seqStore, uint32 curID,
267             uint32 &fseqLen, char *fseq, char *rseq,
268             uint32 &fadjLen, Adjust_t *fadj, Adjust_t *radj,
269             Correction_Output_t  *C, uint64 &Cpos, uint64 Clen) {
270   sqRead read;
271   seqStore->sqStore_getRead(curID, &read);
272   //  Apply corrections to the B read (also converts to lower case, reverses it, etc)
273 
274   //fprintf(stderr, "Correcting B read %u at Cpos=%u Clen=%u\n", curID, Cpos, Clen);
275 
276   fseqLen = 0;
277   fadjLen = 0;
278 
279   //Correcting "b" read. "a" reads were corrected beforehand.
280   correctRead(curID,
281               fseq, fseqLen, fadj, fadjLen,
282               read.sqRead_sequence(),
283               read.sqRead_length(),
284               C, Cpos, Clen);
285 
286   //fprintf(stderr, "Finished   B read %u at Cpos=%u Clen=%u\n", curID, Cpos, Clen);
287 
288   //  Create copies of the sequence for forward and reverse.  There isn't a need for the forward copy (except that
289   //  we mutate it with corrections), and the reverse copy could be deferred until it is needed.
290 
291   memcpy(rseq, fseq, sizeof(char) * (fseqLen + 1));
292 
293   reverseComplementSequence(rseq, fseqLen);
294 
295   Make_Rev_Adjust(radj, fadj, fadjLen, fseqLen);
296 }
297 
298 //FIXME code duplication
299 //returns error rate of the alignment or -1. if (!match_to_end || invalid_olap)
300 static
301 double
ProcessAlignment(const int32 a_part_len,char * a_part,const int32 b_part_len,char * b_part,int32 error_bound,bool check_trivial_dna,pedWorkArea_t * ped,bool * match_to_end,bool * invalid_olap)302 ProcessAlignment(const int32 a_part_len, char *a_part,
303                  const int32 b_part_len, char *b_part,
304                  int32 error_bound, bool check_trivial_dna,
305                  pedWorkArea_t *ped, bool *match_to_end, bool *invalid_olap) {
306   int32   a_end        = 0;
307   int32   b_end        = 0;
308 
309   //fprintf(stderr, ">A\n%s\n", a_part);
310   //fprintf(stderr, ">B\n%s\n", b_part);
311 
312   int32 all_errors = Prefix_Edit_Dist(a_part, a_part_len,
313                                       b_part, b_part_len,
314                                       error_bound,
315                                       a_end,
316                                       b_end,
317                                       *match_to_end,
318                                       ped);
319 
320 
321   //Adjusting the extremities
322   //TODO discuss the logic!
323   //TODO discuss changes regarding a_hang usage!
324   //TODO refactor out the code duplication
325   all_errors -= TrimStartingIndels(a_part, a_end, ped->delta, ped->deltaLen, 1);
326   all_errors -= TrimStartingIndels(b_part, b_end, ped->delta, ped->deltaLen, -1);
327 
328   //fprintf(stderr, "Showing alignment\n");
329   //Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);
330 
331   *invalid_olap = (std::min(a_end, b_end) <= 0);
332 
333   if (!*match_to_end || *invalid_olap) {
334     return -1.;
335   }
336 
337   int32 events;
338   int32 alignment_len;
339 
340   //fprintf(stderr, "Computing errors\n");
341   //fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);
342   std::tie(events, alignment_len) = ComputeErrors(a_part, b_part, ped->deltaLen, ped->delta,
343                                                   a_end, b_end, check_trivial_dna);
344 
345   //fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
346 
347   //Assert can fail if DEFAULT_FLANK_IGNORE > 0
348   //assert(check_trivial_dna || all_errors == events);
349 
350   assert(events >= 0 && alignment_len > 0);
351   return (double) events / alignment_len;
352 }
353 
354 //  Read old fragments in  seqStore  and choose the ones that
355 //  have overlaps with fragments in  Frag. Recompute the
356 //  overlaps, using fragment corrections and output the revised error.
357 void
Redo_Olaps(coParameters * G,sqStore * seqStore)358 Redo_Olaps(coParameters *G, /*const*/ sqStore *seqStore) {
359 
360   //  Figure out the range of B reads we care about.  We probably could just loop over every read in
361   //  the store with minimal penalty.
362 
363   uint64     thisOvl = 0;
364   uint64     lastOvl = G->olapsLen - 1;
365 
366   uint32     loBid   = G->olaps[thisOvl].b_iid;
367   uint32     hiBid   = G->olaps[lastOvl].b_iid;
368 
369   //  Open all the corrections.
370 
371   memoryMappedFile     *Cfile = new memoryMappedFile(G->correctionsName);
372   Correction_Output_t  *C     = (Correction_Output_t *)Cfile->get();
373   uint64                Cpos  = 0;
374   uint64                Clen  = Cfile->length() / sizeof(Correction_Output_t);
375 
376   //  Allocate some temporary work space for the forward and reverse corrected B reads.
377 
378   fprintf(stderr, "--Allocate " F_SIZE_T " MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20);
379   char          *fseq    = new char     [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
380   uint32         fseqLen = 0;
381 
382   char          *rseq    = new char     [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
383 
384   fprintf(stderr, "--Allocate " F_SIZE_T " MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20);
385   Adjust_t      *fadj    = new Adjust_t [AS_MAX_READLEN + 1];
386   Adjust_t      *radj    = new Adjust_t [AS_MAX_READLEN + 1];
387   uint32         fadjLen  = 0;  //  radj is the same length
388 
389   fprintf(stderr, "--Allocate " F_SIZE_T " MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20);
390   pedWorkArea_t *ped      = new pedWorkArea_t;
391 
392   uint64         Total_Alignments_Ct           = 0;
393 
394   uint64         Failed_Alignments_Ct          = 0;
395   uint64         Failed_Alignments_Both_Ct     = 0;
396   uint64         Failed_Alignments_End_Ct      = 0;
397   uint64         Failed_Alignments_Length_Ct   = 0;
398 
399   uint64         olapsFwd = 0;
400   uint64         olapsRev = 0;
401 
402   uint64         nBetter = 0;
403   uint64         nWorse  = 0;
404   uint64         nSame   = 0;
405 
406 
407   ped->initialize(G, G->errorRate);
408 
409   //  Process overlaps.  Loop over the B reads, and recompute each overlap.
410   //  Loop over the B reads ...
411   for (uint32 curID = loBid; curID <= hiBid; curID++) {
412     if (((curID - loBid) % 1024) == 0)
413       fprintf(stderr, "Recomputing overlaps - %9u - %9u - %9u\n", loBid, curID, hiBid);
414 
415     //  ... towards the read for current overlap
416     if (curID < G->olaps[thisOvl].b_iid)
417       continue;
418 
419     assert(curID == G->olaps[thisOvl].b_iid);
420 
421     //  Load and correct the B read
422     PrepareRead(seqStore, curID,
423                 fseqLen, fseq, rseq,
424                 fadjLen, fadj, radj,
425                 C, Cpos, Clen);
426 
427     //  Recompute alignments for ALL overlaps involving the B read
428     for (; thisOvl <= lastOvl && G->olaps[thisOvl].b_iid == curID; thisOvl++) {
429       const Olap_Info_t &olap = G->olaps[thisOvl];
430 
431       if (G->secondID != UINT32_MAX && olap.b_iid != G->secondID)
432         continue;
433 
434       if (olap.normal) {
435       //  fprintf(stderr, "b_part = fseq %40.40s\n", fseq);
436         olapsFwd++;
437       } else {
438       //  fprintf(stderr, "b_part = rseq %40.40s\n", rseq);
439         olapsRev++;
440       }
441 
442       //  Find the A segment.  It's always forward.  It's already been corrected.
443       char *a_part = G->reads[olap.a_iid - G->bgnID].bases;
444       if (olap.a_hang > 0) {
445         int32 ha = Hang_Adjust(olap.a_hang,
446                                G->reads[olap.a_iid - G->bgnID].adjusts,
447                                G->reads[olap.a_iid - G->bgnID].adjustsLen);
448         a_part += ha;
449         //fprintf(stderr, "offset a_part by ha=%d\n", ha);
450       }
451 
452       //  Find the B segment.
453       char *b_part = (olap.normal == true) ? fseq : rseq;
454 
455       if (olap.a_hang < 0) {
456         int32 ha = olap.normal ? Hang_Adjust(-olap.a_hang, fadj, fadjLen) :
457                                             Hang_Adjust(-olap.a_hang, radj, fadjLen);
458         b_part += ha;
459         //fprintf(stderr, "offset b_part by ha=%d normal=%d\n", ha, olap.normal);
460       }
461 
462       //  Compute and process the alignment
463       Total_Alignments_Ct++;
464       //TODO discuss difference with error finding code
465       //In errors finding one of the sequences is the (almost) entire read and the length of its prefix is passed
466       int32   a_part_len  = strlen(a_part);
467       int32   b_part_len  = strlen(b_part);
468 
469       bool    match_to_end = false;
470       bool    invalid_olap = false;
471       double err_rate = ProcessAlignment(a_part_len, a_part, //olap.a_hang,
472                                          b_part_len, b_part,
473                                          G->Error_Bound[std::min(a_part_len, b_part_len)],
474                                          /*check trivial DNA*/G->checkTrivialDNA,
475                                          ped, &match_to_end, &invalid_olap);
476 
477       if (err_rate >= 0.) {
478         //if (err_rate > /*report_threshold*/ 0.) {
479         //  fprintf(stderr, "Err rate of overlap %u - %u is %f\n", olap.a_iid, olap.b_iid, err_rate);
480         //}
481 
482         const uint32 err_encoded = AS_OVS_encodeEvalue(err_rate);
483 
484         const uint32 base_encoded = G->olaps[thisOvl].evalue;
485         if (err_encoded < base_encoded)
486           nBetter++;
487         else if (err_encoded > base_encoded)
488           nWorse++;
489         else
490           nSame++;
491 
492         G->olaps[thisOvl].evalue = err_encoded;
493         //fprintf(stderr, "REDO - err rate = %f\n", AS_OVS_decodeEvalue(G->olaps[thisOvl].evalue));
494       } else {
495         //fprintf(stderr, "Err rate of overlap %u - %u failed\n", olap.a_iid, olap.b_iid);
496 
497         Failed_Alignments_Ct++;
498 
499         if (!match_to_end && invalid_olap)
500           Failed_Alignments_Both_Ct++;
501 
502         if (!match_to_end)
503           Failed_Alignments_End_Ct++;
504 
505         if (invalid_olap)
506           Failed_Alignments_Length_Ct++;
507 
508       #if 0
509         //  I can't find any patterns in these errors.  I thought that it was caused by the corrections, but I
510         //  found a case where no corrections were made and the alignment still failed.  Perhaps it is differences
511         //  in the alignment code (the forward vs reverse prefix distance in overlapper vs only the forward here)?
512 
513         fprintf(stderr, "Redo_Olaps()--\n");
514         fprintf(stderr, "Redo_Olaps()--\n");
515         fprintf(stderr, "Redo_Olaps()--  Bad alignment  errors %d  a_end %d  b_end %d  match_to_end %d  olapLen %d\n",
516                 errors, a_end, b_end, match_to_end, olapLen);
517         fprintf(stderr, "Redo_Olaps()--  Overlap        a_hang %d b_hang %d innie %d\n",
518                 olap.a_hang, olap.b_hang, olap.innie);
519         fprintf(stderr, "Redo_Olaps()--  Reads          a_id %u a_length %d b_id %u b_length %d\n",
520                 G->olaps[thisOvl].a_iid,
521                 G->reads[ G->olaps[thisOvl].a_iid ].basesLen,
522                 G->olaps[thisOvl].b_iid,
523                 G->reads[ G->olaps[thisOvl].b_iid ].basesLen);
524         fprintf(stderr, "Redo_Olaps()--  A %s\n", a_part);
525         fprintf(stderr, "Redo_Olaps()--  B %s\n", b_part);
526 
527         Display_Alignment(a_part, a_part_len, b_part, b_part_len, ped->delta, ped->deltaLen);
528 
529         fprintf(stderr, "\n");
530       #endif
531       }
532     }
533   }
534 
535   fprintf(stderr, "\n");
536 
537   delete    ped;
538   delete [] radj;
539   delete [] fadj;
540   delete [] rseq;
541   delete [] fseq;
542   delete    Cfile;
543 
544   fprintf(stderr, "--  Release bases, adjusts and reads.\n");
545 
546   delete [] G->bases;     G->bases   = NULL;
547   delete [] G->adjusts;   G->adjusts = NULL;
548   delete [] G->reads;     G->reads   = NULL;
549 
550   fprintf(stderr, "Olaps Fwd " F_U64 "\n", olapsFwd);
551   fprintf(stderr, "Olaps Rev " F_U64 "\n", olapsRev);
552 
553   fprintf(stderr, "Total:  " F_U64 "\n", Total_Alignments_Ct);
554   fprintf(stderr, "Failed: " F_U64 " (both)\n", Failed_Alignments_Both_Ct);
555   fprintf(stderr, "Failed: " F_U64 " (either)\n", Failed_Alignments_Ct);
556   fprintf(stderr, "Failed: " F_U64 " (match to end)\n", Failed_Alignments_End_Ct);
557   fprintf(stderr, "Failed: " F_U64 " (negative length)\n", Failed_Alignments_Length_Ct);
558 
559   fprintf(stderr, "Changed " F_U64 " overlaps.\n", nBetter + nWorse + nSame);
560   fprintf(stderr, "Better: " F_U64 " overlaps.\n", nBetter);
561   fprintf(stderr, "Worse:  " F_U64 " overlaps.\n", nWorse);
562   fprintf(stderr, "Same:   " F_U64 " overlaps.\n", nSame);
563 }
564