1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "findErrors.H"
19 #include "computeDiff.H"
20 
21 #include "sequence.H"
22 #include <tuple>
23 
24 #define  DISPLAY_WIDTH   250
25 
26 //  Show (to  stdout ) the alignment encoded in  delta [0 .. (deltaLen - 1)]
27 //  between strings  a [0 .. (a_len - 1)]  and  b [0 .. (b_len - 1)] .
28 
29 static
30 void
Display_Alignment(char * a,int32 aLen,char * b,int32 bLen,int32 * delta,int32 deltaLen)31 Display_Alignment(char    *a,   int32 aLen,
32                   char    *b,   int32 bLen,
33                   int32   *delta,
34                   int32    deltaLen) {
35 
36   int32  i = 0;
37   int32  j = 0;
38 
39   char  *top    = new char [32 * 1024];
40   int32  topLen = 0;
41 
42   char  *bot    = new char [32 * 1024];
43   int32  botLen = 0;
44 
45   for (int32 k = 0;  k < deltaLen;  k++) {
46     for (int32 m = 1;  m < abs(delta[k]);  m++) {
47       top[topLen++] = a[i++];
48       j++;
49     }
50 
51     if (delta[k] < 0) {
52       top[topLen++] = '-';
53       j++;
54     } else {
55       top[topLen++] = a[i++];
56     }
57   }
58 
59   while (i < aLen && j < bLen) {
60     top[topLen++] = a[i++];
61     j++;
62   }
63   top[topLen] = '\0';
64 
65 
66   i = j = 0;
67 
68   for (int32 k = 0;  k < deltaLen;  k++) {
69     for (int32 m = 1;  m < abs(delta[k]);  m++) {
70       bot[botLen++] = b[j++];
71       i++;
72     }
73 
74     if (delta[k] > 0) {
75       bot[botLen++] = '-';
76       i++;
77     } else {
78       bot[botLen++] = b[j++];
79     }
80   }
81 
82   while (j < bLen && i < aLen) {
83     bot[botLen++] = b[j++];
84     i++;
85   }
86   bot[botLen] = '\0';
87 
88 
89   for (i = 0;  i < topLen || i < botLen;  i += DISPLAY_WIDTH) {
90     putc('\n', stderr);
91 
92     fprintf(stderr, "A: ");
93     for (j = 0;  j < DISPLAY_WIDTH && i + j < topLen;  j++)
94       putc(top[i + j], stderr);
95     putc('\n', stderr);
96 
97     fprintf(stderr, "B: ");
98     for (j = 0;  j < DISPLAY_WIDTH && i + j < botLen;  j++)
99       putc(bot[i + j], stderr);
100     putc('\n', stderr);
101 
102     fprintf(stderr, "   ");
103     for (j = 0;  j < DISPLAY_WIDTH && i + j < botLen && i + j < topLen; j++)
104       if (top[i + j] != ' ' && bot[i + j] != ' ' && top[i + j] != bot[i + j])
105         putc('^', stderr);
106       else
107         putc(' ', stderr);
108     putc('\n', stderr);
109   }
110 
111   delete [] top;
112   delete [] bot;
113 }
114 
115 
116 int32
117 Prefix_Edit_Dist(char   *A, int m,
118                  char   *T, int n,
119                  int     Error_Limit,
120                  int32  &A_End,
121                  int32  &T_End,
122                  bool   &Match_To_End,
123                  pedWorkArea_t * wa);
124 
125 void
126 Analyze_Alignment(Thread_Work_Area_t *wa,
127                   char   *a_part, int32 a_len, int32 a_offset,
128                   char   *b_part, int32 b_len,
129                   int32   sub);
130 
131 
132 //  Find the alignment referred to in  olap , where the  a_iid
133 //  fragment is in  Frag  and the  b_iid  sequence is in  b_seq .
134 //  Use the alignment to increment the appropriate vote fields
135 //  for the a fragment.   shredded  is true iff the b fragment
136 //  is from shredded data, in which case the overlap will be
137 //  ignored if the a fragment is also shredded.
138 //  rev_seq  is a buffer to hold the reverse complement of  b_seq
139 //  if needed.  (* rev_id) is used to keep track of whether
140 //  rev_seq  has been created yet.  (* wa) is the work-area
141 //  containing space for the process to use in case of multi-threading.
142 
143 
144 void
Process_Olap(Olap_Info_t * olap,char * b_seq,bool shredded,Thread_Work_Area_t * wa)145 Process_Olap(Olap_Info_t        *olap,
146              char               *b_seq,
147              bool                shredded,
148              Thread_Work_Area_t *wa) {
149 
150 #if 0
151   fprintf(stderr, "Process_Olap:  %8d %8d %5ld %5ld  %c\n",
152           olap->a_iid, olap->b_iid,
153           olap->a_hang, olap->b_hang,
154           olap->innie == true ? 'I' : 'N');
155 
156   //if (olap->a_iid != 39861 && olap->b_iid != 39861 && olap->a_iid != 2283 && olap->b_iid != 2283)
157   //  return;
158 #endif
159 
160   int32  ri = olap->a_iid - wa->G->bgnID;
161 
162   if ((shredded == true) && (wa->G->reads[ri].shredded == true)) {
163     //fprintf(stderr, "%8d %8d shredded\n", olap->a_iid, olap->b_iid);
164     return;
165   }
166   //fprintf(stderr, "%8d %8d not shredded\n", olap->a_iid, olap->b_iid);
167 
168   char  *a_part   = wa->G->reads[ri].sequence;
169   int32  a_offset = 0;
170 
171   char  *b_part   = (olap->normal == true) ? b_seq : wa->rev_seq;
172   int32  b_offset = 0;
173 
174   //  If innie, reverse-complement the B sequence.
175 
176   if ((olap->innie == true) && (wa->rev_id != olap->b_iid)) {
177     strcpy(b_part, b_seq);
178     reverseComplementSequence(b_part, 0);
179     wa->rev_id = olap->b_iid;
180   }
181 
182   //  Adjust for hangs.
183 
184   if (olap->a_hang > 0) {
185     a_offset  = olap->a_hang;
186     a_part   += a_offset;
187   }
188 
189   if (olap->a_hang < 0) {
190     b_offset  = -olap->a_hang;
191     b_part   +=  b_offset;
192   }
193 
194   //  Count degree - just how many times we cover the end of the read?
195 
196   if ((olap->a_hang <= 0) && (wa->G->reads[ri].left_degree < MAX_DEGREE))
197     wa->G->reads[ri].left_degree++;
198 
199   if ((olap->b_hang >= 0) && (wa->G->reads[ri].right_degree < MAX_DEGREE))
200     wa->G->reads[ri].right_degree++;
201 
202   // Get the alignment
203 
204   uint32   a_part_len = strlen(a_part);
205   uint32   b_part_len = strlen(b_part);
206 
207   int32    a_end = 0;
208   int32    b_end = 0;
209 
210   bool     match_to_end = false;
211 
212   //fprintf(stderr, "A: offset %d length %d\n", a_offset, a_part_len);
213   //fprintf(stderr, "B: offset %d length %d\n", b_offset, b_part_len);
214 
215   auto *ped = &wa->ped;
216 
217   int32    all_errors = Prefix_Edit_Dist(a_part, a_part_len,
218                                          b_part, b_part_len,
219                                          wa->G->Error_Bound[std::min(a_part_len, b_part_len)],
220                                          a_end,
221                                          b_end,
222                                          match_to_end,
223                                          ped);
224 
225   if ((a_end < 0) || (a_end > a_part_len) || (b_end < 0) || (b_end > b_part_len)) {
226     fprintf (stderr, "ERROR:  Bad edit distance.\n");
227     fprintf (stderr, "  errors = %d  a_end = %d  b_end = %d\n", all_errors, a_end, b_end);
228     fprintf (stderr, "  a_part_len = %d  b_part_len = %d\n", a_part_len, b_part_len);
229     fprintf (stderr, "  a_iid = %d  b_iid = %d  match_to_end = %c\n", olap->a_iid, olap->b_iid, match_to_end ? 'T' : 'F');
230   }
231   assert(a_end >= 0);
232   assert(a_end <= a_part_len);
233   assert(b_end >= 0);
234   assert(b_end <= b_part_len);
235 
236   //fprintf(stderr, "  all_errors = %d  delta_len = %d\n", all_errors, ped.deltaLen);
237   //fprintf(stderr, "  a_align = %d/%d  b_align = %d/%d\n", a_end, a_part_len, b_end, b_part_len);
238   //Display_Alignment(a_part, a_end, b_part, b_end, ped.delta, ped.deltaLen);//, wa->G->reads[ri].clear_len - a_offset);
239 
240   if ((match_to_end == false) && (a_end + a_offset >= wa->G->reads[ri].clear_len - 1)) {
241     match_to_end = true;
242   }
243 
244   //TODO Adjusting the extremities?
245   //all_errors -= TrimStartingIndels(a_part, a_end, ped->delta, ped->deltaLen, 1);
246   //all_errors -= TrimStartingIndels(b_part, b_end, ped->delta, ped->deltaLen, -1);
247 
248   //fprintf(stderr, "Showing alignment\n");
249   //Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);
250 
251   if (match_to_end && a_end > 0 && b_end > 0) {
252     int32 events;
253     int32 alignment_len;
254 
255     //fprintf(stderr, "Computing errors\n");
256     //fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);
257     std::tie(events, alignment_len) = ComputeErrors(a_part, b_part, ped->deltaLen, ped->delta,
258                                                     a_end, b_end, wa->G->checkTrivialDNA);
259 
260     assert(events >= 0 && alignment_len > 0);
261     //fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
262 
263     //  We used to assert that either checkTrivialDNA was true or that
264     //  all_errors == events.  This was to ensure that no alignment errors
265     //  were rejected by ComputeErrors() -- but we're now ignoring alignment errors
266     //  near the end of a read in all cases, so all_errors != events.
267     //
268     //assert(wa->G->checkTrivialDNA || all_errors == events);
269 
270     if (events <= (int32) ceil(alignment_len * wa->G->maskedErrorRate)) {
271       wa->passedOlaps++;
272       //fprintf(stderr, "%8d %8d passed overlap\n", olap->a_iid, olap->b_iid);
273       Analyze_Alignment(wa,
274                         a_part, a_end, a_offset,
275                         b_part, b_end,
276                         ri);
277       return;
278     }
279   }
280 
281   wa->failedOlaps++;
282   //fprintf(stderr, "%8d %8d failed overlap\n", olap->a_iid, olap->b_iid);
283   //fprintf(stderr, "%8d %8d match to end %c\n", olap->a_iid, olap->b_iid, match_to_end ? 'T' : 'F');
284   //fprintf(stderr, "%8d %8d too many errors %c\n", olap->a_iid, olap->b_iid, (errors > wa->G->Error_Bound[olap_len]) ? 'T' : 'F');
285 }
286