1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include  "correctOverlaps.H"
19 
20 
21 static
22 void
Compute_Delta(pedWorkArea_t * WA,int32 e,int32 d,int32 row)23 Compute_Delta(pedWorkArea_t   *WA,
24               int32            e,
25               int32            d,
26               int32            row) {
27   int32  last     = row;
28   int32  stackLen = 0;
29 
30   for (int32 k=e; k>0; k--) {
31     int32  from = d;
32     int32  max  = 1 + WA->Edit_Array_Lazy[k-1][d];
33 
34     int32  j = WA->Edit_Array_Lazy[k-1][d-1];
35 
36     if (j > max) {
37       from = d-1;
38       max = j;
39     }
40 
41     j = 1 + WA->Edit_Array_Lazy[k-1][d+1];
42 
43     if (j > max) {
44       from = d+1;
45       max = j;
46     }
47 
48     if (from == d-1) {
49       WA->deltaStack[stackLen++] = max - last - 1;
50       d--;
51       last = WA->Edit_Array_Lazy[k-1][from];
52     }
53 
54     else if (from == d+1) {
55       WA->deltaStack[stackLen++] = last - (max - 1);
56       d++;
57       last = WA->Edit_Array_Lazy[k-1][from];
58     }
59   }
60 
61   WA->deltaStack[stackLen++] = last + 1;
62 
63   for (int32 k=0, i=stackLen-1; i>0; i--)
64     WA->delta[k++] = abs(WA->deltaStack[i]) * Sign(WA->deltaStack[i-1]);
65 
66   WA->deltaLen = stackLen - 1;
67 }
68 
69 
70 
71 
72 
73 //  Allocate another block of 64mb for edits
74 
75 //  Needs to be at least:
76 //       52,432 to handle 40% error at  64k overlap
77 //      104,860 to handle 80% error at  64k overlap
78 //      209,718 to handle 40% error at 256k overlap
79 //      419,434 to handle 80% error at 256k overlap
80 //    3,355,446 to handle 40% error at   4m overlap
81 //    6,710,890 to handle 80% error at   4m overlap
82 //  Bigger means we can assign more than one Edit_Array[] in one allocation.
83 
84 uint32  EDIT_SPACE_SIZE  = 16 * 1024 * 1024;
85 
86 static
87 void
Allocate_More_Edit_Space(pedWorkArea_t * WA)88 Allocate_More_Edit_Space(pedWorkArea_t *WA) {
89 
90   //  Determine the last allocated block, and the last assigned block
91 
92   int32  b = 0;                 //  Last edit array assigned
93   int32  e = 0;                 //  Last edit array assigned more space
94   int32  a = WA->alloc.size();  //  First unallocated block
95 
96   while (WA->Edit_Array_Lazy[b] != NULL)
97     b++;
98 
99   //  Fill in the edit space array.  Well, not quite yet.  First, decide the minimum size.
100   //
101   //  Element [0] can access from [-2] to [2] = 5 elements.
102   //  Element [1] can access from [-3] to [3] = 7 elements.
103   //
104   //  Element [e] can access from [-2-e] to [2+e] = 5 + e * 2 elements
105   //
106   //  So, our offset for this new block needs to put [e][0] at offset...
107 
108   int32 Offset = 2 + b;
109   int32 Del    = 6 + b * 2;
110   int32 Size   = EDIT_SPACE_SIZE;
111 
112   while (Size < Offset + Del)
113     Size *= 2;
114 
115   //  Allocate another block
116 
117   int32  *alloc = new int32 [Size];
118 
119   WA->alloc.push_back(alloc);
120 
121   //  And, now, fill in the edit space array.
122 
123   e = b;
124 
125   while ((Offset + Del < Size) &&
126          (e < WA->Edit_Array_Max)) {
127     WA->Edit_Array_Lazy[e++] = alloc + Offset;
128 
129     Offset += Del;
130     Del    += 2;
131   }
132 
133   if (e == b)
134     fprintf(stderr, "Allocate_More_Edit_Space()-- ERROR: couldn't allocate enough space for even one more entry!  e=%d\n", e);
135   assert(e != b);
136 
137   fprintf(stderr, "--Allocate %d MB for edit array work space %d (positions %u-%u)\n", Size >> 20, a, b, e-1);
138 }
139 
140 
141 
142 
143 
144 //  Return the minimum number of changes (inserts, deletes, replacements)
145 //  needed to match string  A[0 .. (m-1)]  with a prefix of string
146 //   T[0 .. (n-1)]  if it's not more than  Error_Limit .
147 //
148 //  Put delta description of alignment in  WA->delta  and set
149 //  WA->deltaLen  to the number of entries there if it's a complete
150 //  match.
151 //  Set  A_End  and  T_End  to the rightmost positions where the
152 //  alignment ended in  A  and  T , respectively.
153 //  Set  Match_To_End  true if the match extended to the end
154 //  of at least one string; otherwise, set it false to indicate
155 //  a branch point.
156 
157 int32
Prefix_Edit_Dist(const char * A,int32 m,const char * T,int32 n,int32 Error_Limit,int32 & A_End,int32 & T_End,bool & Match_To_End,pedWorkArea_t * WA)158 Prefix_Edit_Dist(const char    *A,  int32 m,
159                  const char    *T,  int32 n,
160                  int32    Error_Limit,
161                  int32   &A_End,
162                  int32   &T_End,
163                  bool    &Match_To_End,
164                  pedWorkArea_t *WA) {
165 
166   //assert (m <= n);
167 
168   int32 Best_d  = 0;
169   int32 Best_e  = 0;
170   int32 Longest = 0;
171 
172   WA->deltaLen = 0;
173 
174   int32 shorter = std::min(m, n);
175 
176   int32 Row = 0;
177   while ((Row < shorter) && (A[Row] == T[Row]))
178     Row++;
179 
180   //fprintf(stderr, "Row=%d matches at the start\n", Row);
181 
182   if (WA->Edit_Array_Lazy[0] == NULL)
183     Allocate_More_Edit_Space(WA);
184 
185   WA->Edit_Array_Lazy[0][0] = Row;
186 
187   // Exact match?
188   if (Row == shorter) {
189     A_End        = Row;
190     T_End        = Row;
191     Match_To_End = true;
192     return(0);
193   }
194 
195   int32  Left = 0;
196   int32  Right = 0;
197   double Max_Score = 0.0;
198   int32  Max_Score_Len = 0;
199   int32  Max_Score_Best_d = 0;
200   int32  Max_Score_Best_e = 0;
201 
202   int32 e;
203   for (e = 1; e <= Error_Limit; e++) {
204     if (WA->Edit_Array_Lazy[e] == NULL)
205       Allocate_More_Edit_Space(WA);
206 
207     Left  = std::max(Left  - 1, -e);
208     Right = std::min(Right + 1,  e);
209 
210     WA->Edit_Array_Lazy[e-1][Left]    = -2;
211     WA->Edit_Array_Lazy[e-1][Left-1]  = -2;
212     WA->Edit_Array_Lazy[e-1][Right]   = -2;
213     WA->Edit_Array_Lazy[e-1][Right+1] = -2;
214 
215     for (int32 d=Left; d<=Right; d++) {
216       Row = 1 + WA->Edit_Array_Lazy[e-1][d];
217       Row = std::max(Row, WA->Edit_Array_Lazy[e-1][d-1]);
218       Row = std::max(Row, WA->Edit_Array_Lazy[e-1][d+1] + 1);
219 
220       while ((Row < m) && (Row + d < n) && (A[Row] == T[Row + d]))
221         Row++;
222 
223       //fprintf(stderr, "Row=%d matches at error e=%d\n", Row, e);
224 
225       assert(e < WA->Edit_Array_Max);
226 
227       WA->Edit_Array_Lazy[e][d] = Row;
228 
229       if (Row == m || Row + d == n) {
230         //fprintf(stderr, "Hit end Row=%d m=%d   Row+d=%d n=%d\n", Row, m, Row+d, n);
231 
232         // Force last error to be mismatch rather than insertion
233         if ((Row == m) &&
234             (1 + WA->Edit_Array_Lazy[e-1][d+1] == WA->Edit_Array_Lazy[e][d]) &&
235             (d < Right)) {
236           d++;
237           WA->Edit_Array_Lazy[e][d] = WA->Edit_Array_Lazy[e][d-1];
238         }
239 
240         A_End        = Row;           // One past last align position
241         T_End        = Row + d;
242         Match_To_End = true;
243 
244         Compute_Delta(WA, e, d, Row);
245 
246         return(e);
247       }
248     }
249 
250     while (Left <= Right && Left < 0
251            && WA->Edit_Array_Lazy[e][Left] < WA->G->Edit_Match_Limit[e])
252       Left++;
253 
254     if (Left >= 0)
255       while (Left <= Right
256              && WA->Edit_Array_Lazy[e][Left] + Left < WA->G->Edit_Match_Limit[e])
257         Left++;
258 
259     if (Left > Right)
260       break;
261 
262     while (Right > 0
263            && WA->Edit_Array_Lazy[e][Right] + Right < WA->G->Edit_Match_Limit[e])
264       Right--;
265 
266     if (Right <= 0)
267       while (WA->Edit_Array_Lazy[e][Right] < WA->G->Edit_Match_Limit[e])
268         Right--;
269 
270     assert (Left <= Right);
271 
272     for (int32 d=Left;  d <= Right;  d++)
273       if (WA->Edit_Array_Lazy[e][d] > Longest) {
274         Best_d  = d;
275         Best_e  = e;
276         Longest = WA->Edit_Array_Lazy[e][d];
277       }
278 
279     int32  Score = Longest * BRANCH_PT_MATCH_VALUE - e;
280 
281     // Assumes  BRANCH_PT_MATCH_VALUE - BRANCH_PT_ERROR_VALUE == 1.0
282 
283     //  findErrors also included a second test; overlapper doesn't.
284 
285     if (Score > Max_Score) {
286       Max_Score        = Score;
287       Max_Score_Len    = Longest;
288       Max_Score_Best_d = Best_d;
289       Max_Score_Best_e = Best_e;
290     }
291   }
292 
293   //  findErrors does this call.  Overlapper doesn't.
294   Compute_Delta(WA, Max_Score_Best_e, Max_Score_Best_d, Max_Score_Len);
295 
296   A_End        = Max_Score_Len;
297   T_End        = Max_Score_Len + Max_Score_Best_d;
298   Match_To_End = false;
299 
300   //assert(e == Max_Score_Best_e);
301   //fprintf(stderr, "e=%d Max_Score_Best_e=%d", e, Max_Score_Best_e);
302   return e;
303 }
304 
305