1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "correctOverlaps.H"
19
20
21 static
22 void
Compute_Delta(pedWorkArea_t * WA,int32 e,int32 d,int32 row)23 Compute_Delta(pedWorkArea_t *WA,
24 int32 e,
25 int32 d,
26 int32 row) {
27 int32 last = row;
28 int32 stackLen = 0;
29
30 for (int32 k=e; k>0; k--) {
31 int32 from = d;
32 int32 max = 1 + WA->Edit_Array_Lazy[k-1][d];
33
34 int32 j = WA->Edit_Array_Lazy[k-1][d-1];
35
36 if (j > max) {
37 from = d-1;
38 max = j;
39 }
40
41 j = 1 + WA->Edit_Array_Lazy[k-1][d+1];
42
43 if (j > max) {
44 from = d+1;
45 max = j;
46 }
47
48 if (from == d-1) {
49 WA->deltaStack[stackLen++] = max - last - 1;
50 d--;
51 last = WA->Edit_Array_Lazy[k-1][from];
52 }
53
54 else if (from == d+1) {
55 WA->deltaStack[stackLen++] = last - (max - 1);
56 d++;
57 last = WA->Edit_Array_Lazy[k-1][from];
58 }
59 }
60
61 WA->deltaStack[stackLen++] = last + 1;
62
63 for (int32 k=0, i=stackLen-1; i>0; i--)
64 WA->delta[k++] = abs(WA->deltaStack[i]) * Sign(WA->deltaStack[i-1]);
65
66 WA->deltaLen = stackLen - 1;
67 }
68
69
70
71
72
73 // Allocate another block of 64mb for edits
74
75 // Needs to be at least:
76 // 52,432 to handle 40% error at 64k overlap
77 // 104,860 to handle 80% error at 64k overlap
78 // 209,718 to handle 40% error at 256k overlap
79 // 419,434 to handle 80% error at 256k overlap
80 // 3,355,446 to handle 40% error at 4m overlap
81 // 6,710,890 to handle 80% error at 4m overlap
82 // Bigger means we can assign more than one Edit_Array[] in one allocation.
83
84 uint32 EDIT_SPACE_SIZE = 16 * 1024 * 1024;
85
86 static
87 void
Allocate_More_Edit_Space(pedWorkArea_t * WA)88 Allocate_More_Edit_Space(pedWorkArea_t *WA) {
89
90 // Determine the last allocated block, and the last assigned block
91
92 int32 b = 0; // Last edit array assigned
93 int32 e = 0; // Last edit array assigned more space
94 int32 a = WA->alloc.size(); // First unallocated block
95
96 while (WA->Edit_Array_Lazy[b] != NULL)
97 b++;
98
99 // Fill in the edit space array. Well, not quite yet. First, decide the minimum size.
100 //
101 // Element [0] can access from [-2] to [2] = 5 elements.
102 // Element [1] can access from [-3] to [3] = 7 elements.
103 //
104 // Element [e] can access from [-2-e] to [2+e] = 5 + e * 2 elements
105 //
106 // So, our offset for this new block needs to put [e][0] at offset...
107
108 int32 Offset = 2 + b;
109 int32 Del = 6 + b * 2;
110 int32 Size = EDIT_SPACE_SIZE;
111
112 while (Size < Offset + Del)
113 Size *= 2;
114
115 // Allocate another block
116
117 int32 *alloc = new int32 [Size];
118
119 WA->alloc.push_back(alloc);
120
121 // And, now, fill in the edit space array.
122
123 e = b;
124
125 while ((Offset + Del < Size) &&
126 (e < WA->Edit_Array_Max)) {
127 WA->Edit_Array_Lazy[e++] = alloc + Offset;
128
129 Offset += Del;
130 Del += 2;
131 }
132
133 if (e == b)
134 fprintf(stderr, "Allocate_More_Edit_Space()-- ERROR: couldn't allocate enough space for even one more entry! e=%d\n", e);
135 assert(e != b);
136
137 fprintf(stderr, "--Allocate %d MB for edit array work space %d (positions %u-%u)\n", Size >> 20, a, b, e-1);
138 }
139
140
141
142
143
144 // Return the minimum number of changes (inserts, deletes, replacements)
145 // needed to match string A[0 .. (m-1)] with a prefix of string
146 // T[0 .. (n-1)] if it's not more than Error_Limit .
147 //
148 // Put delta description of alignment in WA->delta and set
149 // WA->deltaLen to the number of entries there if it's a complete
150 // match.
151 // Set A_End and T_End to the rightmost positions where the
152 // alignment ended in A and T , respectively.
153 // Set Match_To_End true if the match extended to the end
154 // of at least one string; otherwise, set it false to indicate
155 // a branch point.
156
157 int32
Prefix_Edit_Dist(const char * A,int32 m,const char * T,int32 n,int32 Error_Limit,int32 & A_End,int32 & T_End,bool & Match_To_End,pedWorkArea_t * WA)158 Prefix_Edit_Dist(const char *A, int32 m,
159 const char *T, int32 n,
160 int32 Error_Limit,
161 int32 &A_End,
162 int32 &T_End,
163 bool &Match_To_End,
164 pedWorkArea_t *WA) {
165
166 //assert (m <= n);
167
168 int32 Best_d = 0;
169 int32 Best_e = 0;
170 int32 Longest = 0;
171
172 WA->deltaLen = 0;
173
174 int32 shorter = std::min(m, n);
175
176 int32 Row = 0;
177 while ((Row < shorter) && (A[Row] == T[Row]))
178 Row++;
179
180 //fprintf(stderr, "Row=%d matches at the start\n", Row);
181
182 if (WA->Edit_Array_Lazy[0] == NULL)
183 Allocate_More_Edit_Space(WA);
184
185 WA->Edit_Array_Lazy[0][0] = Row;
186
187 // Exact match?
188 if (Row == shorter) {
189 A_End = Row;
190 T_End = Row;
191 Match_To_End = true;
192 return(0);
193 }
194
195 int32 Left = 0;
196 int32 Right = 0;
197 double Max_Score = 0.0;
198 int32 Max_Score_Len = 0;
199 int32 Max_Score_Best_d = 0;
200 int32 Max_Score_Best_e = 0;
201
202 int32 e;
203 for (e = 1; e <= Error_Limit; e++) {
204 if (WA->Edit_Array_Lazy[e] == NULL)
205 Allocate_More_Edit_Space(WA);
206
207 Left = std::max(Left - 1, -e);
208 Right = std::min(Right + 1, e);
209
210 WA->Edit_Array_Lazy[e-1][Left] = -2;
211 WA->Edit_Array_Lazy[e-1][Left-1] = -2;
212 WA->Edit_Array_Lazy[e-1][Right] = -2;
213 WA->Edit_Array_Lazy[e-1][Right+1] = -2;
214
215 for (int32 d=Left; d<=Right; d++) {
216 Row = 1 + WA->Edit_Array_Lazy[e-1][d];
217 Row = std::max(Row, WA->Edit_Array_Lazy[e-1][d-1]);
218 Row = std::max(Row, WA->Edit_Array_Lazy[e-1][d+1] + 1);
219
220 while ((Row < m) && (Row + d < n) && (A[Row] == T[Row + d]))
221 Row++;
222
223 //fprintf(stderr, "Row=%d matches at error e=%d\n", Row, e);
224
225 assert(e < WA->Edit_Array_Max);
226
227 WA->Edit_Array_Lazy[e][d] = Row;
228
229 if (Row == m || Row + d == n) {
230 //fprintf(stderr, "Hit end Row=%d m=%d Row+d=%d n=%d\n", Row, m, Row+d, n);
231
232 // Force last error to be mismatch rather than insertion
233 if ((Row == m) &&
234 (1 + WA->Edit_Array_Lazy[e-1][d+1] == WA->Edit_Array_Lazy[e][d]) &&
235 (d < Right)) {
236 d++;
237 WA->Edit_Array_Lazy[e][d] = WA->Edit_Array_Lazy[e][d-1];
238 }
239
240 A_End = Row; // One past last align position
241 T_End = Row + d;
242 Match_To_End = true;
243
244 Compute_Delta(WA, e, d, Row);
245
246 return(e);
247 }
248 }
249
250 while (Left <= Right && Left < 0
251 && WA->Edit_Array_Lazy[e][Left] < WA->G->Edit_Match_Limit[e])
252 Left++;
253
254 if (Left >= 0)
255 while (Left <= Right
256 && WA->Edit_Array_Lazy[e][Left] + Left < WA->G->Edit_Match_Limit[e])
257 Left++;
258
259 if (Left > Right)
260 break;
261
262 while (Right > 0
263 && WA->Edit_Array_Lazy[e][Right] + Right < WA->G->Edit_Match_Limit[e])
264 Right--;
265
266 if (Right <= 0)
267 while (WA->Edit_Array_Lazy[e][Right] < WA->G->Edit_Match_Limit[e])
268 Right--;
269
270 assert (Left <= Right);
271
272 for (int32 d=Left; d <= Right; d++)
273 if (WA->Edit_Array_Lazy[e][d] > Longest) {
274 Best_d = d;
275 Best_e = e;
276 Longest = WA->Edit_Array_Lazy[e][d];
277 }
278
279 int32 Score = Longest * BRANCH_PT_MATCH_VALUE - e;
280
281 // Assumes BRANCH_PT_MATCH_VALUE - BRANCH_PT_ERROR_VALUE == 1.0
282
283 // findErrors also included a second test; overlapper doesn't.
284
285 if (Score > Max_Score) {
286 Max_Score = Score;
287 Max_Score_Len = Longest;
288 Max_Score_Best_d = Best_d;
289 Max_Score_Best_e = Best_e;
290 }
291 }
292
293 // findErrors does this call. Overlapper doesn't.
294 Compute_Delta(WA, Max_Score_Best_e, Max_Score_Best_d, Max_Score_Len);
295
296 A_End = Max_Score_Len;
297 T_End = Max_Score_Len + Max_Score_Best_d;
298 Match_To_End = false;
299
300 //assert(e == Max_Score_Best_e);
301 //fprintf(stderr, "e=%d Max_Score_Best_e=%d", e, Max_Score_Best_e);
302 return e;
303 }
304
305