1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "overlapInCore.H"
19
20 // Add information for the match in ref to the list
21 // starting at subscript (* start). The matching window begins
22 // offset bytes from the beginning of this string.
23
24 static
25 void
Add_Match(String_Ref_t ref,int * start,int offset,int * consistent,Work_Area_t * WA)26 Add_Match(String_Ref_t ref,
27 int * start,
28 int offset,
29 int * consistent,
30 Work_Area_t * WA) {
31 int * p, save;
32 int diag = 0, new_diag, expected_start = 0, num_checked = 0;
33 int move_to_front = false;
34
35 new_diag = getStringRefOffset(ref) - offset;
36
37 for (p = start; (* p) != 0; p = & (WA->Match_Node_Space [(* p)].Next)) {
38 expected_start = WA->Match_Node_Space [(* p)].Start + WA->Match_Node_Space [(* p)].Len - G.Kmer_Len + 1 + HASH_KMER_SKIP;
39
40 diag = WA->Match_Node_Space [(* p)].Offset - WA->Match_Node_Space [(* p)].Start;
41
42 if (expected_start < offset)
43 break;
44
45 if (expected_start == offset) {
46 if (new_diag == diag) {
47 WA->Match_Node_Space [(* p)].Len += 1 + HASH_KMER_SKIP;
48 if (move_to_front) {
49 save = (* p);
50 (* p) = WA->Match_Node_Space [(* p)].Next;
51 WA->Match_Node_Space [save].Next = (* start);
52 (* start) = save;
53 }
54 return;
55 } else
56 move_to_front = true;
57 }
58 num_checked ++;
59 }
60
61 if (WA->Next_Avail_Match_Node == WA->Match_Node_Size) {
62 int32 newSize = WA->Match_Node_Size * 2;
63 Match_Node_t *newSpace = new Match_Node_t [newSize];
64
65 memcpy(newSpace, WA->Match_Node_Space, sizeof(Match_Node_t) * WA->Match_Node_Size);
66
67 delete [] WA->Match_Node_Space;
68
69 WA->Match_Node_Size = newSize;
70 WA->Match_Node_Space = newSpace;
71 }
72
73 if ((* start) != 0
74 && (num_checked > 0
75 || abs (diag - new_diag) > 3
76 || offset < expected_start + G.Kmer_Len - 2))
77 (* consistent) = false;
78
79 save = (* start);
80 (* start) = WA->Next_Avail_Match_Node;
81 WA->Next_Avail_Match_Node ++;
82
83 WA->Match_Node_Space [(* start)].Offset = getStringRefOffset(ref);
84 WA->Match_Node_Space [(* start)].Len = G.Kmer_Len;
85 WA->Match_Node_Space [(* start)].Start = offset;
86 WA->Match_Node_Space [(* start)].Next = save;
87
88 #if 0
89 fprintf(stderr, "Add_Match()-- %3d offset %d len %d start %d next %d\n",
90 *start,
91 WA->Match_Node_Space [(* start)].Offset,
92 WA->Match_Node_Space [(* start)].Len,
93 WA->Match_Node_Space [(* start)].Start,
94 WA->Match_Node_Space [(* start)].Next);
95 #endif
96 }
97
98
99
100 // Add information for Ref and all its matches to the global hash table in String_Olap_Space. Grow
101 // the space if necessary. The matching window begins Offset bytes from the beginning of this
102 // string.
103 static
104 void
Add_Ref(String_Ref_t Ref,int Offset,Work_Area_t * WA)105 Add_Ref(String_Ref_t Ref, int Offset, Work_Area_t * WA) {
106 uint32 Prev, StrNum, Sub;
107 int consistent;
108
109 StrNum = getStringRefStringNum(Ref);
110 Sub = (StrNum ^ (StrNum >> STRING_OLAP_SHIFT)) & STRING_OLAP_MASK;
111
112 #if 0
113 fprintf(stderr, "Add_Ref()- StrNum %d Sub %d Offset %d\n", StrNum, Sub, Offset);
114 #endif
115
116 while (WA->String_Olap_Space [Sub].Full
117 && WA->String_Olap_Space [Sub].String_Num != StrNum) {
118 Prev = Sub;
119 Sub = WA->String_Olap_Space [Sub].Next;
120 if (Sub == 0) {
121 if (WA->Next_Avail_String_Olap == WA->String_Olap_Size) {
122 int32 newSize = WA->String_Olap_Size * 2;
123 String_Olap_t *newSpace = new String_Olap_t [newSize];
124
125 memcpy(newSpace, WA->String_Olap_Space, sizeof(String_Olap_t) * WA->String_Olap_Size);
126
127 delete [] WA->String_Olap_Space;
128
129 WA->String_Olap_Size = newSize;
130 WA->String_Olap_Space = newSpace;
131 }
132
133 Sub = WA->Next_Avail_String_Olap ++;
134 WA->String_Olap_Space [Prev].Next = Sub;
135 WA->String_Olap_Space [Sub].Full = false;
136 break;
137 }
138 }
139
140 if (! WA->String_Olap_Space [Sub].Full) {
141 WA->String_Olap_Space [Sub].String_Num = StrNum;
142 WA->String_Olap_Space [Sub].Match_List = 0;
143 WA->String_Olap_Space [Sub].diag_sum = 0.0;
144 WA->String_Olap_Space [Sub].diag_ct = 0;
145 WA->String_Olap_Space [Sub].diag_bgn = AS_MAX_READLEN;
146 WA->String_Olap_Space [Sub].diag_end = 0;
147 WA->String_Olap_Space [Sub].Next = 0;
148 WA->String_Olap_Space [Sub].Full = true;
149 WA->String_Olap_Space [Sub].consistent = true;
150 }
151
152 consistent = WA->String_Olap_Space [Sub].consistent;
153
154 WA->String_Olap_Space [Sub].diag_sum += (double)getStringRefOffset(Ref) - Offset;
155 WA->String_Olap_Space [Sub].diag_ct ++;
156 if (WA->String_Olap_Space [Sub].diag_bgn > Offset) WA->String_Olap_Space [Sub].diag_bgn = Offset;
157 if (WA->String_Olap_Space [Sub].diag_end < Offset) WA->String_Olap_Space [Sub].diag_end = Offset;
158 Add_Match (Ref, & (WA->String_Olap_Space [Sub].Match_List), Offset, & consistent, WA);
159
160 WA->String_Olap_Space [Sub].consistent = consistent;
161
162 return;
163 }
164
165
166
167
168 // Search for string S with hash key Key in the global
169 // Hash_Table starting at subscript Sub. Return the matching
170 // reference in the hash table if there is one, or else a reference
171 // with the Empty bit set true. Set (* Where) to the subscript in
172 // Extra_Ref_Space where the reference was found if it was found there.
173 // Set (* hi_hits) to true if hash table entry is found but is empty
174 // because it was screened out, otherwise set to false.
175 static
176 String_Ref_t
Hash_Find(uint64 Key,int64 Sub,char * S,int64 * Where,int * hi_hits)177 Hash_Find(uint64 Key, int64 Sub, char * S, int64 * Where, int * hi_hits) {
178 String_Ref_t H_Ref = 0;
179 char * T;
180 unsigned char Key_Check;
181 int64 Ct, Probe;
182 int i;
183
184 Key_Check = KEY_CHECK_FUNCTION (Key);
185 Probe = PROBE_FUNCTION (Key);
186
187 (* hi_hits) = false;
188 Ct = 0;
189 do {
190 for (i = 0; i < Hash_Table [Sub].Entry_Ct; i ++)
191 if (Hash_Table [Sub].Check [i] == Key_Check) {
192 int is_empty;
193
194 H_Ref = Hash_Table [Sub].Entry [i];
195 //fprintf(stderr, "Href = Hash_Table %u Entry %u = " F_U64 "\n", Sub, i, H_Ref);
196
197 is_empty = getStringRefEmpty(H_Ref);
198 if (! getStringRefLast(H_Ref) && ! is_empty) {
199 (* Where) = ((uint64)getStringRefStringNum(H_Ref) << OFFSET_BITS) + getStringRefOffset(H_Ref);
200 H_Ref = Extra_Ref_Space [(* Where)];
201 //fprintf(stderr, "Href = Extra_Ref_Space " F_U64 " = " F_U64 "\n", *Where, H_Ref);
202 }
203 //fprintf(stderr, "Href = " F_U64 " Get String_Start[ " F_U64 " ] + " F_U64 "\n", getStringRefStringNum(H_Ref), getStringRefOffset(H_Ref));
204 T = basesData + String_Start [getStringRefStringNum(H_Ref)] + getStringRefOffset(H_Ref);
205 if (strncmp (S, T, G.Kmer_Len) == 0) {
206 if (is_empty) {
207 setStringRefEmpty(H_Ref, TRUELY_ONE);
208 (* hi_hits) = true;
209 }
210 return H_Ref;
211 }
212 }
213 if (Hash_Table [Sub].Entry_Ct < ENTRIES_PER_BUCKET) {
214 setStringRefEmpty(H_Ref, TRUELY_ONE);
215 return H_Ref;
216 }
217 Sub = (Sub + Probe) % HASH_TABLE_SIZE;
218 } while (++ Ct < HASH_TABLE_SIZE);
219
220 setStringRefEmpty(H_Ref, TRUELY_ONE);
221 return H_Ref;
222 }
223
224
225
226
227
228
229 // Find and output all overlaps and branch points between string
230 // Frag and any fragment currently in the global hash table.
231 // Frag_Len is the length of Frag and Frag_Num is its ID number.
232 // Dir is the orientation of Frag .
233
234 void
Find_Overlaps(char Frag[],int Frag_Len,uint32 Frag_Num,Direction_t Dir,Work_Area_t * WA)235 Find_Overlaps(char Frag [], int Frag_Len, uint32 Frag_Num, Direction_t Dir, Work_Area_t * WA) {
236 String_Ref_t Ref;
237 char * P, * Window;
238 uint64 Key, Next_Key;
239 int64 Sub, Next_Sub, Where;
240 Check_Vector_t This_Check, Next_Check;
241 int Offset, Shift, Next_Shift;
242 int hi_hits;
243 int j;
244
245 memset (WA->String_Olap_Space, 0, STRING_OLAP_MODULUS * sizeof (String_Olap_t));
246 WA->Next_Avail_String_Olap = STRING_OLAP_MODULUS;
247 WA->Next_Avail_Match_Node = 1;
248
249 assert (Frag_Len >= G.Kmer_Len);
250
251 Offset = 0;
252 P = Window = Frag;
253
254 WA->left_end_screened = false;
255 WA->right_end_screened = false;
256
257 WA->A_Olaps_For_Frag = 0;
258 WA->B_Olaps_For_Frag = 0;
259
260 Key = 0;
261 for (j = 0; j < G.Kmer_Len; j ++)
262 Key |= (uint64) (Bit_Equivalent [(int) * (P ++)]) << (2 * j);
263
264 Sub = HASH_FUNCTION (Key);
265 Shift = HASH_CHECK_FUNCTION (Key);
266 Next_Key = (Key >> 2);
267 Next_Key |= ((uint64) (Bit_Equivalent [(int) * P])) << (2 * (G.Kmer_Len - 1));
268 Next_Sub = HASH_FUNCTION (Next_Key);
269 Next_Shift = HASH_CHECK_FUNCTION (Next_Key);
270 Next_Check = Hash_Check_Array [Next_Sub];
271
272 if ((Hash_Check_Array [Sub] & (((Check_Vector_t) 1) << Shift)) != 0) {
273 Ref = Hash_Find (Key, Sub, Window, & Where, & hi_hits);
274 if (hi_hits) {
275 WA->left_end_screened = true;
276 }
277 if (! getStringRefEmpty(Ref)) {
278 while (true) {
279 if (Frag_Num < getStringRefStringNum(Ref) + Hash_String_Num_Offset)
280 Add_Ref (Ref, Offset, WA);
281
282 if (getStringRefLast(Ref))
283 break;
284 else {
285 Ref = Extra_Ref_Space [++ Where];
286 assert (! getStringRefEmpty(Ref));
287 }
288 }
289 }
290 }
291
292 while ((* P) != '\0') {
293 Window ++;
294 Offset ++;
295
296 Key = Next_Key;
297 Shift = Next_Shift;
298 Sub = Next_Sub;
299 This_Check = Next_Check;
300 P ++;
301 Next_Key = (Key >> 2);
302 Next_Key |= ((uint64)
303 (Bit_Equivalent [(int) * P])) << (2 * (G.Kmer_Len - 1));
304 Next_Sub = HASH_FUNCTION (Next_Key);
305 Next_Shift = HASH_CHECK_FUNCTION (Next_Key);
306 Next_Check = Hash_Check_Array [Next_Sub];
307
308 if ((This_Check & (((Check_Vector_t) 1) << Shift)) != 0) {
309 Ref = Hash_Find (Key, Sub, Window, & Where, & hi_hits);
310 if (hi_hits) {
311 if (Offset < HOPELESS_MATCH) {
312 WA->left_end_screened = true;
313 }
314 if (Frag_Len - Offset - G.Kmer_Len + 1 < HOPELESS_MATCH) {
315 WA->right_end_screened = true;
316 }
317 }
318 if (! getStringRefEmpty(Ref)) {
319 while (true) {
320 if (Frag_Num < getStringRefStringNum(Ref) + Hash_String_Num_Offset)
321 Add_Ref (Ref, Offset, WA);
322
323 if (getStringRefLast(Ref))
324 break;
325 else {
326 Ref = Extra_Ref_Space [++ Where];
327 assert (! getStringRefEmpty(Ref));
328 }
329 }
330 }
331 }
332 }
333
334
335 Process_String_Olaps (Frag, Frag_Len, Frag_Num, Dir, WA);
336 }
337
338