1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "overlapInCore.H"
19 
20 //  Add information for the match in  ref  to the list
21 //  starting at subscript  (* start). The matching window begins
22 //  offset  bytes from the beginning of this string.
23 
24 static
25 void
Add_Match(String_Ref_t ref,int * start,int offset,int * consistent,Work_Area_t * WA)26 Add_Match(String_Ref_t ref,
27           int * start,
28           int offset,
29           int * consistent,
30           Work_Area_t * WA) {
31   int  * p, save;
32   int  diag = 0, new_diag, expected_start = 0, num_checked = 0;
33   int  move_to_front = false;
34 
35   new_diag = getStringRefOffset(ref) - offset;
36 
37   for (p = start;  (* p) != 0;  p = & (WA->Match_Node_Space [(* p)].Next)) {
38     expected_start = WA->Match_Node_Space [(* p)].Start + WA->Match_Node_Space [(* p)].Len - G.Kmer_Len + 1 + HASH_KMER_SKIP;
39 
40     diag = WA->Match_Node_Space [(* p)].Offset - WA->Match_Node_Space [(* p)].Start;
41 
42     if (expected_start < offset)
43       break;
44 
45     if (expected_start == offset) {
46       if (new_diag == diag) {
47         WA->Match_Node_Space [(* p)].Len += 1 + HASH_KMER_SKIP;
48         if (move_to_front) {
49           save = (* p);
50           (* p) = WA->Match_Node_Space [(* p)].Next;
51           WA->Match_Node_Space [save].Next = (* start);
52           (* start) = save;
53         }
54         return;
55       } else
56         move_to_front = true;
57     }
58     num_checked ++;
59   }
60 
61   if (WA->Next_Avail_Match_Node == WA->Match_Node_Size) {
62     int32          newSize  = WA->Match_Node_Size * 2;
63     Match_Node_t  *newSpace = new Match_Node_t [newSize];
64 
65     memcpy(newSpace, WA->Match_Node_Space, sizeof(Match_Node_t) * WA->Match_Node_Size);
66 
67     delete [] WA->Match_Node_Space;
68 
69     WA->Match_Node_Size  = newSize;
70     WA->Match_Node_Space = newSpace;
71   }
72 
73   if ((* start) != 0
74       && (num_checked > 0
75           || abs (diag - new_diag) > 3
76           || offset < expected_start + G.Kmer_Len - 2))
77     (* consistent) = false;
78 
79   save = (* start);
80   (* start) = WA->Next_Avail_Match_Node;
81   WA->Next_Avail_Match_Node ++;
82 
83   WA->Match_Node_Space [(* start)].Offset = getStringRefOffset(ref);
84   WA->Match_Node_Space [(* start)].Len = G.Kmer_Len;
85   WA->Match_Node_Space [(* start)].Start = offset;
86   WA->Match_Node_Space [(* start)].Next = save;
87 
88 #if 0
89   fprintf(stderr, "Add_Match()-- %3d offset %d len %d start %d next %d\n",
90           *start,
91           WA->Match_Node_Space [(* start)].Offset,
92           WA->Match_Node_Space [(* start)].Len,
93           WA->Match_Node_Space [(* start)].Start,
94           WA->Match_Node_Space [(* start)].Next);
95 #endif
96 }
97 
98 
99 
100 //  Add information for Ref and all its matches to the global hash table in String_Olap_Space. Grow
101 //  the space if necessary. The matching window begins Offset bytes from the beginning of this
102 //  string.
103 static
104 void
Add_Ref(String_Ref_t Ref,int Offset,Work_Area_t * WA)105 Add_Ref(String_Ref_t Ref, int Offset, Work_Area_t * WA) {
106   uint32  Prev, StrNum, Sub;
107   int  consistent;
108 
109   StrNum = getStringRefStringNum(Ref);
110   Sub = (StrNum ^ (StrNum >> STRING_OLAP_SHIFT)) & STRING_OLAP_MASK;
111 
112 #if 0
113   fprintf(stderr, "Add_Ref()- StrNum %d Sub %d Offset %d\n", StrNum, Sub, Offset);
114 #endif
115 
116   while (WA->String_Olap_Space [Sub].Full
117           && WA->String_Olap_Space [Sub].String_Num != StrNum) {
118     Prev = Sub;
119     Sub = WA->String_Olap_Space [Sub].Next;
120     if (Sub == 0) {
121       if (WA->Next_Avail_String_Olap == WA->String_Olap_Size) {
122         int32          newSize  = WA->String_Olap_Size * 2;
123         String_Olap_t *newSpace = new String_Olap_t [newSize];
124 
125         memcpy(newSpace, WA->String_Olap_Space, sizeof(String_Olap_t) * WA->String_Olap_Size);
126 
127         delete [] WA->String_Olap_Space;
128 
129         WA->String_Olap_Size  = newSize;
130         WA->String_Olap_Space = newSpace;
131       }
132 
133       Sub = WA->Next_Avail_String_Olap ++;
134       WA->String_Olap_Space [Prev].Next = Sub;
135       WA->String_Olap_Space [Sub].Full = false;
136       break;
137     }
138   }
139 
140   if (! WA->String_Olap_Space [Sub].Full) {
141     WA->String_Olap_Space [Sub].String_Num = StrNum;
142     WA->String_Olap_Space [Sub].Match_List = 0;
143     WA->String_Olap_Space [Sub].diag_sum = 0.0;
144     WA->String_Olap_Space [Sub].diag_ct = 0;
145     WA->String_Olap_Space [Sub].diag_bgn = AS_MAX_READLEN;
146     WA->String_Olap_Space [Sub].diag_end = 0;
147     WA->String_Olap_Space [Sub].Next = 0;
148     WA->String_Olap_Space [Sub].Full = true;
149     WA->String_Olap_Space [Sub].consistent = true;
150   }
151 
152   consistent = WA->String_Olap_Space [Sub].consistent;
153 
154   WA->String_Olap_Space [Sub].diag_sum += (double)getStringRefOffset(Ref) - Offset;
155   WA->String_Olap_Space [Sub].diag_ct ++;
156   if (WA->String_Olap_Space [Sub].diag_bgn > Offset) WA->String_Olap_Space [Sub].diag_bgn = Offset;
157   if (WA->String_Olap_Space [Sub].diag_end < Offset) WA->String_Olap_Space [Sub].diag_end = Offset;
158   Add_Match (Ref, & (WA->String_Olap_Space [Sub].Match_List), Offset, & consistent, WA);
159 
160   WA->String_Olap_Space [Sub].consistent = consistent;
161 
162   return;
163 }
164 
165 
166 
167 
168 //  Search for string  S  with hash key  Key  in the global
169 //  Hash_Table  starting at subscript  Sub. Return the matching
170 //  reference in the hash table if there is one, or else a reference
171 //  with the  Empty bit set true.  Set  (* Where)  to the subscript in
172 //  Extra_Ref_Space  where the reference was found if it was found there.
173 //  Set  (* hi_hits)  to  true  if hash table entry is found but is empty
174 //  because it was screened out, otherwise set to false.
175 static
176 String_Ref_t
Hash_Find(uint64 Key,int64 Sub,char * S,int64 * Where,int * hi_hits)177 Hash_Find(uint64 Key, int64 Sub, char * S, int64 * Where, int * hi_hits) {
178   String_Ref_t  H_Ref = 0;
179   char  * T;
180   unsigned char  Key_Check;
181   int64  Ct, Probe;
182   int  i;
183 
184   Key_Check = KEY_CHECK_FUNCTION (Key);
185   Probe = PROBE_FUNCTION (Key);
186 
187   (* hi_hits) = false;
188   Ct = 0;
189   do {
190     for (i = 0;  i < Hash_Table [Sub].Entry_Ct;  i ++)
191       if (Hash_Table [Sub].Check [i] == Key_Check) {
192         int  is_empty;
193 
194         H_Ref = Hash_Table [Sub].Entry [i];
195         //fprintf(stderr, "Href = Hash_Table %u Entry %u = " F_U64 "\n", Sub, i, H_Ref);
196 
197         is_empty = getStringRefEmpty(H_Ref);
198         if (! getStringRefLast(H_Ref) && ! is_empty) {
199           (* Where) = ((uint64)getStringRefStringNum(H_Ref) << OFFSET_BITS) + getStringRefOffset(H_Ref);
200           H_Ref = Extra_Ref_Space [(* Where)];
201           //fprintf(stderr, "Href = Extra_Ref_Space " F_U64 " = " F_U64 "\n", *Where, H_Ref);
202         }
203         //fprintf(stderr, "Href = " F_U64 "  Get String_Start[ " F_U64 " ] + " F_U64 "\n", getStringRefStringNum(H_Ref), getStringRefOffset(H_Ref));
204         T = basesData + String_Start [getStringRefStringNum(H_Ref)] + getStringRefOffset(H_Ref);
205         if (strncmp (S, T, G.Kmer_Len) == 0) {
206           if (is_empty) {
207             setStringRefEmpty(H_Ref, TRUELY_ONE);
208             (* hi_hits) = true;
209           }
210           return  H_Ref;
211         }
212       }
213     if (Hash_Table [Sub].Entry_Ct < ENTRIES_PER_BUCKET) {
214       setStringRefEmpty(H_Ref, TRUELY_ONE);
215       return  H_Ref;
216     }
217     Sub = (Sub + Probe) % HASH_TABLE_SIZE;
218   }  while (++ Ct < HASH_TABLE_SIZE);
219 
220   setStringRefEmpty(H_Ref, TRUELY_ONE);
221   return  H_Ref;
222 }
223 
224 
225 
226 
227 
228 
229 //  Find and output all overlaps and branch points between string
230 //   Frag  and any fragment currently in the global hash table.
231 //   Frag_Len  is the length of  Frag  and  Frag_Num  is its ID number.
232 //   Dir  is the orientation of  Frag .
233 
234 void
Find_Overlaps(char Frag[],int Frag_Len,uint32 Frag_Num,Direction_t Dir,Work_Area_t * WA)235 Find_Overlaps(char Frag [], int Frag_Len, uint32 Frag_Num, Direction_t Dir, Work_Area_t * WA) {
236   String_Ref_t  Ref;
237   char  * P, * Window;
238   uint64  Key, Next_Key;
239   int64  Sub, Next_Sub, Where;
240   Check_Vector_t  This_Check, Next_Check;
241   int  Offset, Shift, Next_Shift;
242   int  hi_hits;
243   int  j;
244 
245   memset (WA->String_Olap_Space, 0, STRING_OLAP_MODULUS * sizeof (String_Olap_t));
246   WA->Next_Avail_String_Olap = STRING_OLAP_MODULUS;
247   WA->Next_Avail_Match_Node = 1;
248 
249   assert (Frag_Len >= G.Kmer_Len);
250 
251   Offset = 0;
252   P = Window = Frag;
253 
254   WA->left_end_screened  = false;
255   WA->right_end_screened = false;
256 
257   WA->A_Olaps_For_Frag = 0;
258   WA->B_Olaps_For_Frag = 0;
259 
260   Key = 0;
261   for (j = 0;  j < G.Kmer_Len;  j ++)
262     Key |= (uint64) (Bit_Equivalent [(int) * (P ++)]) << (2 * j);
263 
264   Sub = HASH_FUNCTION (Key);
265   Shift = HASH_CHECK_FUNCTION (Key);
266   Next_Key = (Key >> 2);
267   Next_Key |= ((uint64) (Bit_Equivalent [(int) * P])) << (2 * (G.Kmer_Len - 1));
268   Next_Sub = HASH_FUNCTION (Next_Key);
269   Next_Shift = HASH_CHECK_FUNCTION (Next_Key);
270   Next_Check = Hash_Check_Array [Next_Sub];
271 
272   if ((Hash_Check_Array [Sub] & (((Check_Vector_t) 1) << Shift)) != 0) {
273     Ref = Hash_Find (Key, Sub, Window, & Where, & hi_hits);
274     if (hi_hits) {
275       WA->left_end_screened = true;
276     }
277     if (! getStringRefEmpty(Ref)) {
278       while (true) {
279         if (Frag_Num < getStringRefStringNum(Ref) + Hash_String_Num_Offset)
280           Add_Ref  (Ref, Offset, WA);
281 
282         if (getStringRefLast(Ref))
283           break;
284         else {
285           Ref = Extra_Ref_Space [++ Where];
286           assert (! getStringRefEmpty(Ref));
287         }
288       }
289     }
290   }
291 
292   while ((* P) != '\0') {
293     Window ++;
294     Offset ++;
295 
296     Key = Next_Key;
297     Shift = Next_Shift;
298     Sub = Next_Sub;
299     This_Check = Next_Check;
300     P ++;
301     Next_Key = (Key >> 2);
302     Next_Key |= ((uint64)
303                  (Bit_Equivalent [(int) * P])) << (2 * (G.Kmer_Len - 1));
304     Next_Sub = HASH_FUNCTION (Next_Key);
305     Next_Shift = HASH_CHECK_FUNCTION (Next_Key);
306     Next_Check = Hash_Check_Array [Next_Sub];
307 
308     if ((This_Check & (((Check_Vector_t) 1) << Shift)) != 0) {
309       Ref = Hash_Find (Key, Sub, Window, & Where, & hi_hits);
310       if (hi_hits) {
311         if (Offset < HOPELESS_MATCH) {
312           WA->left_end_screened = true;
313         }
314         if (Frag_Len - Offset - G.Kmer_Len + 1 < HOPELESS_MATCH) {
315           WA->right_end_screened = true;
316         }
317       }
318       if (! getStringRefEmpty(Ref)) {
319         while (true) {
320           if (Frag_Num < getStringRefStringNum(Ref) + Hash_String_Num_Offset)
321             Add_Ref  (Ref, Offset, WA);
322 
323           if (getStringRefLast(Ref))
324             break;
325           else {
326             Ref = Extra_Ref_Space [++ Where];
327             assert (! getStringRefEmpty(Ref));
328           }
329         }
330       }
331     }
332   }
333 
334 
335   Process_String_Olaps  (Frag, Frag_Len, Frag_Num, Dir, WA);
336 }
337 
338