1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "runtime.H"
19 #include "system.H"
20 
21 #include "sqStore.H"
22 #include "sqCache.H"
23 #include "ovStore.H"
24 
25 #include "prefixEditDistance.H"
26 
27 
28 #ifndef OVERLAPINCORE_H
29 #define OVERLAPINCORE_H
30 
31 #define  HASH_KMER_SKIP           0
32 //  Skip this many kmers between the kmers put into the hash
33 //  table.  Setting this to 0 will make every kmer go
34 //  into the hash table.
35 
36 
37 #define  BAD_WINDOW_LEN           50
38 //  Length of window in which to look for clustered errors
39 //  to invalidate an overlap
40 
41 #define  BAD_WINDOW_VALUE         (8 * QUALITY_CUTOFF)
42 //  This many or more errors in a window of  BAD_WINDOW_LEN
43 //  invalidates an overlap
44 
45 #define  CHECK_MASK              0xff
46 //  To set Check field in hash bucket
47 
48 #define  DEFAULT_HI_HIT_LIMIT    INT_MAX
49 //  Any kmer in hash table with at least this many hits
50 //  cannot initiate an overlap.  Can be changed on the command
51 //  line with  -K  option
52 
53 #define  DELETED_FRAG            2
54 //  Indicates fragment was marked as deleted in the store
55 
56 #define  DISPLAY_WIDTH           60
57 //  Number of characters per line when displaying sequences
58 
59 #define  ENTRIES_PER_BUCKET      21
60 //  In main hash table.  Recommended values are 21, 31 or 42
61 //  depending on cache line size.
62 
63 #define  HASH_CHECK_MASK         0x1f
64 //  Used to set and check bit in Hash_Check_Array
65 //  Change if change  Check_Vector_t
66 
67 #define  HASH_EXPANSION_FACTOR   1.4
68 //  Hash table size is >= this times  MAX_HASH_STRINGS
69 
70 #define  HASH_MASK               (((uint64)1 << G.Hash_Mask_Bits) - 1)
71 //  Extract right Hash_Mask_Bits bits of hash key
72 
73 #define  HASH_TABLE_SIZE         (1 + HASH_MASK)
74 //  Number of buckets in hash table
75 
76 #define  HIGHEST_KMER_LIMIT      255
77 //  If  Hi_Hit_Limit  is more than this, it's ignored
78 
79 #define  HOPELESS_MATCH          90
80 //  A string this long or longer without an exact kmer
81 //  match is assumed to be hopeless to find a match
82 //  within the error threshold
83 
84 #define  IID_GAP_LIMIT           100
85 //  When using a list of fragment IID's, gaps between
86 //  IID's this long or longer force a new load partial
87 //  store to be done
88 
89 #define  INIT_MATCH_NODE_SIZE    10000
90 //  Initial number of nodes to hold exact matches
91 
92 #define  INIT_SCREEN_MATCHES     50
93 //  Initial number of screen-match entries per fragment
94 
95 #define  INIT_STRING_OLAP_SIZE   5000
96 //  Initial number of different New fragments that
97 //  overlap a single Old fragment
98 
99 #define  K_MER_STEP          1
100 //  1 = every k-mer in search
101 //  2 = every other k-mer
102 //  3 = every third, ...
103 //  Used to skip some k-mers in finding matches
104 
105 #define  MAX_BRANCH_COUNT        UCHAR_MAX
106 //  The largest branch point count before an overflow
107 
108 #define  MAX_DISTINCT_OLAPS      3
109 //  Most possible really different overlaps (i.e., not
110 //  just shifts from periodic regions) between 2 fragments
111 //  in a given orientation.  For fragments of approximately
112 //  same size, should never be more than 2.
113 
114 #define  MAX_NAME_LEN            500
115 //  Longest file name allowed
116 
117 #define  MIN_CALC_KMER           4
118 //  When calculating the  Hi_Hit_Limit  based on genome length, etc,
119 //  don't set it below this
120 
121 #define  MIN_INTERSECTION        10
122 //  Minimum length of match region to be worth reporting
123 
124 #define  MIN_OLAP_OUTSIDE_SCREEN 30
125 //  Minimum number of bases outside of screened regions
126 //  to be a reportable overlap.  Entire overlap (including screened
127 //  portion) must still be  MIN_OLAP_LEN .
128 
129 #define  OUTPUT_OVERLAP_DELTAS   0
130 //  If true include delta-encoding of overlap alignment
131 //  in overlap messages.  Otherwise, omit them.
132 //  As of 6 Oct 2008, support for overlap deltas has been removed.
133 //  However, there are enough remnants in AS_MSG to output them.
134 //  Just enabling OUTPUT_OVERLAP_DELTAS will not compile; see
135 //  AS_MSG_USE_OVL_DELTA in AS_MSG.
136 
137 #define  PROBE_MASK              0x3e
138 //  Used to determine probe step to resolve collisions
139 
140 #define  QUALITY_CUTOFF          20
141 //  Regard quality values higher than this as equal to this
142 //  for purposes of finding bad windows
143 
144 #define  SCRIPT_NAME             "lsf-ovl"
145 //  Default name of script produced by  make-ovl-script
146 
147 #define  SHIFT_SLACK  1
148 // Allow to be off by this many bases in combining/comparing alignments
149 
150 #define  STRING_OLAP_SHIFT       8
151 //  To compute hash function into the String_Olap hash table.
152 
153 #define  STRING_OLAP_MODULUS     (1 << STRING_OLAP_SHIFT)
154 //  The size of the  String_Olap  hash table.  The rest of
155 //  the space is used for chaining.  This number should be
156 //  relatively small to reflect the number of fragments a
157 //  given fragment has exact matches with.
158 
159 #define  STRING_OLAP_MASK        (STRING_OLAP_MODULUS - 1)
160 //  To compute hash function into the String_Olap hash table.
161 
162 #define  THREAD_STACKSIZE        (16 * 512 * 512)
163 //  The amount of stack space to allocate to each thread.
164 
165 #define  VALID_FRAG              1
166 //  Indicates fragment was valid in the fragment store
167 
168 //#define  WINDOW_SCREEN_OLAP      10
169 //  Amount by which k-mers can overlap a screen region and still
170 //  be added to the hash table.
171 
172 #define  MAX_EXTRA_SUBCOUNT        (AS_MAX_READLEN / G.Kmer_Len)
173 
174 
175 #define  HASH_FUNCTION(k)        (((k) ^ ((k) >> HSF1) ^ ((k) >> HSF2)) & HASH_MASK)
176 //  Gives subscript in hash table for key  k
177 
178 #define  HASH_CHECK_FUNCTION(k)  (((k) ^ ((k) >> SV1) ^ ((k) >> SV2)) & HASH_CHECK_MASK)
179 //  Gives bit position to see if key could be in bucket
180 
181 #define  KEY_CHECK_FUNCTION(k)   (((k) ^ ((k) >> SV1) ^ ((k) >> SV3)) & CHECK_MASK)
182 //  Gives bit pattern to see if key could match
183 
184 #define  PROBE_FUNCTION(k)       ((((k) ^ ((k) >> SV2) ^ ((k) >> SV3)) & PROBE_MASK) | 1)
185 //  Gives secondary hash function.  Force to be odd so that will be relatively
186 //  prime wrt the hash table size, which is a power of 2.
187 
188 
189 
190 typedef  enum Direction_Type {
191   FORWARD,
192   REVERSE
193 } Direction_t;
194 
195 typedef  struct String_Olap_Node {
196   uint32  String_Num;                // Of hash-table frag that have exact match with
197   int32  Match_List;                 // Subscript of start of list of exact matches
198   double  diag_sum;                  // Sum of diagonals of all k-mer matches to this frag
199   int32   diag_ct;                   // Count of all k-mer matches to this frag
200   int     diag_bgn;
201   int     diag_end;
202   signed int  Next : 29;             // Next match if this is a collision
203   unsigned  Full : 1;
204   unsigned  consistent : 1;
205 }  String_Olap_t;
206 
207 
208 typedef  struct Olap_Info {
209   int  s_lo, s_hi;
210   int  t_lo, t_hi;
211   double  quality;
212   int  delta [AS_MAX_READLEN+1];  //  needs only MAX_ERRORS
213   int  delta_ct;
214   int  s_left_boundary, s_right_boundary;
215   int  t_left_boundary, t_right_boundary;
216   int  min_diag, max_diag;
217 }  Olap_Info_t;
218 
219 //  The following structure holds what used to be global information, but
220 //  is now encapsulated so that multiple copies can be made for multiple
221 //  parallel threads.
222 
223 typedef  struct Work_Area {
224 
225   //int   *Error_Bound;
226 
227   String_Olap_t  * String_Olap_Space;
228   int32  String_Olap_Size;
229   int32  Next_Avail_String_Olap;
230 
231   Match_Node_t  * Match_Node_Space;
232   int32  Match_Node_Size;
233   int32  Next_Avail_Match_Node;
234 
235   //  Counts the number of overlaps for each fragment.  Cuts off
236   //  overlaps above a limit.
237   int32  A_Olaps_For_Frag;
238   int32  B_Olaps_For_Frag;
239 
240   sqStore  *readStore;
241   sqCache  *readCache;
242 
243   int    left_end_screened;
244   int    right_end_screened;
245 
246   int    status;
247   int    thread_id;
248 
249   uint32         bgnID;  //  Range of reads we are processing
250   uint32         endID;  //  was frag_segment_lo and frag_segment_hi (all lowercase)
251 
252   //  Instead of outputting each overlap as we create it, we
253   //  buffer them and output blocks of overlaps.
254   uint64         overlapsLen;
255   uint64         overlapsMax;
256   ovOverlap     *overlaps;
257 
258   //  Various stats that used to be global and updated whenever we
259   //  output an overlap or finished processing a set of hits.
260   //  Needed a mutex to update.
261   uint64         Total_Overlaps;
262   uint64         Contained_Overlap_Ct;
263   uint64         Dovetail_Overlap_Ct;
264 
265   uint64         Kmer_Hits_Without_Olap_Ct;
266   uint64         Kmer_Hits_With_Olap_Ct;
267   uint64         Kmer_Hits_Skipped_Ct;
268   uint64         Multi_Overlap_Ct;
269 
270   prefixEditDistance  *editDist;
271 
272 
273    char * q_diff;
274    Olap_Info_t  *distinct_olap;
275 }  Work_Area_t;
276 
277 
278 
279 
280 typedef  uint32  Check_Vector_t;
281 // Bit vector to see if hash bucket could possibly contain a match
282 
283 
284 typedef  uint64   String_Ref_t;
285 
286 #define  BIT_EMPT  62
287 #define  BIT_LAST  63
288 
289 extern uint32 STRING_NUM_BITS;
290 extern uint32 OFFSET_BITS;
291 
292 extern uint64 TRUELY_ZERO;
293 extern uint64 TRUELY_ONE;
294 
295 extern uint64 STRING_NUM_MASK;
296 extern uint64 OFFSET_MASK;
297 
298 extern uint64 MAX_STRING_NUM;
299 
300 
301 
302 //
303 //  [ Last (1) ][ Empty (1) ][ Offset (11) ][ StringNum (19) ]
304 //
305 
306 #define getStringRefStringNum(X)      (((X)                   ) & STRING_NUM_MASK)
307 #define getStringRefOffset(X)         (((X) >> STRING_NUM_BITS) & OFFSET_MASK)
308 #define getStringRefEmpty(X)          (((X) >> BIT_EMPT       ) & TRUELY_ONE)
309 #define getStringRefLast(X)           (((X) >> BIT_LAST       ) & TRUELY_ONE)
310 
311 #define setStringRefStringNum(X, Y)   ((X) = (((X) & ~(STRING_NUM_MASK                   )) | ((Y))))
312 #define setStringRefOffset(X, Y)      ((X) = (((X) & ~(OFFSET_MASK     << STRING_NUM_BITS)) | ((Y) << STRING_NUM_BITS)))
313 #define setStringRefEmpty(X, Y)       ((X) = (((X) & ~(TRUELY_ONE      << BIT_EMPT       )) | ((Y) << BIT_EMPT)))
314 #define setStringRefLast(X, Y)        ((X) = (((X) & ~(TRUELY_ONE      << BIT_LAST       )) | ((Y) << BIT_LAST)))
315 
316 
317 typedef  struct Hash_Bucket {
318   String_Ref_t  Entry [ENTRIES_PER_BUCKET];
319   unsigned char  Check [ENTRIES_PER_BUCKET];
320   unsigned char  Hits [ENTRIES_PER_BUCKET];
321   int16  Entry_Ct;
322 }  Hash_Bucket_t;
323 
324 typedef  struct Hash_Frag_Info {
325   uint32  length             : 30;
326   uint32  lfrag_end_screened : 1;
327   uint32  rfrag_end_screened : 1;
328 }  Hash_Frag_Info_t;
329 
330 
331 extern char           *basesData;
332 extern String_Ref_t   *nextRef;
333 extern size_t          Data_Len;
334 
335 extern int64   Bad_Short_Window_Ct;
336 extern int64   Bad_Long_Window_Ct;
337 
338 extern size_t  Extra_Data_Len;
339 
340 extern uint64  Max_Extra_Ref_Space;
341 extern uint64  Extra_Ref_Ct;
342 extern String_Ref_t  * Extra_Ref_Space;
343 extern uint64  Extra_String_Ct;
344 extern uint64  Extra_String_Subcount;
345 
346 extern Check_Vector_t  * Hash_Check_Array;
347 extern uint64  Hash_String_Num_Offset;
348 extern Hash_Bucket_t  * Hash_Table;
349 extern uint64  Kmer_Hits_With_Olap_Ct;
350 extern uint64  Kmer_Hits_Without_Olap_Ct;
351 extern uint64  Kmer_Hits_Skipped_Ct;
352 extern uint64  Multi_Overlap_Ct;
353 extern uint64  String_Ct;
354 extern Hash_Frag_Info_t  * String_Info;
355 
356 extern int64  * String_Start;
357 extern uint32  String_Start_Size;
358 
359 extern size_t  Used_Data_Len;
360 
361 extern int32  Bit_Equivalent [256];
362 extern int32  Char_Is_Bad [256];
363 extern uint64  Hash_Entries;
364 extern uint64  Total_Overlaps;
365 extern uint64  Contained_Overlap_Ct;
366 extern uint64  Dovetail_Overlap_Ct;
367 
368 class oicParameters {
369 public:
oicParameters()370   oicParameters() {
371     initialize();
372   };
~oicParameters()373   ~oicParameters() {};
374 
initialize(void)375   void   initialize(void) {
376     maxErate               = 0.06;
377     alignNoise             = 1.0;
378     Doing_Partial_Overlaps = false;
379 
380     bgnHashID = 1;
381     endHashID = UINT32_MAX;
382     minLibToHash = 0;
383     maxLibToHash = UINT32_MAX;
384 
385     bgnRefID = 1;
386     endRefID = UINT32_MAX;
387     minLibToRef  = 0;
388     maxLibToRef  = UINT32_MAX;
389 
390     Kmer_Len = 0;
391     kmerSkipFileName = NULL;
392     Filter_By_Kmer_Count = 0;
393 
394     Frag_Olap_Limit = UINT64_MAX;
395 
396     Unique_Olap_Per_Pair = true;
397 
398     Hash_Mask_Bits       = 22;
399     Max_Hash_Load        = 0.6;
400     Max_Hash_Data_Len    = 100000000;
401 
402     Outfile_Name = NULL;
403     Outstat_Name = NULL;
404 
405     Num_PThreads = getMaxThreadsAllowed();
406 
407     Min_Olap_Len = 0;
408 
409     Use_Hopeless_Check = true;
410 
411     Frag_Store_Path = NULL;
412   };
413 
414   double maxErate;
415   double alignNoise;
416 
417   //  If set true by the G option (G for Granger)
418   //  then allow overlaps that do not extend to the end
419   //  of either read.
420   bool  Doing_Partial_Overlaps;  //  -G
421 
422   uint32  bgnHashID;     //  -h
423   uint32  endHashID;
424   uint32  minLibToHash;  //  -H
425   uint32  maxLibToHash;
426 
427 
428   //  The range of reads we need to process
429 
430   uint32         frag_segment_lo;
431   uint32         frag_segment_hi;
432 
433   uint32  bgnRefID;      //  -r
434   uint32  curRefID;     //  When processing, this is where we are at, bgn < cur <= end.
435   uint32  endRefID;
436   uint32  minLibToRef;   //  -R
437   uint32  maxLibToRef;
438 
439   uint32  perThread;        //  When processing, how many to do per block
440 
441   uint64  Kmer_Len;         //  -k
442   uint64  Filter_By_Kmer_Count;
443   char   *kmerSkipFileName; //  -k
444 
445   //  Maximum number of overlaps for end of an old fragment against
446   //  a single hash table of frags, in each orientation
447   uint64  Frag_Olap_Limit;  //  -l
448 
449   //  If true will allow at most
450   //  one overlap output message per oriented fragment pair
451   //  Set true by  -u  command-line option; set false by  -m
452   bool  Unique_Olap_Per_Pair;  //  -m and -u
453 
454   uint32  Hash_Mask_Bits;  //  --hashbits
455 
456   uint64  Max_Hash_Data_Len;  //  --hashdatalen
457   double  Max_Hash_Load;  //  --hashload
458 
459   //  --maxreadlen sets OFFSET_BITS, STRING_NUM_BITS, STRING_NUM_MASK and MAX_STRING_NUM.
460 
461   char  *Outfile_Name;  //  -o
462   char  *Outstat_Name;  //  -s
463 
464   uint32  Num_PThreads;  //  -t
465 
466   int32  Min_Olap_Len;  //  --minlength, former -v
467 
468   //  Determines whether check for absence of kmer matches
469   //  at the end of a read is used to abort the overlap before
470   //  the extension from a single kmer match is attempted.
471   bool  Use_Hopeless_Check;  //  -z
472 
473   char *Frag_Store_Path;
474 };
475 
476 extern oicParameters G;
477 
478 
479 extern uint64  HSF1;
480 extern uint64  HSF2;
481 extern uint64  SV1;
482 extern uint64  SV2;
483 extern uint64  SV3;
484 
485 extern ovFile  *Out_BOF;
486 
487 
488 
489 
490 void
491 Output_Overlap(uint32 S_ID, int S_Len, Direction_t S_Dir,
492                uint32 T_ID, int T_Len, Olap_Info_t * olap,
493                Work_Area_t *WA);
494 
495 void
496 Output_Partial_Overlap(uint32 s_id, uint32 t_id, Direction_t dir,
497                        const Olap_Info_t * p, int s_len, int t_len,
498                        Work_Area_t  *WA);
499 
500 
501 int
502 Process_String_Olaps (char * S,
503                       int Len,
504                       uint32 ID,
505                       Direction_t Dir,
506                       Work_Area_t * WA);
507 
508 void
509 Find_Overlaps (char Frag [], int Frag_Len, uint32 Frag_Num, Direction_t Dir, Work_Area_t * WA);
510 
511 void *
512 Process_Overlaps (void *);
513 
514 int
515 Build_Hash_Index(sqStore *store, uint32 bgnID, uint32 endID);
516 
517 #endif  //  OVERLAPINCORE_H
518