1 2 /****************************************************************************** 3 * 4 * This file is part of canu, a software program that assembles whole-genome 5 * sequencing reads into contigs. 6 * 7 * This software is based on: 8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) 9 * the 'kmer package' r1994 (http://kmer.sourceforge.net) 10 * 11 * Except as indicated otherwise, this is a 'United States Government Work', 12 * and is released in the public domain. 13 * 14 * File 'README.licenses' in the root directory of this distribution 15 * contains full conditions and disclaimers. 16 */ 17 18 #include "runtime.H" 19 #include "system.H" 20 21 #include "sqStore.H" 22 #include "sqCache.H" 23 #include "ovStore.H" 24 25 #include "prefixEditDistance.H" 26 27 28 #ifndef OVERLAPINCORE_H 29 #define OVERLAPINCORE_H 30 31 #define HASH_KMER_SKIP 0 32 // Skip this many kmers between the kmers put into the hash 33 // table. Setting this to 0 will make every kmer go 34 // into the hash table. 35 36 37 #define BAD_WINDOW_LEN 50 38 // Length of window in which to look for clustered errors 39 // to invalidate an overlap 40 41 #define BAD_WINDOW_VALUE (8 * QUALITY_CUTOFF) 42 // This many or more errors in a window of BAD_WINDOW_LEN 43 // invalidates an overlap 44 45 #define CHECK_MASK 0xff 46 // To set Check field in hash bucket 47 48 #define DEFAULT_HI_HIT_LIMIT INT_MAX 49 // Any kmer in hash table with at least this many hits 50 // cannot initiate an overlap. Can be changed on the command 51 // line with -K option 52 53 #define DELETED_FRAG 2 54 // Indicates fragment was marked as deleted in the store 55 56 #define DISPLAY_WIDTH 60 57 // Number of characters per line when displaying sequences 58 59 #define ENTRIES_PER_BUCKET 21 60 // In main hash table. Recommended values are 21, 31 or 42 61 // depending on cache line size. 62 63 #define HASH_CHECK_MASK 0x1f 64 // Used to set and check bit in Hash_Check_Array 65 // Change if change Check_Vector_t 66 67 #define HASH_EXPANSION_FACTOR 1.4 68 // Hash table size is >= this times MAX_HASH_STRINGS 69 70 #define HASH_MASK (((uint64)1 << G.Hash_Mask_Bits) - 1) 71 // Extract right Hash_Mask_Bits bits of hash key 72 73 #define HASH_TABLE_SIZE (1 + HASH_MASK) 74 // Number of buckets in hash table 75 76 #define HIGHEST_KMER_LIMIT 255 77 // If Hi_Hit_Limit is more than this, it's ignored 78 79 #define HOPELESS_MATCH 90 80 // A string this long or longer without an exact kmer 81 // match is assumed to be hopeless to find a match 82 // within the error threshold 83 84 #define IID_GAP_LIMIT 100 85 // When using a list of fragment IID's, gaps between 86 // IID's this long or longer force a new load partial 87 // store to be done 88 89 #define INIT_MATCH_NODE_SIZE 10000 90 // Initial number of nodes to hold exact matches 91 92 #define INIT_SCREEN_MATCHES 50 93 // Initial number of screen-match entries per fragment 94 95 #define INIT_STRING_OLAP_SIZE 5000 96 // Initial number of different New fragments that 97 // overlap a single Old fragment 98 99 #define K_MER_STEP 1 100 // 1 = every k-mer in search 101 // 2 = every other k-mer 102 // 3 = every third, ... 103 // Used to skip some k-mers in finding matches 104 105 #define MAX_BRANCH_COUNT UCHAR_MAX 106 // The largest branch point count before an overflow 107 108 #define MAX_DISTINCT_OLAPS 3 109 // Most possible really different overlaps (i.e., not 110 // just shifts from periodic regions) between 2 fragments 111 // in a given orientation. For fragments of approximately 112 // same size, should never be more than 2. 113 114 #define MAX_NAME_LEN 500 115 // Longest file name allowed 116 117 #define MIN_CALC_KMER 4 118 // When calculating the Hi_Hit_Limit based on genome length, etc, 119 // don't set it below this 120 121 #define MIN_INTERSECTION 10 122 // Minimum length of match region to be worth reporting 123 124 #define MIN_OLAP_OUTSIDE_SCREEN 30 125 // Minimum number of bases outside of screened regions 126 // to be a reportable overlap. Entire overlap (including screened 127 // portion) must still be MIN_OLAP_LEN . 128 129 #define OUTPUT_OVERLAP_DELTAS 0 130 // If true include delta-encoding of overlap alignment 131 // in overlap messages. Otherwise, omit them. 132 // As of 6 Oct 2008, support for overlap deltas has been removed. 133 // However, there are enough remnants in AS_MSG to output them. 134 // Just enabling OUTPUT_OVERLAP_DELTAS will not compile; see 135 // AS_MSG_USE_OVL_DELTA in AS_MSG. 136 137 #define PROBE_MASK 0x3e 138 // Used to determine probe step to resolve collisions 139 140 #define QUALITY_CUTOFF 20 141 // Regard quality values higher than this as equal to this 142 // for purposes of finding bad windows 143 144 #define SCRIPT_NAME "lsf-ovl" 145 // Default name of script produced by make-ovl-script 146 147 #define SHIFT_SLACK 1 148 // Allow to be off by this many bases in combining/comparing alignments 149 150 #define STRING_OLAP_SHIFT 8 151 // To compute hash function into the String_Olap hash table. 152 153 #define STRING_OLAP_MODULUS (1 << STRING_OLAP_SHIFT) 154 // The size of the String_Olap hash table. The rest of 155 // the space is used for chaining. This number should be 156 // relatively small to reflect the number of fragments a 157 // given fragment has exact matches with. 158 159 #define STRING_OLAP_MASK (STRING_OLAP_MODULUS - 1) 160 // To compute hash function into the String_Olap hash table. 161 162 #define THREAD_STACKSIZE (16 * 512 * 512) 163 // The amount of stack space to allocate to each thread. 164 165 #define VALID_FRAG 1 166 // Indicates fragment was valid in the fragment store 167 168 //#define WINDOW_SCREEN_OLAP 10 169 // Amount by which k-mers can overlap a screen region and still 170 // be added to the hash table. 171 172 #define MAX_EXTRA_SUBCOUNT (AS_MAX_READLEN / G.Kmer_Len) 173 174 175 #define HASH_FUNCTION(k) (((k) ^ ((k) >> HSF1) ^ ((k) >> HSF2)) & HASH_MASK) 176 // Gives subscript in hash table for key k 177 178 #define HASH_CHECK_FUNCTION(k) (((k) ^ ((k) >> SV1) ^ ((k) >> SV2)) & HASH_CHECK_MASK) 179 // Gives bit position to see if key could be in bucket 180 181 #define KEY_CHECK_FUNCTION(k) (((k) ^ ((k) >> SV1) ^ ((k) >> SV3)) & CHECK_MASK) 182 // Gives bit pattern to see if key could match 183 184 #define PROBE_FUNCTION(k) ((((k) ^ ((k) >> SV2) ^ ((k) >> SV3)) & PROBE_MASK) | 1) 185 // Gives secondary hash function. Force to be odd so that will be relatively 186 // prime wrt the hash table size, which is a power of 2. 187 188 189 190 typedef enum Direction_Type { 191 FORWARD, 192 REVERSE 193 } Direction_t; 194 195 typedef struct String_Olap_Node { 196 uint32 String_Num; // Of hash-table frag that have exact match with 197 int32 Match_List; // Subscript of start of list of exact matches 198 double diag_sum; // Sum of diagonals of all k-mer matches to this frag 199 int32 diag_ct; // Count of all k-mer matches to this frag 200 int diag_bgn; 201 int diag_end; 202 signed int Next : 29; // Next match if this is a collision 203 unsigned Full : 1; 204 unsigned consistent : 1; 205 } String_Olap_t; 206 207 208 typedef struct Olap_Info { 209 int s_lo, s_hi; 210 int t_lo, t_hi; 211 double quality; 212 int delta [AS_MAX_READLEN+1]; // needs only MAX_ERRORS 213 int delta_ct; 214 int s_left_boundary, s_right_boundary; 215 int t_left_boundary, t_right_boundary; 216 int min_diag, max_diag; 217 } Olap_Info_t; 218 219 // The following structure holds what used to be global information, but 220 // is now encapsulated so that multiple copies can be made for multiple 221 // parallel threads. 222 223 typedef struct Work_Area { 224 225 //int *Error_Bound; 226 227 String_Olap_t * String_Olap_Space; 228 int32 String_Olap_Size; 229 int32 Next_Avail_String_Olap; 230 231 Match_Node_t * Match_Node_Space; 232 int32 Match_Node_Size; 233 int32 Next_Avail_Match_Node; 234 235 // Counts the number of overlaps for each fragment. Cuts off 236 // overlaps above a limit. 237 int32 A_Olaps_For_Frag; 238 int32 B_Olaps_For_Frag; 239 240 sqStore *readStore; 241 sqCache *readCache; 242 243 int left_end_screened; 244 int right_end_screened; 245 246 int status; 247 int thread_id; 248 249 uint32 bgnID; // Range of reads we are processing 250 uint32 endID; // was frag_segment_lo and frag_segment_hi (all lowercase) 251 252 // Instead of outputting each overlap as we create it, we 253 // buffer them and output blocks of overlaps. 254 uint64 overlapsLen; 255 uint64 overlapsMax; 256 ovOverlap *overlaps; 257 258 // Various stats that used to be global and updated whenever we 259 // output an overlap or finished processing a set of hits. 260 // Needed a mutex to update. 261 uint64 Total_Overlaps; 262 uint64 Contained_Overlap_Ct; 263 uint64 Dovetail_Overlap_Ct; 264 265 uint64 Kmer_Hits_Without_Olap_Ct; 266 uint64 Kmer_Hits_With_Olap_Ct; 267 uint64 Kmer_Hits_Skipped_Ct; 268 uint64 Multi_Overlap_Ct; 269 270 prefixEditDistance *editDist; 271 272 273 char * q_diff; 274 Olap_Info_t *distinct_olap; 275 } Work_Area_t; 276 277 278 279 280 typedef uint32 Check_Vector_t; 281 // Bit vector to see if hash bucket could possibly contain a match 282 283 284 typedef uint64 String_Ref_t; 285 286 #define BIT_EMPT 62 287 #define BIT_LAST 63 288 289 extern uint32 STRING_NUM_BITS; 290 extern uint32 OFFSET_BITS; 291 292 extern uint64 TRUELY_ZERO; 293 extern uint64 TRUELY_ONE; 294 295 extern uint64 STRING_NUM_MASK; 296 extern uint64 OFFSET_MASK; 297 298 extern uint64 MAX_STRING_NUM; 299 300 301 302 // 303 // [ Last (1) ][ Empty (1) ][ Offset (11) ][ StringNum (19) ] 304 // 305 306 #define getStringRefStringNum(X) (((X) ) & STRING_NUM_MASK) 307 #define getStringRefOffset(X) (((X) >> STRING_NUM_BITS) & OFFSET_MASK) 308 #define getStringRefEmpty(X) (((X) >> BIT_EMPT ) & TRUELY_ONE) 309 #define getStringRefLast(X) (((X) >> BIT_LAST ) & TRUELY_ONE) 310 311 #define setStringRefStringNum(X, Y) ((X) = (((X) & ~(STRING_NUM_MASK )) | ((Y)))) 312 #define setStringRefOffset(X, Y) ((X) = (((X) & ~(OFFSET_MASK << STRING_NUM_BITS)) | ((Y) << STRING_NUM_BITS))) 313 #define setStringRefEmpty(X, Y) ((X) = (((X) & ~(TRUELY_ONE << BIT_EMPT )) | ((Y) << BIT_EMPT))) 314 #define setStringRefLast(X, Y) ((X) = (((X) & ~(TRUELY_ONE << BIT_LAST )) | ((Y) << BIT_LAST))) 315 316 317 typedef struct Hash_Bucket { 318 String_Ref_t Entry [ENTRIES_PER_BUCKET]; 319 unsigned char Check [ENTRIES_PER_BUCKET]; 320 unsigned char Hits [ENTRIES_PER_BUCKET]; 321 int16 Entry_Ct; 322 } Hash_Bucket_t; 323 324 typedef struct Hash_Frag_Info { 325 uint32 length : 30; 326 uint32 lfrag_end_screened : 1; 327 uint32 rfrag_end_screened : 1; 328 } Hash_Frag_Info_t; 329 330 331 extern char *basesData; 332 extern String_Ref_t *nextRef; 333 extern size_t Data_Len; 334 335 extern int64 Bad_Short_Window_Ct; 336 extern int64 Bad_Long_Window_Ct; 337 338 extern size_t Extra_Data_Len; 339 340 extern uint64 Max_Extra_Ref_Space; 341 extern uint64 Extra_Ref_Ct; 342 extern String_Ref_t * Extra_Ref_Space; 343 extern uint64 Extra_String_Ct; 344 extern uint64 Extra_String_Subcount; 345 346 extern Check_Vector_t * Hash_Check_Array; 347 extern uint64 Hash_String_Num_Offset; 348 extern Hash_Bucket_t * Hash_Table; 349 extern uint64 Kmer_Hits_With_Olap_Ct; 350 extern uint64 Kmer_Hits_Without_Olap_Ct; 351 extern uint64 Kmer_Hits_Skipped_Ct; 352 extern uint64 Multi_Overlap_Ct; 353 extern uint64 String_Ct; 354 extern Hash_Frag_Info_t * String_Info; 355 356 extern int64 * String_Start; 357 extern uint32 String_Start_Size; 358 359 extern size_t Used_Data_Len; 360 361 extern int32 Bit_Equivalent [256]; 362 extern int32 Char_Is_Bad [256]; 363 extern uint64 Hash_Entries; 364 extern uint64 Total_Overlaps; 365 extern uint64 Contained_Overlap_Ct; 366 extern uint64 Dovetail_Overlap_Ct; 367 368 class oicParameters { 369 public: oicParameters()370 oicParameters() { 371 initialize(); 372 }; ~oicParameters()373 ~oicParameters() {}; 374 initialize(void)375 void initialize(void) { 376 maxErate = 0.06; 377 alignNoise = 1.0; 378 Doing_Partial_Overlaps = false; 379 380 bgnHashID = 1; 381 endHashID = UINT32_MAX; 382 minLibToHash = 0; 383 maxLibToHash = UINT32_MAX; 384 385 bgnRefID = 1; 386 endRefID = UINT32_MAX; 387 minLibToRef = 0; 388 maxLibToRef = UINT32_MAX; 389 390 Kmer_Len = 0; 391 kmerSkipFileName = NULL; 392 Filter_By_Kmer_Count = 0; 393 394 Frag_Olap_Limit = UINT64_MAX; 395 396 Unique_Olap_Per_Pair = true; 397 398 Hash_Mask_Bits = 22; 399 Max_Hash_Load = 0.6; 400 Max_Hash_Data_Len = 100000000; 401 402 Outfile_Name = NULL; 403 Outstat_Name = NULL; 404 405 Num_PThreads = getMaxThreadsAllowed(); 406 407 Min_Olap_Len = 0; 408 409 Use_Hopeless_Check = true; 410 411 Frag_Store_Path = NULL; 412 }; 413 414 double maxErate; 415 double alignNoise; 416 417 // If set true by the G option (G for Granger) 418 // then allow overlaps that do not extend to the end 419 // of either read. 420 bool Doing_Partial_Overlaps; // -G 421 422 uint32 bgnHashID; // -h 423 uint32 endHashID; 424 uint32 minLibToHash; // -H 425 uint32 maxLibToHash; 426 427 428 // The range of reads we need to process 429 430 uint32 frag_segment_lo; 431 uint32 frag_segment_hi; 432 433 uint32 bgnRefID; // -r 434 uint32 curRefID; // When processing, this is where we are at, bgn < cur <= end. 435 uint32 endRefID; 436 uint32 minLibToRef; // -R 437 uint32 maxLibToRef; 438 439 uint32 perThread; // When processing, how many to do per block 440 441 uint64 Kmer_Len; // -k 442 uint64 Filter_By_Kmer_Count; 443 char *kmerSkipFileName; // -k 444 445 // Maximum number of overlaps for end of an old fragment against 446 // a single hash table of frags, in each orientation 447 uint64 Frag_Olap_Limit; // -l 448 449 // If true will allow at most 450 // one overlap output message per oriented fragment pair 451 // Set true by -u command-line option; set false by -m 452 bool Unique_Olap_Per_Pair; // -m and -u 453 454 uint32 Hash_Mask_Bits; // --hashbits 455 456 uint64 Max_Hash_Data_Len; // --hashdatalen 457 double Max_Hash_Load; // --hashload 458 459 // --maxreadlen sets OFFSET_BITS, STRING_NUM_BITS, STRING_NUM_MASK and MAX_STRING_NUM. 460 461 char *Outfile_Name; // -o 462 char *Outstat_Name; // -s 463 464 uint32 Num_PThreads; // -t 465 466 int32 Min_Olap_Len; // --minlength, former -v 467 468 // Determines whether check for absence of kmer matches 469 // at the end of a read is used to abort the overlap before 470 // the extension from a single kmer match is attempted. 471 bool Use_Hopeless_Check; // -z 472 473 char *Frag_Store_Path; 474 }; 475 476 extern oicParameters G; 477 478 479 extern uint64 HSF1; 480 extern uint64 HSF2; 481 extern uint64 SV1; 482 extern uint64 SV2; 483 extern uint64 SV3; 484 485 extern ovFile *Out_BOF; 486 487 488 489 490 void 491 Output_Overlap(uint32 S_ID, int S_Len, Direction_t S_Dir, 492 uint32 T_ID, int T_Len, Olap_Info_t * olap, 493 Work_Area_t *WA); 494 495 void 496 Output_Partial_Overlap(uint32 s_id, uint32 t_id, Direction_t dir, 497 const Olap_Info_t * p, int s_len, int t_len, 498 Work_Area_t *WA); 499 500 501 int 502 Process_String_Olaps (char * S, 503 int Len, 504 uint32 ID, 505 Direction_t Dir, 506 Work_Area_t * WA); 507 508 void 509 Find_Overlaps (char Frag [], int Frag_Len, uint32 Frag_Num, Direction_t Dir, Work_Area_t * WA); 510 511 void * 512 Process_Overlaps (void *); 513 514 int 515 Build_Hash_Index(sqStore *store, uint32 bgnID, uint32 endID); 516 517 #endif // OVERLAPINCORE_H 518