1 static char const rcsid[] = "$Id: lookup.c,v 6.59 2005/07/28 14:57:10 coulouri Exp $";
2
3 /*
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28 /*****************************************************************************
29
30 File name: lookup.c
31
32 Author: Tom Madden
33
34 Contents: Functions to store "hashed" word hits and their positions and
35 in arrays and look these up again.
36
37 Detailed Contents:
38
39 - the number of bits required is given by ln(alphabet size)/ln(2).
40
41 - the number of bits times the wordsize must be less than 32.
42
43 - contiguous words can be of any length, discontiguous words can
44 only have the patten XX0X and be of length three, or word-width of
45 four.
46
47 ******************************************************************************/
48
49 /*******************************************************************************
50 * File Name: lookup.c
51 *
52 * Author: Tom Madden
53 *
54 * Version Creation Date: 10/26/95
55 *
56 * $Revision: 6.59 $
57 *
58 * File Description:
59 * Functions to store "words" from a query and perform lookups against
60 * database sequences.
61 *
62 * Modifications:
63 * --------------------------------------------------------------------------
64 * Date Name Description of modification
65 * ------- ---------- -----------------------------------------------------
66 *
67 * ==========================================================================
68 *
69 *
70 * RCS Modification History:
71 * $Log: lookup.c,v $
72 * Revision 6.59 2005/07/28 14:57:10 coulouri
73 * remove dead code
74 *
75 * Revision 6.58 2004/11/15 22:28:41 kans
76 * in MegaBlastBuildLookupTable, changed total_len (et al.) to Int8 and total_num (et al.) to Int4, from unsigned, which conflicted with pointer arguments to readdb_get_totals_ex
77 *
78 * Revision 6.57 2004/11/15 21:35:35 coulouri
79 *
80 * Manually overriding the effective database length can cause av_len to overflow an Int4; use Uint8 instead.
81 *
82 * Revision 6.56 2003/05/30 17:25:36 coulouri
83 * add rcsid
84 *
85 * Revision 6.55 2002/10/03 14:43:44 dondosha
86 * Bug fix for 2 sequences megablast
87 *
88 * Revision 6.54 2002/08/30 15:49:01 dondosha
89 * MegaBlastWordFinderDeallocate moved from mblast.c
90 *
91 * Revision 6.53 2002/06/07 18:50:43 dondosha
92 * Bug fix for Mega BLAST with two simultaneoue discontiguous word templates of length 21
93 *
94 * Revision 6.52 2002/05/17 21:40:13 dondosha
95 * Added 2 optimal Mega BLAST word templates for length 21
96 *
97 * Revision 6.51 2002/05/14 22:20:20 dondosha
98 * Renamed maximal discontiguous template type into optimal
99 *
100 * Revision 6.50 2002/04/09 18:19:47 dondosha
101 * Changed #ifdefs to conditionals, allowing different discontiguous templates in a single binary
102 *
103 * Revision 6.49 2002/04/01 22:28:29 dondosha
104 * Corrected syntax error for megablast with two-templates case
105 *
106 * Revision 6.48 2002/03/26 21:15:54 dondosha
107 * Added macro definitions to allow template length 21
108 *
109 * Revision 6.47 2002/03/05 17:39:39 dondosha
110 * Use reduced_wordsize instead of wordsize in lookup_find_init
111 *
112 * Revision 6.46 2002/02/19 22:35:06 dondosha
113 * Fixed uninitialized variable bug introduced by the two-template code
114 *
115 * Revision 6.45 2002/02/15 23:35:01 dondosha
116 * Allow hash table and two templates for megablast
117 *
118 * Revision 6.44 2001/12/28 20:46:21 dondosha
119 * 1. Mega BLAST related parameters moved into a separate structure
120 * 2. Environment variables for discontiguous words, etc. changed to options
121 * 3. Extension from discontiguous word with length 18 implemented
122 *
123 * Revision 6.43 2001/12/27 18:12:33 dondosha
124 * Fixed a bug caused by lack of Int4 to Int8 cast
125 *
126 * Revision 6.42 2001/12/10 22:54:27 dondosha
127 * Correction for determining the number of stacks to use in megablast
128 *
129 * Revision 6.41 2001/12/04 17:13:08 dondosha
130 * Made number of stacks for megablast word processing depend on query and database
131 *
132 * Revision 6.40 2001/10/12 21:32:46 dondosha
133 * Added discontiguous word capability to megablast
134 *
135 * Revision 6.39 2001/07/20 19:22:37 dondosha
136 * Added pv_array for megablast
137 *
138 * Revision 6.38 2000/09/29 22:17:42 dondosha
139 * Check if memory allocation fails in MegaBlastBuildLookupTable
140 *
141 * Revision 6.37 2000/08/28 17:13:56 dondosha
142 * Changed the bit indicating mask at hash only for megablast
143 *
144 * Revision 6.36 2000/07/10 17:18:58 dondosha
145 * Use several stacks in MegaBlast for speed up
146 *
147 * Revision 6.35 2000/05/31 14:06:11 dondosha
148 * Added warning messages when 12mers are masked in megablast
149 *
150 * Revision 6.34 2000/05/22 18:06:54 dondosha
151 * Fixed bug in masking at hash for megablast
152 *
153 * Revision 6.33 2000/05/17 17:38:26 dondosha
154 * Removed several debugging statements, introduced by previous revision
155 *
156 * Revision 6.32 2000/05/17 17:15:56 dondosha
157 * In megablast, clean the hash table from patterns with too many positions in query
158 *
159 * Revision 6.31 2000/04/10 17:27:15 madden
160 * Deallocate old lookup table memory before search
161 *
162 * Revision 6.30 2000/04/03 13:07:51 madden
163 * Fixed memory leak
164 *
165 * Revision 6.29 2000/03/07 21:59:53 madden
166 * Add return value of NULL for MegaBlastLookupTableDestruct
167 *
168 * Revision 6.28 2000/03/03 18:07:34 dondosha
169 * Added routine MegaBlastLookupTableDup plus cosmetic changes related to MegaBlast
170 *
171 * Revision 6.27 2000/03/02 17:24:34 dondosha
172 * Fixed handling of lower case masking in MegaBlast
173 *
174 * Revision 6.26 2000/02/23 20:41:39 dondosha
175 * Removed extern declaration for MegaBlastBuildLookupTable - now included from blast.h
176 *
177 * Revision 6.25 2000/02/17 19:02:10 shavirin
178 * Removed all references to absolete theCacheSize variable.
179 *
180 * Revision 6.24 2000/02/12 21:18:57 kans
181 * added prototype for MegaBlastBuildLookupTable - implemented in lookup.c, called from mblast.c
182 *
183 * Revision 6.23 2000/02/11 20:54:28 dondosha
184 * Added functions related to Webb Miller s lookup table
185 *
186 * Revision 6.22 2000/02/03 21:17:03 dondosha
187 * Fixed bug in lookup_new
188 *
189 * Revision 6.21 2000/02/01 21:08:48 dondosha
190 * Added mb_make_mod_lt and mb_lookup_position_aux_destruct - modifications for megablast
191 *
192 * Revision 6.20 2000/01/11 18:37:08 shavirin
193 * Added define DYNAMIC_CACHE to distinduish dynamic and static lookup.
194 *
195 * Revision 6.19 2000/01/11 17:58:50 shavirin
196 * Added new function make_mod_lt_new, which created dynamic lookup table
197 *
198 * Revision 6.18 2000/01/11 17:33:11 shavirin
199 * Added handling of the parameter theCacheSize.
200 *
201 * Revision 6.17 1999/12/21 21:33:23 shavirin
202 * Added parameter mod_lookup_table_memory to the lookup structure.
203 *
204 * Revision 6.15 1999/11/08 21:54:54 madden
205 * Fix typo
206 *
207 * Revision 6.14 1999/10/12 18:00:13 madden
208 * Add one to array lengths for purify errors.
209 *
210 * Revision 6.13 1999/10/06 14:25:20 madden
211 * Only allocate pv_array if needed
212 *
213 * Revision 6.12 1999/09/29 17:20:12 madden
214 * Deallocate ModLookupPositionPtr memory
215 *
216 * Revision 6.11 1999/09/29 13:28:44 madden
217 * Fill in pv_array correctly
218 *
219 * Revision 6.10 1999/09/28 20:14:33 madden
220 * Joerg changes to mimize cache misses
221 *
222 * Revision 6.9 1999/09/16 16:56:39 madden
223 * Changes to lookup_new for long words
224 *
225 * Revision 6.8 1999/09/16 14:16:27 madden
226 * lookup_find_init returns Uint1Ptr instead of CharPtr
227 *
228 * Revision 6.7 1999/04/27 15:40:12 madden
229 * Use aux lookup only for short words
230 *
231 * Revision 6.6 1998/09/22 16:27:47 madden
232 * Added function lookup_position_aux_destruct
233 *
234 * Revision 6.5 1998/07/27 18:14:17 madden
235 * lookup_get_memory replaces call to MemNew
236 *
237 * Revision 6.4 1998/04/15 20:25:58 madden
238 * Auxillary structure added to speed-up saving words for queries
239 *
240 * Revision 6.3 1998/02/26 22:34:39 madden
241 * Changes for 16 bit windows
242 *
243 * Revision 6.2 1998/01/05 20:32:38 kans
244 * AddPositionToLookupTable checks for NULL lookup parameter
245 *
246 * Revision 6.1 1998/01/05 17:38:05 madden
247 * Check that position is non-NULL in lookup_destruct
248 *
249 * Revision 6.0 1997/08/25 18:53:23 madden
250 * Revision changed to 6.0
251 *
252 * Revision 1.7 1997/08/19 18:19:23 madden
253 * Cast arg of log to Nlm_FloatHi
254 *
255 * Revision 1.6 1997/04/03 19:58:27 madden
256 * Added check for NULL lookup->position.
257 *
258 * Revision 1.5 1996/09/25 14:17:30 madden
259 * removed discontiguous options.
260 *
261 * Revision 1.4 1996/09/12 21:12:59 madden
262 * Added new function to save an already computed index, lookup_add_index.
263 *
264 * Revision 1.3 1996/08/21 21:28:31 madden
265 * Added casts to quiet NT compiler warning.s
266 *
267 * Revision 1.2 1996/08/15 18:33:35 madden
268 * Changed Int2 to Int1.
269 *
270 * Revision 1.1 1996/08/05 19:48:50 madden
271 * Initial revision
272 *
273 * Revision 1.11 1996/07/24 12:01:28 madden
274 * Changes for blastx
275 *
276 * Revision 1.10 1996/07/18 22:00:02 madden
277 * Changes for multiple contexts.
278 *
279 * Revision 1.9 1996/06/20 16:52:23 madden
280 * Changed "pow" to "Nlm_Powi".
281 *
282 * Revision 1.8 1996/06/20 16:15:57 madden
283 * Replaced int's with Int4's.
284 *
285 * Revision 1.7 1996/05/22 20:22:01 madden
286 * Removed unused variable lookup_find_discontig.
287 *
288 * Revision 1.6 1996/05/16 19:50:15 madden
289 * Added documentation block.
290 *
291 * Revision 1.5 1996/05/03 19:55:24 madden
292 * Combined two functions.
293 *
294 * Revision 1.4 1996/04/04 20:47:55 madden
295 * Made lookup_find into two functions, that are statics.
296 *
297 * Revision 1.3 1996/02/28 21:37:14 madden
298 * changes for discontiguous words.
299 *
300 * Revision 1.2 1995/12/26 20:28:36 madden
301 * removed lookup_add_word function.
302 *
303 * Revision 1.1 1995/12/08 15:48:23 madden
304 * Initial revision
305 *
306 */
307
308 #include <ncbi.h>
309 #include <blast.h> /* this already includes lookup.h */
310
311 /* mod_lt[] is a modified lookup table. -cfj
312 * It contains the same info that is accessible via lookup->position, but in a form that reduces the
313 * number of cache misses.
314 * Doing a lookup via the original lookup table cost one cache miss per access, and then one miss per
315 * entry as we walk down the hit list.
316 *
317 * mod_lt replaces the linked list with an array of hits, to reduce the number of misses.
318 * Also, if there are 3 or fewer hits, they are placed directly in the table.
319 * if there are more than 3 hits, the first one is placed directly in the table (followed by
320 * a pointer to a contiguous array of all the remaining hits).
321 *
322 * This should probably be allocated and deallocated along with the standard lookup table, but this
323 * is not currently done.
324 *
325 * The make_mod_lt function should be called to build the mod_lt table after all hits have been placed
326 * into the orginal lookup table.
327 */
328
make_mod_lt(LookupTablePtr lookup)329 void make_mod_lt(LookupTablePtr lookup)
330 {
331 Int4 index, j, pv_size;
332 Uint4 len;
333 OrigLookupPositionPtr PNTR lookup_pos;
334 OrigLookupPositionPtr list;
335 ModLookupPositionPtr next_free;
336 PV_ARRAY_TYPE *pv_array=NULL;
337
338
339 ModLAEntry *mod_lt;
340 mod_lt=lookup->mod_lt;
341
342 len = lookup->array_size;
343 lookup_pos = lookup->position;
344
345 /* Add 1 to silence purify messages. */
346 /* Initial allocation - more, than necessary though */
347
348 next_free = MemNew((1+lookup->num_pos_added)*sizeof(ModLookupPosition));
349 lookup->mod_lookup_table_size = 0;
350
351 if (next_free == NULL){
352 ErrPostEx(SEV_ERROR, 0, 0, "Unable to allocate lookup->position");
353 return;
354 }
355 lookup->mod_lookup_table_memory = next_free;
356
357
358 /* pv_array is an array of 'presence bits', one bit per entry
359 in the lookup table, packed 64bits per Uint8 (on a 64-bit binary);
360 otherwise a Uint4 is used. It can be used to quickly see if there
361 is an entry in the lookup table. It's used on short-to-medium
362 length queries since most lookups will be empty. -cfj
363 */
364
365 if (lookup->num_unique_pos_added < PV_ARRAY_FACTOR*lookup->array_size) {
366 pv_size = (lookup->array_size+PV_ARRAY_MASK)/(PV_ARRAY_MASK+1); /* Size measured in PV_ARRAY_TYPE's */
367 pv_array = MemNew(pv_size*PV_ARRAY_BYTES);
368 }
369
370 /* Walk through table, copying info into mod_lt[] */
371 for(index = 0; index < len; index++) {
372 if(lookup_pos[index] == NULL){
373 mod_lt[index].num_used = 0;
374 } else {
375 /* count num entries */
376 int count=0;
377 list = lookup_pos[index];
378
379 if (pv_array)
380 pv_array[(index>>PV_ARRAY_BTS)] |= (((PV_ARRAY_TYPE) 1)<<(index&PV_ARRAY_MASK));
381
382 while (list != NULL){
383 count++;
384 list = list->next;
385 }
386
387 if (count<=3) {
388 list = lookup_pos[index];
389 mod_lt[index].num_used=count;
390 for(j=0;j<count;j++){
391 hinfo_set(&mod_lt[index].entries[j],list->position,list->context);
392 list = list->next;
393 }
394 } else {
395 ModLookupPositionPtr * lpp= (ModLookupPositionPtr *) &mod_lt[index].entries[1];
396 list=lookup_pos[index];
397 mod_lt[index].num_used=count;
398 hinfo_set(&mod_lt[index].entries[0],list->position,list->context);
399 list = list->next;
400 *lpp = next_free;
401 for(j = 1; j < count; j++){
402 hinfo_set(next_free,list->position,list->context);
403 next_free++;
404 lookup->mod_lookup_table_size++;
405 list = list->next;
406 }
407 }
408 }
409 }
410
411 lookup->pv_array = pv_array;
412 }
mb_make_mod_lt(LookupTablePtr lookup)413 void mb_make_mod_lt(LookupTablePtr lookup)
414 {
415 Int4 index, j, pv_size;
416 Uint4 len;
417 OrigLookupPositionPtr PNTR lookup_pos;
418 OrigLookupPositionPtr list;
419 ModLookupPositionPtr next_free;
420 PV_ARRAY_TYPE *pv_array=NULL;
421
422
423 ModLAEntry *mod_lt;
424 mod_lt=lookup->mod_lt;
425
426 len = lookup->array_size;
427 lookup_pos = lookup->position;
428
429 /* Add 1 to silence purify messages. */
430 /* Initial allocation - more, than necessary though */
431
432 next_free = MemNew((1+lookup->num_pos_added)*sizeof(ModLookupPosition));
433 lookup->mod_lookup_table_size = 0;
434
435 if (next_free == NULL){
436 ErrPostEx(SEV_ERROR, 0, 0, "Unable to allocate lookup->position");
437 return;
438 }
439 lookup->mod_lookup_table_memory = next_free;
440
441
442 /* pv_array is an array of 'presence bits', one bit per entry
443 in the lookup table, packed 64bits per Uint8 (on a 64-bit binary);
444 otherwise a Uint4 is used. It can be used to quickly see if there
445 is an entry in the lookup table. It's used on short-to-medium
446 length queries since most lookups will be empty. -cfj
447 */
448
449 if (lookup->num_unique_pos_added < PV_ARRAY_FACTOR*lookup->array_size) {
450 pv_size = (lookup->array_size+PV_ARRAY_MASK)/(PV_ARRAY_MASK+1); /* Size measured in PV_ARRAY_TYPE's */
451 pv_array = MemNew(pv_size*PV_ARRAY_BYTES);
452 }
453
454 /* Walk through table, copying info into mod_lt[] */
455 for(index = 0; index < len; index++) {
456 if(lookup_pos[index] == NULL){
457 mod_lt[index].num_used = 0;
458 } else {
459 /* count num entries */
460 int count=0;
461 list = lookup_pos[index];
462
463 if (pv_array)
464 pv_array[(index>>PV_ARRAY_BTS)] |= (((PV_ARRAY_TYPE) 1)<<(index&PV_ARRAY_MASK));
465
466 while (list != NULL){
467 count++;
468 list = list->next;
469 }
470
471 if (count<=3) {
472 list = lookup_pos[index];
473 mod_lt[index].num_used=count;
474 for(j=0;j<count;j++){
475 mod_lt[index].entries[j] = list->position;
476 list = list->next;
477 }
478 } else {
479 ModLookupPositionPtr * lpp= (ModLookupPositionPtr *) &mod_lt[index].entries[1];
480 list=lookup_pos[index];
481 mod_lt[index].num_used=count;
482 mod_lt[index].entries[0] = list->position;
483 list = list->next;
484 *lpp = next_free;
485 for(j = 1; j < count; j++){
486 *next_free = list->position;
487 next_free++;
488 lookup->mod_lookup_table_size++;
489 list = list->next;
490 }
491 }
492 }
493 }
494
495 lookup->pv_array = pv_array;
496 }
497
498
499 static void AddPositionToLookupTable PROTO((LookupTablePtr lookup, Int4 index, Int4 position, Int1 context));
500
501 #define LOOKUP_MEMORY_MIN 128*1024 /* Minimum block of memory (in bytes) for LookupMemoryPtr. */
502
lookup_deallocate_memory(LookupTablePtr lookup)503 static VoidPtr lookup_deallocate_memory (LookupTablePtr lookup)
504 {
505 LookupMemoryPtr next=NULL, mem_struct;
506
507 if (lookup == NULL)
508 return NULL;
509
510 mem_struct = lookup->mem_struct_start;
511
512 while (mem_struct) {
513 next = mem_struct->next;
514 MemFree(mem_struct->start);
515 MemFree(mem_struct);
516 mem_struct = next;
517 }
518
519 lookup->mem_struct_start = NULL;
520
521 return NULL;
522 }
523
524 /*
525 returns the requested amount of memory. Large blocks are obtained at
526 once and then passed out.
527
528 LookupTablePtr lookup: contains all information on Lookup.
529 size_t required: how much memory has been requested.
530 */
531
lookup_get_memory(LookupTablePtr lookup,size_t required)532 static VoidPtr lookup_get_memory (LookupTablePtr lookup, size_t required)
533 {
534 LookupMemoryPtr last, mem_struct;
535 size_t memory_requested;
536 Uint1Ptr new;
537
538 mem_struct = lookup->mem_struct;
539
540 if (!mem_struct || required > mem_struct->remaining) {
541 if (mem_struct) {
542 last = mem_struct;
543 } else {
544 last = NULL;
545 }
546
547 memory_requested = MAX(lookup->memory_chunk, LOOKUP_MEMORY_MIN);
548 mem_struct = (LookupMemoryPtr) MemNew(sizeof(LookupMemory));
549 mem_struct->start = (Uint1Ptr) MemNew(memory_requested);
550 mem_struct->current = mem_struct->start;
551 mem_struct->remaining = memory_requested;
552
553 /* align to 64Byte block. -cfj */
554 if(1){
555 Uint8 extra = ((Uint8)mem_struct->current) & ((Uint8) 63);
556 if (extra>0){
557 extra=64-extra;
558 mem_struct->current += extra;
559 mem_struct->remaining -= extra;
560 }
561 }
562
563 if (last) {
564 last->next = mem_struct; /* used at end for deallocation. */
565 lookup->mem_struct = mem_struct; /* Used as a shortcut around a long while loop. */
566 } else {
567 lookup->mem_struct = mem_struct;
568 lookup->mem_struct_start = mem_struct;
569 }
570 }
571
572 new = mem_struct->current;
573 mem_struct->current += required;
574 mem_struct->remaining -= required;
575
576 return (VoidPtr) new; /* return pointer to new memory. */
577 }
578
579 /*
580 LookupTablePtr LIBCALL lookup_new(Int2 alphabet_size, Int2 wordsize)
581
582 alphabet_size: number of symbols in alphabet
583
584 wordsize: size of initial word hit (often three)
585
586 Perform the initial setup of the lookup table. "lookup"
587 expects to work with a "zero-offset" alphabet such as
588 NCBIstdaa or NCBI2na.
589
590 Each letter (residue or basepair) in the word goes into it's
591 own set of bits (calculated as num_of_bits below). This is
592 to allow masking and shifting of bits. As an example consider
593 a three-letter word with an alphabet containing 20 residues.
594 The minimum num_of_bits is five (able to represent up to 31)
595 and the total "array_size" is 15.
596 "mask" is used to mask the wordsize-1 lower bits (i.e., to set the
597 bits the highest num_of_bits to zero).
598 */
599 LookupTablePtr LIBCALL
lookup_new(Int2 alphabet_size,Int2 wordsize,Int2 reduced_wordsize)600 lookup_new(Int2 alphabet_size, Int2 wordsize, Int2 reduced_wordsize)
601
602 {
603 Int4 num_of_bits;
604 LookupTablePtr lookup;
605
606 /* How many bits are needed to hold the alphabet? */
607 num_of_bits = Nlm_Nint(log((Nlm_FloatHi)alphabet_size)/NCBIMATH_LN2);
608
609 /* 32 bits is 4 bytes */
610 if (num_of_bits*reduced_wordsize > 32) {
611 ErrPostEx(SEV_ERROR, 0, 0, "alphabet times wordsize > 32");
612 return NULL;
613 }
614
615 lookup = (LookupTablePtr) MemNew(sizeof(LookupTable));
616 if (lookup == NULL)
617 return NULL;
618
619 lookup->char_size = num_of_bits;
620 lookup->wordsize = (Int4) wordsize;
621 if (reduced_wordsize <= 0)
622 reduced_wordsize = wordsize;
623 lookup->reduced_wordsize = (Int4) reduced_wordsize;
624 lookup->array_size = (Int4) Nlm_Powi(2.0, (num_of_bits*reduced_wordsize));
625 lookup->mask = (Int4) Nlm_Powi(2.0, (num_of_bits*(reduced_wordsize-1))) - 1;
626 lookup->mod_lt = NULL; /* will be allocated when built */
627 lookup->theTable = NULL; /* will be allocated when built */
628
629 lookup->position = (OrigLookupPositionPtr PNTR) MemNew((lookup->array_size)*sizeof(OrigLookupPositionPtr));
630 if (lookup->position == NULL) {
631 lookup = lookup_destruct(lookup);
632 ErrPostEx(SEV_ERROR, 0, 0, "Unable to allocate lookup->position");
633 }
634
635 /* Only allocate the auxillary structure if it's less than 1 Meg. */
636 if ((lookup->array_size)*sizeof(OrigLookupPositionPtr) <= (1<<20)) {
637 lookup->position_aux =
638 (OrigLookupPositionPtr PNTR) MemNew((lookup->array_size)*sizeof(OrigLookupPositionPtr));
639 }
640
641 return lookup;
642 }
643
644 LookupTablePtr LIBCALL
lookup_destruct(LookupTablePtr lookup)645 lookup_destruct(LookupTablePtr lookup)
646
647 {
648 if (lookup == NULL)
649 return lookup;
650
651 if(lookup->position)
652 MemFree(lookup->position);
653 if(lookup->position_aux)
654 MemFree(lookup->position_aux);
655 if(lookup->pv_array)
656 MemFree(lookup->pv_array);
657 if(lookup->mod_lt)
658 MemFree(lookup->mod_lt);
659 if(lookup->theTable)
660 MemFree(lookup->theTable);
661 if (lookup->mod_lookup_table_memory)
662 MemFree(lookup->mod_lookup_table_memory);
663 lookup_deallocate_memory(lookup);
664 lookup = MegaBlastLookupTableDestruct(lookup);
665 lookup = MemFree(lookup);
666
667 return lookup;
668 }
669
670 /*
671 Deallocated position_aux, which can be large for
672 large word-sizes, and is no longer needed.
673
674 This now does more than just dealloc lookup->position_aux.
675 This is called afeter the original lookup table is fully built.
676 So this is where we convert it to the new (mod_lt) table.
677 -cfj
678 */
679
680 Boolean
lookup_position_aux_destruct(LookupTablePtr lookup)681 lookup_position_aux_destruct(LookupTablePtr lookup)
682
683 {
684 if (lookup == NULL)
685 return FALSE;
686
687 if(lookup->position_aux)
688 lookup->position_aux = MemFree(lookup->position_aux);
689
690
691 /* create the new lookup table, Add 1 to silence purify messages. */
692
693 lookup->mod_lt = (ModLAEntry PNTR) MemNew((1+lookup->array_size)*sizeof(ModLAEntry));
694
695 make_mod_lt(lookup);
696
697 /* deallocate parts of the old table no longer needed */
698 if(lookup->position)
699 lookup->position = MemFree(lookup->position);
700 lookup_deallocate_memory(lookup);
701
702 return TRUE;
703 }
704
705 Boolean
mb_lookup_position_aux_destruct(LookupTablePtr lookup)706 mb_lookup_position_aux_destruct(LookupTablePtr lookup)
707
708 {
709 if (lookup == NULL)
710 return FALSE;
711
712 if(lookup->position_aux)
713 lookup->position_aux = MemFree(lookup->position_aux);
714
715
716 /* create the new lookup table, Add 1 to silence purify messages. */
717
718 lookup->mod_lt = (ModLAEntry PNTR) MemNew((1+lookup->array_size)*sizeof(ModLAEntry));
719
720 mb_make_mod_lt(lookup);
721
722
723 /* deallocate parts of the old table no longer needed */
724 if(lookup->position) lookup->position = MemFree(lookup->position);
725
726 return TRUE;
727 }
728
729 /*
730 This function adds an (already computed) index to the lookup table
731 for some set of residues. The most likely use of this is for
732 compressed residues, as in blastn.
733 */
734 void LIBCALL
lookup_add_index(LookupTablePtr lookup,Int4 lookup_index,Int4 position,Int1 context)735 lookup_add_index(LookupTablePtr lookup, Int4 lookup_index, Int4 position, Int1 context)
736
737 {
738 AddPositionToLookupTable(lookup, lookup_index, position, context);
739
740 return;
741 }
742
743 /*
744 This function adds an index to the lookup table for any number of
745 contiguous resdiues.
746
747 */
748 void LIBCALL
lookup_add(LookupTablePtr lookup,CharPtr string,Int4 position,Int1 context)749 lookup_add(LookupTablePtr lookup, CharPtr string, Int4 position, Int1 context)
750
751 {
752 Int4 char_size, lookup_index=0, wordsize;
753
754 char_size = lookup->char_size;
755 wordsize = lookup->wordsize;
756
757 lookup_index = *string;
758 wordsize--;
759 while (wordsize > 0) {
760 lookup_index <<= char_size;
761 string++;
762 lookup_index += *string;
763 wordsize--;
764 }
765 AddPositionToLookupTable(lookup, lookup_index, position, context);
766
767 return;
768 }
769
770 static void
AddPositionToLookupTable(LookupTablePtr lookup,Int4 index,Int4 position,Int1 context)771 AddPositionToLookupTable(LookupTablePtr lookup, Int4 index, Int4 position, Int1 context)
772
773 {
774 OrigLookupPositionPtr new, last, PNTR lookup_pos, PNTR lookup_pos_aux;
775
776 if (lookup == NULL) return;
777 lookup->num_pos_added++;
778
779 lookup_pos = lookup->position;
780 lookup_pos_aux = lookup->position_aux;
781 new = (OrigLookupPositionPtr) lookup_get_memory(lookup, sizeof(OrigLookupPosition));
782 if (*(lookup_pos+index) == NULL) {
783 lookup->num_unique_pos_added++;
784 lookup_pos[index] = new;
785 lookup_pos[index]->position = position;
786 lookup_pos[index]->context = context;
787 if (lookup_pos_aux) {
788 lookup_pos_aux[index] = new;
789 }
790 } else { /* Go to last link, use aux struct if it exists. */
791 if (lookup_pos_aux) {
792 last = lookup_pos_aux[index];
793 lookup_pos_aux[index] = new;
794 } else {
795 last = lookup_pos[index];
796 while (last->next)
797 last = last->next;
798 }
799 last->next = new;
800 new->position = position;
801 new->context = context;
802 }
803
804 return;
805 }
806
807 /*
808 Initializes the "lookup_index" for "string". For contiguous word
809 matching the first wordsize-1 letters of string are placed into the
810 lookup_index. The "new" starting point of the string is then returned.
811 */
812
813 Uint1Ptr LIBCALL
lookup_find_init(LookupTablePtr lookup,Int4 PNTR lookup_index,Uint1Ptr string)814 lookup_find_init(LookupTablePtr lookup, Int4 PNTR lookup_index, Uint1Ptr string)
815
816 {
817
818 Int4 char_size, wordsize;
819
820 char_size = lookup->char_size;
821 wordsize = lookup->reduced_wordsize;
822
823 /* Fill in wordsize-1 spaces. */
824 *lookup_index = *string;
825
826 wordsize -= 2;
827 while (wordsize > 0) {
828 string++;
829 *lookup_index <<= char_size;
830 *lookup_index += *string;
831 wordsize--;
832 }
833
834 return string;
835 }
836
837 /*
838 Allocates memory for the BLAST_WordFinder, and lookup table.
839 */
840
841 BLAST_WordFinderPtr
BLAST_WordFinderNew(Int2 alphabet_size,Int2 wordsize,Int2 compression_ratio,Boolean round_down)842 BLAST_WordFinderNew (Int2 alphabet_size, Int2 wordsize, Int2 compression_ratio, Boolean round_down)
843
844 {
845 BLAST_WordFinderPtr wfp;
846 Int2 reduced_wordsize;
847
848 wfp = (BLAST_WordFinderPtr) MemNew(sizeof(BLAST_WordFinder));
849
850 if (wfp != NULL) {
851 /* If the compression_ratio is greater than one and round_down is TRUE, then we want to round
852 the wordsize down, as a multiple of compression_ratio should be three less. round_down is only
853 TRUE in blastn if we want to guarantee that a certain wordsize is used (i.e., 11). */
854 if (compression_ratio > 1) {
855 if (round_down) {
856 reduced_wordsize = (Int2) MIN(2, ((wordsize-3)/compression_ratio));
857 wfp->lookup = lookup_new(alphabet_size, (Int2) ((wordsize-3)/compression_ratio), (Int2) reduced_wordsize);
858 } else {
859 reduced_wordsize = wordsize/compression_ratio;
860 wfp->lookup = lookup_new(alphabet_size, (wordsize/compression_ratio), (Int2) reduced_wordsize);
861 }
862 } else {
863 wfp->lookup = lookup_new(alphabet_size, (Int2) (wordsize/compression_ratio), 0);
864 }
865 wfp->compression_ratio = compression_ratio;
866 wfp->wordsize = wordsize;
867 }
868
869 return wfp;
870 }
871
872 BLAST_WordFinderPtr
BLAST_WordFinderDestruct(BLAST_WordFinderPtr wfp)873 BLAST_WordFinderDestruct (BLAST_WordFinderPtr wfp)
874
875 {
876
877 if (wfp != NULL) {
878 wfp->lookup = lookup_destruct(wfp->lookup);
879 wfp = MemFree(wfp);
880 }
881
882 return wfp;
883 }
884
885 BLAST_WordFinderPtr
MegaBlastWordFinderDeallocate(BLAST_WordFinderPtr wfp)886 MegaBlastWordFinderDeallocate(BLAST_WordFinderPtr wfp)
887 {
888 LookupTablePtr lookup;
889
890 if (wfp == NULL || (lookup = wfp->lookup) == NULL ||
891 lookup->mb_lt == NULL)
892 return wfp;
893 if (lookup->mb_lt->estack) {
894 Int4 index;
895 for (index=0; index<lookup->mb_lt->num_stacks; index++)
896 MemFree(lookup->mb_lt->estack[index]);
897 MemFree(lookup->mb_lt->estack);
898 MemFree(lookup->mb_lt->stack_size);
899 MemFree(lookup->mb_lt->stack_index);
900 }
901 MemFree(lookup->mb_lt);
902 MemFree(lookup);
903 wfp = MemFree(wfp);
904 return wfp;
905 }
906
907 Boolean
MegaBlastBuildLookupTable(BlastSearchBlkPtr search)908 MegaBlastBuildLookupTable(BlastSearchBlkPtr search)
909 {
910 LookupTablePtr lookup = search->wfp->lookup;
911 Int4 query_length = search->context[search->first_context].query->length;
912 register Uint1Ptr seq, pos;
913 register Int4 index;
914 register Int4 ecode, extra_code;
915 register Int4 mask;
916 Int4 ecode1, ecode2;
917 Uint1 val, nuc_mask = 0xfc, at_hash_mask = 0x10;
918 MbLookupTablePtr mb_lt;
919 Int4 masked_word_count = 0;
920 PV_ARRAY_TYPE *pv_array=NULL;
921 Int4 pv_shift, pv_array_bts, pv_size;
922 Boolean discontiguous_word = FALSE;
923 Int2 word_length, extra_length, word_weight;
924 Int4 last_offset;
925 Int4 bit0, no_bit0;
926 Int2 template_length;
927 MegaBlastParameterBlkPtr mb_params = search->pbp->mb_params;
928 MBTemplateType template_type = mb_params->template_type;
929 Boolean amb_cond;
930 Boolean use_two_templates = mb_params->use_two_templates;
931
932 if (lookup == NULL)
933 return FALSE;
934
935 if(lookup->position_aux)
936 lookup->position_aux = MemFree(lookup->position_aux);
937
938 mb_lt = (MbLookupTablePtr) MemNew(sizeof(MbLookupTable));
939
940 /* Increment lookup->wordsize, since it is now equal to (wordsize - 3)/4 */
941 if (++lookup->wordsize < 3)
942 mb_lt->width = 2;
943 else
944 mb_lt->width = 3;
945 mb_lt->lpm = lookup->wordsize * READDB_COMPRESSION_RATIO;
946
947 mb_lt->max_positions = mb_params->max_positions;
948
949 mb_lt->hashsize = (1<<(8*mb_lt->width));
950
951 if (mb_params->disc_word) {
952 discontiguous_word = TRUE;
953 word_length = READDB_COMPRESSION_RATIO*(mb_lt->width+1);
954 template_length = mb_params->template_length;
955 extra_length = template_length - word_length;
956 word_weight = mb_params->word_weight;
957 mask = (1 << (8*(mb_lt->width+1) - 2)) - 1;
958 } else {
959 word_length = READDB_COMPRESSION_RATIO*mb_lt->width;
960 mask = mb_lt->mask = (1 << (8*mb_lt->width - 2)) - 1;
961 }
962
963 if ((mb_lt->hashtable = (Int4Ptr)
964 MemNew(mb_lt->hashsize*sizeof(Int4))) == NULL) {
965 MegaBlastLookupTableDestruct(lookup);
966 return FALSE;
967 }
968
969 if ((mb_lt->next_pos = (Int4Ptr)
970 MemNew(query_length*sizeof(Int4))) == NULL) {
971 MegaBlastLookupTableDestruct(lookup);
972 return FALSE;
973 }
974 if (use_two_templates) {
975 if (word_weight >= 12) {
976 if ((mb_lt->hashtable2 = (Int4Ptr)
977 MemNew(mb_lt->hashsize*sizeof(Int4))) == NULL) {
978 MegaBlastLookupTableDestruct(lookup);
979 return FALSE;
980 }
981 } else {/* For weight 11 no need for extra main table */
982 mb_lt->hashtable2 = mb_lt->hashtable;
983 }
984 if ((mb_lt->next_pos2 = (Int4Ptr)
985 MemNew(query_length*sizeof(Int4))) == NULL) {
986 MegaBlastLookupTableDestruct(lookup);
987 return FALSE;
988 }
989 }
990 seq = search->context[search->first_context].query->sequence_start;
991 pos = seq + word_length;
992
993 ecode = 0;
994
995 if (discontiguous_word) {
996 if (word_weight == 11) {
997 no_bit0 = 0x007fffff;
998 bit0 = 0x00800000;
999 } else {
1000 no_bit0 = 0xffffffff;
1001 bit0 = 0x00000000;
1002 }
1003 if (/*template18*/extra_length) {
1004 last_offset = query_length - extra_length;
1005 for (index = 1; index <= last_offset; index++) {
1006 val = *++seq;
1007 if ((val & nuc_mask) != 0) { /* ambiguity, gap or masked residue */
1008 if (*seq & at_hash_mask) /* Will extend through this residue */
1009 *seq &= 0x0f;
1010 ecode = 0;
1011 pos = seq + word_length;
1012 extra_code = 0;
1013 } else {
1014 /* get next base */
1015 ecode = ((ecode & mask) << 2) + val;
1016 if (seq >= pos) {
1017 switch (template_type) {
1018 case TEMPL_11_18: case TEMPL_12_18:
1019 amb_cond = !GET_AMBIG_CONDITION_18(seq);
1020 break;
1021 case TEMPL_11_18_OPT: case TEMPL_12_18_OPT:
1022 amb_cond = !GET_AMBIG_CONDITION_18_OPT(seq);
1023 break;
1024 case TEMPL_11_21: case TEMPL_12_21:
1025 amb_cond = !GET_AMBIG_CONDITION_21(seq);
1026 break;
1027 case TEMPL_11_21_OPT: case TEMPL_12_21_OPT:
1028 amb_cond = !GET_AMBIG_CONDITION_21_OPT(seq);
1029 break;
1030 default:
1031 amb_cond = FALSE;
1032 break;
1033 }
1034
1035 if (amb_cond) {
1036 switch (template_type) {
1037 case TEMPL_11_18:
1038 ecode1 = (GET_WORD_INDEX_11_18(ecode) |
1039 GET_EXTRA_CODE_18(seq)) & no_bit0;
1040 break;
1041 case TEMPL_12_18:
1042 ecode1 = (GET_WORD_INDEX_12_18(ecode) |
1043 GET_EXTRA_CODE_18(seq)) & no_bit0;
1044 break;
1045 case TEMPL_11_18_OPT:
1046 ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) |
1047 GET_EXTRA_CODE_18_OPT(seq)) & no_bit0;
1048 break;
1049 case TEMPL_12_18_OPT:
1050 ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) |
1051 GET_EXTRA_CODE_18_OPT(seq)) & no_bit0;
1052 break;
1053 case TEMPL_11_21:
1054 ecode1 = (GET_WORD_INDEX_11_21(ecode) |
1055 GET_EXTRA_CODE_21(seq)) & no_bit0;
1056 break;
1057 case TEMPL_12_21:
1058 ecode1 = (GET_WORD_INDEX_12_21(ecode) |
1059 GET_EXTRA_CODE_21(seq)) & no_bit0;
1060 break;
1061 case TEMPL_11_21_OPT:
1062 ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) |
1063 GET_EXTRA_CODE_21_OPT(seq)) & no_bit0;
1064 break;
1065 case TEMPL_12_21_OPT:
1066 ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) |
1067 GET_EXTRA_CODE_21_OPT(seq)) & no_bit0;
1068 break;
1069 default:
1070 ecode1 = 0;
1071 break;
1072 }
1073
1074 if (use_two_templates) {
1075 switch (template_type) {
1076 case TEMPL_11_18:
1077 ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) |
1078 GET_EXTRA_CODE_18_OPT(seq) | bit0;
1079 break;
1080 case TEMPL_12_18:
1081 ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) |
1082 GET_EXTRA_CODE_18_OPT(seq) | bit0;
1083 break;
1084 case TEMPL_11_21:
1085 ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) |
1086 GET_EXTRA_CODE_21_OPT(seq) | bit0;
1087 break;
1088 case TEMPL_12_21:
1089 ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) |
1090 GET_EXTRA_CODE_21_OPT(seq) | bit0;
1091 break;
1092 default:
1093 ecode2 = 0; break;
1094 }
1095
1096 if (mb_lt->hashtable[ecode1] == 0 ||
1097 mb_lt->hashtable2[ecode2] == 0)
1098 lookup->num_unique_pos_added++;
1099 mb_lt->next_pos2[index] = mb_lt->hashtable2[ecode2];
1100 mb_lt->hashtable2[ecode2] = index;
1101 } else {
1102 if (mb_lt->hashtable[ecode1] == 0)
1103 lookup->num_unique_pos_added++;
1104 }
1105 mb_lt->next_pos[index] = mb_lt->hashtable[ecode1];
1106 mb_lt->hashtable[ecode1] = index;
1107 }
1108 }
1109 }
1110 }
1111 /* Modify the last 2 residues if they are masked only for lookup */
1112 for (index = 0; index < extra_length; index++) {
1113 if (*++seq & at_hash_mask)
1114 *seq &= 0x0f;
1115 }
1116 } else { /* Template length 16 */
1117 for (index = 1; index <= query_length; index++) {
1118 val = *++seq;
1119 if ((val & nuc_mask) != 0) { /* ambiguity, gap or masked residue */
1120 ecode = 0;
1121 pos = seq + word_length;
1122 if (*seq & at_hash_mask) /* Will extend through this residue */
1123 *seq &= 0x0f;
1124 } else {
1125 /* get next base */
1126 ecode = ((ecode & mask) << 2) + val;
1127 if (seq >= pos) {
1128 switch (template_type) {
1129 case TEMPL_11_16:
1130 ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
1131 break;
1132 case TEMPL_12_16:
1133 ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
1134 break;
1135 case TEMPL_11_16_OPT:
1136 ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
1137 break;
1138 case TEMPL_12_16_OPT:
1139 ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
1140 break;
1141 default:
1142 ecode1 = 0;
1143 break;
1144 }
1145
1146 if (use_two_templates) {
1147 switch (template_type) {
1148 case TEMPL_11_16:
1149 ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
1150 break;
1151 case TEMPL_12_16:
1152 ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
1153 break;
1154 default:
1155 ecode2 = 0; break;
1156 }
1157 if (mb_lt->hashtable[ecode1] == 0 ||
1158 mb_lt->hashtable2[ecode2] == 0)
1159 lookup->num_unique_pos_added++;
1160 mb_lt->next_pos2[index] = mb_lt->hashtable2[ecode2];
1161 mb_lt->hashtable2[ecode2] = index;
1162 } else {
1163 if (mb_lt->hashtable[ecode1] == 0)
1164 lookup->num_unique_pos_added++;
1165 }
1166 mb_lt->next_pos[index] = mb_lt->hashtable[ecode1];
1167 mb_lt->hashtable[ecode1] = index;
1168 }
1169 }
1170 }
1171 }
1172 } else { /* Contiguous word */
1173 for (index = 1; index <= query_length; index++) {
1174 val = *++seq;
1175 if ((val & nuc_mask) != 0) { /* ambiguity, gap or masked residue */
1176 ecode = 0;
1177 pos = seq + word_length;
1178 if (*seq & at_hash_mask) /* Will extend through this residue */
1179 *seq &= 0x0f;
1180 } else {
1181 /* get next base */
1182 ecode = ((ecode & mask) << 2) + val;
1183 if (seq >= pos) {
1184 if (mb_lt->hashtable[ecode] == 0)
1185 lookup->num_unique_pos_added++;
1186 mb_lt->next_pos[index] = mb_lt->hashtable[ecode];
1187 mb_lt->hashtable[ecode] = index;
1188 }
1189 }
1190 }
1191 }
1192
1193 /* Now remove the hash entries that have too many positions */
1194 if (mb_lt->max_positions>0) {
1195 for (ecode=0; ecode<mb_lt->hashsize; ecode++) {
1196 Int4 pcount, pcount1=0;
1197 for (index=mb_lt->hashtable[ecode], pcount=0;
1198 index>0; index=mb_lt->next_pos[index], pcount++);
1199 if (use_two_templates) {
1200 for (index=mb_lt->hashtable2[ecode], pcount1=0;
1201 index>0; index=mb_lt->next_pos2[index], pcount1++);
1202 }
1203 if ((pcount=MAX(pcount,pcount1))>mb_lt->max_positions) {
1204 mb_lt->hashtable[ecode] = 0;
1205 if (use_two_templates)
1206 mb_lt->hashtable2[ecode] = 0;
1207 ErrPostEx(SEV_WARNING, 0, 0, "%lx - %d", ecode, pcount);
1208 masked_word_count++;
1209 }
1210 }
1211 if (masked_word_count)
1212 ErrPostEx(SEV_WARNING, 0, 0, "Masked %d %dmers", masked_word_count,
1213 4*mb_lt->width);
1214
1215 }
1216
1217 if (discontiguous_word)
1218 mb_lt->mask = (1 << mb_lt->width*8) - 1;
1219 else
1220 mb_lt->mask = (1 << (mb_lt->width*8 - 8)) - 1;
1221
1222 /* Stacks are allocated for word size >= 11, or if query length is too long,
1223 otherwise will use diagonal array (ewp and ewp_params structures) */
1224 if (query_length > MAX_DIAG_ARRAY ||
1225 search->pbp->mb_params->word_weight >= 11) {
1226 Int8 total_len=0, av_search_space=0, av_len=0;
1227 Int4 total_num=0, stack_size=0, num_stacks=0;
1228 if (search->rdfp) {
1229 if (search->dblen > 0 && search->dbseq_num > 0) {
1230 total_len = search->dblen;
1231 total_num = search->dbseq_num;
1232 } else {
1233 readdb_get_totals_ex(search->rdfp, &total_len, &total_num, TRUE);
1234 }
1235 if (total_num > 0)
1236 av_len = total_len / total_num;
1237 else
1238 av_len = 1000;
1239 } else {
1240 av_len = search->subject->length;
1241 }
1242 av_search_space =
1243 ((Int8) search->context[search->first_context].query->length) * av_len;
1244 num_stacks = MIN(1 + (Int4) sqrt(av_search_space)/100, 500);
1245 stack_size = 5000/num_stacks;
1246 mb_lt->stack_index = (Int4Ptr) MemNew(num_stacks*sizeof(Int4));
1247 mb_lt->stack_size = (Int4Ptr) Malloc(num_stacks*sizeof(Int4));
1248 mb_lt->estack = (MbStackPtr PNTR) Malloc(num_stacks*sizeof(MbStackPtr));
1249 for (index=0; index<num_stacks; index++) {
1250 mb_lt->estack[index] =
1251 (MbStackPtr) Malloc(stack_size*sizeof(MbStack));
1252 mb_lt->stack_size[index] = stack_size;
1253 }
1254 mb_lt->num_stacks = num_stacks;
1255 }
1256
1257 /* For 12-mer based lookup table need to make presense bit array much
1258 smaller, so it stays in cache, even though this allows for collisions */
1259 pv_shift = (mb_lt->width < 3) ? 0 : 5;
1260
1261 pv_array_bts = PV_ARRAY_BTS + pv_shift;
1262
1263 if (lookup->num_unique_pos_added <
1264 (PV_ARRAY_FACTOR*(mb_lt->hashsize>>pv_shift))) {
1265 pv_size = mb_lt->hashsize>>pv_array_bts;
1266 /* Size measured in PV_ARRAY_TYPE's */
1267 pv_array = MemNew(pv_size*PV_ARRAY_BYTES);
1268
1269 for (index=0; index<mb_lt->hashsize; index++) {
1270 if (mb_lt->hashtable[index] != 0 ||
1271 (use_two_templates && mb_lt->hashtable2[index] != 0))
1272 pv_array[index>>pv_array_bts] |=
1273 (((PV_ARRAY_TYPE) 1)<<(index&PV_ARRAY_MASK));
1274 }
1275 lookup->pv_array = pv_array;
1276 }
1277 lookup->mb_lt = mb_lt;
1278 return TRUE;
1279 }
1280
1281 LookupTablePtr
MegaBlastLookupTableDestruct(LookupTablePtr lookup)1282 MegaBlastLookupTableDestruct(LookupTablePtr lookup)
1283 {
1284 Int4 index;
1285
1286 if (!lookup->mb_lt)
1287 return lookup;
1288 if (lookup->mb_lt->hashtable)
1289 MemFree(lookup->mb_lt->hashtable);
1290 if (lookup->mb_lt->next_pos)
1291 MemFree(lookup->mb_lt->next_pos);
1292 if (lookup->mb_lt->next_pos2)
1293 MemFree(lookup->mb_lt->next_pos2);
1294 if (lookup->mb_lt->estack) {
1295 for (index=0; index<lookup->mb_lt->num_stacks; index++)
1296 MemFree(lookup->mb_lt->estack[index]);
1297 MemFree(lookup->mb_lt->estack);
1298 MemFree(lookup->mb_lt->stack_index);
1299 MemFree(lookup->mb_lt->stack_size);
1300 }
1301 lookup->mb_lt = MemFree(lookup->mb_lt);
1302 return lookup;
1303 }
1304
1305
MegaBlastLookupTableDup(LookupTablePtr lookup)1306 LookupTablePtr MegaBlastLookupTableDup(LookupTablePtr lookup)
1307 {
1308 Int4 index;
1309
1310 /* The only piece that actually needs to be duplicated is estack array */
1311 LookupTablePtr new_lookup =
1312 (LookupTablePtr) MemDup(lookup, sizeof(LookupTable));
1313
1314 new_lookup->mb_lt =
1315 (MbLookupTablePtr) MemDup(lookup->mb_lt, sizeof(MbLookupTable));
1316 new_lookup->mb_lt->stack_index = (Int4Ptr) MemNew(lookup->mb_lt->num_stacks*sizeof(Int4));
1317 new_lookup->mb_lt->stack_size = (Int4Ptr) MemDup(lookup->mb_lt->stack_size,
1318 lookup->mb_lt->num_stacks*sizeof(Int4));
1319 if (lookup->mb_lt->estack) {
1320 new_lookup->mb_lt->estack = (MbStackPtr PNTR) Malloc(lookup->mb_lt->num_stacks*sizeof(MbStackPtr));
1321 for (index=0; index<lookup->mb_lt->num_stacks; index++)
1322 new_lookup->mb_lt->estack[index] =
1323 (MbStackPtr) Malloc(lookup->mb_lt->stack_size[index]*sizeof(MbStack));
1324 }
1325 return new_lookup;
1326 }
1327