1 static char const rcsid[] = "$Id: lookup.c,v 6.59 2005/07/28 14:57:10 coulouri Exp $";
2 
3 /*
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28 /*****************************************************************************
29 
30 File name: lookup.c
31 
32 Author: Tom Madden
33 
34 Contents: Functions to store "hashed" word hits and their positions and
35 	in arrays and look these up again.
36 
37 Detailed Contents:
38 
39         - the number of bits required is given by ln(alphabet size)/ln(2).
40 
41 	- the number of bits times the wordsize must be less than 32.
42 
43         - contiguous words can be of any length, discontiguous words can
44 	only have the patten XX0X and be of length three, or word-width of
45 	four.
46 
47 ******************************************************************************/
48 
49 /*******************************************************************************
50 * File Name: lookup.c
51 *
52 * Author: Tom Madden
53 *
54 * Version Creation Date:   10/26/95
55 *
56 * $Revision: 6.59 $
57 *
58 * File Description:
59 *       Functions to store "words" from a query and perform lookups against
60 *	database sequences.
61 *
62 * Modifications:
63 * --------------------------------------------------------------------------
64 * Date     Name        Description of modification
65 * -------  ----------  -----------------------------------------------------
66 *
67 * ==========================================================================
68 *
69 *
70 * RCS Modification History:
71 * $Log: lookup.c,v $
72 * Revision 6.59  2005/07/28 14:57:10  coulouri
73 * remove dead code
74 *
75 * Revision 6.58  2004/11/15 22:28:41  kans
76 * in MegaBlastBuildLookupTable, changed total_len (et al.) to Int8 and total_num (et al.) to Int4, from unsigned, which conflicted with pointer arguments to readdb_get_totals_ex
77 *
78 * Revision 6.57  2004/11/15 21:35:35  coulouri
79 *
80 * Manually overriding the effective database length can cause av_len to overflow an Int4; use Uint8 instead.
81 *
82 * Revision 6.56  2003/05/30 17:25:36  coulouri
83 * add rcsid
84 *
85 * Revision 6.55  2002/10/03 14:43:44  dondosha
86 * Bug fix for 2 sequences megablast
87 *
88 * Revision 6.54  2002/08/30 15:49:01  dondosha
89 * MegaBlastWordFinderDeallocate moved from mblast.c
90 *
91 * Revision 6.53  2002/06/07 18:50:43  dondosha
92 * Bug fix for Mega BLAST with two simultaneoue discontiguous word templates of length 21
93 *
94 * Revision 6.52  2002/05/17 21:40:13  dondosha
95 * Added 2 optimal Mega BLAST word templates for length 21
96 *
97 * Revision 6.51  2002/05/14 22:20:20  dondosha
98 * Renamed maximal discontiguous template type into optimal
99 *
100 * Revision 6.50  2002/04/09 18:19:47  dondosha
101 * Changed #ifdefs to conditionals, allowing different discontiguous templates in a single binary
102 *
103 * Revision 6.49  2002/04/01 22:28:29  dondosha
104 * Corrected syntax error for megablast with two-templates case
105 *
106 * Revision 6.48  2002/03/26 21:15:54  dondosha
107 * Added macro definitions to allow template length 21
108 *
109 * Revision 6.47  2002/03/05 17:39:39  dondosha
110 * Use reduced_wordsize instead of wordsize in lookup_find_init
111 *
112 * Revision 6.46  2002/02/19 22:35:06  dondosha
113 * Fixed uninitialized variable bug introduced by the two-template code
114 *
115 * Revision 6.45  2002/02/15 23:35:01  dondosha
116 * Allow hash table and two templates for megablast
117 *
118 * Revision 6.44  2001/12/28 20:46:21  dondosha
119 * 1. Mega BLAST related parameters moved into a separate structure
120 * 2. Environment variables for discontiguous words, etc. changed to options
121 * 3. Extension from discontiguous word with length 18 implemented
122 *
123 * Revision 6.43  2001/12/27 18:12:33  dondosha
124 * Fixed a bug caused by lack of Int4 to Int8 cast
125 *
126 * Revision 6.42  2001/12/10 22:54:27  dondosha
127 * Correction for determining the number of stacks to use in megablast
128 *
129 * Revision 6.41  2001/12/04 17:13:08  dondosha
130 * Made number of stacks for megablast word processing depend on query and database
131 *
132 * Revision 6.40  2001/10/12 21:32:46  dondosha
133 * Added discontiguous word capability to megablast
134 *
135 * Revision 6.39  2001/07/20 19:22:37  dondosha
136 * Added pv_array for megablast
137 *
138 * Revision 6.38  2000/09/29 22:17:42  dondosha
139 * Check if memory allocation fails in MegaBlastBuildLookupTable
140 *
141 * Revision 6.37  2000/08/28 17:13:56  dondosha
142 * Changed the bit indicating mask at hash only for megablast
143 *
144 * Revision 6.36  2000/07/10 17:18:58  dondosha
145 * Use several stacks in MegaBlast for speed up
146 *
147 * Revision 6.35  2000/05/31 14:06:11  dondosha
148 * Added warning messages when 12mers are masked in megablast
149 *
150 * Revision 6.34  2000/05/22 18:06:54  dondosha
151 * Fixed bug in masking at hash for megablast
152 *
153 * Revision 6.33  2000/05/17 17:38:26  dondosha
154 * Removed several debugging statements, introduced by previous revision
155 *
156 * Revision 6.32  2000/05/17 17:15:56  dondosha
157 * In megablast, clean the hash table from patterns with too many positions in query
158 *
159 * Revision 6.31  2000/04/10 17:27:15  madden
160 * Deallocate old lookup table memory before search
161 *
162 * Revision 6.30  2000/04/03 13:07:51  madden
163 * Fixed memory leak
164 *
165 * Revision 6.29  2000/03/07 21:59:53  madden
166 * Add return value of NULL for MegaBlastLookupTableDestruct
167 *
168 * Revision 6.28  2000/03/03 18:07:34  dondosha
169 * Added routine MegaBlastLookupTableDup plus cosmetic changes related to MegaBlast
170 *
171 * Revision 6.27  2000/03/02 17:24:34  dondosha
172 * Fixed handling of lower case masking in MegaBlast
173 *
174 * Revision 6.26  2000/02/23 20:41:39  dondosha
175 * Removed extern declaration for MegaBlastBuildLookupTable - now included from blast.h
176 *
177 * Revision 6.25  2000/02/17 19:02:10  shavirin
178 * Removed all references to absolete theCacheSize variable.
179 *
180 * Revision 6.24  2000/02/12 21:18:57  kans
181 * added prototype for MegaBlastBuildLookupTable - implemented in lookup.c, called from mblast.c
182 *
183 * Revision 6.23  2000/02/11 20:54:28  dondosha
184 * Added functions related to Webb Miller s lookup table
185 *
186 * Revision 6.22  2000/02/03 21:17:03  dondosha
187 * Fixed bug in lookup_new
188 *
189 * Revision 6.21  2000/02/01 21:08:48  dondosha
190 * Added mb_make_mod_lt and mb_lookup_position_aux_destruct - modifications for megablast
191 *
192 * Revision 6.20  2000/01/11 18:37:08  shavirin
193 * Added define DYNAMIC_CACHE to distinduish dynamic and static lookup.
194 *
195 * Revision 6.19  2000/01/11 17:58:50  shavirin
196 * Added new function make_mod_lt_new, which created dynamic lookup table
197 *
198 * Revision 6.18  2000/01/11 17:33:11  shavirin
199 * Added handling of the parameter theCacheSize.
200 *
201 * Revision 6.17  1999/12/21 21:33:23  shavirin
202 * Added parameter mod_lookup_table_memory to the lookup structure.
203 *
204 * Revision 6.15  1999/11/08 21:54:54  madden
205 * Fix typo
206 *
207 * Revision 6.14  1999/10/12 18:00:13  madden
208 * Add one to array lengths for purify errors.
209 *
210 * Revision 6.13  1999/10/06 14:25:20  madden
211 * Only allocate pv_array if needed
212 *
213 * Revision 6.12  1999/09/29 17:20:12  madden
214 * Deallocate ModLookupPositionPtr memory
215 *
216 * Revision 6.11  1999/09/29 13:28:44  madden
217 * Fill in pv_array correctly
218 *
219 * Revision 6.10  1999/09/28 20:14:33  madden
220 * Joerg changes to mimize cache misses
221 *
222 * Revision 6.9  1999/09/16 16:56:39  madden
223 * Changes to lookup_new for long words
224 *
225 * Revision 6.8  1999/09/16 14:16:27  madden
226 * lookup_find_init returns Uint1Ptr instead of CharPtr
227 *
228 * Revision 6.7  1999/04/27 15:40:12  madden
229 * Use aux lookup only for short words
230 *
231 * Revision 6.6  1998/09/22 16:27:47  madden
232 * Added function lookup_position_aux_destruct
233 *
234 * Revision 6.5  1998/07/27 18:14:17  madden
235 * lookup_get_memory replaces call to MemNew
236 *
237 * Revision 6.4  1998/04/15 20:25:58  madden
238 * Auxillary structure added to speed-up saving words for queries
239 *
240 * Revision 6.3  1998/02/26 22:34:39  madden
241 * Changes for 16 bit windows
242 *
243 * Revision 6.2  1998/01/05 20:32:38  kans
244 * AddPositionToLookupTable checks for NULL lookup parameter
245 *
246 * Revision 6.1  1998/01/05 17:38:05  madden
247 * Check that position is non-NULL in lookup_destruct
248 *
249 * Revision 6.0  1997/08/25 18:53:23  madden
250 * Revision changed to 6.0
251 *
252 * Revision 1.7  1997/08/19 18:19:23  madden
253 * Cast arg of log to Nlm_FloatHi
254 *
255 * Revision 1.6  1997/04/03 19:58:27  madden
256 * Added check for NULL lookup->position.
257 *
258  * Revision 1.5  1996/09/25  14:17:30  madden
259  * removed discontiguous options.
260  *
261  * Revision 1.4  1996/09/12  21:12:59  madden
262  * Added new function to save an already computed index, lookup_add_index.
263  *
264  * Revision 1.3  1996/08/21  21:28:31  madden
265  * Added casts to quiet NT compiler warning.s
266  *
267  * Revision 1.2  1996/08/15  18:33:35  madden
268  * Changed Int2 to Int1.
269  *
270  * Revision 1.1  1996/08/05  19:48:50  madden
271  * Initial revision
272  *
273  * Revision 1.11  1996/07/24  12:01:28  madden
274  * Changes for blastx
275  *
276  * Revision 1.10  1996/07/18  22:00:02  madden
277  * Changes for multiple contexts.
278  *
279  * Revision 1.9  1996/06/20  16:52:23  madden
280  * Changed "pow" to "Nlm_Powi".
281  *
282  * Revision 1.8  1996/06/20  16:15:57  madden
283  * Replaced int's with Int4's.
284  *
285  * Revision 1.7  1996/05/22  20:22:01  madden
286  * Removed unused variable lookup_find_discontig.
287  *
288  * Revision 1.6  1996/05/16  19:50:15  madden
289  * Added documentation block.
290  *
291  * Revision 1.5  1996/05/03  19:55:24  madden
292  * Combined two functions.
293  *
294  * Revision 1.4  1996/04/04  20:47:55  madden
295  * Made lookup_find into two functions, that are statics.
296  *
297  * Revision 1.3  1996/02/28  21:37:14  madden
298  * changes for discontiguous words.
299  *
300  * Revision 1.2  1995/12/26  20:28:36  madden
301  * removed lookup_add_word function.
302  *
303  * Revision 1.1  1995/12/08  15:48:23  madden
304  * Initial revision
305  *
306 */
307 
308 #include <ncbi.h>
309 #include <blast.h> /* this already includes lookup.h */
310 
311 /* mod_lt[] is a modified lookup table.  -cfj
312  * It contains the same info that is accessible via lookup->position, but in a form that reduces the
313  * number of cache misses.
314  * Doing a lookup via the original lookup table cost one cache miss per access, and then one miss per
315  *   entry as we walk down the hit list.
316  *
317  * mod_lt replaces the linked list with an array of hits, to reduce the number of misses.
318  * Also, if there are 3 or fewer hits, they are placed directly in the table.
319  *       if there are more than 3 hits, the first one is placed directly in the table (followed by
320  *       a pointer to a contiguous array of all the remaining hits).
321  *
322  * This should probably be allocated and deallocated along with the standard lookup table, but this
323  * is not currently done.
324  *
325  * The make_mod_lt function should be called to build the mod_lt table after all hits have been placed
326  * into the orginal lookup table.
327  */
328 
make_mod_lt(LookupTablePtr lookup)329 void make_mod_lt(LookupTablePtr lookup)
330 {
331     Int4 index, j, pv_size;
332     Uint4 len;
333     OrigLookupPositionPtr PNTR lookup_pos;
334     OrigLookupPositionPtr list;
335     ModLookupPositionPtr next_free;
336     PV_ARRAY_TYPE *pv_array=NULL;
337 
338 
339     ModLAEntry *mod_lt;
340     mod_lt=lookup->mod_lt;
341 
342     len = lookup->array_size;
343     lookup_pos = lookup->position;
344 
345     /* Add 1 to silence purify messages. */
346     /* Initial allocation - more, than necessary though */
347 
348     next_free = MemNew((1+lookup->num_pos_added)*sizeof(ModLookupPosition));
349     lookup->mod_lookup_table_size = 0;
350 
351     if (next_free == NULL){
352         ErrPostEx(SEV_ERROR, 0, 0, "Unable to allocate lookup->position");
353         return;
354     }
355     lookup->mod_lookup_table_memory = next_free;
356 
357 
358     /* pv_array is an array of 'presence bits', one bit per entry
359        in the lookup table, packed 64bits per Uint8 (on a 64-bit binary);
360        otherwise a Uint4 is used. It can be used to quickly see if there
361        is an entry in the lookup table. It's used on short-to-medium
362        length queries since most lookups will be empty. -cfj
363     */
364 
365     if (lookup->num_unique_pos_added < PV_ARRAY_FACTOR*lookup->array_size) {
366 	pv_size = (lookup->array_size+PV_ARRAY_MASK)/(PV_ARRAY_MASK+1);    /* Size measured in PV_ARRAY_TYPE's */
367 	pv_array = MemNew(pv_size*PV_ARRAY_BYTES);
368     }
369 
370     /* Walk through table, copying info into mod_lt[] */
371     for(index = 0; index < len; index++) {
372         if(lookup_pos[index] == NULL){
373             mod_lt[index].num_used = 0;
374         } else {
375             /* count num entries */
376             int count=0;
377             list = lookup_pos[index];
378 
379             if (pv_array)
380                 pv_array[(index>>PV_ARRAY_BTS)] |= (((PV_ARRAY_TYPE) 1)<<(index&PV_ARRAY_MASK));
381 
382             while (list != NULL){
383                 count++;
384                 list = list->next;
385             }
386 
387             if (count<=3) {
388                 list = lookup_pos[index];
389                 mod_lt[index].num_used=count;
390                 for(j=0;j<count;j++){
391 		    hinfo_set(&mod_lt[index].entries[j],list->position,list->context);
392                     list = list->next;
393                 }
394             } else {
395                 ModLookupPositionPtr * lpp= (ModLookupPositionPtr *) &mod_lt[index].entries[1];
396                 list=lookup_pos[index];
397                 mod_lt[index].num_used=count;
398                 hinfo_set(&mod_lt[index].entries[0],list->position,list->context);
399                 list = list->next;
400                 *lpp  = next_free;
401                 for(j = 1; j < count; j++){
402 		    hinfo_set(next_free,list->position,list->context);
403                     next_free++;
404                     lookup->mod_lookup_table_size++;
405                     list = list->next;
406                 }
407             }
408         }
409     }
410 
411     lookup->pv_array = pv_array;
412 }
mb_make_mod_lt(LookupTablePtr lookup)413 void mb_make_mod_lt(LookupTablePtr lookup)
414 {
415     Int4 index, j, pv_size;
416     Uint4 len;
417     OrigLookupPositionPtr PNTR lookup_pos;
418     OrigLookupPositionPtr list;
419     ModLookupPositionPtr next_free;
420     PV_ARRAY_TYPE *pv_array=NULL;
421 
422 
423     ModLAEntry *mod_lt;
424     mod_lt=lookup->mod_lt;
425 
426     len = lookup->array_size;
427     lookup_pos = lookup->position;
428 
429     /* Add 1 to silence purify messages. */
430     /* Initial allocation - more, than necessary though */
431 
432     next_free = MemNew((1+lookup->num_pos_added)*sizeof(ModLookupPosition));
433     lookup->mod_lookup_table_size = 0;
434 
435     if (next_free == NULL){
436         ErrPostEx(SEV_ERROR, 0, 0, "Unable to allocate lookup->position");
437         return;
438     }
439     lookup->mod_lookup_table_memory = next_free;
440 
441 
442     /* pv_array is an array of 'presence bits', one bit per entry
443        in the lookup table, packed 64bits per Uint8 (on a 64-bit binary);
444        otherwise a Uint4 is used. It can be used to quickly see if there
445        is an entry in the lookup table. It's used on short-to-medium
446        length queries since most lookups will be empty. -cfj
447     */
448 
449     if (lookup->num_unique_pos_added < PV_ARRAY_FACTOR*lookup->array_size) {
450 	pv_size = (lookup->array_size+PV_ARRAY_MASK)/(PV_ARRAY_MASK+1);    /* Size measured in PV_ARRAY_TYPE's */
451 	pv_array = MemNew(pv_size*PV_ARRAY_BYTES);
452     }
453 
454     /* Walk through table, copying info into mod_lt[] */
455     for(index = 0; index < len; index++) {
456         if(lookup_pos[index] == NULL){
457             mod_lt[index].num_used = 0;
458         } else {
459             /* count num entries */
460             int count=0;
461             list = lookup_pos[index];
462 
463             if (pv_array)
464                 pv_array[(index>>PV_ARRAY_BTS)] |= (((PV_ARRAY_TYPE) 1)<<(index&PV_ARRAY_MASK));
465 
466             while (list != NULL){
467                 count++;
468                 list = list->next;
469             }
470 
471             if (count<=3) {
472                 list = lookup_pos[index];
473                 mod_lt[index].num_used=count;
474                 for(j=0;j<count;j++){
475 		    mod_lt[index].entries[j] = list->position;
476                     list = list->next;
477                 }
478             } else {
479                 ModLookupPositionPtr * lpp= (ModLookupPositionPtr *) &mod_lt[index].entries[1];
480                 list=lookup_pos[index];
481                 mod_lt[index].num_used=count;
482                 mod_lt[index].entries[0] = list->position;
483                 list = list->next;
484                 *lpp  = next_free;
485                 for(j = 1; j < count; j++){
486 		   *next_free = list->position;
487                     next_free++;
488                     lookup->mod_lookup_table_size++;
489                     list = list->next;
490                 }
491             }
492         }
493     }
494 
495     lookup->pv_array = pv_array;
496 }
497 
498 
499 static void AddPositionToLookupTable PROTO((LookupTablePtr lookup, Int4 index, Int4 position, Int1 context));
500 
501 #define LOOKUP_MEMORY_MIN 128*1024	/* Minimum block of memory (in bytes) for LookupMemoryPtr. */
502 
lookup_deallocate_memory(LookupTablePtr lookup)503 static VoidPtr lookup_deallocate_memory (LookupTablePtr lookup)
504 {
505     LookupMemoryPtr next=NULL, mem_struct;
506 
507     if (lookup == NULL)
508         return NULL;
509 
510     mem_struct = lookup->mem_struct_start;
511 
512     while (mem_struct) {
513         next = mem_struct->next;
514         MemFree(mem_struct->start);
515         MemFree(mem_struct);
516         mem_struct = next;
517     }
518 
519     lookup->mem_struct_start = NULL;
520 
521     return NULL;
522 }
523 
524 /*
525   returns the requested amount of memory.  Large blocks are obtained at
526   once and then passed out.
527 
528   LookupTablePtr lookup:	contains all information on Lookup.
529   size_t required: how much memory has been requested.
530 */
531 
lookup_get_memory(LookupTablePtr lookup,size_t required)532 static VoidPtr lookup_get_memory (LookupTablePtr lookup, size_t required)
533 {
534     LookupMemoryPtr last, mem_struct;
535     size_t memory_requested;
536     Uint1Ptr new;
537 
538     mem_struct = lookup->mem_struct;
539 
540     if (!mem_struct || required > mem_struct->remaining) {
541         if (mem_struct) {
542             last = mem_struct;
543         } else {
544             last = NULL;
545         }
546 
547         memory_requested = MAX(lookup->memory_chunk, LOOKUP_MEMORY_MIN);
548         mem_struct = (LookupMemoryPtr) MemNew(sizeof(LookupMemory));
549         mem_struct->start = (Uint1Ptr) MemNew(memory_requested);
550         mem_struct->current = mem_struct->start;
551         mem_struct->remaining = memory_requested;
552 
553         /* align to 64Byte block. -cfj */
554         if(1){
555             Uint8 extra = ((Uint8)mem_struct->current) & ((Uint8) 63);
556             if (extra>0){
557                 extra=64-extra;
558                 mem_struct->current += extra;
559                 mem_struct->remaining -= extra;
560             }
561         }
562 
563         if (last) {
564             last->next = mem_struct;	/* used at end for deallocation. */
565             lookup->mem_struct = mem_struct; /* Used as a shortcut around a long while loop. */
566         } else {
567             lookup->mem_struct = mem_struct;
568             lookup->mem_struct_start = mem_struct;
569         }
570     }
571 
572     new = mem_struct->current;
573     mem_struct->current += required;
574     mem_struct->remaining -= required;
575 
576     return (VoidPtr) new;	/* return pointer to new memory. */
577 }
578 
579 /*
580   LookupTablePtr LIBCALL lookup_new(Int2 alphabet_size, Int2 wordsize)
581 
582   alphabet_size: number of symbols in alphabet
583 
584   wordsize: size of initial word hit (often three)
585 
586   Perform the initial setup of the lookup table.  "lookup"
587   expects to work with a "zero-offset" alphabet such as
588   NCBIstdaa or NCBI2na.
589 
590   Each letter (residue or basepair) in the word goes into it's
591   own set of bits (calculated as num_of_bits below).  This is
592   to allow masking and shifting of bits.  As an example consider
593   a three-letter word with an alphabet containing 20 residues.
594   The minimum num_of_bits is five (able to represent up to 31)
595   and the total "array_size" is 15.
596   "mask" is used to mask the wordsize-1 lower bits (i.e., to set the
597   bits the highest num_of_bits to zero).
598 */
599 LookupTablePtr LIBCALL
lookup_new(Int2 alphabet_size,Int2 wordsize,Int2 reduced_wordsize)600 lookup_new(Int2 alphabet_size, Int2 wordsize, Int2 reduced_wordsize)
601 
602 {
603     Int4 num_of_bits;
604     LookupTablePtr lookup;
605 
606     /* How many bits are needed to hold the alphabet? */
607     num_of_bits = Nlm_Nint(log((Nlm_FloatHi)alphabet_size)/NCBIMATH_LN2);
608 
609     /* 32 bits is 4 bytes */
610     if (num_of_bits*reduced_wordsize > 32) {
611         ErrPostEx(SEV_ERROR, 0, 0, "alphabet times wordsize > 32");
612         return NULL;
613     }
614 
615     lookup = (LookupTablePtr) MemNew(sizeof(LookupTable));
616     if (lookup == NULL)
617         return NULL;
618 
619     lookup->char_size = num_of_bits;
620     lookup->wordsize = (Int4) wordsize;
621     if (reduced_wordsize <= 0)
622         reduced_wordsize = wordsize;
623     lookup->reduced_wordsize = (Int4) reduced_wordsize;
624     lookup->array_size = (Int4) Nlm_Powi(2.0, (num_of_bits*reduced_wordsize));
625     lookup->mask = (Int4) Nlm_Powi(2.0, (num_of_bits*(reduced_wordsize-1))) - 1;
626     lookup->mod_lt = NULL;    /* will be allocated when built */
627     lookup->theTable = NULL;  /* will be allocated when built */
628 
629     lookup->position = (OrigLookupPositionPtr PNTR) MemNew((lookup->array_size)*sizeof(OrigLookupPositionPtr));
630     if (lookup->position == NULL) {
631         lookup = lookup_destruct(lookup);
632         ErrPostEx(SEV_ERROR, 0, 0, "Unable to allocate lookup->position");
633     }
634 
635     /* Only allocate the auxillary structure if it's less than 1 Meg. */
636     if ((lookup->array_size)*sizeof(OrigLookupPositionPtr) <= (1<<20)) {
637         lookup->position_aux =
638             (OrigLookupPositionPtr PNTR) MemNew((lookup->array_size)*sizeof(OrigLookupPositionPtr));
639     }
640 
641     return lookup;
642 }
643 
644 LookupTablePtr LIBCALL
lookup_destruct(LookupTablePtr lookup)645 lookup_destruct(LookupTablePtr lookup)
646 
647 {
648     if (lookup == NULL)
649         return lookup;
650 
651     if(lookup->position)
652         MemFree(lookup->position);
653     if(lookup->position_aux)
654         MemFree(lookup->position_aux);
655     if(lookup->pv_array)
656         MemFree(lookup->pv_array);
657     if(lookup->mod_lt)
658         MemFree(lookup->mod_lt);
659     if(lookup->theTable)
660         MemFree(lookup->theTable);
661     if (lookup->mod_lookup_table_memory)
662         MemFree(lookup->mod_lookup_table_memory);
663     lookup_deallocate_memory(lookup);
664     lookup = MegaBlastLookupTableDestruct(lookup);
665     lookup = MemFree(lookup);
666 
667     return lookup;
668 }
669 
670 /*
671   Deallocated position_aux, which can be large for
672   large word-sizes, and is no longer needed.
673 
674   This now does more than just dealloc lookup->position_aux.
675   This is called afeter the original lookup table is fully built.
676   So this is where we convert it to the new (mod_lt) table.
677   -cfj
678 */
679 
680 Boolean
lookup_position_aux_destruct(LookupTablePtr lookup)681 lookup_position_aux_destruct(LookupTablePtr lookup)
682 
683 {
684     if (lookup == NULL)
685         return FALSE;
686 
687     if(lookup->position_aux)
688         lookup->position_aux = MemFree(lookup->position_aux);
689 
690 
691     /* create the new lookup table, Add 1 to silence purify messages. */
692 
693     lookup->mod_lt = (ModLAEntry PNTR) MemNew((1+lookup->array_size)*sizeof(ModLAEntry));
694 
695     make_mod_lt(lookup);
696 
697     /* deallocate parts of the old table no longer needed */
698     if(lookup->position)
699 	lookup->position = MemFree(lookup->position);
700     lookup_deallocate_memory(lookup);
701 
702     return TRUE;
703 }
704 
705 Boolean
mb_lookup_position_aux_destruct(LookupTablePtr lookup)706 mb_lookup_position_aux_destruct(LookupTablePtr lookup)
707 
708 {
709     if (lookup == NULL)
710         return FALSE;
711 
712     if(lookup->position_aux)
713         lookup->position_aux = MemFree(lookup->position_aux);
714 
715 
716     /* create the new lookup table, Add 1 to silence purify messages. */
717 
718     lookup->mod_lt = (ModLAEntry PNTR) MemNew((1+lookup->array_size)*sizeof(ModLAEntry));
719 
720     mb_make_mod_lt(lookup);
721 
722 
723     /* deallocate parts of the old table no longer needed */
724     if(lookup->position) lookup->position = MemFree(lookup->position);
725 
726     return TRUE;
727 }
728 
729 /*
730   This function adds an (already computed) index to the lookup table
731   for some set of residues.  The most likely use of this is for
732   compressed residues, as in blastn.
733 */
734 void LIBCALL
lookup_add_index(LookupTablePtr lookup,Int4 lookup_index,Int4 position,Int1 context)735 lookup_add_index(LookupTablePtr lookup, Int4 lookup_index, Int4 position, Int1 context)
736 
737 {
738     AddPositionToLookupTable(lookup, lookup_index, position, context);
739 
740     return;
741 }
742 
743 /*
744   This function adds an index to the lookup table for any number of
745   contiguous resdiues.
746 
747 */
748 void LIBCALL
lookup_add(LookupTablePtr lookup,CharPtr string,Int4 position,Int1 context)749 lookup_add(LookupTablePtr lookup, CharPtr string, Int4 position, Int1 context)
750 
751 {
752     Int4 char_size, lookup_index=0, wordsize;
753 
754     char_size = lookup->char_size;
755     wordsize = lookup->wordsize;
756 
757     lookup_index = *string;
758     wordsize--;
759     while (wordsize > 0) {
760         lookup_index <<= char_size;
761         string++;
762         lookup_index += *string;
763         wordsize--;
764     }
765     AddPositionToLookupTable(lookup, lookup_index, position, context);
766 
767     return;
768 }
769 
770 static void
AddPositionToLookupTable(LookupTablePtr lookup,Int4 index,Int4 position,Int1 context)771 AddPositionToLookupTable(LookupTablePtr lookup, Int4 index, Int4 position, Int1 context)
772 
773 {
774     OrigLookupPositionPtr new, last, PNTR lookup_pos, PNTR lookup_pos_aux;
775 
776     if (lookup == NULL) return;
777     lookup->num_pos_added++;
778 
779     lookup_pos = lookup->position;
780     lookup_pos_aux = lookup->position_aux;
781     new = (OrigLookupPositionPtr) lookup_get_memory(lookup, sizeof(OrigLookupPosition));
782     if (*(lookup_pos+index) == NULL) {
783         lookup->num_unique_pos_added++;
784         lookup_pos[index] = new;
785         lookup_pos[index]->position = position;
786         lookup_pos[index]->context = context;
787         if (lookup_pos_aux) {
788             lookup_pos_aux[index] = new;
789         }
790     } else {	/* Go to last link, use aux struct if it exists. */
791         if (lookup_pos_aux) {
792             last = lookup_pos_aux[index];
793             lookup_pos_aux[index] = new;
794         } else {
795             last = lookup_pos[index];
796             while (last->next)
797                 last = last->next;
798         }
799         last->next = new;
800         new->position = position;
801         new->context = context;
802     }
803 
804     return;
805 }
806 
807 /*
808   Initializes the "lookup_index" for "string".  For contiguous word
809   matching the first wordsize-1 letters of string are placed into the
810   lookup_index.  The "new" starting point of the string is then returned.
811 */
812 
813 Uint1Ptr LIBCALL
lookup_find_init(LookupTablePtr lookup,Int4 PNTR lookup_index,Uint1Ptr string)814 lookup_find_init(LookupTablePtr lookup, Int4 PNTR lookup_index, Uint1Ptr string)
815 
816 {
817 
818     Int4 char_size, wordsize;
819 
820     char_size = lookup->char_size;
821     wordsize = lookup->reduced_wordsize;
822 
823     /* Fill in wordsize-1 spaces. */
824     *lookup_index = *string;
825 
826     wordsize -= 2;
827     while (wordsize > 0) {
828         string++;
829         *lookup_index <<= char_size;
830         *lookup_index += *string;
831         wordsize--;
832     }
833 
834     return string;
835 }
836 
837 /*
838    Allocates memory for the BLAST_WordFinder, and lookup table.
839 */
840 
841 BLAST_WordFinderPtr
BLAST_WordFinderNew(Int2 alphabet_size,Int2 wordsize,Int2 compression_ratio,Boolean round_down)842 BLAST_WordFinderNew (Int2 alphabet_size, Int2 wordsize, Int2 compression_ratio, Boolean round_down)
843 
844 {
845     BLAST_WordFinderPtr wfp;
846     Int2 reduced_wordsize;
847 
848     wfp = (BLAST_WordFinderPtr) MemNew(sizeof(BLAST_WordFinder));
849 
850     if (wfp != NULL) {
851 	/* If the compression_ratio is greater than one and round_down is TRUE, then we want to round
852            the wordsize down, as a multiple of compression_ratio should be three less.  round_down is only
853            TRUE in blastn if we want to guarantee that a certain wordsize is used (i.e., 11). */
854         if (compression_ratio > 1) {
855             if (round_down) {
856                 reduced_wordsize = (Int2) MIN(2, ((wordsize-3)/compression_ratio));
857                 wfp->lookup = lookup_new(alphabet_size, (Int2) ((wordsize-3)/compression_ratio), (Int2) reduced_wordsize);
858             } else {
859                 reduced_wordsize = wordsize/compression_ratio;
860                 wfp->lookup = lookup_new(alphabet_size, (wordsize/compression_ratio), (Int2) reduced_wordsize);
861             }
862         } else {
863             wfp->lookup = lookup_new(alphabet_size, (Int2) (wordsize/compression_ratio), 0);
864         }
865         wfp->compression_ratio = compression_ratio;
866         wfp->wordsize = wordsize;
867     }
868 
869     return wfp;
870 }
871 
872 BLAST_WordFinderPtr
BLAST_WordFinderDestruct(BLAST_WordFinderPtr wfp)873 BLAST_WordFinderDestruct (BLAST_WordFinderPtr wfp)
874 
875 {
876 
877     if (wfp != NULL) {
878         wfp->lookup = lookup_destruct(wfp->lookup);
879         wfp = MemFree(wfp);
880     }
881 
882     return wfp;
883 }
884 
885 BLAST_WordFinderPtr
MegaBlastWordFinderDeallocate(BLAST_WordFinderPtr wfp)886 MegaBlastWordFinderDeallocate(BLAST_WordFinderPtr wfp)
887 {
888    LookupTablePtr lookup;
889 
890    if (wfp == NULL || (lookup = wfp->lookup) == NULL ||
891        lookup->mb_lt == NULL)
892       return wfp;
893    if (lookup->mb_lt->estack) {
894       Int4 index;
895       for (index=0; index<lookup->mb_lt->num_stacks; index++)
896 	 MemFree(lookup->mb_lt->estack[index]);
897       MemFree(lookup->mb_lt->estack);
898       MemFree(lookup->mb_lt->stack_size);
899       MemFree(lookup->mb_lt->stack_index);
900    }
901    MemFree(lookup->mb_lt);
902    MemFree(lookup);
903    wfp = MemFree(wfp);
904    return wfp;
905 }
906 
907 Boolean
MegaBlastBuildLookupTable(BlastSearchBlkPtr search)908 MegaBlastBuildLookupTable(BlastSearchBlkPtr search)
909 {
910    LookupTablePtr lookup = search->wfp->lookup;
911    Int4 query_length = search->context[search->first_context].query->length;
912    register Uint1Ptr seq, pos;
913    register Int4 index;
914    register Int4 ecode, extra_code;
915    register Int4 mask;
916    Int4 ecode1, ecode2;
917    Uint1 val, nuc_mask = 0xfc, at_hash_mask = 0x10;
918    MbLookupTablePtr mb_lt;
919    Int4 masked_word_count = 0;
920    PV_ARRAY_TYPE *pv_array=NULL;
921    Int4 pv_shift, pv_array_bts, pv_size;
922    Boolean discontiguous_word = FALSE;
923    Int2 word_length, extra_length, word_weight;
924    Int4 last_offset;
925    Int4 bit0, no_bit0;
926    Int2 template_length;
927    MegaBlastParameterBlkPtr mb_params = search->pbp->mb_params;
928    MBTemplateType template_type = mb_params->template_type;
929    Boolean amb_cond;
930    Boolean use_two_templates = mb_params->use_two_templates;
931 
932    if (lookup == NULL)
933       return FALSE;
934 
935    if(lookup->position_aux)
936       lookup->position_aux = MemFree(lookup->position_aux);
937 
938    mb_lt = (MbLookupTablePtr) MemNew(sizeof(MbLookupTable));
939 
940    /* Increment lookup->wordsize, since it is now equal to (wordsize - 3)/4 */
941    if (++lookup->wordsize < 3)
942       mb_lt->width = 2;
943    else
944       mb_lt->width = 3;
945    mb_lt->lpm = lookup->wordsize * READDB_COMPRESSION_RATIO;
946 
947    mb_lt->max_positions = mb_params->max_positions;
948 
949    mb_lt->hashsize = (1<<(8*mb_lt->width));
950 
951    if (mb_params->disc_word) {
952       discontiguous_word = TRUE;
953       word_length = READDB_COMPRESSION_RATIO*(mb_lt->width+1);
954       template_length = mb_params->template_length;
955       extra_length = template_length - word_length;
956       word_weight = mb_params->word_weight;
957       mask = (1 << (8*(mb_lt->width+1) - 2)) - 1;
958    } else {
959       word_length = READDB_COMPRESSION_RATIO*mb_lt->width;
960       mask = mb_lt->mask = (1 << (8*mb_lt->width - 2)) - 1;
961    }
962 
963    if ((mb_lt->hashtable = (Int4Ptr)
964         MemNew(mb_lt->hashsize*sizeof(Int4))) == NULL) {
965       MegaBlastLookupTableDestruct(lookup);
966       return FALSE;
967    }
968 
969    if ((mb_lt->next_pos = (Int4Ptr)
970         MemNew(query_length*sizeof(Int4))) == NULL) {
971       MegaBlastLookupTableDestruct(lookup);
972       return FALSE;
973    }
974    if (use_two_templates) {
975       if (word_weight >= 12) {
976          if ((mb_lt->hashtable2 = (Int4Ptr)
977               MemNew(mb_lt->hashsize*sizeof(Int4))) == NULL) {
978             MegaBlastLookupTableDestruct(lookup);
979             return FALSE;
980          }
981       } else {/* For weight 11 no need for extra main table */
982          mb_lt->hashtable2 = mb_lt->hashtable;
983       }
984       if ((mb_lt->next_pos2 = (Int4Ptr)
985            MemNew(query_length*sizeof(Int4))) == NULL) {
986          MegaBlastLookupTableDestruct(lookup);
987          return FALSE;
988       }
989    }
990    seq = search->context[search->first_context].query->sequence_start;
991    pos = seq + word_length;
992 
993    ecode = 0;
994 
995    if (discontiguous_word) {
996       if (word_weight == 11) {
997          no_bit0 = 0x007fffff;
998          bit0 = 0x00800000;
999       } else {
1000          no_bit0 = 0xffffffff;
1001          bit0 = 0x00000000;
1002       }
1003       if (/*template18*/extra_length) {
1004          last_offset = query_length - extra_length;
1005          for (index = 1; index <= last_offset; index++) {
1006             val = *++seq;
1007             if ((val & nuc_mask) != 0) { /* ambiguity, gap or masked residue */
1008                if (*seq & at_hash_mask) /* Will extend through this residue */
1009                   *seq &= 0x0f;
1010                ecode = 0;
1011                pos = seq + word_length;
1012                extra_code = 0;
1013             } else {
1014                /* get next base */
1015                ecode = ((ecode & mask) << 2) + val;
1016                if (seq >= pos) {
1017                   switch (template_type) {
1018                   case TEMPL_11_18: case TEMPL_12_18:
1019                      amb_cond = !GET_AMBIG_CONDITION_18(seq);
1020                      break;
1021                   case TEMPL_11_18_OPT: case TEMPL_12_18_OPT:
1022                      amb_cond = !GET_AMBIG_CONDITION_18_OPT(seq);
1023                      break;
1024                   case TEMPL_11_21: case TEMPL_12_21:
1025                      amb_cond = !GET_AMBIG_CONDITION_21(seq);
1026                      break;
1027                   case TEMPL_11_21_OPT: case TEMPL_12_21_OPT:
1028                      amb_cond = !GET_AMBIG_CONDITION_21_OPT(seq);
1029                      break;
1030                   default:
1031                      amb_cond = FALSE;
1032                      break;
1033                   }
1034 
1035                   if (amb_cond) {
1036                      switch (template_type) {
1037                      case TEMPL_11_18:
1038                         ecode1 = (GET_WORD_INDEX_11_18(ecode) |
1039                                   GET_EXTRA_CODE_18(seq)) & no_bit0;
1040                         break;
1041                      case TEMPL_12_18:
1042                         ecode1 = (GET_WORD_INDEX_12_18(ecode) |
1043                                   GET_EXTRA_CODE_18(seq)) & no_bit0;
1044                         break;
1045                      case TEMPL_11_18_OPT:
1046                         ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) |
1047                                   GET_EXTRA_CODE_18_OPT(seq)) & no_bit0;
1048                         break;
1049                      case TEMPL_12_18_OPT:
1050                         ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) |
1051                                   GET_EXTRA_CODE_18_OPT(seq)) & no_bit0;
1052                         break;
1053                      case TEMPL_11_21:
1054                         ecode1 = (GET_WORD_INDEX_11_21(ecode) |
1055                                   GET_EXTRA_CODE_21(seq)) & no_bit0;
1056                         break;
1057                      case TEMPL_12_21:
1058                         ecode1 = (GET_WORD_INDEX_12_21(ecode) |
1059                                   GET_EXTRA_CODE_21(seq)) & no_bit0;
1060                         break;
1061                      case TEMPL_11_21_OPT:
1062                         ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) |
1063                                   GET_EXTRA_CODE_21_OPT(seq)) & no_bit0;
1064                         break;
1065                      case TEMPL_12_21_OPT:
1066                         ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) |
1067                                   GET_EXTRA_CODE_21_OPT(seq)) & no_bit0;
1068                         break;
1069                      default:
1070                         ecode1 = 0;
1071                         break;
1072                      }
1073 
1074                      if (use_two_templates) {
1075                         switch (template_type) {
1076                         case TEMPL_11_18:
1077                            ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) |
1078                               GET_EXTRA_CODE_18_OPT(seq) | bit0;
1079                            break;
1080                         case TEMPL_12_18:
1081                            ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) |
1082                               GET_EXTRA_CODE_18_OPT(seq) | bit0;
1083                            break;
1084                         case TEMPL_11_21:
1085                            ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) |
1086                               GET_EXTRA_CODE_21_OPT(seq) | bit0;
1087                            break;
1088                         case TEMPL_12_21:
1089                            ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) |
1090                               GET_EXTRA_CODE_21_OPT(seq) | bit0;
1091                            break;
1092                         default:
1093                            ecode2 = 0; break;
1094                         }
1095 
1096                         if (mb_lt->hashtable[ecode1] == 0 ||
1097                             mb_lt->hashtable2[ecode2] == 0)
1098                            lookup->num_unique_pos_added++;
1099                         mb_lt->next_pos2[index] = mb_lt->hashtable2[ecode2];
1100                         mb_lt->hashtable2[ecode2] = index;
1101                      } else {
1102                         if (mb_lt->hashtable[ecode1] == 0)
1103                            lookup->num_unique_pos_added++;
1104                      }
1105                      mb_lt->next_pos[index] = mb_lt->hashtable[ecode1];
1106                      mb_lt->hashtable[ecode1] = index;
1107                   }
1108                }
1109             }
1110          }
1111          /* Modify the last 2 residues if they are masked only for lookup */
1112          for (index = 0; index < extra_length; index++) {
1113             if (*++seq & at_hash_mask)
1114                *seq &= 0x0f;
1115          }
1116       } else { /* Template length 16 */
1117          for (index = 1; index <= query_length; index++) {
1118             val = *++seq;
1119             if ((val & nuc_mask) != 0) { /* ambiguity, gap or masked residue */
1120                ecode = 0;
1121                pos = seq + word_length;
1122                if (*seq & at_hash_mask) /* Will extend through this residue */
1123                   *seq &= 0x0f;
1124             } else {
1125                /* get next base */
1126                ecode = ((ecode & mask) << 2) + val;
1127                if (seq >= pos) {
1128                   switch (template_type) {
1129                   case TEMPL_11_16:
1130                      ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
1131                      break;
1132                   case TEMPL_12_16:
1133                      ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
1134                      break;
1135                   case TEMPL_11_16_OPT:
1136                      ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
1137                      break;
1138                   case TEMPL_12_16_OPT:
1139                      ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
1140                      break;
1141                   default:
1142                      ecode1 = 0;
1143                      break;
1144                   }
1145 
1146                   if (use_two_templates) {
1147                      switch (template_type) {
1148                      case TEMPL_11_16:
1149                         ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
1150                         break;
1151                      case TEMPL_12_16:
1152                         ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
1153                         break;
1154                      default:
1155                         ecode2 = 0; break;
1156                      }
1157                      if (mb_lt->hashtable[ecode1] == 0 ||
1158                          mb_lt->hashtable2[ecode2] == 0)
1159                         lookup->num_unique_pos_added++;
1160                      mb_lt->next_pos2[index] = mb_lt->hashtable2[ecode2];
1161                      mb_lt->hashtable2[ecode2] = index;
1162                   } else {
1163                      if (mb_lt->hashtable[ecode1] == 0)
1164                         lookup->num_unique_pos_added++;
1165                   }
1166                   mb_lt->next_pos[index] = mb_lt->hashtable[ecode1];
1167                   mb_lt->hashtable[ecode1] = index;
1168                }
1169             }
1170          }
1171       }
1172    } else { /* Contiguous word */
1173       for (index = 1; index <= query_length; index++) {
1174          val = *++seq;
1175          if ((val & nuc_mask) != 0) { /* ambiguity, gap or masked residue */
1176             ecode = 0;
1177             pos = seq + word_length;
1178             if (*seq & at_hash_mask) /* Will extend through this residue */
1179                *seq &= 0x0f;
1180          } else {
1181             /* get next base */
1182             ecode = ((ecode & mask) << 2) + val;
1183             if (seq >= pos) {
1184                if (mb_lt->hashtable[ecode] == 0)
1185                   lookup->num_unique_pos_added++;
1186                mb_lt->next_pos[index] = mb_lt->hashtable[ecode];
1187                mb_lt->hashtable[ecode] = index;
1188             }
1189          }
1190       }
1191    }
1192 
1193    /* Now remove the hash entries that have too many positions */
1194    if (mb_lt->max_positions>0) {
1195       for (ecode=0; ecode<mb_lt->hashsize; ecode++) {
1196 	 Int4 pcount, pcount1=0;
1197 	 for (index=mb_lt->hashtable[ecode], pcount=0;
1198 	      index>0; index=mb_lt->next_pos[index], pcount++);
1199          if (use_two_templates) {
1200             for (index=mb_lt->hashtable2[ecode], pcount1=0;
1201                  index>0; index=mb_lt->next_pos2[index], pcount1++);
1202          }
1203 	 if ((pcount=MAX(pcount,pcount1))>mb_lt->max_positions) {
1204 	    mb_lt->hashtable[ecode] = 0;
1205             if (use_two_templates)
1206                mb_lt->hashtable2[ecode] = 0;
1207 	    ErrPostEx(SEV_WARNING, 0, 0, "%lx - %d", ecode, pcount);
1208 	    masked_word_count++;
1209 	 }
1210       }
1211       if (masked_word_count)
1212 	 ErrPostEx(SEV_WARNING, 0, 0, "Masked %d %dmers", masked_word_count,
1213 		   4*mb_lt->width);
1214 
1215    }
1216 
1217    if (discontiguous_word)
1218       mb_lt->mask = (1 << mb_lt->width*8) - 1;
1219    else
1220       mb_lt->mask = (1 << (mb_lt->width*8 - 8)) - 1;
1221 
1222    /* Stacks are allocated for word size >= 11, or if query length is too long,
1223       otherwise will use diagonal array (ewp and ewp_params structures) */
1224    if (query_length > MAX_DIAG_ARRAY ||
1225         search->pbp->mb_params->word_weight >= 11) {
1226       Int8 total_len=0, av_search_space=0, av_len=0;
1227       Int4 total_num=0, stack_size=0, num_stacks=0;
1228       if (search->rdfp) {
1229          if (search->dblen > 0 && search->dbseq_num > 0) {
1230             total_len = search->dblen;
1231             total_num = search->dbseq_num;
1232          } else {
1233             readdb_get_totals_ex(search->rdfp, &total_len, &total_num, TRUE);
1234          }
1235          if (total_num > 0)
1236             av_len = total_len / total_num;
1237          else
1238             av_len = 1000;
1239       } else {
1240          av_len = search->subject->length;
1241       }
1242       av_search_space =
1243          ((Int8) search->context[search->first_context].query->length) * av_len;
1244       num_stacks = MIN(1 + (Int4) sqrt(av_search_space)/100, 500);
1245       stack_size = 5000/num_stacks;
1246       mb_lt->stack_index = (Int4Ptr) MemNew(num_stacks*sizeof(Int4));
1247       mb_lt->stack_size = (Int4Ptr) Malloc(num_stacks*sizeof(Int4));
1248       mb_lt->estack = (MbStackPtr PNTR) Malloc(num_stacks*sizeof(MbStackPtr));
1249       for (index=0; index<num_stacks; index++) {
1250          mb_lt->estack[index] =
1251             (MbStackPtr) Malloc(stack_size*sizeof(MbStack));
1252          mb_lt->stack_size[index] = stack_size;
1253       }
1254       mb_lt->num_stacks = num_stacks;
1255    }
1256 
1257    /* For 12-mer based lookup table need to make presense bit array much
1258       smaller, so it stays in cache, even though this allows for collisions */
1259    pv_shift = (mb_lt->width < 3) ? 0 : 5;
1260 
1261    pv_array_bts = PV_ARRAY_BTS + pv_shift;
1262 
1263    if (lookup->num_unique_pos_added <
1264        (PV_ARRAY_FACTOR*(mb_lt->hashsize>>pv_shift))) {
1265       pv_size = mb_lt->hashsize>>pv_array_bts;
1266       /* Size measured in PV_ARRAY_TYPE's */
1267       pv_array = MemNew(pv_size*PV_ARRAY_BYTES);
1268 
1269       for (index=0; index<mb_lt->hashsize; index++) {
1270          if (mb_lt->hashtable[index] != 0 ||
1271              (use_two_templates && mb_lt->hashtable2[index] != 0))
1272             pv_array[index>>pv_array_bts] |=
1273                (((PV_ARRAY_TYPE) 1)<<(index&PV_ARRAY_MASK));
1274       }
1275       lookup->pv_array = pv_array;
1276    }
1277    lookup->mb_lt = mb_lt;
1278    return TRUE;
1279 }
1280 
1281 LookupTablePtr
MegaBlastLookupTableDestruct(LookupTablePtr lookup)1282 MegaBlastLookupTableDestruct(LookupTablePtr lookup)
1283 {
1284    Int4 index;
1285 
1286    if (!lookup->mb_lt)
1287       return lookup;
1288    if (lookup->mb_lt->hashtable)
1289       MemFree(lookup->mb_lt->hashtable);
1290    if (lookup->mb_lt->next_pos)
1291       MemFree(lookup->mb_lt->next_pos);
1292    if (lookup->mb_lt->next_pos2)
1293       MemFree(lookup->mb_lt->next_pos2);
1294    if (lookup->mb_lt->estack) {
1295       for (index=0; index<lookup->mb_lt->num_stacks; index++)
1296 	 MemFree(lookup->mb_lt->estack[index]);
1297       MemFree(lookup->mb_lt->estack);
1298       MemFree(lookup->mb_lt->stack_index);
1299       MemFree(lookup->mb_lt->stack_size);
1300    }
1301    lookup->mb_lt = MemFree(lookup->mb_lt);
1302    return lookup;
1303 }
1304 
1305 
MegaBlastLookupTableDup(LookupTablePtr lookup)1306 LookupTablePtr MegaBlastLookupTableDup(LookupTablePtr lookup)
1307 {
1308    Int4 index;
1309 
1310    /* The only piece that actually needs to be duplicated is estack array */
1311    LookupTablePtr new_lookup =
1312       (LookupTablePtr) MemDup(lookup, sizeof(LookupTable));
1313 
1314    new_lookup->mb_lt =
1315       (MbLookupTablePtr) MemDup(lookup->mb_lt, sizeof(MbLookupTable));
1316    new_lookup->mb_lt->stack_index = (Int4Ptr) MemNew(lookup->mb_lt->num_stacks*sizeof(Int4));
1317    new_lookup->mb_lt->stack_size = (Int4Ptr) MemDup(lookup->mb_lt->stack_size,
1318 						    lookup->mb_lt->num_stacks*sizeof(Int4));
1319    if (lookup->mb_lt->estack) {
1320       new_lookup->mb_lt->estack = (MbStackPtr PNTR) Malloc(lookup->mb_lt->num_stacks*sizeof(MbStackPtr));
1321       for (index=0; index<lookup->mb_lt->num_stacks; index++)
1322          new_lookup->mb_lt->estack[index] =
1323             (MbStackPtr) Malloc(lookup->mb_lt->stack_size[index]*sizeof(MbStack));
1324    }
1325    return new_lookup;
1326 }
1327