1 /* $Id: blast_nalookup.h,v 1.13 2016/12/06 15:29:16 fukanchi Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  */
26 
27 /** @file blast_nalookup.h
28  *  Routines for creating nucleotide BLAST lookup tables.
29  */
30 
31 #ifndef ALGO_BLAST_CORE__BLAST_NTLOOKUP__H
32 #define ALGO_BLAST_CORE__BLAST_NTLOOKUP__H
33 
34 #include <algo/blast/core/ncbi_std.h>
35 #include <algo/blast/core/blast_def.h>
36 #include <algo/blast/core/blast_lookup.h>
37 #include <algo/blast/core/blast_options.h>
38 
39 #include <algo/blast/core/blast_seqsrc.h>
40 
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44 
45 /** choose the type of nucleotide lookup table to be used
46  * for a blast search
47  * @param lookup_options Options for lookup table creation [in]
48  * @param approx_table_entries An upper bound on the number of words
49  *        that must be added to the lookup table [in]
50  * @param lut_width The number of nucleotides in one lookup table word [out]
51  @param query_length Number of letters in the query [in]
52  * @return the lookup table type chosen
53  */
54 ELookupTableType
55 BlastChooseNaLookupTable(const LookupTableOptions* lookup_options,
56                          Int4 approx_table_entries, Int4 query_length,
57                          Int4 *lut_width);
58 
59 /*------------------------- Small lookup table --------------------------*/
60 
61 /** Lookup table structure for blastn searches with small queries */
62 typedef struct BlastSmallNaLookupTable {
63     Int4 mask;             /**< part of index to mask off, that is, top
64                                 (wordsize*charsize) bits should be discarded. */
65     Int4 word_length;      /**< Length in bases of the full word match
66                                 required to trigger extension */
67     Int4 lut_word_length;  /**< Length in bases of a word indexed by the
68                                 lookup table */
69     Int4 scan_step;        /**< number of bases between successive words */
70     Int4 backbone_size;    /**< number of cells in the backbone */
71     Int4 longest_chain;    /**< length of the longest chain on the backbone */
72     Int2 * final_backbone; /**< backbone structure used when scanning for
73                                  hits */
74     Int2 * overflow;       /**< the overflow array for the compacted
75                                 lookup table */
76     Int4  overflow_size;   /**< Number of elements in the overflow array */
77     void *scansub_callback; /**< function for scanning subject sequences */
78     void *extend_callback;  /**< function for extending hits */
79     BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for soft-masking. */
80 } BlastSmallNaLookupTable;
81 
82 /** Create a new small nucleotide lookup table.
83  * @param query The query sequence block (if concatenated sequence, the
84  *        individual strands/sequences must be separated by a 0x0f byte)[in]
85  * @param locations The locations to be included in the lookup table,
86  *        e.g. [0,length-1] for full sequence. NULL means no sequence. [in]
87  * @param lut Pointer to the lookup table to be created [out]
88  * @param opt Options for lookup table creation [in]
89  * @param query_options query options used to get filtering options [in]
90  * @param lut_width The number of nucleotides in one lookup table word [in]
91  * @return 0 if successful, nonzero on failure
92  */
93 Int4 BlastSmallNaLookupTableNew(BLAST_SequenceBlk* query,
94                                 BlastSeqLoc* locations,
95                                 BlastSmallNaLookupTable * *lut,
96                                 const LookupTableOptions * opt,
97                                 const QuerySetUpOptions* query_options,
98                                 Int4 lut_width);
99 
100 /** Free a small nucleotide lookup table.
101  *  @param lookup The lookup table structure to be freed
102  *  @return NULL
103  */
104 BlastSmallNaLookupTable* BlastSmallNaLookupTableDestruct(
105                                         BlastSmallNaLookupTable* lookup);
106 
107 /*----------------------- Standard lookup table -------------------------*/
108 
109 #define NA_HITS_PER_CELL 3 /**< maximum number of hits in one lookup
110                                 table cell */
111 
112 /** structure defining one cell of the compacted lookup table */
113 typedef struct NaLookupBackboneCell {
114     Int4 num_used;             /**< number of hits stored for this cell */
115 
116     union {
117       Int4 overflow_cursor;    /**< integer offset into the overflow array
118                                  where the list of hits for this cell begins */
119       Int4 entries[NA_HITS_PER_CELL];  /**< if the number of hits for this
120                                             cell is NA_HITS_PER_CELL or less,
121                                             the hits are all stored directly in
122                                             the cell */
123     } payload;  /**< UNION that specifies either entries stored right
124                      on the backbone if fewer than NA_HITS_PER_CELL
125                      are present or a pointer to where the hits are
126                      stored (off-backbone). */
127 } NaLookupBackboneCell;
128 
129 /** The basic lookup table structure for blastn searches
130  */
131 typedef struct BlastNaLookupTable {
132     Int4 mask;             /**< part of index to mask off, that is, top
133                                 (wordsize*charsize) bits should be discarded. */
134     Int4 word_length;      /**< Length in bases of the full word match
135                                 required to trigger extension */
136     Int4 lut_word_length;  /**< Length in bases of a word indexed by the
137                                 lookup table */
138     Int4 scan_step;        /**< number of bases between successive words */
139     Int4 backbone_size;    /**< number of cells in the backbone */
140     Int4 longest_chain;    /**< length of the longest chain on the backbone */
141     NaLookupBackboneCell * thick_backbone; /**< the "thick" backbone. after
142                                               queries are indexed, compact the
143                                               backbone to put at most
144                                               NA_HITS_PER_CELL hits on the
145                                               backbone, otherwise point to
146                                               some overflow storage */
147     Int4 * overflow;       /**< the overflow array for the compacted
148                                 lookup table */
149     Int4  overflow_size;   /**< Number of elements in the overflow array */
150     PV_ARRAY_TYPE *pv;     /**< Presence vector bitfield; bit positions that
151                                 are set indicate that the corresponding thick
152                                 backbone cell contains hits */
153     void *scansub_callback; /**< function for scanning subject sequences */
154     void *extend_callback;  /**< function for extending hits */
155     BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for soft-masking. */
156 } BlastNaLookupTable;
157 
158 /** Create a new nucleotide lookup table.
159  * @param query The query sequence block (if concatenated sequence, the
160  *        individual strands/sequences must be separated by a 0x0f byte)[in]
161  * @param locations The locations to be included in the lookup table,
162  *        e.g. [0,length-1] for full sequence. NULL means no sequence. [in]
163  * @param lut Pointer to the lookup table to be created [out]
164  * @param opt Options for lookup table creation [in]
165  * @param query_options query options used to get filtering options [in]
166  * @param lut_width The number of nucleotides in one lookup table word [in]
167  * @return 0 if successful, nonzero on failure
168  */
169 Int4 BlastNaLookupTableNew(BLAST_SequenceBlk* query,
170                            BlastSeqLoc* locations,
171                            BlastNaLookupTable * *lut,
172                            const LookupTableOptions * opt,
173                            const QuerySetUpOptions* query_options,
174                            Int4 lut_width);
175 
176 /** Free a nucleotide lookup table.
177  *  @param lookup The lookup table structure to be freed
178  *  @return NULL
179  */
180 BlastNaLookupTable* BlastNaLookupTableDestruct(BlastNaLookupTable* lookup);
181 
182 /*----------------------- Megablast lookup table -------------------------*/
183 
184 /** General types of discontiguous word templates */
185 typedef enum {
186    eMBWordCoding = 0,
187    eMBWordOptimal = 1,
188    eMBWordTwoTemplates = 2
189 } EDiscWordType;
190 
191 /** Enumeration of all discontiguous word templates; the enumerated values
192  * encode the weight, template length and type information
193  *
194  * <PRE>
195  *  Optimal word templates:
196  * Number of 1's in a template is word size (weight);
197  * total number of 1's and 0's - template length.
198  *   1,110,110,110,110,111      - 12 of 16
199  *   1,110,010,110,110,111      - 11 of 16
200  * 111,010,110,010,110,111      - 12 of 18
201  * 111,010,010,110,010,111      - 11 of 18
202  * 111,010,010,110,010,010,111  - 12 of 21
203  * 111,010,010,100,010,010,111  - 11 of 21
204  *  Coding word templates:
205  *    111,110,110,110,110,1     - 12 of 16
206  *    110,110,110,110,110,1     - 11 of 16
207  * 10,110,110,110,110,110,1     - 12 of 18
208  * 10,110,110,010,110,110,1     - 11 of 18
209  * 10,010,110,110,110,010,110,1 - 12 of 21
210  * 10,010,110,010,110,010,110,1 - 11 of 21
211  * </PRE>
212  *
213  * Sequence data processed by these templates is assumed to be arranged
214  * from left to right
215  *
216  * Index values are calculated by masking the respective pieces of sequence so
217  * only bits corresponding to a contiguous string of 1's in a template are
218  * left, then shifting the masked value to a correct position in the final
219  * 22- or 24-bit lookup table index, which is the sum of such shifts.
220  */
221 typedef enum {
222    eDiscTemplateContiguous = 0,
223    eDiscTemplate_11_16_Coding = 1,
224    eDiscTemplate_11_16_Optimal = 2,
225    eDiscTemplate_12_16_Coding = 3,
226    eDiscTemplate_12_16_Optimal = 4,
227    eDiscTemplate_11_18_Coding = 5,
228    eDiscTemplate_11_18_Optimal = 6,
229    eDiscTemplate_12_18_Coding = 7,
230    eDiscTemplate_12_18_Optimal = 8,
231    eDiscTemplate_11_21_Coding = 9,
232    eDiscTemplate_11_21_Optimal = 10,
233    eDiscTemplate_12_21_Coding = 11,
234    eDiscTemplate_12_21_Optimal = 12
235 } EDiscTemplateType;
236 
237 /** The lookup table structure used for Mega BLAST */
238 typedef struct BlastMBLookupTable {
239     Int4 word_length;      /**< number of exact letter matches that will trigger
240                               an ungapped extension */
241     Int4 lut_word_length;  /**< number of letters in a lookup table word */
242     Int8 hashsize;       /**< = 4^(lut_word_length) */
243     Boolean discontiguous; /**< Are discontiguous words used? */
244     Int4 template_length; /**< Length of the discontiguous word template */
245     EDiscTemplateType template_type; /**< Type of the discontiguous
246                                          word template */
247     Boolean two_templates; /**< Use two templates simultaneously */
248     EDiscTemplateType second_template_type; /**< Type of the second
249                                                 discontiguous word template */
250 
251     Boolean stride;     /**< is lookup table created with a stride */
252     Int4 scan_step;     /**< Step size for scanning the database */
253     Int4* hashtable;   /**< Array of positions              */
254     Int4* hashtable2;  /**< Array of positions for second template */
255     Int4* next_pos;    /**< Extra positions stored here     */
256     Int4* next_pos2;   /**< Extra positions for the second template */
257     PV_ARRAY_TYPE *pv_array;/**< Presence vector, used for quick presence
258                                check */
259     Int4 pv_array_bts; /**< The exponent of 2 by which pv_array is smaller than
260                            the backbone */
261     Int4 longest_chain; /**< Largest number of query positions for a given
262                            word */
263     void *scansub_callback; /**< function for scanning subject sequences */
264     void *extend_callback;  /**< function for extending hits */
265 
266     Int4 num_unique_pos_added; /**< Number of positions added to the l.t. */
267     Int4 num_words_added; /**< Number of words added to the l.t. */
268     BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for soft-masking. */
269 } BlastMBLookupTable;
270 
271 /**
272  * Create the lookup table for Mega BLAST
273  * @param query The query sequence block (if concatenated sequence, the
274  *        individual strands/sequences must be separated by a 0x0f byte)[in]
275  * @param location The locations to be included in the lookup table,
276  *        e.g. [0,length-1] for full sequence. NULL means no sequence. [in]
277  * @param mb_lt_ptr Pointer to the lookup table to be created [out]
278  * @param lookup_options Options for lookup table creation [in]
279  * @param query_options query options used to get filtering options [in]
280  * @param approx_table_entries An upper bound on the number of words
281  *        that must be added to the lookup table [in]
282  * @param lut_width The number of nucleotides in one lookup table word [in]
283  * @param seqsrc Database sequences [in]
284  */
285 Int2 BlastMBLookupTableNew(BLAST_SequenceBlk* query, BlastSeqLoc* location,
286                            BlastMBLookupTable** mb_lt_ptr,
287                            const LookupTableOptions* lookup_options,
288                            const QuerySetUpOptions* query_options,
289                            Int4 approx_table_entries,
290                            Int4 lut_width,
291                            BlastSeqSrc* seqsrc);
292 
293 /**
294  * Deallocate memory used by the Mega BLAST lookup table
295  */
296 BlastMBLookupTable* BlastMBLookupTableDestruct(BlastMBLookupTable* mb_lt);
297 
298 /*----------------------- Discontiguous Megablast -------------------------*/
299 
300 /** Forms a lookup table index for the 11-of-16 coding template in
301  *  discontiguous megablast
302  * @param accum accumulator containing the 2-bit bases that will
303  *              be used to create the index. Bases most recently
304  *              added to the accumulator are in the low-order bits
305  * @return The 22-bit lookup table index
306  */
DiscontigIndex_11_16_Coding(Uint8 accum)307 static NCBI_INLINE Int4 DiscontigIndex_11_16_Coding(Uint8 accum)
308 {
309     Uint4 lo = (Uint4)accum;
310     return ((lo & 0x00000003)      ) |
311            ((lo & 0x000000f0) >>  2) |
312            ((lo & 0x00003c00) >>  4) |
313            ((lo & 0x000f0000) >>  6) |
314            ((lo & 0x03c00000) >>  8) |
315            ((lo & 0xf0000000) >> 10);
316 }
317 
318 /** Forms a lookup table index for the 11-of-16 optimal template in
319  *  discontiguous megablast
320  * @param accum accumulator containing the 2-bit bases that will
321  *              be used to create the index. Bases most recently
322  *              added to the accumulator are in the low-order bits
323  * @return The 22-bit lookup table index
324  */
DiscontigIndex_11_16_Optimal(Uint8 accum)325 static NCBI_INLINE Int4 DiscontigIndex_11_16_Optimal(Uint8 accum)
326 {
327     Uint4 lo = (Uint4)accum;
328     return ((lo & 0x0000003f)      ) |
329            ((lo & 0x00000f00) >>  2) |
330            ((lo & 0x0003c000) >>  4) |
331            ((lo & 0x00300000) >>  6) |
332            ((lo & 0xfc000000) >> 10);
333 }
334 
335 /** Forms a lookup table index for the 11-of-18 coding template in
336  *  discontiguous megablast
337  * @param accum accumulator containing the 2-bit bases that will
338  *              be used to create the index. Bases most recently
339  *              added to the accumulator are in the low-order bits
340  * @return The 22-bit lookup table index
341  */
DiscontigIndex_11_18_Coding(Uint8 accum)342 static NCBI_INLINE Int4 DiscontigIndex_11_18_Coding(Uint8 accum)
343 {
344     Uint4 lo = (Uint4)accum;
345     Uint4 hi = (Uint4)(accum >> 32);
346     return ((lo & 0x00000003)      ) |
347            ((lo & 0x000000f0) >>  2) |
348            ((lo & 0x00003c00) >>  4) |
349            ((lo & 0x00030000) >>  6) |
350            ((lo & 0x03c00000) >> 10) |
351            ((lo & 0xf0000000) >> 12) |
352            ((hi & 0x0000000c) << 18);
353 }
354 
355 /** Forms a lookup table index for the 11-of-18 optimal template in
356  *  discontiguous megablast
357  * @param accum accumulator containing the 2-bit bases that will
358  *              be used to create the index. Bases most recently
359  *              added to the accumulator are in the low-order bits
360  * @return The 22-bit lookup table index
361  */
DiscontigIndex_11_18_Optimal(Uint8 accum)362 static NCBI_INLINE Int4 DiscontigIndex_11_18_Optimal(Uint8 accum)
363 {
364     Uint4 lo = (Uint4)accum;
365     Uint4 hi = (Uint4)(accum >> 32);
366     return ((lo & 0x0000003f)      ) |
367            ((lo & 0x00000300) >>  2) |
368            ((lo & 0x0003c000) >>  6) |
369            ((lo & 0x00300000) >>  8) |
370            ((lo & 0x0c000000) >> 12) |
371            ((lo & 0xc0000000) >> 14) |
372            ((hi & 0x0000000f) << 18);
373 }
374 
375 /** Forms a lookup table index for the 11-of-21 coding template in
376  *  discontiguous megablast
377  * @param accum accumulator containing the 2-bit bases that will
378  *              be used to create the index. Bases most recently
379  *              added to the accumulator are in the low-order bits
380  * @return The 22-bit lookup table index
381  */
DiscontigIndex_11_21_Coding(Uint8 accum)382 static NCBI_INLINE Int4 DiscontigIndex_11_21_Coding(Uint8 accum)
383 {
384     Uint4 lo = (Uint4)accum;
385     Uint4 hi = (Uint4)(accum >> 32);
386     return ((lo & 0x00000003)      ) |
387            ((lo & 0x000000f0) >>  2) |
388            ((lo & 0x00000c00) >>  4) |
389            ((lo & 0x000f0000) >>  8) |
390            ((lo & 0x00c00000) >> 10) |
391            ((lo & 0xf0000000) >> 14) |
392            ((hi & 0x0000000c) << 16) |
393            ((hi & 0x00000300) << 12);
394 }
395 
396 /** Forms a lookup table index for the 11-of-21 optimal template in
397  *  discontiguous megablast
398  * @param accum accumulator containing the 2-bit bases that will
399  *              be used to create the index. Bases most recently
400  *              added to the accumulator are in the low-order bits
401  * @return The 24-bit lookup table index
402  */
DiscontigIndex_11_21_Optimal(Uint8 accum)403 static NCBI_INLINE Int4 DiscontigIndex_11_21_Optimal(Uint8 accum)
404 {
405     Uint4 lo = (Uint4)accum;
406     Uint4 hi = (Uint4)(accum >> 32);
407     return ((lo & 0x0000003f)      ) |
408            ((lo & 0x00000300) >>  2) |
409            ((lo & 0x0000c000) >>  6) |
410            ((lo & 0x00c00000) >> 12) |
411            ((lo & 0x0c000000) >> 14) |
412            ((hi & 0x00000003) << 14) |
413            ((hi & 0x000003f0) << 12);
414 }
415 
416 /** Forms a lookup table index for the 12-of-16 coding template in
417  *  discontiguous megablast
418  * @param accum accumulator containing the 2-bit bases that will
419  *              be used to create the index. Bases most recently
420  *              added to the accumulator are in the low-order bits
421  * @return The 24-bit lookup table index
422  */
DiscontigIndex_12_16_Coding(Uint8 accum)423 static NCBI_INLINE Int4 DiscontigIndex_12_16_Coding(Uint8 accum)
424 {
425     Uint4 lo = (Uint4)accum;
426     return ((lo & 0x00000003)      ) |
427            ((lo & 0x000000f0) >>  2) |
428            ((lo & 0x00003c00) >>  4) |
429            ((lo & 0x000f0000) >>  6) |
430            ((lo & 0xffc00000) >>  8);
431 }
432 
433 /** Forms a lookup table index for the 12-of-16 optimal template in
434  *  discontiguous megablast
435  * @param accum accumulator containing the 2-bit bases that will
436  *              be used to create the index. Bases most recently
437  *              added to the accumulator are in the low-order bits
438  * @return The 24-bit lookup table index
439  */
DiscontigIndex_12_16_Optimal(Uint8 accum)440 static NCBI_INLINE Int4 DiscontigIndex_12_16_Optimal(Uint8 accum)
441 {
442     Uint4 lo = (Uint4)accum;
443     return ((lo & 0x0000003f)     ) |
444            ((lo & 0x00000f00) >> 2) |
445            ((lo & 0x0003c000) >> 4) |
446            ((lo & 0x00f00000) >> 6) |
447            ((lo & 0xfc000000) >> 8);
448 }
449 
450 /** Forms a lookup table index for the 12-of-18 coding template in
451  *  discontiguous megablast
452  * @param accum accumulator containing the 2-bit bases that will
453  *              be used to create the index. Bases most recently
454  *              added to the accumulator are in the low-order bits
455  * @return The 24-bit lookup table index
456  */
DiscontigIndex_12_18_Coding(Uint8 accum)457 static NCBI_INLINE Int4 DiscontigIndex_12_18_Coding(Uint8 accum)
458 {
459     Uint4 lo = (Uint4)accum;
460     Uint4 hi = (Uint4)(accum >> 32);
461     return ((lo & 0x00000003)      ) |
462            ((lo & 0x000000f0) >>  2) |
463            ((lo & 0x00003c00) >>  4) |
464            ((lo & 0x000f0000) >>  6) |
465            ((lo & 0x03c00000) >>  8) |
466            ((lo & 0xf0000000) >> 10) |
467            ((hi & 0x0000000c) << 20);
468 }
469 
470 /** Forms a lookup table index for the 12-of-18 optimal template in
471  *  discontiguous megablast
472  * @param accum accumulator containing the 2-bit bases that will
473  *              be used to create the index. Bases most recently
474  *              added to the accumulator are in the low-order bits
475  * @return The 24-bit lookup table index
476  */
DiscontigIndex_12_18_Optimal(Uint8 accum)477 static NCBI_INLINE Int4 DiscontigIndex_12_18_Optimal(Uint8 accum)
478 {
479     Uint4 lo = (Uint4)accum;
480     Uint4 hi = (Uint4)(accum >> 32);
481     return ((lo & 0x0000003f)      ) |
482            ((lo & 0x00000f00) >>  2) |
483            ((lo & 0x0000c000) >>  4) |
484            ((lo & 0x00f00000) >>  8) |
485            ((lo & 0x0c000000) >> 10) |
486            ((lo & 0xc0000000) >> 12) |
487            ((hi & 0x0000000f) << 20);
488 }
489 
490 /** Forms a lookup table index for the 12-of-21 coding template in
491  *  discontiguous megablast
492  * @param accum accumulator containing the 2-bit bases that will
493  *              be used to create the index. Bases most recently
494  *              added to the accumulator are in the low-order bits
495  * @return The 24-bit lookup table index
496  */
DiscontigIndex_12_21_Coding(Uint8 accum)497 static NCBI_INLINE Int4 DiscontigIndex_12_21_Coding(Uint8 accum)
498 {
499     Uint4 lo = (Uint4)accum;
500     Uint4 hi = (Uint4)(accum >> 32);
501     return ((lo & 0x00000003)      ) |
502            ((lo & 0x000000f0) >>  2) |
503            ((lo & 0x00000c00) >>  4) |
504            ((lo & 0x000f0000) >>  8) |
505            ((lo & 0x03c00000) >> 10) |
506            ((lo & 0xf0000000) >> 12) |
507            ((hi & 0x0000000c) << 18) |
508            ((hi & 0x00000300) << 14);
509 }
510 
511 /** Forms a lookup table index for the 12-of-21 optimal template in
512  *  discontiguous megablast
513  * @param accum accumulator containing the 2-bit bases that will
514  *              be used to create the index. Bases most recently
515  *              added to the accumulator are in the low-order bits
516  * @return The 24-bit lookup table index
517  */
DiscontigIndex_12_21_Optimal(Uint8 accum)518 static NCBI_INLINE Int4 DiscontigIndex_12_21_Optimal(Uint8 accum)
519 {
520     Uint4 lo = (Uint4)accum;
521     Uint4 hi = (Uint4)(accum >> 32);
522     return ((lo & 0x0000003f)      ) |
523            ((lo & 0x00000300) >>  2) |
524            ((lo & 0x0000c000) >>  6) |
525            ((lo & 0x00f00000) >> 10) |
526            ((lo & 0x0c000000) >> 12) |
527            ((hi & 0x00000003) << 16) |
528            ((hi & 0x000003f0) << 14);
529 }
530 
531 /** Given an accumulator containing packed bases, compute the discontiguous
532  *  word index specified by template_type. Only the low-order (2 *
533  *  template_length) bits of the accumulator are used; the base most recently
534  *  added to the accumulator is in the two lowest bits.
535 *
536  * @param accum The accumulator [in]
537  * @param template_type What type of discontiguous word template to use [in]
538  * @return The lookup table index of the discontiguous word
539  */
ComputeDiscontiguousIndex(Uint8 accum,EDiscTemplateType template_type)540 static NCBI_INLINE Int4 ComputeDiscontiguousIndex(Uint8 accum,
541                                     EDiscTemplateType template_type)
542 {
543    Int4 index;
544 
545    switch (template_type) {
546    case eDiscTemplate_11_16_Coding:
547       index = DiscontigIndex_11_16_Coding(accum);
548       break;
549    case eDiscTemplate_12_16_Coding:
550       index = DiscontigIndex_12_16_Coding(accum);
551       break;
552    case eDiscTemplate_11_16_Optimal:
553       index = DiscontigIndex_11_16_Optimal(accum);
554       break;
555    case eDiscTemplate_12_16_Optimal:
556       index = DiscontigIndex_12_16_Optimal(accum);
557       break;
558    case eDiscTemplate_11_18_Coding:
559       index = DiscontigIndex_11_18_Coding(accum);
560      break;
561    case eDiscTemplate_12_18_Coding:
562       index = DiscontigIndex_12_18_Coding(accum);
563       break;
564    case eDiscTemplate_11_18_Optimal:
565       index = DiscontigIndex_11_18_Optimal(accum);
566       break;
567    case eDiscTemplate_12_18_Optimal:
568       index = DiscontigIndex_12_18_Optimal(accum);
569       break;
570    case eDiscTemplate_11_21_Coding:
571       index = DiscontigIndex_11_21_Coding(accum);
572       break;
573    case eDiscTemplate_12_21_Coding:
574       index = DiscontigIndex_12_21_Coding(accum);
575       break;
576    case eDiscTemplate_11_21_Optimal:
577       index = DiscontigIndex_11_21_Optimal(accum);
578       break;
579    case eDiscTemplate_12_21_Optimal:
580       index = DiscontigIndex_12_21_Optimal(accum);
581       break;
582    default:
583       index = 0;
584       break;
585    }
586 
587    return index;
588 }
589 
590 /** Find an index into a sparse array using pv_array as a bit field
591  * @param index Index to translate [in]
592  * @param pv_array Bit field (lookup table pv_array) [in]
593  * @param pv_array_bts Log2 of pv_array size [in]
594  * @param pv_counts Array of set bit counts, where pv_counts[i] is the number
595  *                   of bits set in pv_array[0..i-1] [in]
596  * @param words_per_bit Number of words represented by one bit in the
597  *                       pv_array [in]
598  * @return Index into the sparse array
599  */
600 Int8 FindSparseIndex(Int8 index, PV_ARRAY_TYPE* pv_array, Int4 pv_array_bts,
601                      Int4* pv_counts, Int4 words_per_bit);
602 
603 
604 /** Find an index into a sparse array using pv_array as a bit field, for a
605  * batch of indices
606  * @param in_array Input array of indices to translate [in]
607  * @param out_array Output array of indices to translate [out]
608  * @param length Number of indices to translate [in]
609  * @param pv_array Bit field (lookup table pv_array) [in]
610  * @param pv_array_bts Log2 of pv_array size [in]
611  * @param pv_counts Array of set bit counts, where pv_counts[i] is the number
612  *                   of bits set in pv_array[0..i-1] [in]
613  * @param words_per_bit Number of words represented by one bit in the
614  *                       pv_array [in]
615  * @return Index into the sparse array
616  */
617 Int2 FindSparseIndices(Int8* NCBI_RESTRICT in_array, Int8* out_array, Int4 length,
618                        PV_ARRAY_TYPE* pv_array, Int4 pv_array_bts,
619                        Int4* counts, Int4 elems_per_bit);
620 
621 
622 /*----------------------- Hashed Na lookup table -------------------------*/
623 
624 /** Structure defining one cell of the compacted lookup table */
625 typedef struct NaHashLookupBackboneCell {
626 
627     Int1 num_words;      /**< number of words stored under the same hash value */
628     Int1 num_offsets[3]; /**< number of offsets for each word if there are
629                               fewer than 3 */
630 
631     Uint4 words[3];      /**< words stored under this hash value */
632     Int4 offsets[9];     /**< offset locations for each word */
633 
634 } NaHashLookupBackboneCell;
635 
636 
637 typedef struct BlastNaHashLookupTable {
638     Int4 mask;             /**< part of index to mask off, that is, top
639                                 (wordsize*charsize) bits should be discarded. */
640     Int4 word_length;      /**< Length in bases of the full word match
641                                 required to trigger extension */
642     Int4 lut_word_length;  /**< Length in bases of a word indexed by the
643                                 lookup table */
644     Int4 scan_step;        /**< number of bases between successive words */
645     Int4 backbone_size;    /**< number of cells in the backbone */
646     Int4 longest_chain;    /**< length of the longest chain on the backbone */
647     NaHashLookupBackboneCell * thick_backbone; /**< the "thick" backbone. after
648                                               queries are indexed, compact the
649                                               backbone to put at most
650                                               NA_HITS_PER_CELL hits on the
651                                               backbone, otherwise point to
652                                               some overflow storage */
653     Int4* overflow;        /**< the overflow array for the compacted
654                                 lookup table */
655     Int4 offsets_size;     /**< Number of elements in the overflow array */
656     PV_ARRAY_TYPE *pv;     /**< Presence vector bitfield; bit positions that
657                                 are set indicate that the corresponding thick
658                                 backbone cell contains hits */
659     Int4 pv_array_bts;     /**< power of 2 by which to divide a word to access
660                                 PV_ARRAY_TYPE element in pv array */
661     void *scansub_callback; /**< function for scanning subject sequences */
662     void *hash_callback;    /**< hash function to be used for hash table */
663     BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for
664                                       soft-masking. */
665 } BlastNaHashLookupTable;
666 
667 /* Maximum number of words and offsets that can be strored in thich backbone */
668 #define NA_WORDS_PER_HASH 3
669 #define NA_OFFSETS_PER_HASH 9
670 
671 
672 Int4 BlastNaHashLookupTableNew(BLAST_SequenceBlk* query,
673                                BlastSeqLoc* locations,
674                                BlastNaHashLookupTable** lut,
675                                const LookupTableOptions* opt,
676                                const QuerySetUpOptions* query_options,
677                                BlastSeqSrc* seqsrc,
678                                Uint4 num_threads);
679 
680 
681 /** Free a nucleotide lookup table.
682  *  @param lookup The lookup table structure to be freed
683  *  @return NULL
684  */
685 BlastNaHashLookupTable*
686 BlastNaHashLookupTableDestruct(BlastNaHashLookupTable* lookup);
687 
688 
689 
690 #ifdef __cplusplus
691 }
692 #endif
693 
694 #endif /* !ALGO_BLAST_CORE__BLAST_NTLOOKUP__H */
695