1 /* $Id: blast_nalookup.h,v 1.13 2016/12/06 15:29:16 fukanchi Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 */
26
27 /** @file blast_nalookup.h
28 * Routines for creating nucleotide BLAST lookup tables.
29 */
30
31 #ifndef ALGO_BLAST_CORE__BLAST_NTLOOKUP__H
32 #define ALGO_BLAST_CORE__BLAST_NTLOOKUP__H
33
34 #include <algo/blast/core/ncbi_std.h>
35 #include <algo/blast/core/blast_def.h>
36 #include <algo/blast/core/blast_lookup.h>
37 #include <algo/blast/core/blast_options.h>
38
39 #include <algo/blast/core/blast_seqsrc.h>
40
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44
45 /** choose the type of nucleotide lookup table to be used
46 * for a blast search
47 * @param lookup_options Options for lookup table creation [in]
48 * @param approx_table_entries An upper bound on the number of words
49 * that must be added to the lookup table [in]
50 * @param lut_width The number of nucleotides in one lookup table word [out]
51 @param query_length Number of letters in the query [in]
52 * @return the lookup table type chosen
53 */
54 ELookupTableType
55 BlastChooseNaLookupTable(const LookupTableOptions* lookup_options,
56 Int4 approx_table_entries, Int4 query_length,
57 Int4 *lut_width);
58
59 /*------------------------- Small lookup table --------------------------*/
60
61 /** Lookup table structure for blastn searches with small queries */
62 typedef struct BlastSmallNaLookupTable {
63 Int4 mask; /**< part of index to mask off, that is, top
64 (wordsize*charsize) bits should be discarded. */
65 Int4 word_length; /**< Length in bases of the full word match
66 required to trigger extension */
67 Int4 lut_word_length; /**< Length in bases of a word indexed by the
68 lookup table */
69 Int4 scan_step; /**< number of bases between successive words */
70 Int4 backbone_size; /**< number of cells in the backbone */
71 Int4 longest_chain; /**< length of the longest chain on the backbone */
72 Int2 * final_backbone; /**< backbone structure used when scanning for
73 hits */
74 Int2 * overflow; /**< the overflow array for the compacted
75 lookup table */
76 Int4 overflow_size; /**< Number of elements in the overflow array */
77 void *scansub_callback; /**< function for scanning subject sequences */
78 void *extend_callback; /**< function for extending hits */
79 BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for soft-masking. */
80 } BlastSmallNaLookupTable;
81
82 /** Create a new small nucleotide lookup table.
83 * @param query The query sequence block (if concatenated sequence, the
84 * individual strands/sequences must be separated by a 0x0f byte)[in]
85 * @param locations The locations to be included in the lookup table,
86 * e.g. [0,length-1] for full sequence. NULL means no sequence. [in]
87 * @param lut Pointer to the lookup table to be created [out]
88 * @param opt Options for lookup table creation [in]
89 * @param query_options query options used to get filtering options [in]
90 * @param lut_width The number of nucleotides in one lookup table word [in]
91 * @return 0 if successful, nonzero on failure
92 */
93 Int4 BlastSmallNaLookupTableNew(BLAST_SequenceBlk* query,
94 BlastSeqLoc* locations,
95 BlastSmallNaLookupTable * *lut,
96 const LookupTableOptions * opt,
97 const QuerySetUpOptions* query_options,
98 Int4 lut_width);
99
100 /** Free a small nucleotide lookup table.
101 * @param lookup The lookup table structure to be freed
102 * @return NULL
103 */
104 BlastSmallNaLookupTable* BlastSmallNaLookupTableDestruct(
105 BlastSmallNaLookupTable* lookup);
106
107 /*----------------------- Standard lookup table -------------------------*/
108
109 #define NA_HITS_PER_CELL 3 /**< maximum number of hits in one lookup
110 table cell */
111
112 /** structure defining one cell of the compacted lookup table */
113 typedef struct NaLookupBackboneCell {
114 Int4 num_used; /**< number of hits stored for this cell */
115
116 union {
117 Int4 overflow_cursor; /**< integer offset into the overflow array
118 where the list of hits for this cell begins */
119 Int4 entries[NA_HITS_PER_CELL]; /**< if the number of hits for this
120 cell is NA_HITS_PER_CELL or less,
121 the hits are all stored directly in
122 the cell */
123 } payload; /**< UNION that specifies either entries stored right
124 on the backbone if fewer than NA_HITS_PER_CELL
125 are present or a pointer to where the hits are
126 stored (off-backbone). */
127 } NaLookupBackboneCell;
128
129 /** The basic lookup table structure for blastn searches
130 */
131 typedef struct BlastNaLookupTable {
132 Int4 mask; /**< part of index to mask off, that is, top
133 (wordsize*charsize) bits should be discarded. */
134 Int4 word_length; /**< Length in bases of the full word match
135 required to trigger extension */
136 Int4 lut_word_length; /**< Length in bases of a word indexed by the
137 lookup table */
138 Int4 scan_step; /**< number of bases between successive words */
139 Int4 backbone_size; /**< number of cells in the backbone */
140 Int4 longest_chain; /**< length of the longest chain on the backbone */
141 NaLookupBackboneCell * thick_backbone; /**< the "thick" backbone. after
142 queries are indexed, compact the
143 backbone to put at most
144 NA_HITS_PER_CELL hits on the
145 backbone, otherwise point to
146 some overflow storage */
147 Int4 * overflow; /**< the overflow array for the compacted
148 lookup table */
149 Int4 overflow_size; /**< Number of elements in the overflow array */
150 PV_ARRAY_TYPE *pv; /**< Presence vector bitfield; bit positions that
151 are set indicate that the corresponding thick
152 backbone cell contains hits */
153 void *scansub_callback; /**< function for scanning subject sequences */
154 void *extend_callback; /**< function for extending hits */
155 BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for soft-masking. */
156 } BlastNaLookupTable;
157
158 /** Create a new nucleotide lookup table.
159 * @param query The query sequence block (if concatenated sequence, the
160 * individual strands/sequences must be separated by a 0x0f byte)[in]
161 * @param locations The locations to be included in the lookup table,
162 * e.g. [0,length-1] for full sequence. NULL means no sequence. [in]
163 * @param lut Pointer to the lookup table to be created [out]
164 * @param opt Options for lookup table creation [in]
165 * @param query_options query options used to get filtering options [in]
166 * @param lut_width The number of nucleotides in one lookup table word [in]
167 * @return 0 if successful, nonzero on failure
168 */
169 Int4 BlastNaLookupTableNew(BLAST_SequenceBlk* query,
170 BlastSeqLoc* locations,
171 BlastNaLookupTable * *lut,
172 const LookupTableOptions * opt,
173 const QuerySetUpOptions* query_options,
174 Int4 lut_width);
175
176 /** Free a nucleotide lookup table.
177 * @param lookup The lookup table structure to be freed
178 * @return NULL
179 */
180 BlastNaLookupTable* BlastNaLookupTableDestruct(BlastNaLookupTable* lookup);
181
182 /*----------------------- Megablast lookup table -------------------------*/
183
184 /** General types of discontiguous word templates */
185 typedef enum {
186 eMBWordCoding = 0,
187 eMBWordOptimal = 1,
188 eMBWordTwoTemplates = 2
189 } EDiscWordType;
190
191 /** Enumeration of all discontiguous word templates; the enumerated values
192 * encode the weight, template length and type information
193 *
194 * <PRE>
195 * Optimal word templates:
196 * Number of 1's in a template is word size (weight);
197 * total number of 1's and 0's - template length.
198 * 1,110,110,110,110,111 - 12 of 16
199 * 1,110,010,110,110,111 - 11 of 16
200 * 111,010,110,010,110,111 - 12 of 18
201 * 111,010,010,110,010,111 - 11 of 18
202 * 111,010,010,110,010,010,111 - 12 of 21
203 * 111,010,010,100,010,010,111 - 11 of 21
204 * Coding word templates:
205 * 111,110,110,110,110,1 - 12 of 16
206 * 110,110,110,110,110,1 - 11 of 16
207 * 10,110,110,110,110,110,1 - 12 of 18
208 * 10,110,110,010,110,110,1 - 11 of 18
209 * 10,010,110,110,110,010,110,1 - 12 of 21
210 * 10,010,110,010,110,010,110,1 - 11 of 21
211 * </PRE>
212 *
213 * Sequence data processed by these templates is assumed to be arranged
214 * from left to right
215 *
216 * Index values are calculated by masking the respective pieces of sequence so
217 * only bits corresponding to a contiguous string of 1's in a template are
218 * left, then shifting the masked value to a correct position in the final
219 * 22- or 24-bit lookup table index, which is the sum of such shifts.
220 */
221 typedef enum {
222 eDiscTemplateContiguous = 0,
223 eDiscTemplate_11_16_Coding = 1,
224 eDiscTemplate_11_16_Optimal = 2,
225 eDiscTemplate_12_16_Coding = 3,
226 eDiscTemplate_12_16_Optimal = 4,
227 eDiscTemplate_11_18_Coding = 5,
228 eDiscTemplate_11_18_Optimal = 6,
229 eDiscTemplate_12_18_Coding = 7,
230 eDiscTemplate_12_18_Optimal = 8,
231 eDiscTemplate_11_21_Coding = 9,
232 eDiscTemplate_11_21_Optimal = 10,
233 eDiscTemplate_12_21_Coding = 11,
234 eDiscTemplate_12_21_Optimal = 12
235 } EDiscTemplateType;
236
237 /** The lookup table structure used for Mega BLAST */
238 typedef struct BlastMBLookupTable {
239 Int4 word_length; /**< number of exact letter matches that will trigger
240 an ungapped extension */
241 Int4 lut_word_length; /**< number of letters in a lookup table word */
242 Int8 hashsize; /**< = 4^(lut_word_length) */
243 Boolean discontiguous; /**< Are discontiguous words used? */
244 Int4 template_length; /**< Length of the discontiguous word template */
245 EDiscTemplateType template_type; /**< Type of the discontiguous
246 word template */
247 Boolean two_templates; /**< Use two templates simultaneously */
248 EDiscTemplateType second_template_type; /**< Type of the second
249 discontiguous word template */
250
251 Boolean stride; /**< is lookup table created with a stride */
252 Int4 scan_step; /**< Step size for scanning the database */
253 Int4* hashtable; /**< Array of positions */
254 Int4* hashtable2; /**< Array of positions for second template */
255 Int4* next_pos; /**< Extra positions stored here */
256 Int4* next_pos2; /**< Extra positions for the second template */
257 PV_ARRAY_TYPE *pv_array;/**< Presence vector, used for quick presence
258 check */
259 Int4 pv_array_bts; /**< The exponent of 2 by which pv_array is smaller than
260 the backbone */
261 Int4 longest_chain; /**< Largest number of query positions for a given
262 word */
263 void *scansub_callback; /**< function for scanning subject sequences */
264 void *extend_callback; /**< function for extending hits */
265
266 Int4 num_unique_pos_added; /**< Number of positions added to the l.t. */
267 Int4 num_words_added; /**< Number of words added to the l.t. */
268 BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for soft-masking. */
269 } BlastMBLookupTable;
270
271 /**
272 * Create the lookup table for Mega BLAST
273 * @param query The query sequence block (if concatenated sequence, the
274 * individual strands/sequences must be separated by a 0x0f byte)[in]
275 * @param location The locations to be included in the lookup table,
276 * e.g. [0,length-1] for full sequence. NULL means no sequence. [in]
277 * @param mb_lt_ptr Pointer to the lookup table to be created [out]
278 * @param lookup_options Options for lookup table creation [in]
279 * @param query_options query options used to get filtering options [in]
280 * @param approx_table_entries An upper bound on the number of words
281 * that must be added to the lookup table [in]
282 * @param lut_width The number of nucleotides in one lookup table word [in]
283 * @param seqsrc Database sequences [in]
284 */
285 Int2 BlastMBLookupTableNew(BLAST_SequenceBlk* query, BlastSeqLoc* location,
286 BlastMBLookupTable** mb_lt_ptr,
287 const LookupTableOptions* lookup_options,
288 const QuerySetUpOptions* query_options,
289 Int4 approx_table_entries,
290 Int4 lut_width,
291 BlastSeqSrc* seqsrc);
292
293 /**
294 * Deallocate memory used by the Mega BLAST lookup table
295 */
296 BlastMBLookupTable* BlastMBLookupTableDestruct(BlastMBLookupTable* mb_lt);
297
298 /*----------------------- Discontiguous Megablast -------------------------*/
299
300 /** Forms a lookup table index for the 11-of-16 coding template in
301 * discontiguous megablast
302 * @param accum accumulator containing the 2-bit bases that will
303 * be used to create the index. Bases most recently
304 * added to the accumulator are in the low-order bits
305 * @return The 22-bit lookup table index
306 */
DiscontigIndex_11_16_Coding(Uint8 accum)307 static NCBI_INLINE Int4 DiscontigIndex_11_16_Coding(Uint8 accum)
308 {
309 Uint4 lo = (Uint4)accum;
310 return ((lo & 0x00000003) ) |
311 ((lo & 0x000000f0) >> 2) |
312 ((lo & 0x00003c00) >> 4) |
313 ((lo & 0x000f0000) >> 6) |
314 ((lo & 0x03c00000) >> 8) |
315 ((lo & 0xf0000000) >> 10);
316 }
317
318 /** Forms a lookup table index for the 11-of-16 optimal template in
319 * discontiguous megablast
320 * @param accum accumulator containing the 2-bit bases that will
321 * be used to create the index. Bases most recently
322 * added to the accumulator are in the low-order bits
323 * @return The 22-bit lookup table index
324 */
DiscontigIndex_11_16_Optimal(Uint8 accum)325 static NCBI_INLINE Int4 DiscontigIndex_11_16_Optimal(Uint8 accum)
326 {
327 Uint4 lo = (Uint4)accum;
328 return ((lo & 0x0000003f) ) |
329 ((lo & 0x00000f00) >> 2) |
330 ((lo & 0x0003c000) >> 4) |
331 ((lo & 0x00300000) >> 6) |
332 ((lo & 0xfc000000) >> 10);
333 }
334
335 /** Forms a lookup table index for the 11-of-18 coding template in
336 * discontiguous megablast
337 * @param accum accumulator containing the 2-bit bases that will
338 * be used to create the index. Bases most recently
339 * added to the accumulator are in the low-order bits
340 * @return The 22-bit lookup table index
341 */
DiscontigIndex_11_18_Coding(Uint8 accum)342 static NCBI_INLINE Int4 DiscontigIndex_11_18_Coding(Uint8 accum)
343 {
344 Uint4 lo = (Uint4)accum;
345 Uint4 hi = (Uint4)(accum >> 32);
346 return ((lo & 0x00000003) ) |
347 ((lo & 0x000000f0) >> 2) |
348 ((lo & 0x00003c00) >> 4) |
349 ((lo & 0x00030000) >> 6) |
350 ((lo & 0x03c00000) >> 10) |
351 ((lo & 0xf0000000) >> 12) |
352 ((hi & 0x0000000c) << 18);
353 }
354
355 /** Forms a lookup table index for the 11-of-18 optimal template in
356 * discontiguous megablast
357 * @param accum accumulator containing the 2-bit bases that will
358 * be used to create the index. Bases most recently
359 * added to the accumulator are in the low-order bits
360 * @return The 22-bit lookup table index
361 */
DiscontigIndex_11_18_Optimal(Uint8 accum)362 static NCBI_INLINE Int4 DiscontigIndex_11_18_Optimal(Uint8 accum)
363 {
364 Uint4 lo = (Uint4)accum;
365 Uint4 hi = (Uint4)(accum >> 32);
366 return ((lo & 0x0000003f) ) |
367 ((lo & 0x00000300) >> 2) |
368 ((lo & 0x0003c000) >> 6) |
369 ((lo & 0x00300000) >> 8) |
370 ((lo & 0x0c000000) >> 12) |
371 ((lo & 0xc0000000) >> 14) |
372 ((hi & 0x0000000f) << 18);
373 }
374
375 /** Forms a lookup table index for the 11-of-21 coding template in
376 * discontiguous megablast
377 * @param accum accumulator containing the 2-bit bases that will
378 * be used to create the index. Bases most recently
379 * added to the accumulator are in the low-order bits
380 * @return The 22-bit lookup table index
381 */
DiscontigIndex_11_21_Coding(Uint8 accum)382 static NCBI_INLINE Int4 DiscontigIndex_11_21_Coding(Uint8 accum)
383 {
384 Uint4 lo = (Uint4)accum;
385 Uint4 hi = (Uint4)(accum >> 32);
386 return ((lo & 0x00000003) ) |
387 ((lo & 0x000000f0) >> 2) |
388 ((lo & 0x00000c00) >> 4) |
389 ((lo & 0x000f0000) >> 8) |
390 ((lo & 0x00c00000) >> 10) |
391 ((lo & 0xf0000000) >> 14) |
392 ((hi & 0x0000000c) << 16) |
393 ((hi & 0x00000300) << 12);
394 }
395
396 /** Forms a lookup table index for the 11-of-21 optimal template in
397 * discontiguous megablast
398 * @param accum accumulator containing the 2-bit bases that will
399 * be used to create the index. Bases most recently
400 * added to the accumulator are in the low-order bits
401 * @return The 24-bit lookup table index
402 */
DiscontigIndex_11_21_Optimal(Uint8 accum)403 static NCBI_INLINE Int4 DiscontigIndex_11_21_Optimal(Uint8 accum)
404 {
405 Uint4 lo = (Uint4)accum;
406 Uint4 hi = (Uint4)(accum >> 32);
407 return ((lo & 0x0000003f) ) |
408 ((lo & 0x00000300) >> 2) |
409 ((lo & 0x0000c000) >> 6) |
410 ((lo & 0x00c00000) >> 12) |
411 ((lo & 0x0c000000) >> 14) |
412 ((hi & 0x00000003) << 14) |
413 ((hi & 0x000003f0) << 12);
414 }
415
416 /** Forms a lookup table index for the 12-of-16 coding template in
417 * discontiguous megablast
418 * @param accum accumulator containing the 2-bit bases that will
419 * be used to create the index. Bases most recently
420 * added to the accumulator are in the low-order bits
421 * @return The 24-bit lookup table index
422 */
DiscontigIndex_12_16_Coding(Uint8 accum)423 static NCBI_INLINE Int4 DiscontigIndex_12_16_Coding(Uint8 accum)
424 {
425 Uint4 lo = (Uint4)accum;
426 return ((lo & 0x00000003) ) |
427 ((lo & 0x000000f0) >> 2) |
428 ((lo & 0x00003c00) >> 4) |
429 ((lo & 0x000f0000) >> 6) |
430 ((lo & 0xffc00000) >> 8);
431 }
432
433 /** Forms a lookup table index for the 12-of-16 optimal template in
434 * discontiguous megablast
435 * @param accum accumulator containing the 2-bit bases that will
436 * be used to create the index. Bases most recently
437 * added to the accumulator are in the low-order bits
438 * @return The 24-bit lookup table index
439 */
DiscontigIndex_12_16_Optimal(Uint8 accum)440 static NCBI_INLINE Int4 DiscontigIndex_12_16_Optimal(Uint8 accum)
441 {
442 Uint4 lo = (Uint4)accum;
443 return ((lo & 0x0000003f) ) |
444 ((lo & 0x00000f00) >> 2) |
445 ((lo & 0x0003c000) >> 4) |
446 ((lo & 0x00f00000) >> 6) |
447 ((lo & 0xfc000000) >> 8);
448 }
449
450 /** Forms a lookup table index for the 12-of-18 coding template in
451 * discontiguous megablast
452 * @param accum accumulator containing the 2-bit bases that will
453 * be used to create the index. Bases most recently
454 * added to the accumulator are in the low-order bits
455 * @return The 24-bit lookup table index
456 */
DiscontigIndex_12_18_Coding(Uint8 accum)457 static NCBI_INLINE Int4 DiscontigIndex_12_18_Coding(Uint8 accum)
458 {
459 Uint4 lo = (Uint4)accum;
460 Uint4 hi = (Uint4)(accum >> 32);
461 return ((lo & 0x00000003) ) |
462 ((lo & 0x000000f0) >> 2) |
463 ((lo & 0x00003c00) >> 4) |
464 ((lo & 0x000f0000) >> 6) |
465 ((lo & 0x03c00000) >> 8) |
466 ((lo & 0xf0000000) >> 10) |
467 ((hi & 0x0000000c) << 20);
468 }
469
470 /** Forms a lookup table index for the 12-of-18 optimal template in
471 * discontiguous megablast
472 * @param accum accumulator containing the 2-bit bases that will
473 * be used to create the index. Bases most recently
474 * added to the accumulator are in the low-order bits
475 * @return The 24-bit lookup table index
476 */
DiscontigIndex_12_18_Optimal(Uint8 accum)477 static NCBI_INLINE Int4 DiscontigIndex_12_18_Optimal(Uint8 accum)
478 {
479 Uint4 lo = (Uint4)accum;
480 Uint4 hi = (Uint4)(accum >> 32);
481 return ((lo & 0x0000003f) ) |
482 ((lo & 0x00000f00) >> 2) |
483 ((lo & 0x0000c000) >> 4) |
484 ((lo & 0x00f00000) >> 8) |
485 ((lo & 0x0c000000) >> 10) |
486 ((lo & 0xc0000000) >> 12) |
487 ((hi & 0x0000000f) << 20);
488 }
489
490 /** Forms a lookup table index for the 12-of-21 coding template in
491 * discontiguous megablast
492 * @param accum accumulator containing the 2-bit bases that will
493 * be used to create the index. Bases most recently
494 * added to the accumulator are in the low-order bits
495 * @return The 24-bit lookup table index
496 */
DiscontigIndex_12_21_Coding(Uint8 accum)497 static NCBI_INLINE Int4 DiscontigIndex_12_21_Coding(Uint8 accum)
498 {
499 Uint4 lo = (Uint4)accum;
500 Uint4 hi = (Uint4)(accum >> 32);
501 return ((lo & 0x00000003) ) |
502 ((lo & 0x000000f0) >> 2) |
503 ((lo & 0x00000c00) >> 4) |
504 ((lo & 0x000f0000) >> 8) |
505 ((lo & 0x03c00000) >> 10) |
506 ((lo & 0xf0000000) >> 12) |
507 ((hi & 0x0000000c) << 18) |
508 ((hi & 0x00000300) << 14);
509 }
510
511 /** Forms a lookup table index for the 12-of-21 optimal template in
512 * discontiguous megablast
513 * @param accum accumulator containing the 2-bit bases that will
514 * be used to create the index. Bases most recently
515 * added to the accumulator are in the low-order bits
516 * @return The 24-bit lookup table index
517 */
DiscontigIndex_12_21_Optimal(Uint8 accum)518 static NCBI_INLINE Int4 DiscontigIndex_12_21_Optimal(Uint8 accum)
519 {
520 Uint4 lo = (Uint4)accum;
521 Uint4 hi = (Uint4)(accum >> 32);
522 return ((lo & 0x0000003f) ) |
523 ((lo & 0x00000300) >> 2) |
524 ((lo & 0x0000c000) >> 6) |
525 ((lo & 0x00f00000) >> 10) |
526 ((lo & 0x0c000000) >> 12) |
527 ((hi & 0x00000003) << 16) |
528 ((hi & 0x000003f0) << 14);
529 }
530
531 /** Given an accumulator containing packed bases, compute the discontiguous
532 * word index specified by template_type. Only the low-order (2 *
533 * template_length) bits of the accumulator are used; the base most recently
534 * added to the accumulator is in the two lowest bits.
535 *
536 * @param accum The accumulator [in]
537 * @param template_type What type of discontiguous word template to use [in]
538 * @return The lookup table index of the discontiguous word
539 */
ComputeDiscontiguousIndex(Uint8 accum,EDiscTemplateType template_type)540 static NCBI_INLINE Int4 ComputeDiscontiguousIndex(Uint8 accum,
541 EDiscTemplateType template_type)
542 {
543 Int4 index;
544
545 switch (template_type) {
546 case eDiscTemplate_11_16_Coding:
547 index = DiscontigIndex_11_16_Coding(accum);
548 break;
549 case eDiscTemplate_12_16_Coding:
550 index = DiscontigIndex_12_16_Coding(accum);
551 break;
552 case eDiscTemplate_11_16_Optimal:
553 index = DiscontigIndex_11_16_Optimal(accum);
554 break;
555 case eDiscTemplate_12_16_Optimal:
556 index = DiscontigIndex_12_16_Optimal(accum);
557 break;
558 case eDiscTemplate_11_18_Coding:
559 index = DiscontigIndex_11_18_Coding(accum);
560 break;
561 case eDiscTemplate_12_18_Coding:
562 index = DiscontigIndex_12_18_Coding(accum);
563 break;
564 case eDiscTemplate_11_18_Optimal:
565 index = DiscontigIndex_11_18_Optimal(accum);
566 break;
567 case eDiscTemplate_12_18_Optimal:
568 index = DiscontigIndex_12_18_Optimal(accum);
569 break;
570 case eDiscTemplate_11_21_Coding:
571 index = DiscontigIndex_11_21_Coding(accum);
572 break;
573 case eDiscTemplate_12_21_Coding:
574 index = DiscontigIndex_12_21_Coding(accum);
575 break;
576 case eDiscTemplate_11_21_Optimal:
577 index = DiscontigIndex_11_21_Optimal(accum);
578 break;
579 case eDiscTemplate_12_21_Optimal:
580 index = DiscontigIndex_12_21_Optimal(accum);
581 break;
582 default:
583 index = 0;
584 break;
585 }
586
587 return index;
588 }
589
590 /** Find an index into a sparse array using pv_array as a bit field
591 * @param index Index to translate [in]
592 * @param pv_array Bit field (lookup table pv_array) [in]
593 * @param pv_array_bts Log2 of pv_array size [in]
594 * @param pv_counts Array of set bit counts, where pv_counts[i] is the number
595 * of bits set in pv_array[0..i-1] [in]
596 * @param words_per_bit Number of words represented by one bit in the
597 * pv_array [in]
598 * @return Index into the sparse array
599 */
600 Int8 FindSparseIndex(Int8 index, PV_ARRAY_TYPE* pv_array, Int4 pv_array_bts,
601 Int4* pv_counts, Int4 words_per_bit);
602
603
604 /** Find an index into a sparse array using pv_array as a bit field, for a
605 * batch of indices
606 * @param in_array Input array of indices to translate [in]
607 * @param out_array Output array of indices to translate [out]
608 * @param length Number of indices to translate [in]
609 * @param pv_array Bit field (lookup table pv_array) [in]
610 * @param pv_array_bts Log2 of pv_array size [in]
611 * @param pv_counts Array of set bit counts, where pv_counts[i] is the number
612 * of bits set in pv_array[0..i-1] [in]
613 * @param words_per_bit Number of words represented by one bit in the
614 * pv_array [in]
615 * @return Index into the sparse array
616 */
617 Int2 FindSparseIndices(Int8* NCBI_RESTRICT in_array, Int8* out_array, Int4 length,
618 PV_ARRAY_TYPE* pv_array, Int4 pv_array_bts,
619 Int4* counts, Int4 elems_per_bit);
620
621
622 /*----------------------- Hashed Na lookup table -------------------------*/
623
624 /** Structure defining one cell of the compacted lookup table */
625 typedef struct NaHashLookupBackboneCell {
626
627 Int1 num_words; /**< number of words stored under the same hash value */
628 Int1 num_offsets[3]; /**< number of offsets for each word if there are
629 fewer than 3 */
630
631 Uint4 words[3]; /**< words stored under this hash value */
632 Int4 offsets[9]; /**< offset locations for each word */
633
634 } NaHashLookupBackboneCell;
635
636
637 typedef struct BlastNaHashLookupTable {
638 Int4 mask; /**< part of index to mask off, that is, top
639 (wordsize*charsize) bits should be discarded. */
640 Int4 word_length; /**< Length in bases of the full word match
641 required to trigger extension */
642 Int4 lut_word_length; /**< Length in bases of a word indexed by the
643 lookup table */
644 Int4 scan_step; /**< number of bases between successive words */
645 Int4 backbone_size; /**< number of cells in the backbone */
646 Int4 longest_chain; /**< length of the longest chain on the backbone */
647 NaHashLookupBackboneCell * thick_backbone; /**< the "thick" backbone. after
648 queries are indexed, compact the
649 backbone to put at most
650 NA_HITS_PER_CELL hits on the
651 backbone, otherwise point to
652 some overflow storage */
653 Int4* overflow; /**< the overflow array for the compacted
654 lookup table */
655 Int4 offsets_size; /**< Number of elements in the overflow array */
656 PV_ARRAY_TYPE *pv; /**< Presence vector bitfield; bit positions that
657 are set indicate that the corresponding thick
658 backbone cell contains hits */
659 Int4 pv_array_bts; /**< power of 2 by which to divide a word to access
660 PV_ARRAY_TYPE element in pv array */
661 void *scansub_callback; /**< function for scanning subject sequences */
662 void *hash_callback; /**< hash function to be used for hash table */
663 BlastSeqLoc* masked_locations; /**< masked locations, only non-NULL for
664 soft-masking. */
665 } BlastNaHashLookupTable;
666
667 /* Maximum number of words and offsets that can be strored in thich backbone */
668 #define NA_WORDS_PER_HASH 3
669 #define NA_OFFSETS_PER_HASH 9
670
671
672 Int4 BlastNaHashLookupTableNew(BLAST_SequenceBlk* query,
673 BlastSeqLoc* locations,
674 BlastNaHashLookupTable** lut,
675 const LookupTableOptions* opt,
676 const QuerySetUpOptions* query_options,
677 BlastSeqSrc* seqsrc,
678 Uint4 num_threads);
679
680
681 /** Free a nucleotide lookup table.
682 * @param lookup The lookup table structure to be freed
683 * @return NULL
684 */
685 BlastNaHashLookupTable*
686 BlastNaHashLookupTableDestruct(BlastNaHashLookupTable* lookup);
687
688
689
690 #ifdef __cplusplus
691 }
692 #endif
693
694 #endif /* !ALGO_BLAST_CORE__BLAST_NTLOOKUP__H */
695