1 static char const rcsid[] = "$Id: mblast.c,v 6.215 2006/09/21 13:42:37 madden Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26
27 /*****************************************************************************
28
29 File name: mblast.c
30
31 Author: Ilya Dondoshansky
32
33 Contents: Mega BLAST functions
34
35 Detailed Contents:
36
37 - Mega BLAST versions of some functions from blast.c returning array
38 of SeqAlignPtrs
39
40 - Functions specific to Mega BLAST
41
42 ******************************************************************************
43 * $Revision: 6.215 $
44 *
45 * $Log: mblast.c,v $
46 * Revision 6.215 2006/09/21 13:42:37 madden
47 * BlastProcessGiLists returns a boolean to specify that an attempt was made to process a list of GIs. If no matches were found this can be reported back to the user
48 *
49 * Revision 6.214 2005/08/31 20:33:18 coulouri
50 * From Mike Gertz:
51 * In MegaBlastSetUpSearchWithReadDbInternal, replaced existing code
52 * for adjusting the hitlist size by a call to BlastSingleQueryResultSize.
53 *
54 * Revision 6.213 2005/03/07 16:30:57 dondosha
55 * In reevaluation with ambiguities, if HSP start changed, update starting offsets in edit block
56 *
57 * Revision 6.212 2004/12/07 16:09:04 coulouri
58 * check malloc returncode
59 *
60 * Revision 6.211 2004/11/25 01:59:38 coulouri
61 * reinitialize full_query_length to prevent overflow of query_seq_combined when only minus strands are searched
62 *
63 * Revision 6.210 2004/11/22 20:54:15 coulouri
64 * initialize pointer variables
65 *
66 * Revision 6.209 2004/11/22 15:17:34 dondosha
67 * Do not calculate length adjustment for empty contexts, i.e. ones with null Karlin blocks
68 *
69 * Revision 6.208 2004/11/19 13:22:05 madden
70 * Remove no_check_score completely (from Mike Gertz)
71 *
72 * Revision 6.207 2004/11/05 15:34:53 coulouri
73 * bring in system includes after toolkit includes so that LONG_BIT is defined correctly on amd64
74 *
75 * Revision 6.206 2004/10/29 21:34:45 dondosha
76 * Calculate length adjustment for each context separately; do not allow effective query length to be less than 1
77 *
78 * Revision 6.205 2004/09/13 18:33:01 dondosha
79 * Advance mask pointer in the list when one of the queries in a set is discarded due to being too short, and also has a lower case mask
80 *
81 * Revision 6.204 2004/05/27 17:35:56 dondosha
82 * Do not flag HSPs for deletion in sorting before doing inclusion tests
83 *
84 * Revision 6.203 2004/05/21 13:53:04 dondosha
85 * Use BLAST_HSPFree to free BLAST_HSP structures, hence no need to call GapXEditBlockDelete in multiple places
86 *
87 * Revision 6.202 2004/03/31 17:58:51 papadopo
88 * Mike Gertz' changes for length adjustment calculations
89 *
90 * Revision 6.201 2004/02/26 15:52:30 papadopo
91 * Mike Gertz' modifications to unify handling of gapped Karlin blocks between protein and nucleotide searches
92 *
93 * Revision 6.200 2004/02/24 14:07:01 camacho
94 * Use approximate sequence length calculation for entrez-limited
95 * nucleotide blast databases.
96 *
97 * Revision 6.199 2004/01/27 20:46:06 dondosha
98 * Allow values 0, 1, 2 for no_traceback megablast option
99 *
100 * Revision 6.198 2004/01/23 22:29:11 dondosha
101 * Increase hitlist size for preliminary alignment if traceback is delayed
102 *
103 * Revision 6.197 2004/01/16 23:43:45 dondosha
104 * No more need for special argument for partial search: it is set in options
105 *
106 * Revision 6.196 2004/01/13 15:46:54 dondosha
107 * Changed cast to nearest integer in length adjustment calculation
108 *
109 * Revision 6.195 2004/01/06 22:37:10 dondosha
110 * Use BLAST_HSPfree function
111 *
112 * Revision 6.194 2003/12/10 17:05:28 dondosha
113 * Added function ReevaluateScoreWithAmbiguities to reevaluate score for one HSP; use it after greedy traceback
114 *
115 * Revision 6.193 2003/11/06 19:53:07 dondosha
116 * Fix in MegaBlastSetupSearchInternalByLoc when query submitted as Bioseq instead of SeqLoc; removed unused variables
117 *
118 * Revision 6.192 2003/11/05 21:11:09 dondosha
119 * Fix for traceback with greedy algorithm
120 *
121 * Revision 6.191 2003/10/29 17:46:59 dondosha
122 * Allow 2-stage greedy extension in megablast
123 *
124 * Revision 6.190 2003/05/30 17:25:36 coulouri
125 * add rcsid
126 *
127 * Revision 6.189 2003/05/20 21:56:34 dondosha
128 * Use correct X-dropoff for preliminary dynamic programming gapped extension
129 *
130 * Revision 6.188 2003/05/14 20:35:58 camacho
131 * Allow searching empty databases
132 *
133 * Revision 6.187 2003/05/13 16:02:53 coulouri
134 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
135 *
136 * Revision 6.186 2003/05/12 12:23:44 camacho
137 * Sanity check for number of sequences & db length
138 *
139 * Revision 6.185 2003/05/02 23:02:11 dondosha
140 * Fix for dynamic programming gapped extension
141 *
142 * Revision 6.184 2003/04/24 14:58:20 dondosha
143 * Fixed a minor bug in MegaBlastExtendHit found by A. Morgulis
144 *
145 * Revision 6.183 2003/04/04 17:16:31 dondosha
146 * Previous change reversed, was incorrect
147 *
148 * Revision 6.182 2003/04/03 22:35:44 dondosha
149 * When saving hit list, save hsp contexts modulo 2, since multiple query information is no longer needed
150 *
151 * Revision 6.181 2003/03/19 19:16:20 dondosha
152 * Small correction for a rarely happening case in MegaBlastExtendHit
153 *
154 * Revision 6.180 2003/03/13 19:03:10 dondosha
155 * Minor bug fix in MegaBlastExtendHit
156 *
157 * Revision 6.179 2003/03/13 16:53:03 dondosha
158 * Minor bug fixes
159 *
160 * Revision 6.178 2002/12/26 20:27:40 dondosha
161 * Bug fix for e-values of hits whose extent changes when reevaluating with ambiguities
162 *
163 * Revision 6.177 2002/11/25 19:57:29 dondosha
164 * Further fix to the HSP limit (-H) megablast option
165 *
166 * Revision 6.176 2002/11/22 23:30:34 dondosha
167 * 1. Use array of structures instead of array of pointers for initial offset pairs;
168 * 2. Scan subject sequence to the last byte (it wasn't!) when looking for words
169 *
170 * Revision 6.175 2002/11/06 22:26:49 dondosha
171 * Calculate number of identities at the same time when doing reevaluation for ambiguities
172 *
173 * Revision 6.174 2002/11/04 22:57:24 dondosha
174 * Save number of identities in result HSPs
175 *
176 * Revision 6.173 2002/10/10 21:02:20 dondosha
177 * Fixed reevaluation with ambiguities: if score gets negative, check whether best score already qualifies for saving
178 *
179 * Revision 6.172 2002/09/25 18:35:33 dondosha
180 * Fix in MegaBlastExtendHit for contiguous word size 12
181 *
182 * Revision 6.171 2002/08/30 17:53:51 thiessen
183 * fix missing LIBCALL in prototype
184 *
185 * Revision 6.170 2002/08/30 15:54:11 dondosha
186 * 1. Prototypes of internally used functions moved from mblast.h;
187 * 2. Do not allocate ewp_params and ewp structures when stacks are used to save
188 * initial hits;
189 * 3. Cleaned up initial word extension code.
190 *
191 * Revision 6.169 2002/08/22 13:39:45 camacho
192 * Close the header and sequence files only if allocated
193 *
194 * Revision 6.168 2002/08/20 22:06:02 madden
195 * Restore partial results code for megablast
196 *
197 * Revision 6.167 2002/08/06 15:56:50 dondosha
198 * Removed some old unused code
199 *
200 * Revision 6.166 2002/08/06 15:45:21 dondosha
201 * Corrected the sanity check in MegaBlastGapInfoToSeqAlign
202 *
203 * Revision 6.165 2002/08/01 20:48:25 dondosha
204 * Use BLASTPostSearchLogic function: compute SeqAligns after all results are found
205 *
206 * Revision 6.164 2002/07/31 17:51:09 dondosha
207 * Corrected the two-hit logic in case of word size greater than 12
208 *
209 * Revision 6.163 2002/07/18 19:40:45 dondosha
210 * Added an option to restrict number of HSPs per database sequence
211 *
212 * Revision 6.162 2002/07/17 22:12:58 dondosha
213 * Allow return without computing seqalign for partial search
214 *
215 * Revision 6.161 2002/06/26 00:56:30 camacho
216 *
217 * 1. Fixed bug when searching a mixture of real and mask databases.
218 * 2. Clean up of code that calculates the number of sequences and database
219 * length.
220 *
221 * Revision 6.160 2002/06/24 18:50:20 dondosha
222 * Memory leak fixed
223 *
224 * Revision 6.159 2002/06/13 19:49:24 dondosha
225 * Hide previous change limiting number of HSPs for very large queries
226 *
227 * Revision 6.158 2002/06/13 17:44:27 dondosha
228 * 1. Fixed a bug in revision 6.156;
229 * 2. Limit the maximal number of saved HSPs per database sequence if the
230 * search space is too large to 100 times the number of query sequences.
231 *
232 * Revision 6.157 2002/06/11 14:44:47 dondosha
233 * Return status from some functions instead of search block pointer
234 *
235 * Revision 6.156 2002/06/07 22:39:19 dondosha
236 * Do not mask residues on any of the contexts except first - needed for megablast with non-greedy traceback
237 *
238 * Revision 6.155 2002/06/07 19:30:08 dondosha
239 * Shift offsets for non-greedy preliminary extension when 1-base database scanning is used
240 *
241 * Revision 6.154 2002/06/07 16:47:42 dondosha
242 * Re-sort results after traceback in case of non-greedy extension
243 *
244 * Revision 6.153 2002/05/17 21:40:13 dondosha
245 * Added 2 optimal Mega BLAST word templates for length 21
246 *
247 * Revision 6.152 2002/05/14 22:20:20 dondosha
248 * Renamed maximal discontiguous template type into optimal
249 *
250 * Revision 6.151 2002/05/10 22:38:49 dondosha
251 * Always do preliminary and then traceback gapped extension if dynamic programming extension is used
252 *
253 * Revision 6.150 2002/05/09 17:01:23 dondosha
254 * Renamed typedefs dp_ptr and dp_node to GapXDPPtr and GapXDP
255 *
256 * Revision 6.149 2002/05/08 22:48:12 dondosha
257 * Allocate memory for dynamic programming upfront in Mega BLAST case
258 *
259 * Revision 6.148 2002/05/03 21:36:26 dondosha
260 * Do not allocate memory for greedy algorithm if it is not going to be used
261 *
262 * Revision 6.147 2002/04/24 17:50:53 dondosha
263 * Check percent identity cut-off in all cases
264 *
265 * Revision 6.146 2002/04/23 20:41:20 dondosha
266 * In case of non-affine extension in megablast, check percent identity cutoff after the traceback is obtained
267 *
268 * Revision 6.145 2002/04/11 20:33:47 dondosha
269 * Returned the reaping of hitlists by e-value before reevaluation with ambiguities
270 *
271 * Revision 6.144 2002/04/10 18:20:05 dondosha
272 * Do not subtract 4 from wordsize in BlastFillQueryOffsets
273 *
274 * Revision 6.143 2002/04/09 18:21:00 dondosha
275 * Changed #ifdefs to conditionals, allowing different discontiguous templates in a single binary
276 *
277 * Revision 6.142 2002/04/04 21:19:15 dondosha
278 * Corrections for megablast with non-greedy extensions
279 *
280 * Revision 6.141 2002/04/01 22:26:51 dondosha
281 * Correction in the MegaBlastReevaluateWithAmbiguities function: delete hsp if its evalue dropped below threshold after reevaluation
282 *
283 * Revision 6.140 2002/03/26 21:17:42 dondosha
284 * Changed discontiguous word finding code to allow template length 21
285 *
286 * Revision 6.139 2002/03/06 18:34:31 dondosha
287 * Pass the filtered locations back from the megablast engine to use in formatting
288 *
289 * Revision 6.138 2002/03/05 17:58:57 dondosha
290 * Set same offsets for the traceback as for preliminary extension for megablast with non-greedy extensions
291 *
292 * Revision 6.137 2002/02/15 23:34:04 dondosha
293 * 1. Allow use of hash table instead of index table (need -DUSE_HASH_TABLE)
294 * 2. Allow use of two simultaneous word templates (with -DUSE_TWO_TEMPLATES)
295 * 3. Added functionality fo print masked query sequence with lower case masking.
296 *
297 * Revision 6.136 2002/01/07 23:16:00 dondosha
298 * Fixed several memory leaks and allocation/freeing bugs in multithreaded megablast
299 *
300 * Revision 6.135 2002/01/04 20:08:36 dondosha
301 * Allocate filter string when rodent repeat filtering is needed for neighboring
302 *
303 * Revision 6.134 2001/12/28 20:46:21 dondosha
304 * 1. Mega BLAST related parameters moved into a separate structure
305 * 2. Environment variables for discontiguous words, etc. changed to options
306 * 3. Extension from discontiguous word with length 18 implemented
307 *
308 * Revision 6.133 2001/12/13 16:07:17 dondosha
309 * Do not use mb_endpoint_results in neighboring
310 *
311 * Revision 6.132 2001/12/11 19:45:56 dondosha
312 * Repeat inclusion check after reevaluation of HSPs
313 *
314 * Revision 6.131 2001/12/10 22:54:27 dondosha
315 * Correction for determining the number of stacks to use in megablast
316 *
317 * Revision 6.130 2001/12/04 17:13:08 dondosha
318 * Made number of stacks for megablast word processing depend on query and database
319 *
320 * Revision 6.129 2001/11/27 22:17:20 dondosha
321 * Allow dynamic programming gapped extension only with affine gap scores
322 *
323 * Revision 6.128 2001/11/16 16:25:34 dondosha
324 * In MegaBlastReevaluateWithAmbiguities: bug from previous change fixed
325 *
326 * Revision 6.127 2001/11/14 23:45:46 dondosha
327 * 1. Retrieve unpacked subject sequence directly in blastna encoding
328 * 2. Implemented a possibility to use dynamic programming gapped extension
329 * via an environment variable.
330 *
331 * Revision 6.126 2001/11/13 18:23:54 dondosha
332 * Move environment variable checking for discontiguous words to higher level function, so it is done only once
333 *
334 * Revision 6.125 2001/10/12 21:32:46 dondosha
335 * Added discontiguous word capability to megablast
336 *
337 * Revision 6.124 2001/09/18 16:49:25 dondosha
338 * Removed unneeded functions, eliminated mbutils.h header
339 *
340 * Revision 6.123 2001/09/07 14:46:43 dondosha
341 * Roll back removal of threshold_first from functions and structures
342 *
343 * Revision 6.122 2001/09/06 20:24:34 dondosha
344 * Removed threshold_first
345 *
346 * Revision 6.121 2001/09/05 21:30:11 dondosha
347 * Purify error fixed
348 *
349 * Revision 6.120 2001/09/05 20:33:11 dondosha
350 * Fixed a bug in changes from 07/20/2001
351 *
352 * Revision 6.119 2001/08/01 16:23:28 dondosha
353 * Improvement in procedure removing HSPs on close diagonals
354 *
355 * Revision 6.118 2001/07/25 22:10:28 dondosha
356 * Bug fix in hsp inclusion check
357 *
358 * Revision 6.117 2001/07/20 19:27:52 dondosha
359 * 1. Added pv_array and diagonal array when needed.
360 * 2. Added logic to decide when to unpack db sequence depending on its length.
361 * 3. Rewrote reevaluation with ambiguities function to allow gapped alignment.
362 *
363 * Revision 6.116 2001/06/21 21:44:49 dondosha
364 * Free allocated SeqLocs before error return from search set up
365 *
366 * Revision 6.115 2001/06/15 17:35:13 dondosha
367 * Correction to previous changes
368 *
369 * Revision 6.114 2001/06/14 22:09:15 dondosha
370 * Rearranged code for gi lists and oid masks processing to get rid of duplication
371 *
372 * Revision 6.113 2001/06/13 21:46:27 dondosha
373 * Comparison function name in heapsort of gi list changed
374 *
375 * Revision 6.112 2001/06/05 22:15:10 dondosha
376 * If some queries completely mask, destroy respective KarlinBlk
377 *
378 * Revision 6.111 2001/05/16 14:17:09 dondosha
379 * Correction for previous revision
380 *
381 * Revision 6.110 2001/05/15 20:02:27 dondosha
382 * Adjustments to accommodate query in a Bioseq form
383 *
384 * Revision 6.109 2001/05/08 20:18:16 dondosha
385 * Corrections for removal of short queries and score reevaluation for affine gaps
386 *
387 * Revision 6.108 2001/05/04 19:50:45 dondosha
388 * Improved error message when all queries are shorter than word size
389 *
390 * Revision 6.107 2001/05/04 15:55:21 dondosha
391 * Take into account removal of short sequences in function BlastFillQueryOffsets
392 *
393 * Revision 6.106 2001/05/03 21:48:27 dondosha
394 * Handle some cases when memory allocation fails
395 *
396 * Revision 6.105 2001/04/16 16:19:59 dondosha
397 * Remove query sequences with length less than wordsize, print warning
398 *
399 * Revision 6.104 2001/03/22 19:23:09 dondosha
400 * Changed hsp array new size variable Int4 from Int2
401 *
402 * Revision 6.103 2001/03/19 22:39:24 dondosha
403 * Allow location on the first query sequence for megablast
404 *
405 * Revision 6.102 2001/03/12 19:10:10 dondosha
406 * Do not use window size and multiple hits options in megablast
407 *
408 * Revision 6.101 2001/02/07 21:13:30 dondosha
409 * Pass output stream from options block to search
410 *
411 * Revision 6.100 2001/01/24 20:51:50 dondosha
412 * Enabled splitting of the second sequence for 2 sequences with megablast
413 *
414 * Revision 6.99 2001/01/11 18:31:22 dondosha
415 * Changed error level for nonexistent database from ERROR to FATAL
416 *
417 * Revision 6.98 2001/01/03 21:45:31 dondosha
418 * Fixed a memory leak - some edit blocks not freed in megablast
419 *
420 * Revision 6.97 2000/12/28 22:07:20 dondosha
421 * Exit with error if database not found
422 *
423 * Revision 6.96 2000/12/22 19:48:49 dondosha
424 * Corrected handling of percent identity cutoff in MegaBlastSaveCurrentHitlist
425 *
426 * Revision 6.95 2000/12/21 22:30:42 dondosha
427 * Implemented discarding of HSPs below percent identity cutoff
428 *
429 * Revision 6.94 2000/12/14 17:55:25 dondosha
430 * Fixed handling of subsequences masking in MegaBlastMaskTheResidues
431 *
432 * Revision 6.93 2000/12/07 20:30:37 dondosha
433 * Initialize spp to NULL (not done in previous change)
434 *
435 * Revision 6.92 2000/12/07 17:48:31 dondosha
436 * Handle non-whole query SeqLoc correctly in MegaBlastSetUpSearchInternalByLoc
437 *
438 * Revision 6.91 2000/11/29 23:06:24 dondosha
439 * Removed restriction on number of alignments saved for a single query sequence
440 *
441 * Revision 6.90 2000/11/29 16:27:06 dondosha
442 * Get the ncbi4na-encoded subject sequence only once per subject
443 *
444 * Revision 6.89 2000/11/21 20:18:56 dondosha
445 * Look for HSP inclusion based on diagonal instead of offset
446 *
447 * Revision 6.88 2000/11/16 19:14:10 dondosha
448 * Initialize mb_endpoint_results if no traceback
449 *
450 * Revision 6.87 2000/11/03 20:16:54 dondosha
451 * Changed one_line_results parameter to no_traceback
452 *
453 * Revision 6.86 2000/10/31 15:06:15 dondosha
454 * Added Boolean parameter to PrintMaskedSequence function
455 *
456 * Revision 6.85 2000/10/26 18:46:00 dondosha
457 * Check if gi list file is provided from the db alias
458 *
459 * Revision 6.84 2000/10/19 16:02:13 dondosha
460 * Changed MegaBlastGetPercentIdentity to MegaBlastGetNumIdentical
461 *
462 * Revision 6.83 2000/10/18 14:53:25 dondosha
463 * Corrected MegaBlastReevaluateWithAmbiguities function treatment of lower case query regions
464 *
465 * Revision 6.82 2000/10/12 14:55:55 dondosha
466 * Proceed with search even if some, but not all of the queries are completely masked
467 *
468 * Revision 6.81 2000/10/05 19:53:56 dondosha
469 * Save search results in an array of BlastResultsStruct - separate results for each query
470 *
471 * Revision 6.80 2000/09/29 22:21:47 dondosha
472 * Check if memory allocation succeeds to avoid core dumping exit
473 *
474 * Revision 6.79 2000/09/28 18:27:58 dondosha
475 * Do not free lower case mask SeqLocs before the search
476 *
477 * Revision 6.78 2000/09/28 14:59:01 dondosha
478 * Save initial hits in a small structure MegaBlastExactMatch instead of large BLAST_HSP
479 *
480 * Revision 6.77 2000/09/06 18:35:33 dondosha
481 * Corrected percentage of identities computation for -D1 output
482 *
483 * Revision 6.76 2000/09/01 18:29:12 dondosha
484 * Removed calls to ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles
485 *
486 * Revision 6.75 2000/08/30 22:08:44 dondosha
487 * Added function BioseqMegaBlastEngineByLoc
488 *
489 * Revision 6.74 2000/08/25 22:37:15 dondosha
490 * Added function MegaBlastReevaluateWithAmbiguities
491 *
492 * Revision 6.73 2000/08/18 20:12:30 dondosha
493 * Do not use search->query_id in megablast, use only qid_array
494 *
495 * Revision 6.72 2000/08/07 16:59:50 dondosha
496 * Correct construction of path for gi list file
497 *
498 * Revision 6.71 2000/08/07 16:16:12 dondosha
499 * Corrected behavior when input contains an empty sequence
500 *
501 * Revision 6.70 2000/08/02 15:21:49 dondosha
502 * Corrected the way Karlin block is computed for megablast
503 *
504 * Revision 6.69 2000/08/01 18:08:24 dondosha
505 * Fixed function MaskSeqLocFromSeqAlign used for masked query output
506 *
507 * Revision 6.68 2000/07/17 14:37:11 dondosha
508 * Fixed a memory leak in sequence masking
509 *
510 * Revision 6.67 2000/07/14 18:00:39 dondosha
511 * Sort db sequences by best hit expect value in traditional output
512 *
513 * Revision 6.66 2000/07/11 16:46:17 dondosha
514 * Fixed memory leak; use options query_lcase_mask variable for lower case masking
515 *
516 * Revision 6.65 2000/07/10 17:20:17 dondosha
517 * Use several stacks for speed up of search with small word sizes
518 *
519 * Revision 6.64 2000/07/06 17:25:49 dondosha
520 * Pass megablast_full_deflines option to search parameters
521 *
522 * Revision 6.63 2000/07/03 21:51:05 dondosha
523 * For small wordsizes look for 2 hits in a window
524 *
525 * Revision 6.62 2000/06/30 15:07:11 dondosha
526 * Fixed single strand search
527 *
528 * Revision 6.61 2000/06/29 21:10:37 dondosha
529 * Sort seqaligns by evalue only among hits from same database sequence
530 *
531 * Revision 6.60 2000/06/29 14:52:10 dondosha
532 * Typo in previous change
533 *
534 * Revision 6.59 2000/06/29 14:45:43 dondosha
535 * Mask queries with N, not lower case for -Q option output
536 *
537 * Revision 6.58 2000/06/27 22:21:53 dondosha
538 * Set no_check_score to TRUE; Added functions for masked query output
539 *
540 * Revision 6.57 2000/06/27 14:48:39 dondosha
541 * Changed the procedure of HSP inclusion checking before gapped extension to avoid quadratic complexity
542 *
543 * Revision 6.56 2000/06/26 18:21:19 dondosha
544 * Set awake_index to FALSE before any return from MegaBlastSetUpSearchInternalByLoc
545 *
546 * Revision 6.55 2000/06/26 17:35:30 dondosha
547 * If some of the queries are completely masked, the search can still proceed
548 *
549 * Revision 6.54 2000/06/22 22:22:19 dondosha
550 * Fixed a memory leak in MegaBlastNtWordExtend
551 *
552 * Revision 6.53 2000/06/21 18:03:20 dondosha
553 * Use the same hsp_array for gapped extension
554 *
555 * Revision 6.52 2000/06/19 20:08:06 madden
556 * Add last parameter to readdb_get_sequence_ex
557 *
558 * Revision 6.51 2000/06/19 19:42:35 dondosha
559 * Fixed a bug causing megablast to crash when database is not found
560 *
561 * Revision 6.50 2000/06/19 19:26:56 dondosha
562 * Do not go through a double loop on HSPs for small wordsize
563 *
564 * Revision 6.49 2000/06/07 17:39:51 dondosha
565 * Fixed bug causing extra progress messages in nucleotide neighboring
566 *
567 * Revision 6.48 2000/05/31 14:07:14 dondosha
568 * Reallocate memory for the hit stack if it is overflowing
569 *
570 * Revision 6.47 2000/05/26 19:22:28 dondosha
571 * For neighboring test HSP length and percent identity before saving
572 *
573 * Revision 6.46 2000/05/25 21:00:34 dondosha
574 * Set redundant HSPs to NULL in BlastSortUniqHspArray
575 *
576 * Revision 6.45 2000/05/24 19:51:03 dondosha
577 * Initialize qid_array during search set-up; do not use readdb when megablast is used for two sequences
578 *
579 * Revision 6.44 2000/05/22 18:48:07 dondosha
580 * Conform to change in ReadDBFILE structure
581 *
582 * Revision 6.43 2000/05/17 21:27:02 dondosha
583 * Small improvement in handling edit script
584 *
585 * Revision 6.42 2000/05/17 17:11:13 dondosha
586 * Removed some unused variables; fixed comparison function for sorting hsps
587 *
588 * Revision 6.41 2000/05/12 19:44:19 dondosha
589 * Do not use SeqPort for retrieving queries; keep query ids in array to allow binary search; use ncbi4na encoding for subject sequences during extension; added routine BinarySearchInt4
590 *
591 * Revision 6.40 2000/05/03 20:23:04 dondosha
592 * Added function MegaBlastGappedAlign to do the gapped extension after all perfect matches are saved - this significantly improves speed
593 *
594 * Revision 6.39 2000/05/01 21:25:54 dondosha
595 * Changed greedy_gapped_align calls to MegaBlastAffineGreedyAlign; bug fix for neighboring; memory leak fix
596 *
597 * Revision 6.38 2000/04/25 21:51:45 dondosha
598 * Little clean-up in MegaBlastNtWordExtend
599 *
600 * Revision 6.37 2000/04/20 15:14:36 dondosha
601 * Fixed MegaBlastGetFirstAndLastContext and BlastFillQueryOffsets for one strand only search
602 *
603 * Revision 6.36 2000/04/19 18:54:33 dondosha
604 * Bug fix: assign values for end-points different from gap or masked characters
605 *
606 * Revision 6.35 2000/04/18 19:09:03 dondosha
607 * Set the X dropoff correctly for the megablast search
608 *
609 * Revision 6.34 2000/04/14 19:22:52 dondosha
610 * Remove HSPs contained in other HSPs
611 *
612 * Revision 6.33 2000/04/12 19:11:28 dondosha
613 * Create index_callback thread right after the creation of BlastSearchBlk
614 *
615 * Revision 6.32 2000/04/12 18:24:28 dondosha
616 * Added MegaBlastGetPercentIdentity; fixed bugs with memory handling
617 *
618 * Revision 6.31 2000/04/11 21:00:19 dondosha
619 * Fixed memory bug in BlastNeedHumanRepeatFiltering
620 *
621 * Revision 6.30 2000/04/11 19:51:50 dondosha
622 * Removed setting of queue_callback to NULL
623 *
624 * Revision 6.29 2000/04/10 18:03:26 dondosha
625 * Fixed a bug in previous change
626 *
627 * Revision 6.28 2000/04/10 17:44:52 dondosha
628 * Find correct path for Homo_sapiens.n.gil, get this gilist only once
629 *
630 * Revision 6.27 2000/04/10 15:22:35 dondosha
631 * Moved call to BlastFillQueryOffsets to MegaBlastSetUpSearchInternalByLoc
632 *
633 * Revision 6.26 2000/04/07 16:51:57 dondosha
634 * Include callback argument for handling results in BioseqMegaBlastEngine, to be initialized in Main
635 *
636 * Revision 6.25 2000/04/05 18:14:48 dondosha
637 * Moved SeqIdSetDup to objloc.c, slightly changed format of BlastSearchHandleResults
638 *
639 * Revision 6.24 2000/04/04 20:50:46 dondosha
640 * Cleaned from non-megablast (non-blastn) stuff
641 *
642 * Revision 6.23 2000/04/04 16:16:35 dondosha
643 * Fixed some memory leaks in MegaBlast traceback
644 *
645 * Revision 6.22 2000/04/03 23:37:08 dondosha
646 * Added test for human repeat filtering for neighboring
647 *
648 * Revision 6.21 2000/03/31 19:10:32 dondosha
649 * Changed some names related to MegaBlast
650 *
651 * Revision 6.20 2000/03/29 23:32:21 dondosha
652 * Fixed minor bugs from previous change
653 *
654 * Revision 6.19 2000/03/29 22:13:43 dondosha
655 * Enabled processing gap information for MegaBlast code
656 *
657 * Revision 6.18 2000/03/27 20:56:43 madden
658 * Set query frame properly in BlastAdjustHitOffsets (for ungapped blast)
659 *
660 * Revision 6.17 2000/03/23 20:02:02 dondosha
661 * BlastSearchHandleResults exits immediately if hspcnt==0
662 *
663 * Revision 6.16 2000/03/22 17:57:12 dondosha
664 * Improved memory management in MegaBlast
665 *
666 * Revision 6.15 2000/03/16 18:14:00 dondosha
667 * Added routine SeqIdSetDup, improved handling of megablast results
668 *
669 * Revision 6.14 2000/03/13 21:06:59 dondosha
670 * Routine printing results rewritten, plus minor improvements
671 *
672 * Revision 6.13 2000/03/08 20:34:53 madden
673 * Remove ungapped psi-blast stuff
674 *
675 * Revision 6.12 2000/03/03 18:12:22 dondosha
676 * Added routine MegaBlastWordFinderDeallocate to fix multithreaded MegaBlast
677 *
678 * Revision 6.11 2000/03/02 18:29:00 dondosha
679 * Minor bug fix in setting context offsets
680 *
681 * Revision 6.10 2000/03/02 17:23:21 dondosha
682 * Added lower case masking plus bug fixes
683 *
684 * Revision 6.9 2000/02/24 23:20:15 dondosha
685 * Fixed a bug in BlastAdjustHitOffsets
686 *
687 * Revision 6.8 2000/02/24 17:53:38 dondosha
688 * Added routine BlastAdjustHitOffsets
689 *
690 * Revision 6.7 2000/02/23 20:28:38 dondosha
691 * Bug fix for ungapped alignment search, plus style improvements
692 *
693 * Revision 6.6 2000/02/17 20:01:53 dondosha
694 * Removed references to theCacheSize
695 *
696 * Revision 6.5 2000/02/12 21:18:57 kans
697 * added prototype for MegaBlastBuildLookupTable - implemented in lookup.c, called from mblast.c
698 *
699 * Revision 6.4 2000/02/11 21:03:36 dondosha
700 * Added new word finder and extension routines to work with new lokup table
701 *
702 * Revision 6.3 2000/02/03 21:21:10 dondosha
703 * Added header; few bug fixes
704 *
705 * Revision 6.2 2000/02/02 15:03:44 dondosha
706 * Removed unused routine ReapHitlistByContext
707 *
708 * */
709
710 #include <ncbi.h>
711 #include <blastpri.h>
712 #include <lookup.h>
713 #include <objcode.h>
714 #include <objseq.h>
715 #include <sequtil.h>
716 #include <tofasta.h>
717 #include <seqport.h>
718 #include <readdb.h>
719 #include <ncbithr.h>
720 #include <gapxdrop.h>
721 #include <dust.h>
722 #include <mbalign.h>
723 #include <mblast.h>
724 #include <time.h>
725
726 Int4
727 MegaBlastWordFinder (BlastSearchBlkPtr search, LookupTablePtr lookup);
728
729 Int4
730 MegaBlastExtendHit (BlastSearchBlkPtr search, LookupTablePtr lookup,
731 Int4 s_off, Int4 q_off);
732 Int2
733 MegaBlastNtWordExtend (BlastSearchBlkPtr search, Uint1Ptr subject0,
734 Int4 q_off, Int4 s_off);
735
736 Int2 MegaBlastGappedAlign (BlastSearchBlkPtr search);
737
738 void MegaBlastMaskTheResidues (Uint1Ptr buffer, Int4 max_length,
739 Uint1 mask_residue, SeqLocPtr mask_slp, Boolean reverse,
740 Int4 offset, Boolean lowercase_mask);
741
742 void BlastLCaseMaskTheResidues (Uint1Ptr buffer, Int4 max_length,
743 SeqLocPtr mask_slp, Boolean reverse, Int4 offset);
744
745 GapXEditScriptPtr MBToGapXEditScript (edit_script_t PNTR ed_script);
746
747 MBTemplateType GetMBTemplateType (Int2 weight, Int2 length,
748 MBDiscWordType type);
749
750 SeqAlignPtr PNTR MegaBlastPackAlignmentsByQuery (BlastSearchBlkPtr search,
751 SeqAlignPtr seqalign);
752
753 Int2 LIBCALL MegaBlastSequenceAddSequence (BlastSequenceBlkPtr sequence_blk,
754 Uint1Ptr sequence, Uint1Ptr sequence_start, Int4 length,
755 Int4 original_length, Int4 effective_length);
756
757 Int4 SeqLocTotalLen (CharPtr prog_name, SeqLocPtr slp);
758
759 Boolean MegaBlastGetFirstAndLastContext (CharPtr prog_name,
760 SeqLocPtr query_slp, Int2Ptr first_context,
761 Int2Ptr last_context, Uint1 strand_options);
762
763 SeqAlignPtr PNTR
BioseqMegaBlastEngine(BioseqPtr PNTR bspp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)(Int4 done,Int4 positives),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * results_callback)PROTO ((VoidPtr Ptr)))764 BioseqMegaBlastEngine (BioseqPtr PNTR bspp, CharPtr progname, CharPtr database,
765 BLAST_OptionsBlkPtr options, ValNodePtr *other_returns,
766 ValNodePtr *error_returns, int (LIBCALLBACK
767 *callback)(Int4 done,
768 Int4
769 positives),
770 SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list,
771 Int4 gi_list_total,
772 int (LIBCALLBACK *results_callback)PROTO((VoidPtr Ptr)))
773 {
774 SeqLocPtr slp, next_slp;
775 Int4 index = 0;
776 SeqAlignPtr PNTR head;
777 Int4 from = options->required_start, to = options->required_end;
778
779 slp = NULL;
780
781 /* If location given, apply it to the first sequence */
782 if (from > 0 || to > 0) {
783 if (to < 0)
784 to = bspp[0]->length - 1;
785 if (from >= bspp[0]->length || to < 0) {
786 ErrPostEx(SEV_FATAL, 1, 0,
787 "Location outside of the query sequence range\n");
788 return NULL;
789 } else if (to <= from) {
790 ErrPostEx(SEV_FATAL, 1, 0, "Empty query location provided\n");
791 return NULL;
792 }
793 slp = SeqLocIntNew(from, to, options->strand_option, bspp[0]->id);
794 index++;
795 }
796
797 for ( ; bspp[index] != NULL; index++)
798 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdSetDup(bspp[index]->id));
799
800 head = BioseqMegaBlastEngineByLoc(slp, progname, database, options,
801 other_returns, error_returns, callback,
802 seqid_list, gi_list, gi_list_total,
803 results_callback);
804
805 if (slp->choice == SEQLOC_INT) {
806 next_slp = slp->next;
807 SeqLocFree(slp);
808 slp = next_slp;
809 }
810
811 while (slp) {
812 next_slp = slp->next;
813 SeqIdSetFree((SeqIdPtr) slp->data.ptrvalue);
814 MemFree(slp);
815 slp = next_slp;
816 }
817
818 return head;
819 }
820
821 SeqAlignPtr PNTR
BioseqMegaBlastEngineByLoc(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)(Int4 done,Int4 positives),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * results_callback)PROTO ((VoidPtr Ptr)))822 BioseqMegaBlastEngineByLoc (SeqLocPtr slp, CharPtr progname, CharPtr database,
823 BLAST_OptionsBlkPtr options, ValNodePtr *other_returns,
824 ValNodePtr *error_returns,
825 int (LIBCALLBACK *callback)(Int4 done, Int4 positives),
826 SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list,
827 Int4 gi_list_total,
828 int (LIBCALLBACK *results_callback)PROTO((VoidPtr Ptr)))
829 {
830 Boolean options_allocated=FALSE;
831 BlastSearchBlkPtr search;
832 Int2 status;
833 SeqAlignPtr PNTR head;
834 SeqLocPtr whole_slp=NULL, slp_var;
835 Int4 index;
836
837
838 head = NULL;
839
840 if (error_returns)
841 {
842 *error_returns = NULL;
843 }
844
845 if (other_returns)
846 {
847 *other_returns = NULL;
848 }
849
850 if (progname == NULL)
851 return NULL;
852
853 /* If no options, use default. */
854 if (options == NULL)
855 {
856 options = BLASTOptionNewEx(progname, TRUE, TRUE);
857 options_allocated = TRUE;
858 }
859
860 status = BLASTOptionValidateEx(options, progname, error_returns);
861 if (status != 0)
862 { /* error messages in other_returns? */
863 return NULL;
864 }
865
866 if (slp == NULL || database == NULL)
867 return NULL;
868
869 search = MegaBlastSetUpSearchWithReadDbInternal(slp, NULL, progname,
870 0,
871 database, options, NULL,
872 seqid_list, gi_list,
873 gi_list_total, NULL);
874
875 if (search == NULL) {
876 /* We need to veryfy if database name is wrong and to set error
877 returns correctly */
878 BlastErrorMsgPtr error_msg;
879 CharPtr chptr;
880 ReadDBFILEPtr rdfp=NULL;
881
882 rdfp = readdb_new(database, FALSE);
883 if(rdfp == NULL) {
884 error_msg = MemNew(sizeof(BlastErrorMsg));
885 chptr = MemNew(StringLen(database) + 256);
886 sprintf(chptr, "Database %s was not found or does not exist",
887 database);
888 error_msg->msg = chptr;
889 error_msg->level = 3; /* FATAL */
890 ValNodeAddPointer(error_returns, 0, error_msg);
891 }
892
893 readdb_destruct(rdfp);
894 return NULL;
895 }
896
897 search->thr_info->tick_callback = callback;
898 search->thr_info->star_callback = callback;
899 search->handle_results = results_callback;
900 search->output = options->output;
901
902 head = BioseqMegaBlastEngineCore(search, options);
903
904 if (search->error_return)
905 {
906 ValNodeLink(error_returns, search->error_return);
907 search->error_return = NULL;
908 }
909
910 if (other_returns)
911 { /* format dbinfo etc. */
912 *other_returns = BlastOtherReturnsPrepare(search);
913 }
914
915 if (options_allocated)
916 {
917 options = BLASTOptionDelete(options);
918 }
919
920 search = BlastSearchBlkDestruct(search);
921
922 /* Adjsut the offset if the query does not cover the entire sequence. */
923 slp_var = slp;
924 for (index=0; slp_var!=NULL; index++,slp_var=slp_var->next) {
925 if (slp_var->choice != SEQLOC_WHOLE) {
926 ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE,
927 SeqIdFindBest(SeqLocId(slp_var), SEQID_GI));
928 if (SeqLocAinB(whole_slp, slp_var) != 0)
929 AdjustOffSetsInSeqAlign(head[index], slp_var, NULL);
930 ValNodeFree(whole_slp);
931 }
932 }
933
934 return head;
935 }
936
937 static Int2 mb_two_hit_min_step;
938
939 SeqAlignPtr PNTR
BioseqMegaBlastEngineCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options)940 BioseqMegaBlastEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options)
941 {
942 SeqAlignPtr seqalign, PNTR seqalignp;
943 Int2 num_queries;
944 CharPtr tmpstr;
945
946 seqalign = NULL;
947 seqalignp = NULL;
948 start_timer;
949 if (search == NULL || search->query_invalid)
950 return NULL;
951
952 /* Starting awake thread if multithreaded. */
953 if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
954 BlastStartAwakeThread(search->thr_info);
955
956 if ((tmpstr = getenv("MB_TWO_HIT_MIN_STEP")) != NULL)
957 mb_two_hit_min_step = atoi(tmpstr);
958 else
959 mb_two_hit_min_step = 0;
960
961 stop_timer("BioseqBlastEngineCore: before do_the_blast_run()");
962 do_the_blast_run(search);
963
964 /* If this is a partial search, we do not do traceback, so can return
965 immediately */
966 if (options->no_traceback == 1 || search->handle_results) {
967 BlastStopAwakeThread(search->thr_info);
968 return NULL;
969 }
970
971 /* So far the length of the first query was set to the total
972 concatenated length. It can be reset to the real length of the
973 first query sequence here. */
974 search->context[search->first_context].query->length =
975 search->query_context_offsets[search->first_context+1] - 1;
976
977
978 start_timer;
979 num_queries = search->last_context / 2 + 1;
980 seqalignp = (SeqAlignPtr PNTR)
981 MemNew(num_queries*sizeof(SeqAlignPtr));
982 BLASTPostSearchLogic(search, options, seqalignp, FALSE);
983
984 /* Stop the awake thread. */
985 BlastStopAwakeThread(search->thr_info);
986
987 stop_timer("MegaBlastEngineCore: after do_the_blast_run()");
988 return seqalignp;
989 }
990
991 #define INDEX_THR_MIN_SIZE 20000
992 BlastSearchBlkPtr
MegaBlastSetUpSearchWithReadDbInternal(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,ReadDBFILEPtr rdfp)993 MegaBlastSetUpSearchWithReadDbInternal (SeqLocPtr query_slp, BioseqPtr
994 query_bsp, CharPtr prog_name, Int4 qlen,
995 CharPtr dbname, BLAST_OptionsBlkPtr
996 options, int (LIBCALLBACK *callback)
997 PROTO((Int4 done, Int4 positives)),
998 SeqIdPtr seqid_list,
999 BlastDoubleInt4Ptr gi_list, Int4
1000 gi_list_total, ReadDBFILEPtr rdfp)
1001
1002 {
1003
1004 BlastSearchBlkPtr search;
1005 Boolean options_alloc=FALSE;
1006 Boolean looking_for_gis = FALSE;
1007 Int2 status, first_context, last_context;
1008 Int8 dblen;
1009 Int4 query_length;
1010 Nlm_FloatHi searchsp_eff=0;
1011 Int4 hitlist_size;
1012
1013 /* Allocate default options if none are allocated yet. */
1014 if (options == NULL)
1015 {
1016 options = BLASTOptionNew(prog_name, FALSE);
1017 options_alloc = TRUE;
1018 }
1019 hitlist_size = BlastSingleQueryResultSize(options);
1020
1021 /* last context is the total number of contexts in concatenated
1022 queries */
1023
1024 if (query_slp) {
1025 query_length = SeqLocTotalLen(prog_name, query_slp);
1026 MegaBlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
1027 } else {
1028 query_length = 2*query_bsp->length + 3;
1029 BlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
1030 }
1031
1032 qlen = 0;
1033 /* qlen != 0 is a flag for BlastSearchBlkNewExtra to create the
1034 'diagonal array' structure for saving initial hits that will be
1035 used instead of stacks for word sizes < 11 and short to medium
1036 length queries only */
1037 if (options->wordsize < 11 && query_length <= MAX_DIAG_ARRAY)
1038 qlen = query_length;
1039 /* Pass nonzero length only if need to allocate ewp */
1040 search = BlastSearchBlkNewExtra(options->wordsize, qlen,
1041 dbname, FALSE, 0,
1042 options->threshold_second,
1043 hitlist_size, prog_name, NULL,
1044 first_context, last_context, rdfp,
1045 options->window_size);
1046
1047 if (search) {
1048 if (NlmThreadsAvailable() && query_length > INDEX_THR_MIN_SIZE) {
1049 search->thr_info->awake_index = TRUE;
1050 search->thr_info->last_tick = Nlm_GetSecs();
1051 search->thr_info->index_thr =
1052 NlmThreadCreate(index_proc, search->thr_info);
1053 search->thr_info->index_callback = callback;
1054 }
1055 readdb_get_totals_ex(search->rdfp, &(dblen),
1056 &(search->dbseq_num), TRUE);
1057
1058
1059 /* Create virtual database if any of the databases have gi lists or
1060 ordinal id masks, or if gi list is provided from options */
1061 looking_for_gis = BlastProcessGiLists(search, options, gi_list, gi_list_total);
1062
1063 /* search->thr_info->blast_gi_list will be non-NULL if gi_list or
1064 * options->gilist or options->gifile was non-NULL and therefore
1065 * intersected with any oidlists in the search->rdfp(s). If this is the
1066 * case, we need to recalculate the database length and number of
1067 * sequences */
1068 if (search->thr_info->blast_gi_list && !options->use_real_db_size)
1069 readdb_get_totals_ex3(search->rdfp, &dblen, &search->dbseq_num,
1070 FALSE, TRUE, eApproximate);
1071
1072 if (looking_for_gis && search->thr_info->blast_gi_list == NULL)
1073 {
1074 ErrPostEx(SEV_WARNING, 0, 0, "Intersection of gilist and database IDs is empty");
1075 search->query_invalid = TRUE;
1076 }
1077
1078 /* command-line/options trump alias file. */
1079 if (options->db_length > 0)
1080 dblen = options->db_length;
1081 if (options->dbseq_num > 0)
1082 search->dbseq_num = options->dbseq_num;
1083 if (options->searchsp_eff > 0)
1084 searchsp_eff = options->searchsp_eff;
1085
1086 search->dblen = dblen;
1087 search->searchsp_eff = searchsp_eff;
1088 status = MegaBlastSetUpSearchInternalByLoc (search, query_slp, query_bsp,
1089 prog_name, qlen, options,
1090 callback);
1091 /*
1092 Turn off the index thread by setting this flag.
1093 Don't wait for a join, as the search will take much
1094 longer than the one second for this to die.
1095 */
1096 search->thr_info->awake_index = FALSE;
1097
1098 if (search->pbp->mb_params->no_traceback &&
1099 !search->pbp->mb_params->is_neighboring)
1100 search->mb_endpoint_results = ValNodeNew(NULL);
1101
1102 if (status != 0) {
1103 ErrPostEx(SEV_WARNING, 0, 0, "MegaBlastSetUpSearch failed.");
1104 search->query_invalid = TRUE;
1105 }
1106
1107 if (!search->query_invalid && !search->pbp->mb_params->use_dyn_prog) {
1108 search = GreedyAlignMemAlloc(search);
1109 if (search->abmp == NULL) {
1110 BlastConstructErrorMessage("Mega BLAST", "Memory allocation for greedy algorithm failed.", 2, &(search->error_return));
1111 search->query_invalid = TRUE;
1112 }
1113 } else
1114 search->abmp = NULL;
1115 if (search->rdfp->parameters & READDB_CONTENTS_ALLOCATED)
1116 search->rdfp = ReadDBCloseMHdrAndSeqFiles(search->rdfp);
1117 }
1118
1119 if (options_alloc)
1120 options = BLASTOptionDelete(options);
1121
1122 return search;
1123 }
1124
1125 /* Function below assumes that the gi file is sorted in increasing order */
1126 static Boolean
FindGiInGiList(Uint4 query_gi,BlastDoubleInt4Ptr gi_list,Int4 gi_list_size)1127 FindGiInGiList(Uint4 query_gi, BlastDoubleInt4Ptr gi_list,
1128 Int4 gi_list_size)
1129 {
1130 Int4 begin, end, middle;
1131
1132 begin = 0;
1133 end = gi_list_size - 1;
1134 while (begin <= end) {
1135 middle = (begin + end) / 2;
1136 if (gi_list[middle].gi > query_gi)
1137 end = middle - 1;
1138 else if (gi_list[middle].gi < query_gi)
1139 begin = middle + 1;
1140 else {
1141 return TRUE;
1142 }
1143 }
1144 return FALSE;
1145 }
1146
1147 #define MIN_LIMIT_HSP_NUM 1e20
1148 #define BLASTNA_N_RESIDUE 14
1149
1150 Int2
MegaBlastSetUpSearchInternalByLoc(BlastSearchBlkPtr search,SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))1151 MegaBlastSetUpSearchInternalByLoc (BlastSearchBlkPtr search, SeqLocPtr
1152 query_slp, BioseqPtr query_bsp, CharPtr
1153 prog_name, Int4 qlen, BLAST_OptionsBlkPtr
1154 options, int (LIBCALLBACK
1155 *callback)PROTO((Int4 done,
1156 Int4
1157 positives)))
1158
1159 {
1160 Boolean mask_at_hash=FALSE, private_slp_delete;
1161 Char buffer[128];
1162 Int2 retval, status;
1163 Int4 effective_query_length, query_length,
1164 index, length, length_adjustment=0,
1165 min_query_length, full_query_length=0;
1166 Int4 context, num_queries;
1167 Nlm_FloatHi avglen;
1168 SeqIdPtr qid=NULL;
1169 SeqLocPtr filter_slp=NULL, private_slp=NULL, private_slp_rev=NULL, slp=NULL, tmp_slp=NULL;
1170 Uint1 residue;
1171 Uint1Ptr query_seq=NULL, query_seq_start=NULL, query_seq_rev=NULL, query_seq_start_rev=NULL;
1172 Uint1Ptr query_seq_combined=NULL;
1173 CharPtr filter_string=NULL;
1174 Uint4 query_gi;
1175 Int4 homo_gilist_size, mouse_gilist_size, rat_gilist_size;
1176 BlastDoubleInt4Ptr homo_gilist=NULL, mouse_gilist=NULL, rat_gilist=NULL;
1177 SeqLocPtr mask_slp=NULL, next_mask_slp=NULL;
1178 SeqPortPtr spp = NULL;
1179
1180 if (options == NULL) {
1181 ErrPostEx(SEV_FATAL, 1, 0, "BLAST_OptionsBlkPtr is NULL\n");
1182 return 1;
1183 }
1184
1185 if (query_slp == NULL && query_bsp == NULL) {
1186 ErrPostEx(SEV_FATAL, 1, 0, "Query is NULL\n");
1187 return 1;
1188 }
1189
1190 context = search->first_context;
1191
1192 num_queries = search->last_context / 2 + 1;
1193 search->qid_array = (SeqIdPtr PNTR) MemNew(num_queries*sizeof(SeqIdPtr));
1194
1195 if (!query_slp) {
1196 query_length = query_bsp->length;
1197 private_slp = SeqLocIntNew(0, query_length-1 , Seq_strand_plus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1198 private_slp_rev = SeqLocIntNew(0, query_length-1 , Seq_strand_minus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1199 private_slp_delete = FALSE;
1200 search->query_slp = private_slp;
1201 search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
1202 /* Now let query_length be the total length of 2 strands together with
1203 sentinel bytes */
1204 query_length = 2*query_length + 3;
1205 } else {
1206 search->query_slp = query_slp;
1207 query_length = SeqLocTotalLen(search->prog_name, query_slp);
1208 }
1209
1210 if ((search->context[search->first_context].query->sequence_start =
1211 (Uint1Ptr) Malloc((query_length+2)*sizeof(Uint1))) == NULL) {
1212 ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1213 retval = 1;
1214 goto MegaBlastSetUpReturn;
1215 }
1216
1217 search = BlastFillQueryOffsets(search, search->query_slp,
1218 options->wordsize);
1219
1220 search->translation_buffer = NULL;
1221 search->translation_buffer_size = 0;
1222
1223 /*
1224 Set the context_factor, which specifies how many different
1225 ways the query or db is examined (e.g., blastn looks at both
1226 stands of query, context_factor is 2).
1227 */
1228 /* All strands of all queries concatenated into one */
1229 search->context_factor = 1;
1230
1231 /* Set the ambiguous residue before the ScoreBlk is filled. */
1232 if(options->matrix!=NULL) {
1233 search->sbp->read_in_matrix = TRUE;
1234 } else
1235 search->sbp->read_in_matrix = FALSE;
1236 BlastScoreSetAmbigRes(search->sbp, 'N');
1237
1238 search->sbp->penalty = options->penalty;
1239 search->sbp->reward = options->reward;
1240
1241 /* option is to use alignments chosen by user in PSM computation API (used in WWW PSI-Blast); */
1242 search->pbp->use_best_align = options->use_best_align;
1243
1244 /* Should culling be used at all? */
1245 search->pbp->perform_culling = options->perform_culling;
1246 search->pbp->hsp_range_max = options->hsp_range_max;
1247 search->pbp->mb_params = MegaBlastParameterBlkNew(options);
1248
1249 search->pbp->mb_params->max_positions = MIN(query_length, options->block_width);
1250 search->sbp->query_length = query_length;
1251
1252 search->mb_result_struct = (BLASTResultsStructPtr PNTR)
1253 MemNew(num_queries*sizeof(BLASTResultsStructPtr));
1254
1255 if (options->matrix != NULL)
1256 status = BlastScoreBlkMatFill(search->sbp, options->matrix);
1257 else
1258 status = BlastScoreBlkMatFill(search->sbp, "BLOSUM62");
1259 if (status != 0) {
1260 ErrPostEx(SEV_WARNING, 0, 0, "BlastScoreBlkMatFill returned non-zero status");
1261 retval = 1;
1262 goto MegaBlastSetUpReturn;
1263 }
1264
1265 /* This is used right below. */
1266 search->pbp->gapped_calculation = options->gapped_calculation;
1267 search->pbp->do_not_reevaluate = options->do_not_reevaluate;
1268 search->pbp->do_sum_stats = options->do_sum_stats;
1269 search->pbp->first_db_seq = options->first_db_seq;
1270 search->pbp->final_db_seq = options->final_db_seq;
1271
1272 retval = 0;
1273
1274 context = search->first_context;
1275
1276 if (private_slp && private_slp_delete)
1277 private_slp = SeqLocFree(private_slp);
1278 if (private_slp_rev)
1279 private_slp_rev = SeqLocFree(private_slp_rev);
1280
1281 search->query_id = NULL;
1282 slp = search->query_slp;
1283
1284 if (search->pbp->mb_params->is_neighboring) {
1285 CharPtr filename;
1286 filename = FindBlastDBFile("Homo_sapiens.n.gil");
1287 homo_gilist = GetGisFromFile (filename, &homo_gilist_size);
1288 MemFree(filename);
1289 filename = FindBlastDBFile("Mus_musculus.n.gil");
1290 mouse_gilist = GetGisFromFile(filename, &mouse_gilist_size);
1291 MemFree(filename);
1292 filename = FindBlastDBFile("Rattus_norvegicus.n.gil");
1293 rat_gilist = GetGisFromFile(filename, &rat_gilist_size);
1294 MemFree(filename);
1295 }
1296
1297 /* All lower case masking locations are in one list */
1298 next_mask_slp = options->query_lcase_mask;
1299
1300 while(context<=search->last_context && slp != NULL) {
1301 if (search->first_context == 0) {
1302 private_slp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp),
1303 Seq_strand_plus, SeqLocId(slp));
1304 }
1305 if (search->last_context % 2 == 1) {
1306 private_slp_rev = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp),
1307 Seq_strand_minus, SeqLocId(slp));
1308 }
1309
1310 if (private_slp)
1311 tmp_slp = private_slp;
1312 else
1313 tmp_slp = private_slp_rev;
1314
1315 search->qid_array[context/2] = SeqIdSetDup(SeqLocId(slp));
1316
1317 query_length = 0;
1318 query_length = SeqLocLen(tmp_slp);
1319 if (query_length < options->wordsize) {
1320 SeqIdWrite(SeqLocId(tmp_slp), buffer, PRINTID_FASTA_LONG, 127);
1321 ErrPostEx(SEV_WARNING, 0, 0,
1322 "Query sequence %s removed: length %ld is less than wordsize %d",
1323 buffer, query_length, options->wordsize);
1324 if (next_mask_slp &&
1325 SeqIdComp(SeqLocId(next_mask_slp), SeqLocId(tmp_slp)) == SIC_YES) {
1326 next_mask_slp = next_mask_slp->next;
1327 }
1328 slp = slp->next;
1329 context += 2;
1330 continue;
1331 }
1332
1333 if (options->filter && !options->filter_string)
1334 options->filter_string = BlastConstructFilterString(options->filter);
1335 if (search->pbp->mb_params->is_neighboring) {
1336 /* Test if we have to do human repeat filtering on this query */
1337 if (private_slp)
1338 qid = SeqLocId(private_slp);
1339 else
1340 qid = SeqLocId(private_slp_rev);
1341 query_gi = GetGIForSeqId(qid);
1342 if (FindGiInGiList(query_gi, homo_gilist, homo_gilist_size)) {
1343 filter_string =
1344 (CharPtr) Malloc(StringLen(options->filter_string)+3);
1345 sprintf(filter_string, "%s%s", options->filter_string, ";R");
1346 } else if (FindGiInGiList(query_gi, mouse_gilist, mouse_gilist_size)
1347 || FindGiInGiList(query_gi, rat_gilist, rat_gilist_size)) {
1348 filter_string =
1349 (CharPtr) Malloc(StringLen(options->filter_string)+18);
1350
1351 sprintf(filter_string, "%s%s", options->filter_string,
1352 ";R -d rodents.lib");
1353 } else
1354 filter_string = StringSave(options->filter_string);
1355 } else
1356 filter_string = options->filter_string;
1357
1358 if (private_slp)
1359 filter_slp = BlastSeqLocFilterEx(private_slp, filter_string, &mask_at_hash);
1360 else if (private_slp_rev)
1361 filter_slp = BlastSeqLocFilterEx(private_slp_rev, filter_string, &mask_at_hash);
1362 if (search->pbp->mb_params->is_neighboring)
1363 MemFree(filter_string);
1364
1365 if (search->first_context==0 && search->last_context % 2 == 1)
1366 query_seq_combined =
1367 (Uint1Ptr) Malloc((2*query_length+3)*sizeof(Uint1));
1368 else
1369 query_seq_combined =
1370 (Uint1Ptr) Malloc((query_length+2)*sizeof(Uint1));
1371
1372 if (query_seq_combined == NULL) {
1373 ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1374 retval = 1;
1375 goto MegaBlastSetUpReturn;
1376 }
1377
1378 query_seq_combined[0] = 0x0f;
1379 full_query_length = 0;
1380
1381 /* Extract the mask location corresponding to this query */
1382 if (next_mask_slp &&
1383 SeqIdComp(SeqLocId(next_mask_slp), SeqLocId(tmp_slp)) == SIC_YES) {
1384 mask_slp = (ValNodePtr) MemDup(next_mask_slp, sizeof(ValNode));
1385 next_mask_slp = mask_slp->next;
1386 mask_slp->next = NULL;
1387 } else
1388 mask_slp = NULL;
1389
1390 if (private_slp) {
1391 if (private_slp->choice == SEQLOC_INT) {
1392 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbi4na);
1393 SeqPortSet_do_virtual(spp, TRUE);
1394 }
1395
1396
1397 if ((query_seq_start = (Uint1Ptr)
1398 Malloc(((query_length)+2)*sizeof(Char))) == NULL) {
1399 ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1400 retval = 1;
1401 goto MegaBlastSetUpReturn;
1402 }
1403
1404 query_seq = query_seq_start+1;
1405
1406 index=0;
1407 if (spp) {
1408 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
1409 if (IS_residue(residue)) {
1410 query_seq[index] = ncbi4na_to_blastna[residue];
1411 index++;
1412 }
1413 }
1414 spp = SeqPortFree(spp);
1415 }
1416
1417 query_seq_start[0] = query_seq[query_length] = 0x0f;
1418
1419 filter_slp = blastMergeFilterLocs(filter_slp, mask_slp, FALSE, 0, 0);
1420 mask_slp = MemFree(mask_slp);
1421 /* Mask by the blastna value of 'N' (15 in ncbi4na) */
1422
1423 /* If masking done for lookup table only, mask only the combined
1424 sequence that will go into the first context */
1425 if (filter_slp && !mask_at_hash) {
1426 MegaBlastMaskTheResidues(query_seq, query_length,
1427 BLASTNA_N_RESIDUE, filter_slp, FALSE,
1428 SeqLocStart(private_slp), mask_at_hash);
1429 }
1430 MemCpy(&query_seq_combined[1], query_seq, query_length+1);
1431 if (filter_slp && mask_at_hash) {
1432 MegaBlastMaskTheResidues(query_seq_combined+1, query_length,
1433 BLASTNA_N_RESIDUE, filter_slp, FALSE,
1434 SeqLocStart(private_slp), mask_at_hash);
1435 }
1436
1437 full_query_length = query_length + 1;
1438 }
1439
1440 if (private_slp_rev) {
1441 spp = SeqPortNewByLoc(private_slp_rev, Seq_code_ncbi4na);
1442 SeqPortSet_do_virtual(spp, TRUE);
1443 if ((query_seq_start_rev = (Uint1Ptr)
1444 Malloc(((query_length)+2)*sizeof(Char))) == NULL) {
1445 ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1446 retval = 1;
1447 goto MegaBlastSetUpReturn;
1448 }
1449 query_seq_rev = query_seq_start_rev+1;
1450 index=0;
1451 if (spp) {
1452 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
1453 if (IS_residue(residue)) {
1454 query_seq_rev[index] = ncbi4na_to_blastna[residue];
1455 index++;
1456 }
1457 }
1458 spp = SeqPortFree(spp);
1459 }
1460
1461 query_seq_start_rev[0] = query_seq_rev[query_length] = 0x0f;
1462 if (filter_slp && !mask_at_hash) {
1463 MegaBlastMaskTheResidues(query_seq_rev, query_length,
1464 BLASTNA_N_RESIDUE, filter_slp, TRUE,
1465 SeqLocStart(private_slp_rev),
1466 mask_at_hash);
1467 }
1468 MemCpy(query_seq_combined+full_query_length+1,
1469 query_seq_rev,query_length+1);
1470 if (filter_slp && mask_at_hash) {
1471 MegaBlastMaskTheResidues(query_seq_combined+full_query_length+1,
1472 query_length, BLASTNA_N_RESIDUE,
1473 filter_slp, TRUE,
1474 SeqLocStart(private_slp_rev),
1475 mask_at_hash);
1476 }
1477 full_query_length += query_length + 1;
1478 }
1479
1480 MegaBlastSequenceAddSequence(search->context[search->first_context].query,
1481 NULL, query_seq_combined, full_query_length,
1482 full_query_length, 0);
1483
1484 query_seq_combined = MemFree(query_seq_combined);
1485 if (context==search->first_context)
1486 query_seq_start = MemFree(query_seq_start);
1487
1488 if (private_slp && context != search->first_context)
1489 MegaBlastSequenceAddSequence(search->context[context].query, query_seq,
1490 query_seq_start, query_length,
1491 query_length, 0);
1492 if (context % 2 == 0)
1493 context++;
1494 if (private_slp_rev && context != search->first_context) {
1495 MegaBlastSequenceAddSequence(search->context[context].query, query_seq_rev,
1496 query_seq_start_rev, query_length,
1497 query_length, 0);
1498 }
1499 context++;
1500
1501 MegaBlastSetUpReturn:
1502 if (filter_slp && !mask_at_hash)
1503 /* Save filtering locations for formatting */
1504 ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
1505 else /* No longer needed */
1506 SeqLocSetFree(filter_slp);
1507
1508 if (private_slp)
1509 private_slp = SeqLocFree(private_slp);
1510 if (private_slp_rev)
1511 private_slp_rev = SeqLocFree(private_slp_rev);
1512 if (retval)
1513 return retval;
1514 slp = slp->next;
1515 } /* End of loop over query contexts (strands) */
1516
1517 if (search->pbp->mb_params->is_neighboring) {
1518 MemFree(homo_gilist);
1519 MemFree(mouse_gilist);
1520 MemFree(rat_gilist);
1521 }
1522
1523 retval = 1;
1524 for (index=search->first_context; index<=search->last_context; index++) {
1525 length = search->query_context_offsets[index+1] -
1526 search->query_context_offsets[index] - 1;
1527 if (length > 0) {
1528 BLAST_KarlinBlk *kbp_gap;
1529 BLAST_KarlinBlk *kbp;
1530
1531 if (retval)
1532 retval = 2;
1533
1534 status = BlastScoreBlkFill(search->sbp, (CharPtr)
1535 search->context[index].query->sequence,
1536 length, index);
1537 search->sbp->kbp_gap_std[index] = BlastKarlinBlkCreate();
1538 kbp_gap = search->sbp->kbp_gap_std[index];
1539 kbp = search->sbp->kbp_std[index];
1540
1541 kbp_gap->Lambda = kbp->Lambda;
1542 kbp_gap->K = kbp->K;
1543 kbp_gap->logK = kbp->logK;
1544 kbp_gap->H = kbp->H;
1545 kbp_gap->paramC = kbp->paramC;
1546
1547 if (status==0)
1548 retval = 0;
1549 else
1550 search->sbp->kbp_std[index] =
1551 BlastKarlinBlkDestruct(search->sbp->kbp_std[index]);
1552 }
1553 }
1554
1555 /* Error only if all query sequences failed! */
1556 if (retval != 0) {
1557 if (retval == 1)
1558 sprintf(buffer, "All queries are shorter than word size or completely masked, no search performed");
1559 else
1560 sprintf(buffer, "Unable to calculate Karlin-Altschul parameters, check query sequence");
1561 BlastConstructErrorMessage("BLASTSetUpSearch", buffer, 2, &(search->error_return));
1562 }
1563
1564 search->sbp->kbp_gap = search->sbp->kbp_gap_std;
1565 search->sbp->kbp = search->sbp->kbp_std;
1566
1567 /* If retval was set non-zero above (by the routines calculating Karlin-Altschul params),
1568 return here before these values are used.
1569 */
1570 if (retval)
1571 return retval;
1572
1573 /* Calculate length adjustments and effective query lengths for
1574 each query. */
1575 for( context=search->first_context;
1576 context<=search->last_context;
1577 context++ ) {
1578 /* Skip those contexts where sequence is not searched. */
1579 if (!search->sbp->kbp[context])
1580 continue;
1581 BlastComputeLengthAdjustment(search->sbp->kbp[context]->K,
1582 search->sbp->kbp[context]->logK,
1583 1/search->sbp->kbp[context]->H,
1584 0.0,
1585 length,
1586 search->dblen, search->dbseq_num,
1587 &length_adjustment );
1588
1589 length = search->query_context_offsets[context+1] -
1590 search->query_context_offsets[context] - 1;
1591 min_query_length = (Int4) 1/(search->sbp->kbp[context]->K);
1592 effective_query_length =
1593 MAX(min_query_length, length - length_adjustment);
1594 search->context[context].query->effective_length =
1595 effective_query_length;
1596 if (context == search->first_context)
1597 search->length_adjustment = length_adjustment;
1598 }
1599 search->dblen_eff =
1600 search->dblen - search->dbseq_num*search->length_adjustment;
1601 /*if (search->searchsp_eff == 0)
1602 search->searchsp_eff = ((Nlm_FloatHi) search->dblen_eff) *
1603 ((Nlm_FloatHi) effective_query_length); */
1604
1605 /* The default is that cutoff_s was not set and is zero. */
1606 if (options->cutoff_s == 0) {
1607 search->pbp->cutoff_e = options->expect_value;
1608 search->pbp->cutoff_e_set = TRUE;
1609 search->pbp->cutoff_s = options->cutoff_s;
1610 search->pbp->cutoff_s_set = FALSE;
1611 } else {
1612 search->pbp->cutoff_e = options->expect_value;
1613 search->pbp->cutoff_e_set = FALSE;
1614 search->pbp->cutoff_s = options->cutoff_s;
1615 search->pbp->cutoff_s_set = TRUE;
1616 }
1617 /* For now e2 is set to 0.5 and cutoff_e2_set is FALSE. This is then
1618 changed to the proper values in blast_set_parameters. In the final version
1619 of this program (where more blast programs and command-line options are
1620 available) this needs to be set higher up. */
1621 if (options->cutoff_s2 == 0) {
1622 search->pbp->cutoff_e2 = options->e2;
1623 search->pbp->cutoff_e2_set = FALSE;
1624 search->pbp->cutoff_s2 = options->cutoff_s2;
1625 search->pbp->cutoff_s2_set = FALSE;
1626 } else {
1627 search->pbp->cutoff_e2 = options->e2;
1628 search->pbp->cutoff_e2_set = FALSE;
1629 search->pbp->cutoff_s2 = options->cutoff_s2;
1630 search->pbp->cutoff_s2_set = TRUE;
1631 }
1632
1633 search->pbp->discontinuous = options->discontinuous;
1634
1635
1636 /* For postion based blast. */
1637 search->pbp->ethresh = options->ethresh;
1638 search->pbp->maxNumPasses = options->maxNumPasses;
1639 search->pbp->pseudoCountConst = options->pseudoCountConst;
1640
1641 search->pbp->process_num = options->number_of_cpus;
1642 search->pbp->cpu_limit = options->cpu_limit;
1643 search->pbp->gap_decay_rate = options->gap_decay_rate;
1644 search->pbp->gap_size = options->gap_size;
1645 search->pbp->gap_prob = options->gap_prob;
1646 search->pbp->old_stats = options->old_stats;
1647 search->pbp->use_large_gaps = options->use_large_gaps;
1648 search->pbp->number_of_bits = options->number_of_bits;
1649 search->pbp->two_pass_method = options->two_pass_method;
1650 search->pbp->multiple_hits_only = options->multiple_hits_only;
1651 search->pbp->gap_open = options->gap_open;
1652 search->pbp->gap_extend = options->gap_extend;
1653 search->pbp->decline_align = options->decline_align;
1654
1655 if (options->hsp_num_max) {
1656 search->pbp->hsp_num_max = options->hsp_num_max;
1657 if (search->hsp_array_size > options->hsp_num_max)
1658 search->hsp_array_size = options->hsp_num_max;
1659 } else {
1660 FloatHi searchsp_eff = ((FloatHi) search->dblen_eff) *
1661 ((FloatHi) search->context[search->first_context].query->length);
1662 if (searchsp_eff > MIN_LIMIT_HSP_NUM) {
1663 char buffer[256];
1664 search->pbp->hsp_num_max = 50*(search->last_context + 1);
1665 sprintf(buffer,
1666 "Limiting the number of HSPs per database sequence to %d",
1667 search->pbp->hsp_num_max);
1668 ErrPostEx(SEV_WARNING, 0, 0, buffer);
1669 }
1670 }
1671
1672
1673 /*search->pbp->gap_x_dropoff = (BLAST_Score)
1674 (options->gap_x_dropoff*NCBIMATH_LN2 /
1675 search->sbp->kbp[search->first_context]->Lambda);*/
1676 search->pbp->gap_x_dropoff = options->gap_x_dropoff;
1677 /* Ensures that gap_x_dropoff_final is at least as large as gap_x_dropoff. */
1678 search->pbp->gap_x_dropoff_final = MAX(options->gap_x_dropoff_final, search->pbp->gap_x_dropoff);
1679
1680 /* "threshold" (first and second) must be set manually for two-pass right now.*/
1681 search->pbp->threshold_set = TRUE;
1682 search->pbp->threshold_second = options->threshold_second;
1683
1684 search->pbp->window_size = options->window_size;
1685 search->pbp->window_size_set = TRUE;
1686
1687 search->whole_query = TRUE;
1688 if (options->required_start != 0 || options->required_end != -1) {
1689 search->whole_query = FALSE;
1690 search->required_start = options->required_start;
1691 if (options->required_end != -1)
1692 search->required_end = options->required_end;
1693 else
1694 search->required_end = qlen;
1695 }
1696
1697 search->pbp->dropoff_2nd_pass = - options->dropoff_2nd_pass;
1698
1699 avglen = BLAST_NT_AVGLEN;
1700 /* Use only one type of gap for blastn */
1701 search->pbp->ignore_small_gaps = TRUE;
1702
1703 search->wfp = search->wfp_second;
1704 if (!MegaBlastBuildLookupTable(search)) {
1705 ErrPostEx(SEV_WARNING, 0, 0, "Failed to construct a lookup table");
1706 return 1;
1707 }
1708
1709 return 0;
1710 }
1711
1712 Boolean
MegaBlastGetFirstAndLastContext(CharPtr prog_name,SeqLocPtr query_slp,Int2Ptr first_context,Int2Ptr last_context,Uint1 strand_options)1713 MegaBlastGetFirstAndLastContext(CharPtr prog_name, SeqLocPtr query_slp, Int2Ptr first_context, Int2Ptr last_context, Uint1 strand_options)
1714 {
1715 SeqLocPtr tmp_slp = query_slp;
1716 Int2 tmp_first, tmp_last;
1717 Int2 index;
1718
1719 if (StringCmp(prog_name, "blastn"))
1720 return BlastGetFirstAndLastContext(prog_name, query_slp, first_context,
1721 last_context, strand_options);
1722
1723 if (!BlastGetFirstAndLastContext(prog_name, query_slp, first_context,
1724 last_context, strand_options))
1725 return FALSE;
1726
1727 for (index=0; tmp_slp->next != NULL; index++, tmp_slp=tmp_slp->next);
1728
1729 if (!BlastGetFirstAndLastContext(prog_name, tmp_slp, &tmp_first,
1730 &tmp_last, strand_options))
1731 return FALSE;
1732 /* For all intermediate sequences count 2 contexts */
1733 *last_context = 2 * index + tmp_last;
1734 return TRUE;
1735 }
1736
1737
SeqLocTotalLen(CharPtr prog_name,SeqLocPtr slp)1738 Int4 SeqLocTotalLen(CharPtr prog_name, SeqLocPtr slp)
1739 {
1740 Int4 total_length = 0;
1741 SeqLocPtr tmp_slp = slp;
1742
1743 if (StringCmp(prog_name, "blastn"))
1744 return SeqLocLen(slp);
1745
1746 while (tmp_slp != NULL) {
1747 total_length += 2*(SeqLocLen(tmp_slp) + 1);
1748 tmp_slp = tmp_slp->next;
1749 }
1750 return total_length - 1;
1751 }
1752
1753 BlastSearchBlkPtr
BlastFillQueryOffsets(BlastSearchBlkPtr search,SeqLocPtr query_slp,Int4 wordsize)1754 BlastFillQueryOffsets(BlastSearchBlkPtr search, SeqLocPtr query_slp,
1755 Int4 wordsize)
1756 {
1757 SeqLocPtr slp = query_slp;
1758 Int2 num_contexts;
1759 Int2 i = 0;
1760 Int4 length;
1761
1762 if (slp == NULL) return search;
1763 /* Note: the last offset will be equal to the combined query length + 1, for
1764 convenience of computing each individual query length later */
1765 for (slp=query_slp, num_contexts=0; slp; slp=slp->next, num_contexts+=2);
1766
1767 search->query_context_offsets = (Int4Ptr) Malloc((num_contexts+1)*sizeof(Int4));
1768
1769 search->query_context_offsets[0] = 0;
1770 for (slp = query_slp; slp; slp = slp->next) {
1771 length = SeqLocLen(slp) + 1;
1772 if (length <= wordsize)
1773 /* Sequence shorter than word size will be removed */
1774 length = 0;
1775 if (search->first_context == 0)
1776 search->query_context_offsets[i+1] =
1777 search->query_context_offsets[i] + length;
1778 else /* Top strand is not searched */
1779 search->query_context_offsets[i+1] =
1780 search->query_context_offsets[i];
1781 i++;
1782 if (search->last_context % 2 == 1)
1783 search->query_context_offsets[i+1] =
1784 search->query_context_offsets[i] + length;
1785 else
1786 /* Bottom strand is not searched */
1787 search->query_context_offsets[i+1] =
1788 search->query_context_offsets[i];
1789 i++;
1790 }
1791
1792
1793 return search;
1794 }
1795
1796 static int LIBCALLBACK
evalue_compare_seqaligns(VoidPtr v1,VoidPtr v2)1797 evalue_compare_seqaligns(VoidPtr v1, VoidPtr v2)
1798 {
1799 SeqAlignPtr sap1, sap2, PNTR sapp1, PNTR sapp2;
1800
1801 sapp1 = (SeqAlignPtr PNTR) v1;
1802 sapp2 = (SeqAlignPtr PNTR) v2;
1803
1804 sap1 = *sapp1;
1805 sap2 = *sapp2;
1806
1807 if (sap1->score->value.realvalue > sap2->score->value.realvalue)
1808 return -1;
1809 else if (sap1->score->value.realvalue < sap2->score->value.realvalue)
1810 return 1;
1811 else if (sap1->score->value.intvalue < sap2->score->value.intvalue)
1812 return -1;
1813 else if (sap1->score->value.intvalue > sap2->score->value.intvalue)
1814 return 1;
1815 else
1816 return 0;
1817
1818 }
1819
1820 /* Disassemble a list of SeqAligns into an array by query (in case of
1821 multiple queries */
MegaBlastPackAlignmentsByQuery(BlastSearchBlkPtr search,SeqAlignPtr seqalign)1822 SeqAlignPtr PNTR MegaBlastPackAlignmentsByQuery(BlastSearchBlkPtr search,
1823 SeqAlignPtr seqalign)
1824 {
1825 SeqAlignPtr PNTR seqalignp, PNTR sapp, PNTR sapp1;
1826 SeqAlignPtr seqalign_var, sap, seqalign_prev;
1827 Int2 index, i, hit_count, dbseq_count;
1828 SeqIdPtr query_id, seg_query_id;
1829 SeqLocPtr slp;
1830 Int4 num_queries = search->last_context / 2 + 1;
1831 Int4 count = 0;
1832
1833 slp = search->query_slp;
1834
1835 seqalignp = (SeqAlignPtr PNTR)
1836 MemNew(num_queries*sizeof(SeqAlignPtr));
1837
1838 if (!seqalign) {
1839 *seqalignp = NULL;
1840 return seqalignp;
1841 }
1842
1843 for (index=0; index<num_queries; index++) {
1844 query_id = search->qid_array[index];
1845 hit_count = 0;
1846 seqalign_var = seqalign;
1847 seqalign_prev = NULL;
1848 while (seqalign_var != NULL) {
1849 seg_query_id = ((DenseSegPtr)seqalign_var->segs)->ids;
1850 if (SeqIdComp(query_id, seg_query_id) == SIC_YES) {
1851 /* Remove this link from the list - we won't need it any more */
1852 if (seqalign_prev != NULL)
1853 seqalign_prev->next = seqalign_var->next;
1854 else /* first link in the list */
1855 seqalign = seqalign_var->next;
1856
1857 seqalign_var->next = NULL;
1858
1859 /* Add this seqalign to the list for this query */
1860 if (seqalignp[index]==NULL) {
1861 seqalignp[index] = seqalign_var;
1862 sap = seqalignp[index];
1863 } else {
1864 sap->next = seqalign_var;
1865 sap = sap->next;
1866 }
1867 hit_count++;
1868 /* Move to the next link */
1869 if (seqalign_prev != NULL)
1870 seqalign_var = seqalign_prev->next;
1871 else
1872 seqalign_var = seqalign;
1873 } else { /* hit from different query */
1874 seqalign_prev = seqalign_var;
1875 seqalign_var = seqalign_var->next;
1876 }
1877 }
1878
1879 count += hit_count;
1880 /* Now sort seqaligns for this query based on evalue */
1881 if (hit_count>1) {
1882 Int4 start, end;
1883 SeqIdPtr sid;
1884
1885 sapp = Nlm_Malloc(hit_count*sizeof(SeqAlignPtr));
1886 sapp1 = Nlm_Malloc(hit_count*sizeof(SeqAlignPtr));
1887 sapp[0] = seqalignp[index];
1888
1889 /* sapp is an array of pointers to all hits for this query */
1890 /* sapp1 is an array of pointers to best hits for each db */
1891 /* sequence for this query */
1892 for (i=1; i<hit_count; i++)
1893 sapp[i] = sapp[i-1]->next;
1894
1895 dbseq_count = 0;
1896 for (start=0; start<hit_count; ) {
1897 end = start;
1898 sid = ((DenseSegPtr)sapp[start]->segs)->ids->next;
1899 for(end=start+1; end < hit_count &&
1900 SeqIdComp(sid, ((DenseSegPtr)sapp[end]->segs)->ids->next)
1901 ==SIC_YES;
1902 end++);
1903
1904 if (end > start + 1)
1905 HeapSort(&sapp[start], end-start, sizeof(SeqAlignPtr),
1906 evalue_compare_seqaligns);
1907 for (i=start; i<end-1; i++)
1908 sapp[i]->next = sapp[i+1];
1909 sapp[end-1]->next = NULL; /* Separate lists for different db
1910 sequences */
1911 sapp1[dbseq_count++] = sapp[start];
1912 start = end;
1913 }
1914 /* Sort the lists for different db sequences based on their
1915 best hits */
1916 HeapSort(sapp1, dbseq_count, sizeof(SeqAlignPtr),
1917 evalue_compare_seqaligns);
1918 /* Connect the sorted lists */
1919 for (i=0; i<dbseq_count-1; i++) {
1920 for (sap=sapp1[i]; sap->next != NULL; sap = sap->next);
1921 sap->next = sapp1[i+1];
1922 }
1923
1924 seqalignp[index] = sapp1[0];
1925 MemFree(sapp);
1926 MemFree(sapp1);
1927 }
1928 }
1929 /* Shouldn't be necessary, but clean just in case */
1930 SeqAlignSetFree(seqalign);
1931 return seqalignp;
1932 }
1933
1934 /* Attach the "sequence" pointer to the BlastSequenceBlkPtr. sequence_start may be the
1935 actual start of the sequence (this pointer is kept for deallocation purposes). The
1936 sequence may start before "sequence" starts as there may be a sentinel (i.e., NULLB)
1937 before the start of the sequence. When the extension function extends this way it
1938 can tell that there is a NULLB there and stop the extension.
1939
1940 */
1941
1942 Int2 LIBCALL
MegaBlastSequenceAddSequence(BlastSequenceBlkPtr sequence_blk,Uint1Ptr sequence,Uint1Ptr sequence_start,Int4 length,Int4 original_length,Int4 effective_length)1943 MegaBlastSequenceAddSequence (BlastSequenceBlkPtr sequence_blk, Uint1Ptr sequence,
1944 Uint1Ptr sequence_start, Int4 length,
1945 Int4 original_length, Int4 effective_length)
1946 {
1947 Uint1Ptr seqptr;
1948
1949 if (sequence_blk == NULL)
1950 return 1;
1951 /* Assume that memory for sequence_blk->sequence is
1952 allocated elsewhere */
1953 if (sequence == NULL && sequence_start != NULL) {
1954 if (sequence_blk->sequence!=NULL && sequence_blk->length>0) {
1955 seqptr = sequence_blk->sequence_start + sequence_blk->length;
1956 MemCpy(seqptr, sequence_start, length+1);
1957 } else {
1958 MemCpy(sequence_blk->sequence_start, sequence_start, length+1);
1959 sequence_blk->sequence = sequence_blk->sequence_start + 1;
1960 sequence_blk->effective_length = effective_length;
1961 }
1962 }
1963 else if (sequence != NULL) {
1964 sequence_blk->sequence = sequence;
1965 sequence_blk->sequence_start = sequence_start;
1966 }
1967
1968 sequence_blk->length += length;
1969 sequence_blk->original_length += original_length;
1970
1971 return 0;
1972 }
1973
1974 static Boolean
MegaBlastSaveExactMatch(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off)1975 MegaBlastSaveExactMatch(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off)
1976 {
1977 MegaBlastExactMatchPtr match_array;
1978 Int4 num, num_avail;
1979
1980 if (!search || !search->wfp || !search->wfp->lookup ||
1981 !search->wfp->lookup->mb_lt)
1982 return FALSE;
1983
1984 num = search->current_hitlist->hspcnt;
1985 num_avail = search->current_hitlist->exact_match_max;
1986
1987 match_array = search->current_hitlist->exact_match_array;
1988 if (num>=num_avail) {
1989 if (search->current_hitlist->do_not_reallocate)
1990 return FALSE;
1991 num_avail *= 2;
1992 match_array = (MegaBlastExactMatchPtr)
1993 Realloc(match_array, num_avail*sizeof(MegaBlastExactMatch));
1994 if (!match_array) {
1995 ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in MegaBlastSaveExactMatch for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, num_avail);
1996 search->current_hitlist->do_not_reallocate = TRUE;
1997 return FALSE;
1998 } else {
1999 search->current_hitlist->exact_match_max = num_avail;
2000 search->current_hitlist->exact_match_array = match_array;
2001 }
2002 }
2003
2004 match_array[num].q_off = q_off;
2005 match_array[num].s_off = s_off;
2006 search->current_hitlist->hspcnt++;
2007 return TRUE;
2008 }
2009
2010 #define MB_TEMPLATE_LENGTH 18
2011
2012 #ifndef BUFFER_LENGTH
2013 #define BUFFER_LENGTH 255
2014 #endif
2015 #define MAX_LINE 48
2016
2017 /* Discontiguous words, database scanned 4 bases at a time */
2018
2019 static Int4
MegaBlastWordFinder_disc(BlastSearchBlkPtr search,LookupTablePtr lookup)2020 MegaBlastWordFinder_disc(BlastSearchBlkPtr search, LookupTablePtr lookup)
2021 {
2022 register Uint1Ptr subject;
2023 register Int4 s_off, ecode, mask, q_off, extra_code;
2024 Int4 ecode1, ecode2;
2025 Int4 subj_length = search->subject->length;
2026 Int4 min_hit_size;
2027 PV_ARRAY_TYPE *pv_array = lookup->pv_array;
2028 Int4 pv_array_bts;
2029 Int4 bit0, no_bit0;
2030 Int2 word_length;
2031 #ifdef USE_HASH_TABLE
2032 Uint1Ptr hash_buf;
2033 Int4 hash_shift, hash_mask, crc, size;
2034 #endif
2035 MegaBlastParameterBlkPtr mb_params = search->pbp->mb_params;
2036 Int2 template_length = mb_params->template_length;
2037 MBTemplateType template_type = mb_params->template_type;
2038 Boolean use_two_templates = mb_params->use_two_templates;
2039
2040 min_hit_size = lookup->mb_lt->lpm;
2041 if (search->pbp->window_size > 0)
2042 min_hit_size += 4;
2043
2044 if (mb_params->is_neighboring &&
2045 subj_length < MIN_NEIGHBOR_HSP_LENGTH)
2046 return 0;
2047 if (search->current_hitlist == NULL)
2048 search->current_hitlist = BlastHitListNew(search);
2049
2050 if (mb_params->word_weight == 11) {
2051 no_bit0 = 0x007fffff;
2052 bit0 = 0x00800000;
2053 } else {
2054 no_bit0 = 0xffffffff;
2055 bit0 = 0;
2056 }
2057
2058 #ifdef USE_HASH_TABLE
2059 for (hash_shift=-1,size=lookup->mb_lt->hashsize; size;
2060 size=size>>1,hash_shift++);
2061 hash_shift = (32 - hash_shift)/2;
2062 hash_mask = lookup->mb_lt->hashsize - 1;
2063 #endif
2064
2065 word_length = (lookup->mb_lt->width + 1) * 4;
2066 if (template_length > word_length) {
2067 subj_length -= template_length - word_length;
2068 }
2069
2070 mask = lookup->mb_lt->mask;
2071 subject = search->subject->sequence - 1;
2072 ecode = 0;
2073 if (lookup->mb_lt->estack)
2074 MemSet(lookup->mb_lt->stack_index, 0,
2075 lookup->mb_lt->num_stacks*sizeof(Int4));
2076
2077 #ifdef USE_HASH_TABLE
2078 pv_array_bts = PV_ARRAY_BTS;
2079 #else
2080 pv_array_bts = PV_ARRAY_BTS + ((lookup->mb_lt->width < 3) ? 0 : 5);
2081 #endif
2082
2083 for (s_off = 0; s_off < lookup->mb_lt->width*4; s_off += 4) {
2084 ecode = (ecode << 8) + *++subject;
2085 }
2086
2087 s_off += 4;
2088 if (template_length == 16) {
2089 while (s_off<=subj_length) {
2090 ecode = ((ecode & mask) << 8) + *++subject;
2091 if (template_type == TEMPL_11_16)
2092 ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2093 else if (template_type == TEMPL_12_16)
2094 ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2095 else if (template_type == TEMPL_11_16_OPT)
2096 ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2097 else if (template_type == TEMPL_12_16_OPT)
2098 ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2099 #ifdef USE_HASH_TABLE /* Using hash table */
2100 hash_buf = (Uint1Ptr)&ecode1;
2101 CRC32(crc, hash_buf);
2102 ecode1 = (crc>>hash_shift) & hash_mask;
2103 #endif
2104 if (use_two_templates) {
2105 if (template_type == TEMPL_11_16)
2106 ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2107 else if (template_type == TEMPL_12_16)
2108 ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2109 }
2110 #ifdef USE_HASH_TABLE /* Using hash table */
2111 hash_buf = (Uint1Ptr)&ecode2;
2112 CRC32(crc, hash_buf);
2113 ecode2 = (crc>>hash_shift) & hash_mask;
2114 #endif
2115 if (pv_array) {
2116 while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2117 (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2118 && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2119 (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2120 ecode = ((ecode & mask) << 8) + *++subject;
2121 if (template_type == TEMPL_11_16)
2122 ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2123 else if (template_type == TEMPL_12_16)
2124 ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2125 else if (template_type == TEMPL_11_16_OPT)
2126 ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2127 else if (template_type == TEMPL_12_16_OPT)
2128 ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2129 #ifdef USE_HASH_TABLE
2130 hash_buf = (Uint1Ptr)&ecode1;
2131 CRC32(crc, hash_buf);
2132 ecode1 = (crc>>hash_shift) & hash_mask;
2133 #endif
2134
2135 if (use_two_templates) {
2136 if (template_type == TEMPL_11_16)
2137 ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2138 else if (template_type == TEMPL_12_16)
2139 ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2140 }
2141 #ifdef USE_HASH_TABLE
2142 hash_buf = (Uint1Ptr)&ecode2;
2143 CRC32(crc, hash_buf);
2144 ecode2 = (crc>>hash_shift) & hash_mask;
2145 #endif
2146 s_off += 4;
2147 }
2148 if (s_off > subj_length)
2149 break;
2150 }
2151 for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2152 q_off = lookup->mb_lt->next_pos[q_off]) {
2153 search->second_pass_hits++;
2154 MegaBlastExtendHit(search, lookup, s_off, q_off);
2155 }
2156 if (use_two_templates) {
2157 for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2158 q_off = lookup->mb_lt->next_pos2[q_off]) {
2159 search->second_pass_hits++;
2160 MegaBlastExtendHit(search, lookup, s_off, q_off);
2161 }
2162 }
2163 s_off += 4;
2164 }
2165
2166 } else { /* template length > 16 */
2167 extra_code = 0;
2168 while (s_off<=subj_length) {
2169 ecode = ((ecode & mask) << 8) + *++subject;
2170 switch (template_type) {
2171 case TEMPL_11_18:
2172 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2173 ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2174 break;
2175 case TEMPL_12_18:
2176 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2177 ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2178 break;
2179 case TEMPL_11_18_OPT:
2180 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2181 ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2182 break;
2183 case TEMPL_12_18_OPT:
2184 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2185 ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2186 break;
2187 case TEMPL_11_21:
2188 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2189 ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2190 break;
2191 case TEMPL_12_21:
2192 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2193 ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2194 break;
2195 case TEMPL_11_21_OPT:
2196 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2197 ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2198 break;
2199 case TEMPL_12_21_OPT:
2200 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2201 ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2202 break;
2203 default:
2204 extra_code = 0;
2205 ecode1 = 0;
2206 break;
2207 }
2208 #ifdef USE_HASH_TABLE
2209 hash_buf = (Uint1Ptr)&ecode1;
2210 CRC32(crc, hash_buf);
2211 ecode1 = (crc>>hash_shift) & hash_mask;
2212 #endif
2213
2214 if (use_two_templates) {
2215 switch (template_type) {
2216 case TEMPL_11_18:
2217 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2218 ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) |
2219 extra_code | bit0;
2220 break;
2221 case TEMPL_12_18:
2222 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2223 ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) |
2224 extra_code | bit0;
2225 break;
2226 case TEMPL_11_21:
2227 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2228 ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) |
2229 extra_code | bit0;
2230 break;
2231 case TEMPL_12_21:
2232 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2233 ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) |
2234 extra_code | bit0;
2235 break;
2236 default:
2237 ecode2 = 0;
2238 break;
2239 }
2240 #ifdef USE_HASH_TABLE
2241 hash_buf = (Uint1Ptr)&ecode2;
2242 CRC32(crc, hash_buf);
2243 ecode2 = (crc>>hash_shift) & hash_mask;
2244 #endif
2245 }
2246 if (pv_array) {
2247 while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2248 (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2249 && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2250 (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2251 ecode = ((ecode & mask) << 8) + *++subject;
2252
2253 switch (template_type) {
2254 case TEMPL_11_18:
2255 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2256 ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2257 break;
2258 case TEMPL_12_18:
2259 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2260 ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2261 break;
2262 case TEMPL_11_18_OPT:
2263 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2264 ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2265 break;
2266 case TEMPL_12_18_OPT:
2267 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2268 ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2269 break;
2270 case TEMPL_11_21:
2271 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2272 ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2273 break;
2274 case TEMPL_12_21:
2275 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2276 ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2277 break;
2278 case TEMPL_11_21_OPT:
2279 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2280 ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2281 break;
2282 case TEMPL_12_21_OPT:
2283 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2284 ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2285 break;
2286 default:
2287 extra_code = 0;
2288 ecode1 = 0;
2289 break;
2290 }
2291
2292 #ifdef USE_HASH_TABLE
2293 hash_buf = (Uint1Ptr)&ecode1;
2294 CRC32(crc, hash_buf);
2295 ecode1 = (crc>>hash_shift) & hash_mask;
2296 #endif
2297
2298 if (use_two_templates) {
2299 switch (template_type) {
2300 case TEMPL_11_18:
2301 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2302 ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) |
2303 extra_code | bit0;
2304 break;
2305 case TEMPL_12_18:
2306 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2307 ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) |
2308 extra_code | bit0;
2309 break;
2310 case TEMPL_11_21:
2311 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2312 ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) |
2313 extra_code | bit0;
2314 break;
2315 case TEMPL_12_21:
2316 extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2317 ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) |
2318 extra_code | bit0;
2319 break;
2320 default:
2321 ecode2 = 0; break;
2322 }
2323 #ifdef USE_HASH_TABLE
2324 hash_buf = (Uint1Ptr)&ecode2;
2325 CRC32(crc, hash_buf);
2326 ecode2 = (crc>>hash_shift) & hash_mask;
2327 #endif
2328 }
2329 s_off += 4;
2330 }
2331 if (s_off > subj_length)
2332 break;
2333 }
2334 for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2335 q_off = lookup->mb_lt->next_pos[q_off]) {
2336 search->second_pass_hits++;
2337 MegaBlastExtendHit(search, lookup, s_off, q_off);
2338 }
2339 if (use_two_templates) {
2340 for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2341 q_off = lookup->mb_lt->next_pos2[q_off]) {
2342 search->second_pass_hits++;
2343 MegaBlastExtendHit(search, lookup, s_off, q_off);
2344 }
2345 }
2346 s_off += 4;
2347 }
2348 }
2349
2350 if (!lookup->mb_lt->estack)
2351 BlastExtendWordExit(search);
2352
2353 search->current_hitlist->do_not_reallocate = FALSE;
2354
2355 /* Do greedy gapped extension */
2356 if (search->current_hitlist->hspcnt > 0)
2357 if (MegaBlastGappedAlign(search) != 0)
2358 return -1;
2359
2360 return search->current_hitlist->hspcnt;
2361 }
2362
2363 /* Discontiguous words, database scanned 1 base at a time */
2364
2365 #define mask30b 0x3fffffff
2366 #define mask2b 0x00000003
2367
2368 static Int4
MegaBlastWordFinder_disc_1b(BlastSearchBlkPtr search,LookupTablePtr lookup)2369 MegaBlastWordFinder_disc_1b(BlastSearchBlkPtr search, LookupTablePtr lookup)
2370 {
2371 register Uint1Ptr subject;
2372 register Int4 s_off, ecode, mask, q_off, extra_code;
2373 Int4 ecode1, ecode2;
2374 Int4 subj_length = search->subject->length;
2375 Int4 min_hit_size;
2376 PV_ARRAY_TYPE *pv_array = lookup->pv_array;
2377 Int4 pv_array_bts;
2378 Uint1 bit;
2379 Int4 no_bit0, bit0;
2380 Int2 word_length;
2381 #ifdef USE_HASH_TABLE
2382 Uint1Ptr hash_buf;
2383 Int4 hash_shift, hash_mask, crc, size;
2384 #endif
2385 MegaBlastParameterBlkPtr mb_params = search->pbp->mb_params;
2386 Int2 template_length = mb_params->template_length;
2387 MBTemplateType template_type = mb_params->template_type;
2388 Boolean use_two_templates = mb_params->use_two_templates;
2389
2390 min_hit_size = lookup->mb_lt->lpm;
2391 if (search->pbp->window_size > 0)
2392 min_hit_size += 4;
2393
2394 if (mb_params->is_neighboring &&
2395 subj_length < MIN_NEIGHBOR_HSP_LENGTH)
2396 return 0;
2397 if (search->current_hitlist == NULL)
2398 search->current_hitlist = BlastHitListNew(search);
2399
2400 if (mb_params->word_weight == 11) {
2401 no_bit0 = 0x007fffff;
2402 bit0 = 0x00800000;
2403 } else {
2404 no_bit0 = -1;
2405 bit0 = 0;
2406 }
2407
2408
2409 #ifdef USE_HASH_TABLE
2410 for (hash_shift=0,size=lookup->mb_lt->hashsize; size;
2411 size=size>>1,hash_shift++);
2412 hash_shift = (32 - hash_shift)/2;
2413 hash_mask = lookup->mb_lt->hashsize - 1;
2414 #endif
2415
2416 word_length = (lookup->mb_lt->width + 1) * 4;
2417 if (template_length > word_length) {
2418 subj_length -= template_length - word_length;
2419 }
2420
2421 mask = lookup->mb_lt->mask;
2422 subject = search->subject->sequence - 1;
2423 ecode = 0;
2424 if (lookup->mb_lt->estack)
2425 MemSet(lookup->mb_lt->stack_index, 0,
2426 lookup->mb_lt->num_stacks*sizeof(Int4));
2427
2428 pv_array_bts = PV_ARRAY_BTS + ((lookup->mb_lt->width < 3) ? 0 : 5);
2429
2430 for (s_off = 0; s_off < lookup->mb_lt->width*4; s_off += 4) {
2431 ecode = (ecode << 8) + *++subject;
2432 }
2433
2434 bit = 3;
2435 s_off++;
2436 if (template_length == word_length) {
2437 while (s_off<=subj_length) {
2438 if (bit == 3) subject++;
2439 ecode = ((ecode & mask30b) << 2) | (((*subject)>>(2*bit)) & mask2b);
2440 if (template_type == TEMPL_11_16)
2441 ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2442 else if (template_type == TEMPL_12_16)
2443 ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2444 else if (template_type == TEMPL_11_16_OPT)
2445 ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2446 else if (template_type == TEMPL_12_16_OPT)
2447 ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2448 #ifdef USE_HASH_TABLE
2449 hash_buf = (Uint1Ptr)&ecode1;
2450 CRC32(crc, hash_buf);
2451 ecode1 = (crc>>hash_shift) & hash_mask;
2452 #endif
2453
2454 if (use_two_templates) {
2455 switch (template_type) {
2456 case TEMPL_11_16:
2457 ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2458 break;
2459 case TEMPL_12_16:
2460 ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2461 break;
2462 default:
2463 ecode2 = 0; break;
2464 }
2465 #ifdef USE_HASH_TABLE
2466 hash_buf = (Uint1Ptr)&ecode2;
2467 CRC32(crc, hash_buf);
2468 ecode2 = (crc>>hash_shift) & hash_mask;
2469 #endif
2470 }
2471 bit = (bit - 1) & mask2b;
2472
2473 if (pv_array) {
2474 while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2475 (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2476 && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2477 (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2478 if (bit == 3) subject++;
2479 ecode = ((ecode & mask30b) << 2) |
2480 (((*subject)>>(2*bit)) & mask2b);
2481 if (template_type == TEMPL_11_16)
2482 ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2483 else if (template_type == TEMPL_12_16)
2484 ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2485 else if (template_type == TEMPL_11_16_OPT)
2486 ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2487 else if (template_type == TEMPL_12_16_OPT)
2488 ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2489 #ifdef USE_HASH_TABLE
2490 hash_buf = (Uint1Ptr)&ecode1;
2491 CRC32(crc, hash_buf);
2492 ecode1 = (crc>>hash_shift) & hash_mask;
2493 #endif
2494 if (use_two_templates) {
2495 switch (template_type) {
2496 case TEMPL_11_16:
2497 ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2498 break;
2499 case TEMPL_12_16:
2500 ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2501 break;
2502 default:
2503 ecode2 = 0; break;
2504 }
2505 #ifdef USE_HASH_TABLE
2506 hash_buf = (Uint1Ptr)&ecode2;
2507 CRC32(crc, hash_buf);
2508 ecode2 = (crc>>hash_shift) & hash_mask;
2509 #endif
2510 }
2511 bit = (bit - 1) & mask2b;
2512 s_off++;
2513 }
2514 if (s_off > subj_length)
2515 break;
2516 }
2517 for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2518 q_off = lookup->mb_lt->next_pos[q_off]) {
2519 search->second_pass_hits++;
2520 MegaBlastExtendHit(search, lookup, s_off, q_off);
2521 }
2522 if (use_two_templates) {
2523 for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2524 q_off = lookup->mb_lt->next_pos2[q_off]) {
2525 search->second_pass_hits++;
2526 MegaBlastExtendHit(search, lookup, s_off, q_off);
2527 }
2528 }
2529 s_off++;
2530 }
2531 } else {
2532 Uint1Ptr seqptr;
2533 Uint1 bitval;
2534 Int4 tmpval;
2535 extra_code = 0;
2536 while (s_off<=subj_length) {
2537 if (bit == 3) subject++;
2538 ecode = ((ecode & mask30b) << 2) | (((*subject)>>(2*bit)) & mask2b);
2539 seqptr = subject;
2540 bitval = bit;
2541 extra_code = 0;
2542 tmpval = 0;
2543 switch (template_type) {
2544 case TEMPL_11_18:
2545 GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2546 ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2547 break;
2548 case TEMPL_12_18:
2549 GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2550 ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2551 break;
2552 case TEMPL_11_18_OPT:
2553 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2554 ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2555 break;
2556 case TEMPL_12_18_OPT:
2557 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2558 ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2559 break;
2560 case TEMPL_11_21:
2561 GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2562 ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2563 break;
2564 case TEMPL_12_21:
2565 GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2566 ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2567 break;
2568 case TEMPL_11_21_OPT:
2569 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2570 ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2571 break;
2572 case TEMPL_12_21_OPT:
2573 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2574 ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2575 break;
2576 default:
2577 extra_code = 0; ecode1 = 0; break;
2578 }
2579 #ifdef USE_HASH_TABLE
2580 hash_buf = (Uint1Ptr)&ecode1;
2581 CRC32(crc, hash_buf);
2582 ecode1 = (crc>>hash_shift) & hash_mask;
2583 #endif
2584 if (use_two_templates) {
2585 seqptr = subject;
2586 bitval = bit;
2587 extra_code = 0;
2588 tmpval = 0;
2589 switch (template_type) {
2590 case TEMPL_11_18:
2591 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2592 ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) | extra_code | bit0;
2593 break;
2594 case TEMPL_12_18:
2595 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2596 ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) | extra_code | bit0;
2597 break;
2598 case TEMPL_11_21:
2599 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2600 ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) | extra_code | bit0;
2601 break;
2602 case TEMPL_12_21:
2603 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2604 ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) | extra_code | bit0;
2605 break;
2606 default:
2607 ecode2 = 0; break;
2608 }
2609 #ifdef USE_HASH_TABLE
2610 hash_buf = (Uint1Ptr)&ecode2;
2611 CRC32(crc, hash_buf);
2612 ecode2 = (crc>>hash_shift) & hash_mask;
2613 #endif
2614 }
2615 bit = (bit - 1) & mask2b;
2616 if (pv_array) {
2617 while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2618 (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2619 && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2620 (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2621 if (bit == 3) subject++;
2622 ecode = ((ecode & mask30b) << 2) | (((*subject)>>(2*bit)) & mask2b);
2623 seqptr = subject;
2624 bitval = bit;
2625 extra_code = 0;
2626 tmpval = 0;
2627 switch (template_type) {
2628 case TEMPL_11_18:
2629 GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2630 ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2631 break;
2632 case TEMPL_12_18:
2633 GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2634 ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2635 break;
2636 case TEMPL_11_18_OPT:
2637 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2638 ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2639 break;
2640 case TEMPL_12_18_OPT:
2641 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2642 ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2643 break;
2644 case TEMPL_11_21:
2645 GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2646 ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2647 break;
2648 case TEMPL_12_21:
2649 GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2650 ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2651 break;
2652 case TEMPL_11_21_OPT:
2653 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2654 ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2655 break;
2656 case TEMPL_12_21_OPT:
2657 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2658 ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2659 break;
2660 default:
2661 extra_code = 0; ecode1 = 0; break;
2662 }
2663 #ifdef USE_HASH_TABLE
2664 hash_buf = (Uint1Ptr)&ecode1;
2665 CRC32(crc, hash_buf);
2666 ecode1 = (crc>>hash_shift) & hash_mask;
2667 #endif
2668 if (use_two_templates) {
2669 seqptr = subject;
2670 bitval = bit;
2671 extra_code = 0;
2672 tmpval = 0;
2673 switch (template_type) {
2674 case TEMPL_11_18:
2675 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2676 ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) | extra_code | bit0;
2677 break;
2678 case TEMPL_12_18:
2679 GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2680 ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) | extra_code | bit0;
2681 break;
2682 case TEMPL_11_21:
2683 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2684 ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) | extra_code | bit0;
2685 break;
2686 case TEMPL_12_21:
2687 GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2688 ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) | extra_code | bit0;
2689 break;
2690 default:
2691 ecode2 = 0; break;
2692 }
2693 #ifdef USE_HASH_TABLE
2694 hash_buf = (Uint1Ptr)&ecode2;
2695 CRC32(crc, hash_buf);
2696 ecode2 = (crc>>hash_shift) & hash_mask;
2697 #endif
2698 }
2699 bit = (bit - 1) & mask2b;
2700 s_off++;
2701 }
2702 if (s_off > subj_length)
2703 break;
2704 }
2705 for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2706 q_off = lookup->mb_lt->next_pos[q_off]) {
2707 search->second_pass_hits++;
2708 MegaBlastExtendHit(search, lookup, s_off, q_off);
2709 }
2710 if (use_two_templates) {
2711 for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2712 q_off = lookup->mb_lt->next_pos2[q_off]) {
2713 search->second_pass_hits++;
2714 MegaBlastExtendHit(search, lookup, s_off, q_off);
2715 }
2716 }
2717 s_off++;
2718 }
2719 }
2720 if (!lookup->mb_lt->estack)
2721 BlastExtendWordExit(search);
2722
2723 search->current_hitlist->do_not_reallocate = FALSE;
2724
2725 /* Do greedy gapped extension */
2726 if (search->current_hitlist->hspcnt > 0)
2727 if (MegaBlastGappedAlign(search) != 0)
2728 return -1;
2729
2730 if (search->current_hitlist)
2731 return search->current_hitlist->hspcnt;
2732 else return 0;
2733 }
2734
2735 /* Contiguous words, database scanned 4 bases at a time */
2736
2737 Int4
MegaBlastWordFinder(BlastSearchBlkPtr search,LookupTablePtr lookup)2738 MegaBlastWordFinder(BlastSearchBlkPtr search, LookupTablePtr lookup)
2739 {
2740 register Uint1Ptr subject;
2741 register Int4 s_off, ecode, mask, q_off;
2742 Int4 subj_length = search->subject->length;
2743 Int4 min_hit_size;
2744 PV_ARRAY_TYPE *pv_array = lookup->pv_array;
2745 Int4 pv_array_bts;
2746
2747 if (search->pbp->mb_params->disc_word) {
2748 if (search->pbp->mb_params->one_base_step)
2749 return MegaBlastWordFinder_disc_1b(search, lookup);
2750 else
2751 return MegaBlastWordFinder_disc(search, lookup);
2752 }
2753
2754 min_hit_size = lookup->mb_lt->lpm;
2755 if (search->pbp->window_size > 0)
2756 min_hit_size += 4;
2757
2758 if (search->pbp->mb_params->is_neighboring &&
2759 subj_length < MIN_NEIGHBOR_HSP_LENGTH)
2760 return 0;
2761 if (search->current_hitlist == NULL)
2762 search->current_hitlist = BlastHitListNew(search);
2763
2764 mask = lookup->mb_lt->mask;
2765 subject = search->subject->sequence - 1;
2766 ecode = 0;
2767 if (lookup->mb_lt->estack)
2768 MemSet(lookup->mb_lt->stack_index, 0,
2769 lookup->mb_lt->num_stacks*sizeof(Int4));
2770
2771 pv_array_bts = PV_ARRAY_BTS + ((lookup->mb_lt->width < 3) ? 0 : 5);
2772
2773 for (s_off = 0; s_off < (lookup->mb_lt->width - 1)*4; s_off += 4) {
2774 ecode = (ecode << 8) + *++subject;
2775 }
2776
2777 s_off += 4;
2778 while (s_off<=subj_length) {
2779 if (pv_array) {
2780 ecode = ((ecode & mask) << 8) + *++subject;
2781 while ((s_off < subj_length) && ((pv_array[ecode>>pv_array_bts]&
2782 (((PV_ARRAY_TYPE) 1)<<(ecode&PV_ARRAY_MASK))) == 0)) {
2783 ecode = ((ecode & mask) << 8) + *++subject;
2784 s_off += 4;
2785 }
2786 if (s_off > subj_length)
2787 break;
2788 } else {
2789 ecode = ((ecode & mask) << 8) + *++subject;
2790 }
2791 for (q_off = lookup->mb_lt->hashtable[ecode]; q_off>0;
2792 q_off = lookup->mb_lt->next_pos[q_off]) {
2793 search->second_pass_hits++;
2794 MegaBlastExtendHit(search, lookup, s_off, q_off);
2795 }
2796 s_off += 4;
2797 }
2798
2799 if (!lookup->mb_lt->estack)
2800 BlastExtendWordExit(search);
2801
2802 search->current_hitlist->do_not_reallocate = FALSE;
2803 /* Do greedy gapped extension */
2804 if (search->current_hitlist->hspcnt > 0)
2805 if (MegaBlastGappedAlign(search) != 0)
2806 return -1;
2807
2808 if (search->current_hitlist)
2809 return search->current_hitlist->hspcnt;
2810 else return 0;
2811 }
2812
2813 static int LIBCALLBACK
diag_compare_match(VoidPtr v1,VoidPtr v2)2814 diag_compare_match(VoidPtr v1, VoidPtr v2)
2815 {
2816 MegaBlastExactMatchPtr h1, h2;
2817
2818 h1 = *((MegaBlastExactMatchPtr PNTR) v1);
2819 h2 = *((MegaBlastExactMatchPtr PNTR) v2);
2820
2821 return (h1->q_off - h1->s_off) - (h2->q_off - h2->s_off);
2822 }
2823
2824 #define MB_MAX_LENGTH_TO_UNPACK 400
MegaBlastGappedAlign(BlastSearchBlkPtr search)2825 Int2 MegaBlastGappedAlign(BlastSearchBlkPtr search)
2826 {
2827 MegaBlastExactMatchPtr PNTR e_hsp_array;
2828 MegaBlastExactMatchPtr e_hsp;
2829 Int4 index, i, hspcnt, buf_len=0;
2830 Uint1Ptr subject0 = NULL;
2831 Boolean delete_hsp;
2832 GapAlignBlkPtr gap_align;
2833 Boolean use_dyn_prog = search->pbp->mb_params->use_dyn_prog;
2834
2835 hspcnt = search->current_hitlist->hspcnt;
2836
2837 if (hspcnt == 0)
2838 return 0;
2839
2840 /* Make current hitlist available for rewriting of extended hsps
2841 without freeing the hsp_array since it's used in this function */
2842 search->current_hitlist->hspcnt = 0;
2843
2844 e_hsp_array = (MegaBlastExactMatchPtr PNTR)
2845 Malloc(hspcnt*sizeof(MegaBlastExactMatchPtr));
2846
2847 if (e_hsp_array == NULL)
2848 return 1;
2849
2850 for (index=0; index < hspcnt; index++) {
2851 e_hsp_array[index] =
2852 &(search->current_hitlist->exact_match_array[index]);
2853 }
2854
2855 if (use_dyn_prog || (search->rdfp && search->subject->length > MB_MAX_LENGTH_TO_UNPACK)) {
2856 subject0 = search->subject->sequence;
2857 } else {
2858 /* Get subject sequence in Seq_code_ncbi4na format with all
2859 ambiguities information.
2860 If database sequence is split, do it only once and save in
2861 search->subject->sequence_start.
2862 For the 2 sequences engine this is done beforehand.
2863 */
2864 if (search->subject->sequence_start == NULL && search->rdfp) {
2865 readdb_get_sequence_ex(search->rdfp, search->subject_id,
2866 &search->subject->sequence_start,
2867 &buf_len, TRUE);
2868 }
2869 subject0 =
2870 (&search->subject->sequence_start[search->subject->original_length])
2871 + 1;
2872 }
2873
2874 HeapSort(e_hsp_array, hspcnt, sizeof(MegaBlastExactMatchPtr), diag_compare_match);
2875 if (use_dyn_prog) {
2876 if (search->gap_align == NULL) {
2877 search->gap_align = GapAlignBlkNew(1, 1);
2878 search->gap_align->dyn_prog = (GapXDPPtr)
2879 Malloc((search->context[search->first_context].query->length + 2)
2880 *sizeof(GapXDP));
2881 }
2882 gap_align = search->gap_align;
2883 gap_align->gap_open = search->pbp->gap_open;
2884 gap_align->gap_extend = search->pbp->gap_extend;
2885 gap_align->x_parameter = search->pbp->gap_x_dropoff;
2886 gap_align->matrix = search->sbp->matrix;
2887 gap_align->query =
2888 search->context[search->first_context].query->sequence;
2889 gap_align->subject = subject0;
2890 /* Subtract 1 because the concatenated length includes the last
2891 sentinel byte */
2892 gap_align->query_length =
2893 search->context[search->first_context].query->length - 1;
2894 gap_align->subject_length = search->subject->length;
2895 }
2896
2897 for (index=0; index<hspcnt; index++) {
2898 e_hsp = e_hsp_array[index];
2899 delete_hsp = FALSE;
2900 for (i = search->current_hitlist->hspcnt-1;
2901 i >= 0 && MB_HSP_CLOSE(e_hsp->q_off, search->current_hitlist->hsp_array[i]->query.offset, e_hsp->s_off, search->current_hitlist->hsp_array[i]->subject.offset, MB_DIAG_NEAR);
2902 i--) {
2903 if (MB_HSP_CONTAINED(e_hsp->q_off, search->current_hitlist->hsp_array[i]->query.offset, search->current_hitlist->hsp_array[i]->query.end, e_hsp->s_off, search->current_hitlist->hsp_array[i]->subject.offset, search->current_hitlist->hsp_array[i]->subject.end, MB_DIAG_CLOSE)) {
2904 delete_hsp = TRUE;
2905 break;
2906 }
2907 }
2908 if (!delete_hsp) {
2909 if ((search->pbp->mb_params->disc_word &&
2910 (search->pbp->window_size == 0))) {
2911 delete_hsp = BlastNtWordUngappedExtend(search, e_hsp->q_off,
2912 e_hsp->s_off, search->pbp->cutoff_s2);
2913 }
2914 if (!delete_hsp) {
2915 search->second_pass_extends++;
2916
2917 if (use_dyn_prog) {
2918 Uint1 rem = e_hsp->s_off & 0x03;
2919 gap_align->q_start = e_hsp->q_off - 1 - rem;
2920 gap_align->s_start = e_hsp->s_off - 1 - rem;
2921 gap_align->decline_align = INT2_MAX;
2922 if (!PerformNtGappedAlignment(gap_align))
2923 return 1;
2924 if (gap_align->score >= search->pbp->cutoff_s2) {
2925 BlastNtSaveCurrentHspGapped(search, gap_align->score,
2926 gap_align->query_start, gap_align->subject_start,
2927 gap_align->query_stop-gap_align->query_start+1,
2928 gap_align->subject_stop-gap_align->subject_start+1,
2929 e_hsp->q_off, e_hsp->s_off, search->first_context, NULL);
2930 }
2931 } else
2932 MegaBlastNtWordExtend(search, subject0, e_hsp->q_off,
2933 e_hsp->s_off);
2934 }
2935 }
2936 }
2937
2938 MemFree(e_hsp_array);
2939 return 0;
2940 }
2941
2942 Int4
MegaBlastExtendHit(BlastSearchBlkPtr search,LookupTablePtr lookup,Int4 s_off,Int4 q_off)2943 MegaBlastExtendHit(BlastSearchBlkPtr search, LookupTablePtr lookup,
2944 Int4 s_off, Int4 q_off) {
2945 Int4 index, len = 4*lookup->mb_lt->width;
2946 Int4 index1, step;
2947 Int2 min_hit_length;
2948 MbStackPtr estack;
2949 Int4 diag, stack_top;
2950 BLAST_ExtendWordParamsPtr ewp_params = search->ewp_params;
2951 register Int4 window;
2952 Int4 step_unit;
2953 Boolean one_word, hit_ready, two_hits;
2954
2955 window = search->pbp->window_size + len;
2956 two_hits = (search->pbp->window_size > 0);
2957
2958 if (search->pbp->mb_params->one_base_step)
2959 step_unit = 1;
2960 else
2961 step_unit = 4;
2962
2963 min_hit_length = lookup->mb_lt->lpm;
2964
2965 /* For short query/db stacks are not used;
2966 instead use a diagonal array */
2967 if (!lookup->mb_lt->estack) {
2968 CfjModStruct *combo_array =
2969 search->context[search->first_context].ewp->combo_array;
2970 CfjModStruct *combo_array_elem;
2971 register Int4 min_diag_mask;
2972 Int4 offset, s_pos;
2973
2974 min_diag_mask = ewp_params->min_diag_mask;
2975 offset = ewp_params->offset;
2976
2977 s_pos = s_off + offset;
2978 diag = (s_off - q_off) & min_diag_mask;
2979 combo_array_elem = &combo_array[diag];
2980 step = s_pos - combo_array_elem->last_hit;
2981
2982 one_word = (combo_array_elem->diag_level >= min_hit_length);
2983 hit_ready = ((!two_hits && one_word) ||
2984 combo_array_elem->diag_level > min_hit_length);
2985
2986 if (two_hits && step < mb_two_hit_min_step && step > step_unit
2987 && one_word) {
2988 /* Words too close are not allowed */
2989 return 0;
2990 } else if (step == step_unit ||
2991 (step > step_unit && step <= window
2992 && one_word && !hit_ready)) {
2993 /* We had a hit on this diagonal that is extended by current hit */
2994 combo_array_elem->last_hit = s_pos;
2995 combo_array_elem->diag_level += step;
2996 /* Save the hit immediately if:
2997 For single-hit case - it reached the word size
2998 For double-hit case - hit is longer than word size, or
2999 2 words have already been found, and there is another gap.
3000 If hit is longer than the required minimum, it must have already
3001 been saved earlier.
3002 */
3003 if (!hit_ready && ((combo_array_elem->diag_level > min_hit_length) ||
3004 (!two_hits && combo_array_elem->diag_level == min_hit_length)))
3005 {
3006 MegaBlastSaveExactMatch(search, q_off, s_off);
3007 }
3008 } else {
3009 /* Start the new hit, overwriting the diagonal array entry */
3010 combo_array_elem->last_hit = s_pos;
3011 combo_array_elem->diag_level = len;
3012 /* Save the hit if it already qualifies */
3013 if (!two_hits && (len >= min_hit_length))
3014 MegaBlastSaveExactMatch(search, q_off, s_off);
3015 }
3016 return 0;
3017 }
3018
3019 index1 = (s_off - q_off) % lookup->mb_lt->num_stacks;
3020 if (index1<0)
3021 index1 += lookup->mb_lt->num_stacks;
3022 estack = lookup->mb_lt->estack[index1];
3023 stack_top = lookup->mb_lt->stack_index[index1] - 1;
3024
3025 for (index=0; index<=stack_top; ) {
3026 step = s_off - estack[index].level;
3027 one_word = (estack[index].length >= min_hit_length);
3028 if (estack[index].diag == s_off - q_off) {
3029 hit_ready = ((!two_hits && one_word) ||
3030 estack[index].length > min_hit_length);
3031 if (two_hits && step < mb_two_hit_min_step && step > step_unit
3032 && one_word) {
3033 lookup->mb_lt->stack_index[index1] = stack_top + 1;
3034 /* Skip words that are too close to each other (experimental) */
3035 return 0;
3036 } else if (step == step_unit ||
3037 (step > step_unit && step <= window &&
3038 one_word && !hit_ready)) {
3039 estack[index].length += step;
3040 /* We had a hit on this diagonal that is extended by current hit */
3041 /* Save the hit if:
3042 For single-hit case - it has just reached the word size
3043 For double-hit case - second hit has just been found,
3044 after the first reached the word size.
3045 Otherwise, just update the hit length and offset.
3046 */
3047 if (!hit_ready && (estack[index].length > min_hit_length ||
3048 (!two_hits && estack[index].length == min_hit_length))) {
3049 MegaBlastSaveExactMatch(search, q_off, s_off);
3050 }
3051 estack[index].level = s_off;
3052 } else {
3053 /* Start the new hit */
3054 estack[index].length = len;
3055 estack[index].level = s_off;
3056 if (!two_hits && (len >= min_hit_length))
3057 MegaBlastSaveExactMatch(search, q_off, s_off);
3058 }
3059 lookup->mb_lt->stack_index[index1] = stack_top + 1;
3060 return 0;
3061 } else if (step <= step_unit || (step <= window && one_word)) {
3062 /* Hit from a different diagonal, and it can potentially continue */
3063 index++;
3064 } else {
3065 /* Hit from a different diagonal that does not continue: remove it
3066 from the stack */
3067 estack[index] = estack[stack_top--];
3068 }
3069 }
3070
3071 /* Need an extra slot on the stack for this hit */
3072 if (++stack_top>=lookup->mb_lt->stack_size[index1]) {
3073 /* Stack about to overflow - reallocate memory */
3074 MbStackPtr ptr;
3075 if (!(ptr = (MbStackPtr)Realloc(estack,
3076 2*lookup->mb_lt->stack_size[index1]*sizeof(MbStack)))) {
3077 ErrPostEx(SEV_WARNING, 0, 0, "Unable to allocate %ld extra spaces for stack %d",
3078 2*lookup->mb_lt->stack_size[index1], index1);
3079 return 1;
3080 } else {
3081 lookup->mb_lt->stack_size[index1] *= 2;
3082 estack = lookup->mb_lt->estack[index1] = ptr;
3083 }
3084 }
3085 estack[stack_top].diag = s_off - q_off;
3086 estack[stack_top].level = s_off;
3087 estack[stack_top].length = len;
3088 lookup->mb_lt->stack_index[index1] = stack_top + 1;
3089 if (!two_hits && (len >= min_hit_length))
3090 MegaBlastSaveExactMatch(search, q_off, s_off);
3091
3092 return 0;
3093 }
3094
3095 /* subject sequence is compressed 4:1 */
3096 Int2
MegaBlastNtWordExtend(BlastSearchBlkPtr search,Uint1Ptr subject0,Int4 q_off,Int4 s_off)3097 MegaBlastNtWordExtend(BlastSearchBlkPtr search, Uint1Ptr subject0,
3098 Int4 q_off, Int4 s_off)
3099 {
3100 register Uint1Ptr q;
3101 register BLAST_Score score;
3102 Uint1Ptr query0, s;
3103 BLAST_Score X;
3104 BLAST_ParameterBlkPtr pbp;
3105 BLAST_ScoreBlkPtr sbp;
3106 Int4 q_avail, s_avail;
3107 /* Variables for the call to MegaBlastGreedyAlign */
3108 Int4 q_ext_l, q_ext_r, s_ext_l, s_ext_r;
3109 edit_script_t *ed_script_fwd=NULL, *ed_script_rev=NULL;
3110 Boolean good_hit;
3111 Uint1 rem;
3112 GapXEditScriptPtr esp = NULL;
3113
3114 good_hit = TRUE;
3115 sbp=search->sbp;
3116 pbp=search->pbp;
3117
3118 query0 = (Uint1Ptr) search->context[search->first_context].query->sequence;
3119 q_avail = search->context[search->first_context].query->length - q_off;
3120 s_avail = search->subject->length - s_off;
3121
3122 q = query0 + q_off;
3123 if (!search->rdfp || search->subject->length <= MB_MAX_LENGTH_TO_UNPACK) {
3124 s = subject0 + s_off;
3125 rem = 4;
3126 } else {
3127 s = subject0 + s_off/4;
3128 rem = s_off % 4;
3129 }
3130
3131 /* Find where positive scoring starts & ends within the word hit */
3132 score = 0;
3133
3134 X = pbp->gap_x_dropoff;
3135
3136 if (!search->pbp->mb_params->no_traceback) {
3137 ed_script_fwd = edit_script_new();
3138 ed_script_rev = edit_script_new();
3139 }
3140
3141 /* extend to the right */
3142 score = MegaBlastAffineGreedyAlign(s, s_avail, q, q_avail, FALSE, X,
3143 sbp->reward, -sbp->penalty,
3144 pbp->gap_open, pbp->gap_extend,
3145 &s_ext_r, &q_ext_r, search->abmp,
3146 ed_script_fwd, rem);
3147 if (search->rdfp && search->subject->length > MB_MAX_LENGTH_TO_UNPACK)
3148 rem = 0;
3149 /* extend to the left */
3150 score += MegaBlastAffineGreedyAlign(subject0, s_off, query0, q_off,
3151 TRUE, X, sbp->reward,
3152 -sbp->penalty, pbp->gap_open,
3153 pbp->gap_extend, &s_ext_l,
3154 &q_ext_l, search->abmp,
3155 ed_script_rev, rem);
3156 /* For neighboring we have a stricter criterion to keep an HSP */
3157 if (search->pbp->mb_params->is_neighboring) {
3158 FloatHi perc_identity;
3159 Int4 hsp_length;
3160
3161 hsp_length = MIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
3162 perc_identity = 100 * (1 - ((FloatHi) score) / hsp_length);
3163 if (hsp_length < MIN_NEIGHBOR_HSP_LENGTH ||
3164 perc_identity < MIN_NEIGHBOR_PERC_IDENTITY)
3165 good_hit = FALSE;
3166 }
3167
3168 /* In basic case the greedy algorithm returns number of
3169 differences, hence we need to convert it to score */
3170 if (pbp->gap_open==0 && pbp->gap_extend==0)
3171 score =
3172 (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*sbp->reward/2 -
3173 score*(sbp->reward - sbp->penalty);
3174 else if (sbp->reward % 2 == 1)
3175 score /= 2;
3176
3177 if (good_hit && score >= pbp->cutoff_s2) { /* Score is reportable */
3178 if (!search->pbp->mb_params->no_traceback) {
3179 edit_script_append(ed_script_rev, ed_script_fwd);
3180 esp = MBToGapXEditScript(ed_script_rev);
3181 }
3182
3183 search->second_pass_good_extends++;
3184
3185 BlastNtSaveCurrentHspGapped(search, score, q_off-q_ext_l,
3186 s_off-s_ext_l, q_ext_l+q_ext_r,
3187 s_ext_l+s_ext_r, q_off, s_off,
3188 search->first_context, esp);
3189 }
3190 edit_script_free(ed_script_fwd);
3191 edit_script_free(ed_script_rev);
3192
3193 return 0;
3194 }
3195
3196 #define NO_UNPACK_MBALIGN_ARG 4
PerformGreedyAlignmentWithTraceback(GapAlignBlkPtr gap_align,GreedyAlignMemPtr gamp,BLAST_ScoreBlkPtr sbp)3197 void PerformGreedyAlignmentWithTraceback(GapAlignBlkPtr gap_align,
3198 GreedyAlignMemPtr gamp, BLAST_ScoreBlkPtr sbp)
3199 {
3200 Int4 score, s_ext_r, q_ext_r, s_ext_l, q_ext_l;
3201 Uint1Ptr query0, subject0, query, subject;
3202 Int4 X = gap_align->x_parameter;
3203 Int4 reward = sbp->reward;
3204 Int4 penalty = -sbp->penalty;
3205 Int4 q_off, s_off, q_avail, s_avail;
3206 edit_script_t *ed_script_fwd=NULL, *ed_script_rev=NULL;
3207 GapXEditScriptPtr esp = NULL;
3208
3209 query0 = gap_align->query;
3210 subject0 = gap_align->subject;
3211 q_off = gap_align->q_start;
3212 s_off = gap_align->s_start;
3213 query = query0 + q_off;
3214 subject = subject0 + s_off;
3215 q_avail = gap_align->query_length - q_off;
3216 s_avail = gap_align->subject_length - s_off;
3217 ed_script_fwd = edit_script_new();
3218 ed_script_rev = edit_script_new();
3219
3220 /* extend to the right */
3221 score = MegaBlastAffineGreedyAlign(subject, s_avail, query, q_avail,
3222 FALSE, X, reward, penalty, gap_align->gap_open,
3223 gap_align->gap_extend, &s_ext_r, &q_ext_r, gamp,
3224 ed_script_fwd, NO_UNPACK_MBALIGN_ARG);
3225 /* extend to the left */
3226 score += MegaBlastAffineGreedyAlign(subject0, s_off, query0, q_off,
3227 TRUE, X, reward, penalty, gap_align->gap_open,
3228 gap_align->gap_extend, &s_ext_l, &q_ext_l, gamp,
3229 ed_script_rev, NO_UNPACK_MBALIGN_ARG);
3230
3231 /* In basic case the greedy algorithm returns number of
3232 differences, hence we need to convert it to score */
3233 if (gap_align->gap_open==0 && gap_align->gap_extend==0)
3234 score =
3235 (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*sbp->reward/2 -
3236 score*(sbp->reward - sbp->penalty);
3237 else if (sbp->reward % 2 == 1)
3238 score /= 2;
3239
3240 edit_script_append(ed_script_rev, ed_script_fwd);
3241 esp = MBToGapXEditScript(ed_script_rev);
3242
3243 edit_script_free(ed_script_fwd);
3244 edit_script_free(ed_script_rev);
3245
3246 gap_align->score = score;
3247 gap_align->query_start = q_off - q_ext_l;
3248 gap_align->subject_start = s_off - s_ext_l;
3249 gap_align->query_stop = q_off + q_ext_r;
3250 gap_align->subject_stop = s_off + s_ext_r;
3251 gap_align->edit_block = GapXEditBlockNew(gap_align->query_start, gap_align->subject_start);
3252
3253 gap_align->edit_block->start1 = gap_align->query_start;
3254 gap_align->edit_block->start2 = gap_align->subject_start;
3255 gap_align->edit_block->length1 = gap_align->query_length;
3256 gap_align->edit_block->length2 = gap_align->subject_length;
3257 gap_align->edit_block->frame1 = gap_align->query_frame;
3258 gap_align->edit_block->frame2 = gap_align->subject_frame;
3259 gap_align->edit_block->reverse = 0;
3260
3261 gap_align->edit_block->esp = esp;
3262 }
3263
3264 /* A routine to mask the residues from a mask SeqLoc by changing to a given
3265 * value (mask_residue).
3266 * The buffer should contain an already decoded sequence.
3267 *
3268 */
MegaBlastMaskTheResidues(Uint1Ptr buffer,Int4 max_length,Uint1 mask_residue,SeqLocPtr mask_slp,Boolean reverse,Int4 offset,Boolean mask_at_hash)3269 void MegaBlastMaskTheResidues(Uint1Ptr buffer, Int4 max_length, Uint1
3270 mask_residue, SeqLocPtr mask_slp, Boolean reverse,
3271 Int4 offset, Boolean mask_at_hash)
3272
3273 {
3274 SeqLocPtr slp=NULL;
3275 Int4 index, start, stop;
3276
3277 while (mask_slp) {
3278 slp=NULL;
3279 while((slp = SeqLocFindNext(mask_slp, slp))!=NULL) {
3280 if (reverse) {
3281 start = MAX(max_length + offset - 1 - SeqLocStop(slp), 0);
3282 stop = max_length - 1 - MAX(SeqLocStart(slp) - offset, 0);
3283 } else {
3284 start = MAX(SeqLocStart(slp) - offset, 0);
3285 stop = MIN(SeqLocStop(slp) - offset, max_length - 1);
3286 }
3287
3288 if (mask_at_hash)
3289 for (index=start; index<=stop; index++)
3290 /* Set the 5th bit to mark that this is a masked residue */
3291 buffer[index] = buffer[index] | 0x10;
3292 else
3293 for (index=start; index<=stop; index++)
3294 buffer[index] = mask_residue;
3295 }
3296 mask_slp = mask_slp->next;
3297 }
3298 }
3299
3300 /* The following function can only be used for IUPAC encoding */
3301 void
BlastLCaseMaskTheResidues(Uint1Ptr buffer,Int4 max_length,SeqLocPtr mask_slp,Boolean reverse,Int4 offset)3302 BlastLCaseMaskTheResidues(Uint1Ptr buffer, Int4 max_length, SeqLocPtr mask_slp, Boolean reverse, Int4 offset)
3303
3304 {
3305 SeqLocPtr slp=NULL;
3306 Int4 index, start, stop;
3307
3308 while (mask_slp) {
3309 slp=NULL;
3310 while((slp = SeqLocFindNext(mask_slp, slp))!=NULL) {
3311 if (reverse) {
3312 start = max_length - 1 - SeqLocStop(slp);
3313 stop = max_length - 1 - SeqLocStart(slp);
3314 } else {
3315 start = SeqLocStart(slp);
3316 stop = SeqLocStop(slp);
3317 }
3318
3319 start -= offset;
3320 stop -= offset;
3321
3322 for (index=start; index<=stop; index++)
3323 buffer[index] = tolower(buffer[index]);
3324 }
3325 mask_slp = mask_slp->next;
3326 }
3327 }
3328
3329 static int LIBCALLBACK
diag_compare_hsps(VoidPtr v1,VoidPtr v2)3330 diag_compare_hsps(VoidPtr v1, VoidPtr v2)
3331
3332 {
3333 BLAST_HSPPtr h1, h2;
3334 BLAST_HSPPtr PNTR hp1, PNTR hp2;
3335
3336 hp1 = (BLAST_HSPPtr PNTR) v1;
3337 hp2 = (BLAST_HSPPtr PNTR) v2;
3338 h1 = *hp1;
3339 h2 = *hp2;
3340
3341 if (h1==NULL && h2==NULL) return 0;
3342 else if (h1==NULL) return 1;
3343 else if (h2==NULL) return -1;
3344
3345 /* Separate different queries and/or strands */
3346 if (h1->context < h2->context)
3347 return -1;
3348 else if (h1->context > h2->context)
3349 return 1;
3350
3351 return (h1->query.offset - h1->subject.offset) -
3352 (h2->query.offset - h2->subject.offset);
3353 }
3354
3355 typedef enum E_HSPInclusionStatus {
3356 e_Equal = 0, /**< Identical */
3357 e_FirstInSecond, /**< First included in rectangle formed by second */
3358 e_SecondInFirst, /**< Second included in rectangle formed by first */
3359 e_DiagNear, /**< Diagonals are near, but neither HSP is included in
3360 the other. */
3361 e_DiagDistant /**< Diagonals are far apart, or different contexts */
3362 } E_HSPInclusionStatus;
3363
3364 /** HSP inclusion criterion for megablast: one HSP must be included in a
3365 * diagonal strip of a certain width around the other, and also in a rectangle
3366 * formed by the other HSP's endpoints.
3367 */
3368 static E_HSPInclusionStatus
BLAST_HSPInclusionTest(BLAST_HSP * hsp1,BLAST_HSP * hsp2)3369 BLAST_HSPInclusionTest(BLAST_HSP* hsp1, BLAST_HSP* hsp2)
3370 {
3371 if (hsp1->context != hsp2->context ||
3372 !MB_HSP_CLOSE(hsp1->query.offset, hsp2->query.offset,
3373 hsp1->subject.offset, hsp2->subject.offset,
3374 2*MB_DIAG_NEAR))
3375 return e_DiagDistant;
3376
3377 if (hsp1->query.offset == hsp2->query.offset &&
3378 hsp1->query.end == hsp2->query.end &&
3379 hsp1->subject.offset == hsp2->subject.offset &&
3380 hsp1->subject.end == hsp2->subject.end &&
3381 hsp1->score == hsp2->score) {
3382 return e_Equal;
3383 } else if (hsp1->query.offset >= hsp2->query.offset &&
3384 hsp1->query.end <= hsp2->query.end &&
3385 hsp1->subject.offset >= hsp2->subject.offset &&
3386 hsp1->subject.end <= hsp2->subject.end &&
3387 hsp1->score < hsp2->score) {
3388 return e_FirstInSecond;
3389 } else if (hsp1->query.offset <= hsp2->query.offset &&
3390 hsp1->query.end >= hsp2->query.end &&
3391 hsp1->subject.offset <= hsp2->subject.offset &&
3392 hsp1->subject.end >= hsp2->subject.end &&
3393 hsp1->score >= hsp2->score) {
3394 return e_SecondInFirst;
3395 }
3396 return e_DiagNear;
3397 }
3398
3399 /** How many HSPs to check for inclusion for each new HSP? */
3400 #define MAX_NUM_CHECK_INCLUSION 20
3401
3402 static void
BlastSortUniqHspArray(BLAST_HitListPtr hitlist)3403 BlastSortUniqHspArray(BLAST_HitListPtr hitlist)
3404 {
3405 Int4 index, new_hspcnt, index1, index2;
3406 BLAST_HSPPtr PNTR hsp_array = hitlist->hsp_array;
3407 Boolean shift_needed = FALSE;
3408 E_HSPInclusionStatus inclusion_status = e_DiagNear;
3409
3410 HeapSort(hitlist->hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr),
3411 diag_compare_hsps);
3412
3413 for (index=1, new_hspcnt=0; index<hitlist->hspcnt; index++) {
3414 if (hsp_array[index]==NULL)
3415 continue;
3416 inclusion_status = e_DiagNear;
3417 for (index1 = new_hspcnt; inclusion_status != e_DiagDistant &&
3418 index1 >= 0 && new_hspcnt-index1 < MAX_NUM_CHECK_INCLUSION;
3419 index1--) {
3420 inclusion_status =
3421 BLAST_HSPInclusionTest(hsp_array[index], hsp_array[index1]);
3422 if (inclusion_status == e_FirstInSecond ||
3423 inclusion_status == e_Equal) {
3424 /* Free the new HSP and break out of the inclusion test loop */
3425 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
3426 break;
3427 } else if (inclusion_status == e_SecondInFirst) {
3428 hsp_array[index1] = BLAST_HSPFree(hsp_array[index1]);
3429 shift_needed = TRUE;
3430 }
3431 }
3432
3433 /* If some lower indexed HSPs have been removed, shift the subsequent
3434 HSPs */
3435 if (shift_needed) {
3436 /* Find the first non-NULL HSP, going backwards */
3437 while (index1 >= 0 && !hsp_array[index1])
3438 index1--;
3439 /* Go forward, and shift any non-NULL HSPs */
3440 for (index2 = ++index1; index1 <= new_hspcnt; index1++) {
3441 if (hsp_array[index1])
3442 hsp_array[index2++] = hsp_array[index1];
3443 }
3444 new_hspcnt = index2 - 1;
3445 shift_needed = FALSE;
3446 }
3447 if (hsp_array[index] != NULL)
3448 hsp_array[++new_hspcnt] = hsp_array[index];
3449 }
3450 /* Set all HSP pointers that will not be used any more to NULL */
3451 MemSet(&hsp_array[new_hspcnt+1], 0,
3452 (hitlist->hspcnt - new_hspcnt - 1)*sizeof(BLAST_HSPPtr));
3453 hitlist->hspcnt = new_hspcnt + 1;
3454 hitlist->hspcnt_max = hitlist->hspcnt;
3455 }
3456
3457 void
MegaBlastFillHspGapInfo(BLAST_HSPPtr hsp,GapXEditScriptPtr esp)3458 MegaBlastFillHspGapInfo(BLAST_HSPPtr hsp, GapXEditScriptPtr esp)
3459 {
3460 hsp->gap_info = GapXEditBlockNew(hsp->query.offset, hsp->subject.offset);
3461
3462 hsp->gap_info->start1 = hsp->query.offset;
3463 hsp->gap_info->start2 = hsp->subject.offset;
3464 hsp->gap_info->length1 = hsp->query.length;
3465 hsp->gap_info->length2 = hsp->subject.length;
3466 hsp->gap_info->frame1 = hsp->gap_info->frame2 = 1;
3467 hsp->gap_info->reverse = 0;
3468
3469 hsp->gap_info->esp = esp;
3470 }
3471
3472 GapXEditScriptPtr
MBToGapXEditScript(edit_script_t PNTR ed_script)3473 MBToGapXEditScript (edit_script_t PNTR ed_script)
3474 {
3475 GapXEditScriptPtr esp_start, esp, esp_prev;
3476 Int4 i;
3477
3478 if (ed_script==NULL || ed_script->num == 0)
3479 return NULL;
3480
3481 for (i=0; i<ed_script->num; i++) {
3482 esp = (GapXEditScriptPtr) MemNew(sizeof(GapXEditScript));
3483 esp->num = EDIT_VAL(ed_script->op[i]);
3484 esp->op_type = 3 - EDIT_OPC(ed_script->op[i]);
3485 if (esp->op_type == 3)
3486 fprintf(stderr, "op_type = 3\n");
3487 if (i==0)
3488 esp_start = esp_prev = esp;
3489 else {
3490 esp_prev->next = esp;
3491 esp_prev = esp;
3492 }
3493 }
3494
3495 return esp_start;
3496
3497 }
3498
3499
3500 SeqAlignPtr
MegaBlastGapInfoToSeqAlign(BlastSearchBlkPtr search,Int4 subject_index,Int2 query_index)3501 MegaBlastGapInfoToSeqAlign(BlastSearchBlkPtr search, Int4 subject_index,
3502 Int2 query_index)
3503 {
3504 SeqAlignPtr seqalign, seqalign_var, sap;
3505 BLASTResultHitlistPtr result_hitlist;
3506 SeqIdPtr subject_id;
3507 BLASTResultHspPtr hsp_array;
3508 Int4 index;
3509 SeqIdPtr query_id, new_subject_seqid;
3510 StdSegPtr ssp;
3511 ValNodePtr gi_list=NULL;
3512
3513 /* Sanity check */
3514 if (query_index > search->last_context / 2 ||
3515 search->mb_result_struct[query_index] == NULL ||
3516 subject_index >= search->mb_result_struct[query_index]->hitlist_count)
3517 return NULL;
3518
3519 result_hitlist =
3520 search->mb_result_struct[query_index]->results[subject_index];
3521
3522 if (search->rdfp) {
3523 readdb_get_descriptor(search->rdfp,
3524 result_hitlist->subject_id,
3525 &subject_id, NULL);
3526 } else if (result_hitlist->subject_info &&
3527 result_hitlist->subject_info->sip) {
3528 subject_id =
3529 SeqIdDup(result_hitlist->subject_info->sip);
3530 }
3531
3532 hsp_array = result_hitlist->hsp_array;
3533
3534 gi_list = BlastGetAllowedGis(search, result_hitlist->subject_id, &new_subject_seqid);
3535
3536 for (index=0; index<result_hitlist->hspcnt; index++) {
3537 query_id = search->qid_array[hsp_array[index].context/2];
3538 if (index==0) {
3539 if (new_subject_seqid)
3540 seqalign = seqalign_var =
3541 GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3542 new_subject_seqid, query_id);
3543 else
3544 seqalign = seqalign_var =
3545 GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3546 subject_id, query_id);
3547 } else {
3548 if (new_subject_seqid)
3549 seqalign_var->next =
3550 GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3551 new_subject_seqid, query_id);
3552 else
3553 seqalign_var->next =
3554 GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3555 subject_id, query_id);
3556
3557 seqalign_var = seqalign_var->next;
3558 }
3559 seqalign_var->score = GetScoreSetFromBlastResultHsp(&hsp_array[index], gi_list);
3560 if (seqalign_var->segtype == 3) {
3561 ssp = seqalign_var->segs;
3562 while (ssp) {
3563 ssp->scores = GetScoreSetFromBlastResultHsp(&hsp_array[index], gi_list);
3564 ssp = ssp->next;
3565 }
3566 } else if (seqalign_var->segtype == 5) { /* Discontinuous */
3567 sap = (SeqAlignPtr) seqalign_var->segs;
3568 for(;sap != NULL; sap = sap->next) {
3569 sap->score = GetScoreSetFromBlastResultHsp(&hsp_array[index], gi_list);
3570 }
3571 }
3572 }
3573
3574 SeqIdSetFree(subject_id);
3575 return seqalign;
3576 }
3577
3578 Int4
MegaBlastGetNumIdentical(Uint1Ptr query,Uint1Ptr subject,Int4 q_start,Int4 s_start,Int4 length,Boolean reverse)3579 MegaBlastGetNumIdentical(Uint1Ptr query, Uint1Ptr subject, Int4 q_start,
3580 Int4 s_start, Int4 length, Boolean reverse)
3581 {
3582 Int4 i, ident = 0;
3583 Uint1Ptr q, s;
3584 Uint1 at_hash_mask = 0x0f;
3585
3586 q = &query[q_start];
3587 if (!reverse)
3588 s = &subject[s_start];
3589 else
3590 s = &subject[s_start + length - 1];
3591
3592 for (i=0; i<length; i++) {
3593 if ((*q & at_hash_mask) == *s)
3594 ident++;
3595 q++;
3596 if (!reverse)
3597 s++;
3598 else
3599 s--;
3600 }
3601
3602 return ident;
3603 }
3604
3605 /* Reevaluate score of one HSP, using ambiguity information */
ReevaluateScoreWithAmbiguities(BlastSearchBlkPtr search,Uint1Ptr subject_start,BLAST_HSPPtr hsp)3606 Boolean ReevaluateScoreWithAmbiguities(BlastSearchBlkPtr search,
3607 Uint1Ptr subject_start, BLAST_HSPPtr hsp)
3608 {
3609 BLAST_Score sum, score, gap_open, gap_extend;
3610 BLAST_ScorePtr PNTR matrix;
3611 Uint1Ptr query_start, query, subject;
3612 Uint1Ptr new_q_start, new_s_start, new_q_end, new_s_end;
3613 Int4 i, context, last_esp_num;
3614 Int2 factor;
3615 Uint1 mask = 0x0f;
3616 Boolean delete_hsp;
3617 GapXEditBlockPtr gap_info = hsp->gap_info;
3618 GapXEditScriptPtr esp, last_esp, prev_esp, first_esp;
3619 FloatHi searchsp_eff;
3620 Int4 align_length;
3621
3622 if (search->sbp->reward % 2 == 1)
3623 factor = 2;
3624 else
3625 factor = 1;
3626
3627 if (search->pbp->gap_open == 0 && search->pbp->gap_extend == 0) {
3628 gap_open = 0;
3629 gap_extend =
3630 (search->sbp->reward - 2*search->sbp->penalty) * factor / 2;
3631 } else {
3632 gap_open = factor*search->pbp->gap_open;
3633 gap_extend = factor*search->pbp->gap_extend;
3634 }
3635
3636 matrix = search->sbp->matrix;
3637
3638 context = hsp->context;
3639
3640 searchsp_eff = (FloatHi) search->dblen_eff *
3641 ((FloatHi) search->context[context].query->effective_length);
3642
3643 query_start = search->context[context].query->sequence;
3644 query = query_start + hsp->query.offset;
3645 subject = subject_start + hsp->subject.offset;
3646 score = 0;
3647 sum = 0;
3648 new_q_start = new_q_end = query;
3649 new_s_start = new_s_end = subject;
3650 i = 0;
3651
3652 esp = gap_info->esp;
3653 prev_esp = NULL;
3654 last_esp = first_esp = esp;
3655 last_esp_num = 0;
3656
3657 while (esp) {
3658 if (esp->op_type == GAPALIGN_SUB) {
3659 sum += factor*matrix[*query & mask][*subject];
3660 query++;
3661 subject++;
3662 i++;
3663 } else if (esp->op_type == GAPALIGN_DEL) {
3664 sum -= gap_open + gap_extend * esp->num;
3665 subject += esp->num;
3666 i += esp->num;
3667 } else if (esp->op_type == GAPALIGN_INS) {
3668 sum -= gap_open + gap_extend * esp->num;
3669 query += esp->num;
3670 i += esp->num;
3671 }
3672
3673 if (sum < 0) {
3674 if (score < search->pbp->cutoff_s2 ||
3675 BlastKarlinStoE_simple(score, search->sbp->kbp[context],
3676 searchsp_eff) > search->pbp->cutoff_e) {
3677 /* Start from new offset */
3678 new_q_start = query;
3679 new_s_start = subject;
3680 score = sum = 0;
3681 if (i < esp->num) {
3682 esp->num -= i;
3683 first_esp = esp;
3684 i = 0;
3685 } else {
3686 first_esp = esp->next;
3687 }
3688 /* Unlink the bad part of the esp chain
3689 so it can be freed later */
3690 if (prev_esp)
3691 prev_esp->next = NULL;
3692 last_esp = NULL;
3693 } else {
3694 /* Stop here */
3695 break;
3696 }
3697 } else if (sum > score) {
3698 /* Remember this point as the best scoring end point */
3699 score = sum;
3700 last_esp = esp;
3701 last_esp_num = i;
3702 new_q_end = query;
3703 new_s_end = subject;
3704 }
3705 if (i >= esp->num) {
3706 i = 0;
3707 prev_esp = esp;
3708 esp = esp->next;
3709 }
3710 }
3711
3712 score /= factor;
3713
3714 delete_hsp = FALSE;
3715
3716 if (score >= search->pbp->cutoff_s2) {
3717 hsp->score = score;
3718 searchsp_eff = (FloatHi) search->dblen_eff *
3719 (FloatHi) search->context[context].query->effective_length;
3720 hsp->evalue = BlastKarlinStoE_simple(score,
3721 search->sbp->kbp[context], searchsp_eff);
3722 if (hsp->evalue > search->pbp->cutoff_e) {
3723 delete_hsp = TRUE;
3724 } else {
3725 hsp->query.length = new_q_end - new_q_start;
3726 hsp->subject.length = new_s_end - new_s_start;
3727 hsp->query.offset = hsp->gap_info->start1 = new_q_start - query_start;
3728 hsp->query.end = hsp->query.offset + hsp->query.length;
3729 hsp->subject.offset = hsp->gap_info->start2 =
3730 new_s_start - subject_start;
3731 hsp->subject.end = hsp->subject.offset + hsp->subject.length;
3732 /* Make corrections in edit block and free any parts that
3733 are no longer needed */
3734 if (first_esp != hsp->gap_info->esp) {
3735 GapXEditScriptDelete(hsp->gap_info->esp);
3736 hsp->gap_info->esp = first_esp;
3737 }
3738 if (last_esp->next != NULL) {
3739 last_esp->next = GapXEditScriptDelete(last_esp->next);
3740 }
3741 last_esp->num = last_esp_num;
3742 BlastHSPGetNumIdentical(search, hsp, NULL, &hsp->num_ident,
3743 &align_length);
3744 }
3745 } else {
3746 delete_hsp = TRUE;
3747 }
3748
3749 return delete_hsp;
3750 }
3751
3752 /* In the following function the forward strand is assumed */
3753
3754 Int2
MegaBlastReevaluateWithAmbiguities(BlastSearchBlkPtr search)3755 MegaBlastReevaluateWithAmbiguities(BlastSearchBlkPtr search)
3756 {
3757 BLAST_HitListPtr current_hitlist;
3758 BLAST_HSPPtr PNTR hsp_array, hsp;
3759 Uint1Ptr subject_start = NULL;
3760 Int4 index, context, hspcnt, buf_len=0;
3761 Boolean purge, delete_hsp;
3762 Int4 query_length;
3763 Int2 status;
3764
3765 if (!search->pbp->mb_params)
3766 return 1;
3767
3768 current_hitlist = search->current_hitlist;
3769 if (current_hitlist == NULL || current_hitlist->hspcnt == 0)
3770 return 0;
3771
3772
3773 status = BlastGetNonSumStatsEvalue(search);
3774
3775 /* While query offsets are not yet adjusted, remove the redundant HSPs */
3776 /*if (current_hitlist->hspcnt > 1)
3777 BlastSortUniqHspArray(current_hitlist);*/
3778 hspcnt = current_hitlist->hspcnt;
3779
3780 hsp_array = current_hitlist->hsp_array;
3781 /* In case of greedy extension without traceback,
3782 only adjust the offsets */
3783 if (!search->pbp->mb_params->use_dyn_prog &&
3784 search->pbp->mb_params->no_traceback) {
3785 for (index=0; index<hspcnt; index++) {
3786 if (hsp_array[index] != NULL) {
3787 /* Adjust the query offsets here */
3788 hsp_array[index]->query.offset -=
3789 search->query_context_offsets[hsp_array[index]->context];
3790 hsp_array[index]->query.end -=
3791 search->query_context_offsets[hsp_array[index]->context];
3792 hsp_array[index]->query.gapped_start -=
3793 search->query_context_offsets[hsp_array[index]->context];
3794 }
3795 }
3796 status = BlastReapHitlistByEvalue(search);
3797 return status;
3798 }
3799
3800 status = BlastReapHitlistByEvalue(search);
3801
3802 if (current_hitlist->hspcnt == 0)
3803 /* All HSPs have been deleted */
3804 return 0;
3805
3806 /* Can happen only when subject was being unpacked on the fly
3807 during gapped extensions */
3808 if (!search->subject->sequence_start)
3809 readdb_get_sequence_ex(search->rdfp, search->subject_id,
3810 &search->subject->sequence_start,
3811 &buf_len, TRUE);
3812 /* The sequence in blastna encoding is now stored in sequence_start */
3813 subject_start = search->subject->sequence_start + 1;
3814
3815 purge = FALSE;
3816 for (index=0; index<hspcnt; index++) {
3817 if (hsp_array[index] == NULL)
3818 continue;
3819 else
3820 hsp = hsp_array[index];
3821
3822 context = hsp->context;
3823
3824 /* Adjust the query offsets here */
3825 hsp->query.offset -= search->query_context_offsets[context];
3826 hsp->query.end -= search->query_context_offsets[context];
3827 hsp->query.gapped_start -= search->query_context_offsets[context];
3828
3829 if (search->pbp->mb_params->use_dyn_prog) {
3830 /* Check if the extension jumped through the border between
3831 sequences or sequence strands. If this happens, leave the part
3832 which contains the original offset pair, saved in gapped_start's
3833 */
3834 Int4 extra_length;
3835 query_length = search->query_context_offsets[context+1] -
3836 search->query_context_offsets[context] - 1;
3837
3838 while ((hsp->query.end - 1) / query_length > 0) {
3839 if (hsp->query.gapped_start < query_length) {
3840 extra_length = hsp->query.end - query_length + 1;
3841 hsp->subject.end -= extra_length;
3842 hsp->query.length -= extra_length;
3843 hsp->subject.length -= extra_length;
3844 hsp->query.end = query_length - 1;
3845 } else {
3846 extra_length = query_length + 1 - hsp->query.offset;
3847 hsp->subject.offset += extra_length;
3848 hsp->query.offset = 0;
3849 hsp->query.end -= (query_length + 1);
3850 hsp->query.length -= extra_length;
3851 hsp->subject.length -= extra_length;
3852 hsp->query.gapped_start -= query_length + 1;
3853 context++;
3854 query_length = search->query_context_offsets[context+1] -
3855 search->query_context_offsets[context] - 1;
3856 }
3857 }
3858 hsp->context = context;
3859 /* Gap information not available here, so no reevaluation
3860 with ambiguities can be done - it will be done with the final
3861 extension with traceback */
3862 continue;
3863 }
3864
3865 delete_hsp =
3866 ReevaluateScoreWithAmbiguities(search, subject_start, hsp);
3867
3868 if (delete_hsp) { /* This HSP is now below the cutoff */
3869 hsp_array[index] = BLAST_HSPFree(hsp);
3870 purge = TRUE;
3871 }
3872 }
3873 if (purge) {
3874 index = HspArrayPurge(hsp_array, hspcnt, TRUE);
3875 current_hitlist->hspcnt = index;
3876 current_hitlist->hspcnt_max = index;
3877 }
3878
3879 if (search->pbp->mb_params->use_dyn_prog) {
3880 status = BlastReapHitlistByEvalue(search);
3881 }
3882
3883 if (current_hitlist->hspcnt > 1)
3884 BlastSortUniqHspArray(current_hitlist);
3885
3886 if (search->pbp->hsp_num_max &&
3887 search->pbp->hsp_num_max < search->current_hitlist->hspcnt &&
3888 !search->pbp->mb_params->use_dyn_prog && search->last_context <= 1) {
3889 /* Number of HSPs to be saved is restricted by user;
3890 delete extra HSPs unless gapped alignments are done later, or
3891 there are multiple query sequences - then this is done elsewhere.
3892 */
3893 for (index=search->pbp->hsp_num_max;
3894 index < search->current_hitlist->hspcnt; ++index) {
3895 current_hitlist->hsp_array[index] =
3896 BLAST_HSPFree(current_hitlist->hsp_array[index]);
3897 }
3898 search->current_hitlist->hspcnt = search->pbp->hsp_num_max;
3899 }
3900
3901 return 0;
3902 }
3903
3904 /* The following binary search routine assumes that array A is filled. */
BinarySearchInt4(Int4 n,Int4Ptr A,Int4 size)3905 Int4 BinarySearchInt4(Int4 n, Int4Ptr A, Int4 size)
3906 {
3907 Int4 m, b, e;
3908
3909 b = 0;
3910 e = size;
3911 while (b < e - 1) {
3912 m = (b + e) / 2;
3913 if (A[m] > n)
3914 e = m;
3915 else
3916 b = m;
3917 }
3918 return b;
3919 }
3920
3921 /* Create a SeqLoc chain of query segments covered by all HSPs from a given
3922 SeqAlign - needed for masked query output only */
MaskSeqLocFromSeqAlign(SeqAlignPtr seqalign)3923 SeqLocPtr MaskSeqLocFromSeqAlign(SeqAlignPtr seqalign)
3924 {
3925 SeqLocPtr slp = NULL, new_slp, slp_var;
3926 SeqAlignPtr sap;
3927 DenseSegPtr dsp;
3928 Int4 from, to;
3929
3930 for (sap = seqalign; sap; sap = sap->next) {
3931 dsp = (DenseSegPtr) sap->segs;
3932 if (dsp->strands[0] == Seq_strand_plus) {
3933 from = dsp->starts[0];
3934 to = dsp->starts[2*(dsp->numseg - 1)] +
3935 dsp->lens[dsp->numseg - 1] - 1;
3936 } else {
3937 to = dsp->starts[0] + dsp->lens[0] - 1;
3938 from = dsp->starts[2*(dsp->numseg - 1)];
3939 }
3940 new_slp = SeqLocIntNew(from, to, Seq_strand_plus, dsp->ids);
3941 if (slp==NULL)
3942 slp = slp_var = new_slp;
3943 else {
3944 slp_var->next = new_slp;
3945 slp_var = slp_var->next;
3946 }
3947 }
3948
3949 return slp;
3950 }
3951
PrintMaskedSequence(BioseqPtr query_bsp,SeqLocPtr mask_slp,FILE * fp,Int2 line_len,Boolean lcase_masking)3952 void PrintMaskedSequence(BioseqPtr query_bsp, SeqLocPtr mask_slp,
3953 FILE *fp, Int2 line_len, Boolean lcase_masking)
3954 {
3955 SeqLocPtr query_slp = NULL;
3956 SeqPortPtr spp;
3957 Uint1Ptr query_seq, seq_line;
3958 Int4 index, id_length = 0, defline_len;
3959 Uint1 residue;
3960 CharPtr buffer = NULL;
3961
3962 ValNodeAddPointer(&query_slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(query_bsp->id, SEQID_GI)));
3963
3964
3965 spp = SeqPortNewByLoc(query_slp, Seq_code_iupacna);
3966 SeqPortSet_do_virtual(spp, TRUE);
3967 query_seq = (Uint1Ptr) MemNew((query_bsp->length + 1)*sizeof(Uint1));
3968 index=0;
3969 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3970 if (IS_residue(residue)) {
3971 query_seq[index] = residue;
3972 index++;
3973 }
3974 }
3975 if (lcase_masking)
3976 BlastLCaseMaskTheResidues(query_seq, query_bsp->length, mask_slp, FALSE,
3977 SeqLocStart(query_slp));
3978 else
3979 BlastMaskTheResidues(query_seq, query_bsp->length, 'N', mask_slp, FALSE,
3980 SeqLocStart(query_slp));
3981
3982 SeqLocFree(query_slp);
3983 if (query_bsp->descr)
3984 defline_len = StringLen(BioseqGetTitle(query_bsp));
3985 else
3986 defline_len = 0;
3987 defline_len += 255; /* Sufficient for an ID. */
3988 buffer = MemNew((defline_len+1)*sizeof(Char));
3989 if (query_bsp->id->choice != SEQID_LOCAL) {
3990 SeqIdWrite(query_bsp->id, buffer, PRINTID_FASTA_LONG, BUFFER_LENGTH);
3991 id_length = StringLen(buffer);
3992 buffer[id_length] = ' ';
3993 id_length++;
3994 }
3995 StringCpy(&buffer[id_length], BioseqGetTitle(query_bsp));
3996 fprintf(fp, ">%s\n", buffer);
3997
3998 seq_line = query_seq;
3999 for (index=0; index<=(query_bsp->length-1)/line_len; index++) {
4000 fprintf(fp, "%.*s\n", line_len, seq_line);
4001 seq_line += line_len;
4002 }
4003
4004 spp = SeqPortFree(spp);
4005 MemFree(buffer);
4006 query_seq = MemFree(query_seq);
4007 }
4008
4009 #define MIN_HSP_ARRAY_SIZE 20
4010 Int4 LIBCALL
MegaBlastSaveCurrentHitlist(BlastSearchBlkPtr search)4011 MegaBlastSaveCurrentHitlist(BlastSearchBlkPtr search)
4012 {
4013 BLASTResultHitlistPtr result_hitlist, PNTR results, PNTR mb_results;
4014 BLASTResultsStructPtr result_struct;
4015 BLAST_HitListPtr current_hitlist;
4016 BLAST_HSPPtr hsp;
4017 BLAST_KarlinBlkPtr kbp;
4018 BLASTResultHspPtr hsp_array;
4019 Int4 hspcnt, index, index1, new_index, old_index, low_index, high_index;
4020 Int4 hitlist_count, hitlist_max, hspmax, high_score=0, retval;
4021 Nlm_FloatHi current_evalue=DBL_MAX;
4022 Int4 query_length, hsp_index, new_size;
4023 Int4Ptr hsp_array_sizes;
4024 Int2 num_queries, query_index;
4025 Boolean PNTR do_not_reallocate;
4026 Boolean use_dyn_prog = search->pbp->mb_params->use_dyn_prog;
4027
4028 if (search == NULL)
4029 return 0;
4030
4031 if (search->current_hitlist == NULL || search->current_hitlist->hspcnt <= 0) /* No hits to save. */
4032 {
4033 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
4034 return 0;
4035 }
4036
4037 current_hitlist = search->current_hitlist;
4038
4039 retval = current_hitlist->hspcnt;
4040
4041 num_queries = search->last_context / 2 + 1;
4042 mb_results = (BLASTResultHitlistPtr PNTR)
4043 MemNew(num_queries*sizeof(BLASTResultHitlistPtr));
4044 hsp_array_sizes = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
4045 do_not_reallocate = (Boolean PNTR) MemNew(num_queries*sizeof(Boolean));
4046
4047 /* The HSPs in the hsp_array are sorted previously by score, therefore
4048 for each individual query they already come in the increasing order
4049 of evalue, since linking of HSPs is not done in megablast */
4050 index1 = 0;
4051 hspcnt = current_hitlist->hspcnt;
4052 hspmax = current_hitlist->hspcnt_max;
4053
4054 HeapSort(current_hitlist->hsp_array, hspcnt,
4055 sizeof(BLAST_HSPPtr), score_compare_hsps);
4056
4057 hsp = current_hitlist->hsp_array[0];
4058 for (index=0; index<hspcnt; index++) {
4059 while (hsp == NULL && index1 < hspmax) {
4060 index1++;
4061 hsp = current_hitlist->hsp_array[index1];
4062 }
4063 if (index1==hspmax) break;
4064 if (search->pbp->mb_params->perc_identity > 0) {
4065 if (MegaBlastGetHspPercentIdentity(search, hsp) <
4066 search->pbp->mb_params->perc_identity) {
4067 index1++;
4068 if (index1 >= hspmax)
4069 break;
4070 hsp = current_hitlist->hsp_array[index1];
4071 continue;
4072 }
4073 }
4074 query_index = hsp->context / 2;
4075
4076 result_hitlist = mb_results[query_index];
4077 if (!result_hitlist) {
4078 hsp_array_sizes[query_index] = MIN_HSP_ARRAY_SIZE;
4079 if (search->pbp->hsp_num_max &&
4080 search->pbp->hsp_num_max < MIN_HSP_ARRAY_SIZE)
4081 hsp_array_sizes[query_index] = search->pbp->hsp_num_max;
4082 result_hitlist = mb_results[query_index] =
4083 BLASTResultHitlistNew(hsp_array_sizes[query_index]);
4084 result_hitlist->hspcnt = 1;
4085 /* This is the first HSP for this query, so it is the
4086 best one */
4087 result_hitlist->best_evalue = hsp->evalue;
4088 result_hitlist->high_score = hsp->score;
4089 } else if (result_hitlist->hspcnt == hsp_array_sizes[query_index]) {
4090 if (!do_not_reallocate[query_index]) {
4091 new_size = 2*hsp_array_sizes[query_index];
4092 if (search->pbp->hsp_num_max &&
4093 search->pbp->hsp_num_max < new_size && !use_dyn_prog) {
4094 /* Number of HSPs to be saved is restricted by user;
4095 stop at this limit if gapped alignment has already
4096 been completed */
4097 new_size = search->pbp->hsp_num_max;
4098 if (new_size <= hsp_array_sizes[query_index]) {
4099 do_not_reallocate[query_index] = TRUE;
4100 continue;
4101 }
4102 }
4103
4104 if (!(hsp_array = Realloc(result_hitlist->hsp_array,
4105 new_size*sizeof(BLASTResultHsp)))) {
4106 ErrPostEx(SEV_WARNING, 0, 0,
4107 "Cannot reallocate hsp_array for a result hitlist, size %d", new_size);
4108 do_not_reallocate[query_index] = TRUE;
4109 } else {
4110 result_hitlist->hsp_array = hsp_array;
4111 result_hitlist->hspcnt++;
4112 hsp_array_sizes[query_index] = new_size;
4113 }
4114 } else {
4115 /* hsp_array is already full and reallocation not allowed */
4116 continue;
4117 }
4118 } else
4119 result_hitlist->hspcnt++;
4120
4121 if (result_hitlist == NULL) /* Should never happen */
4122 continue;
4123
4124 result_hitlist->subject_id = search->subject_id;
4125 result_hitlist->subject_info = search->subject_info;
4126
4127 hsp_array = result_hitlist->hsp_array;
4128 hsp_index = result_hitlist->hspcnt - 1;
4129
4130 hsp_array[hsp_index].ordering_method = hsp->ordering_method;
4131 hsp_array[hsp_index].number = hsp->num;
4132 hsp_array[hsp_index].score = hsp->score;
4133 hsp_array[hsp_index].e_value = hsp->evalue;
4134 hsp_array[hsp_index].num_ident = hsp->num_ident;
4135 kbp = search->sbp->kbp[hsp->context];
4136 hsp_array[hsp_index].bit_score = ((hsp->score*kbp->Lambda) -
4137 kbp->logK)/NCBIMATH_LN2;
4138
4139 if (hsp->context & 1)
4140 hsp_array[hsp_index].query_frame = -1;
4141 else
4142 hsp_array[hsp_index].query_frame = 1;
4143 query_length =
4144 search->query_context_offsets[hsp->context+1] -
4145 search->query_context_offsets[hsp->context] - 1;
4146 if (!use_dyn_prog && !search->pbp->mb_params->no_traceback) {
4147 hsp->gap_info->start1 = hsp->query.offset;
4148 hsp->gap_info->start2 = hsp->subject.offset;
4149 hsp->gap_info->frame1 = hsp_array[hsp_index].query_frame;
4150 hsp->gap_info->length1 = query_length;
4151 hsp_array[hsp_index].query_gapped_start = hsp->query.offset + 1;
4152 hsp_array[hsp_index].subject_gapped_start =
4153 hsp->subject.offset + 1;
4154 } else {
4155 hsp_array[hsp_index].query_gapped_start =
4156 hsp->query.gapped_start;
4157 hsp_array[hsp_index].subject_gapped_start =
4158 hsp->subject.gapped_start;
4159 }
4160 hsp_array[hsp_index].gap_info = hsp->gap_info;
4161 /* Edit block pointer has been copied; remove it from hsp to avoid
4162 double freeing */
4163 hsp->gap_info = NULL;
4164 hsp_array[hsp_index].context = hsp->context;
4165 hsp_array[hsp_index].query_offset = hsp->query.offset;
4166 hsp_array[hsp_index].query_length = hsp->query.length;
4167 hsp_array[hsp_index].subject_offset = hsp->subject.offset;
4168 hsp_array[hsp_index].subject_length = hsp->subject.length;
4169 hsp_array[hsp_index].subject_frame = hsp->subject.frame;;
4170 hsp_array[hsp_index].point_back = result_hitlist;
4171 hsp_array[hsp_index].back_left =
4172 hsp_array[hsp_index].back_right = NULL;
4173 hsp_array[hsp_index].hspset_cnt = -1;
4174
4175 index1++;
4176 if (index1 >= hspmax)
4177 break;
4178 hsp = current_hitlist->hsp_array[index1];
4179 }
4180
4181 search->subject_info = NULL;
4182
4183 /* For MT BLAST we check that no other thread is attempting to insert results. */
4184 if (search->thr_info->results_mutex)
4185 NlmMutexLock(search->thr_info->results_mutex);
4186
4187 for (query_index=0; query_index<num_queries; query_index++) {
4188 result_hitlist = mb_results[query_index];
4189 if (!result_hitlist)
4190 continue;
4191
4192 /* This is the structure that is identical on every thread. */
4193 result_struct = search->mb_result_struct[query_index];
4194
4195 if (!result_struct)
4196 result_struct = search->mb_result_struct[query_index] =
4197 BLASTResultsStructNew(search->result_size, 0, 0);
4198
4199 hitlist_count = result_struct->hitlist_count;
4200 hitlist_max = result_struct->hitlist_max;
4201 results = result_struct->results;
4202
4203 /* Record the worst evalue for ReevaluateWithAmbiguities. */
4204 if (hitlist_count == hitlist_max)
4205 search->worst_evalue = results[hitlist_count-1]->best_evalue;
4206
4207 current_evalue = result_hitlist->best_evalue;
4208 high_score = result_hitlist->high_score;
4209 /* New hit is less significant than all the other hits. */
4210 if (hitlist_count > 0 &&
4211 (current_evalue > results[hitlist_count-1]->best_evalue ||
4212 (current_evalue == results[hitlist_count-1]->best_evalue &&
4213 high_score < results[hitlist_count-1]->high_score))) {
4214 if (hitlist_count == hitlist_max) {
4215 /* Array is full, delete the entry. */
4216 result_hitlist = BLASTResultHitlistFreeEx(search, result_hitlist);
4217 continue;
4218 } else /* Add to end of array. */
4219 new_index = hitlist_count;
4220 } else if (hitlist_count > 0) {
4221 /* The array is all NULL's if hitlist_count==0 */
4222 high_index=0;
4223 low_index=hitlist_count-1;
4224 new_index = (high_index+low_index)/2;
4225 old_index = new_index;
4226 for (index=0; index<BLAST_SAVE_ITER_MAX; index++) {
4227 if (results[new_index]->best_evalue > current_evalue)
4228 low_index = new_index;
4229 else if (results[new_index]->best_evalue < current_evalue)
4230 high_index = new_index;
4231 else { /* If e-values are the same, use high score. */
4232 /* If scores are the same, use ordinal number. */
4233 if (results[new_index]->high_score < high_score)
4234 low_index = new_index;
4235 else if (results[new_index]->high_score > high_score)
4236 high_index = new_index;
4237 else if (results[new_index]->subject_id < search->subject_id)
4238 low_index = new_index;
4239 else
4240 high_index = new_index;
4241 }
4242
4243 new_index = (high_index+low_index)/2;
4244 if (old_index == new_index) {
4245 if (results[new_index]->best_evalue < current_evalue)
4246 /* Perform this check as new_index get rounded DOWN above.*/
4247 new_index++;
4248 else if (results[new_index]->best_evalue == current_evalue && results[new_index]->high_score > high_score)
4249 new_index++;
4250 break;
4251 }
4252 old_index = new_index;
4253 }
4254 if (hitlist_count == hitlist_max) {
4255 /* The list is full, delete the last entry. */
4256 if (results[hitlist_max-1]->seqalign)
4257 SeqAlignSetFree(results[hitlist_max-1]->seqalign);
4258 results[hitlist_max-1] =
4259 BLASTResultHitlistFreeEx(search, results[hitlist_max-1]);
4260 result_struct->hitlist_count--;
4261 hitlist_count = result_struct->hitlist_count;
4262 }
4263 if (hitlist_max > 1)
4264 Nlm_MemMove((results+new_index+1), (results+new_index),
4265 (hitlist_count-new_index)*sizeof(results[0]));
4266 } else /* First hit to be stored. */
4267 new_index = 0;
4268
4269 if (new_index < hitlist_max) {
4270 results[new_index] = result_hitlist;
4271 result_struct->hitlist_count++;
4272 }
4273 }
4274 MemFree(do_not_reallocate);
4275 MemFree(hsp_array_sizes);
4276 MemFree(mb_results);
4277 /* Free mutex. */
4278 if (search->thr_info->results_mutex)
4279 NlmMutexUnlock(search->thr_info->results_mutex);
4280 return retval;
4281 }
4282
4283 FloatLo
MegaBlastGetHspPercentIdentity(BlastSearchBlkPtr search,BLAST_HSPPtr hsp)4284 MegaBlastGetHspPercentIdentity(BlastSearchBlkPtr search, BLAST_HSPPtr hsp)
4285 {
4286 FloatLo perc_ident;
4287 Int4 numseg, align_length, i, q_off, s_off;
4288 Int2 context;
4289 Int4Ptr length, start;
4290 Uint1Ptr strands;
4291 Uint1Ptr query_seq, subject_seq = NULL;
4292 GapXEditScriptPtr esp;
4293
4294 if (!hsp->gap_info)
4295 /* Cannot do any meaningful check at this time, return 100 so nothing is
4296 discarded */
4297 return 100.0;
4298
4299 context = hsp->context;
4300 query_seq = search->context[context].query->sequence;
4301 subject_seq = search->subject->sequence_start + 1;
4302
4303 esp = hsp->gap_info->esp;
4304
4305 for (numseg=0; esp; esp = esp->next, numseg++);
4306
4307 /* GXECollectDataForSeqalign might change the query offset, so need an extra
4308 variable */
4309 q_off = hsp->query.offset;
4310 s_off = hsp->subject.offset;
4311 GXECollectDataForSeqalign(hsp->gap_info, hsp->gap_info->esp, numseg,
4312 &start, &length, &strands,
4313 &q_off, &s_off);
4314
4315 perc_ident = 0;
4316 align_length = 0;
4317 for (i=0; i<numseg; i++) {
4318 align_length += length[i];
4319 if (start[2*i] != -1 && start[2*i+1] != -1) {
4320 perc_ident +=
4321 (FloatLo) MegaBlastGetNumIdentical(query_seq, subject_seq,
4322 start[2*i], start[2*i+1],
4323 length[i], FALSE);
4324 }
4325 }
4326 perc_ident = perc_ident / align_length * 100;
4327 MemFree(start);
4328 MemFree(length);
4329 MemFree(strands);
4330 return perc_ident;
4331 }
4332
MegaBlastParameterBlkNew(BLAST_OptionsBlkPtr options)4333 MegaBlastParameterBlkPtr MegaBlastParameterBlkNew(BLAST_OptionsBlkPtr options)
4334 {
4335 MegaBlastParameterBlkPtr mb_params = (MegaBlastParameterBlkPtr)
4336 MemNew(sizeof(MegaBlastParameterBlk));
4337
4338 mb_params->no_traceback = options->no_traceback;
4339 mb_params->is_neighboring = options->is_neighboring;
4340 mb_params->full_seqids = options->megablast_full_deflines;
4341 mb_params->perc_identity = options->perc_identity;
4342 mb_params->max_positions = options->block_width;
4343 if (options->mb_template_length > 0) {
4344 mb_params->template_length = options->mb_template_length;
4345 mb_params->disc_word = TRUE;
4346 mb_params->word_weight = options->wordsize;
4347 mb_params->template_type =
4348 GetMBTemplateType(mb_params->word_weight, mb_params->template_length,
4349 options->mb_disc_type);
4350 if (options->mb_disc_type == MB_TWO_TEMPLATES)
4351 mb_params->use_two_templates = TRUE;
4352 } else {
4353 mb_params->template_length = MIN(options->wordsize, 12);
4354 mb_params->word_weight = options->wordsize;
4355 }
4356 mb_params->one_base_step = options->mb_one_base_step;
4357 mb_params->use_dyn_prog = options->mb_use_dyn_prog;
4358
4359 return mb_params;
4360 }
4361
GetMBTemplateType(Int2 weight,Int2 length,MBDiscWordType type)4362 MBTemplateType GetMBTemplateType(Int2 weight, Int2 length, MBDiscWordType type)
4363 {
4364 if (weight == 11) {
4365 if (length == 16) {
4366 if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4367 return TEMPL_11_16;
4368 else if (type == MB_WORD_OPTIMAL)
4369 return TEMPL_11_16_OPT;
4370 } else if (length == 18) {
4371 if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4372 return TEMPL_11_18;
4373 else if (type == MB_WORD_OPTIMAL)
4374 return TEMPL_11_18_OPT;
4375 } else if (length == 21) {
4376 if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4377 return TEMPL_11_21;
4378 else if (type == MB_WORD_OPTIMAL)
4379 return TEMPL_11_21_OPT;
4380 }
4381 } else if (weight == 12) {
4382 if (length == 16) {
4383 if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4384 return TEMPL_12_16;
4385 else if (type == MB_WORD_OPTIMAL)
4386 return TEMPL_12_16_OPT;
4387 } else if (length == 18) {
4388 if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4389 return TEMPL_12_18;
4390 else if (type == MB_WORD_OPTIMAL)
4391 return TEMPL_12_18_OPT;
4392 } else if (length == 21) {
4393 if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4394 return TEMPL_12_21;
4395 else if (type == MB_WORD_OPTIMAL)
4396 return TEMPL_12_21_OPT;
4397 }
4398 }
4399 return TEMPL_ERROR;
4400 }
4401