1 static char const rcsid[] = "$Id: mblast.c,v 6.215 2006/09/21 13:42:37 madden Exp $";
2 
3 /* ===========================================================================
4 *
5 *                            PUBLIC DOMAIN NOTICE
6 *               National Center for Biotechnology Information
7 *
8 *  This software/database is a "United States Government Work" under the
9 *  terms of the United States Copyright Act.  It was written as part of
10 *  the author's official duties as a United States Government employee and
11 *  thus cannot be copyrighted.  This software/database is freely available
12 *  to the public for use. The National Library of Medicine and the U.S.
13 *  Government have not placed any restriction on its use or reproduction.
14 *
15 *  Although all reasonable efforts have been taken to ensure the accuracy
16 *  and reliability of the software and data, the NLM and the U.S.
17 *  Government do not and cannot warrant the performance or results that
18 *  may be obtained by using this software or data. The NLM and the U.S.
19 *  Government disclaim all warranties, express or implied, including
20 *  warranties of performance, merchantability or fitness for any particular
21 *  purpose.
22 *
23 *  Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26 
27 /*****************************************************************************
28 
29 File name: mblast.c
30 
31 Author: Ilya Dondoshansky
32 
33 Contents: Mega BLAST functions
34 
35 Detailed Contents:
36 
37         - Mega BLAST versions of some functions from blast.c returning array
38 	of SeqAlignPtrs
39 
40 	- Functions specific to Mega BLAST
41 
42 ******************************************************************************
43  * $Revision: 6.215 $
44  *
45  * $Log: mblast.c,v $
46  * Revision 6.215  2006/09/21 13:42:37  madden
47  * BlastProcessGiLists returns a boolean to specify that an attempt was made to process a list of GIs.  If no matches were found this can be reported back to the user
48  *
49  * Revision 6.214  2005/08/31 20:33:18  coulouri
50  * From Mike Gertz:
51  *     In MegaBlastSetUpSearchWithReadDbInternal, replaced existing code
52  *     for adjusting the hitlist size by a call to BlastSingleQueryResultSize.
53  *
54  * Revision 6.213  2005/03/07 16:30:57  dondosha
55  * In reevaluation with ambiguities, if HSP start changed, update starting offsets in edit block
56  *
57  * Revision 6.212  2004/12/07 16:09:04  coulouri
58  * check malloc returncode
59  *
60  * Revision 6.211  2004/11/25 01:59:38  coulouri
61  * reinitialize full_query_length to prevent overflow of query_seq_combined when only minus strands are searched
62  *
63  * Revision 6.210  2004/11/22 20:54:15  coulouri
64  * initialize pointer variables
65  *
66  * Revision 6.209  2004/11/22 15:17:34  dondosha
67  * Do not calculate length adjustment for empty contexts, i.e. ones with null Karlin blocks
68  *
69  * Revision 6.208  2004/11/19 13:22:05  madden
70  * Remove no_check_score completely (from Mike Gertz)
71  *
72  * Revision 6.207  2004/11/05 15:34:53  coulouri
73  * bring in system includes after toolkit includes so that LONG_BIT is defined correctly on amd64
74  *
75  * Revision 6.206  2004/10/29 21:34:45  dondosha
76  * Calculate length adjustment for each context separately; do not allow effective query length to be less than 1
77  *
78  * Revision 6.205  2004/09/13 18:33:01  dondosha
79  * Advance mask pointer in the list when one of the queries in a set is discarded due to being too short, and also has a lower case mask
80  *
81  * Revision 6.204  2004/05/27 17:35:56  dondosha
82  * Do not flag HSPs for deletion in sorting before doing inclusion tests
83  *
84  * Revision 6.203  2004/05/21 13:53:04  dondosha
85  * Use BLAST_HSPFree to free BLAST_HSP structures, hence no need to call GapXEditBlockDelete in multiple places
86  *
87  * Revision 6.202  2004/03/31 17:58:51  papadopo
88  * Mike Gertz' changes for length adjustment calculations
89  *
90  * Revision 6.201  2004/02/26 15:52:30  papadopo
91  * Mike Gertz' modifications to unify handling of gapped Karlin blocks between protein and nucleotide searches
92  *
93  * Revision 6.200  2004/02/24 14:07:01  camacho
94  * Use approximate sequence length calculation for entrez-limited
95  * nucleotide blast databases.
96  *
97  * Revision 6.199  2004/01/27 20:46:06  dondosha
98  * Allow values 0, 1, 2 for no_traceback megablast option
99  *
100  * Revision 6.198  2004/01/23 22:29:11  dondosha
101  * Increase hitlist size for preliminary alignment if traceback is delayed
102  *
103  * Revision 6.197  2004/01/16 23:43:45  dondosha
104  * No more need for special argument for partial search: it is set in options
105  *
106  * Revision 6.196  2004/01/13 15:46:54  dondosha
107  * Changed cast to nearest integer in length adjustment calculation
108  *
109  * Revision 6.195  2004/01/06 22:37:10  dondosha
110  * Use BLAST_HSPfree function
111  *
112  * Revision 6.194  2003/12/10 17:05:28  dondosha
113  * Added function ReevaluateScoreWithAmbiguities to reevaluate score for one HSP; use it after greedy traceback
114  *
115  * Revision 6.193  2003/11/06 19:53:07  dondosha
116  * Fix in MegaBlastSetupSearchInternalByLoc when query submitted as Bioseq instead of SeqLoc; removed unused variables
117  *
118  * Revision 6.192  2003/11/05 21:11:09  dondosha
119  * Fix for traceback with greedy algorithm
120  *
121  * Revision 6.191  2003/10/29 17:46:59  dondosha
122  * Allow 2-stage greedy extension in megablast
123  *
124  * Revision 6.190  2003/05/30 17:25:36  coulouri
125  * add rcsid
126  *
127  * Revision 6.189  2003/05/20 21:56:34  dondosha
128  * Use correct X-dropoff for preliminary dynamic programming gapped extension
129  *
130  * Revision 6.188  2003/05/14 20:35:58  camacho
131  * Allow searching empty databases
132  *
133  * Revision 6.187  2003/05/13 16:02:53  coulouri
134  * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
135  *
136  * Revision 6.186  2003/05/12 12:23:44  camacho
137  * Sanity check for number of sequences & db length
138  *
139  * Revision 6.185  2003/05/02 23:02:11  dondosha
140  * Fix for dynamic programming gapped extension
141  *
142  * Revision 6.184  2003/04/24 14:58:20  dondosha
143  * Fixed a minor bug in MegaBlastExtendHit found by A. Morgulis
144  *
145  * Revision 6.183  2003/04/04 17:16:31  dondosha
146  * Previous change reversed, was incorrect
147  *
148  * Revision 6.182  2003/04/03 22:35:44  dondosha
149  * When saving hit list, save hsp contexts modulo 2, since multiple query information is no longer needed
150  *
151  * Revision 6.181  2003/03/19 19:16:20  dondosha
152  * Small correction for a rarely happening case in MegaBlastExtendHit
153  *
154  * Revision 6.180  2003/03/13 19:03:10  dondosha
155  * Minor bug fix in MegaBlastExtendHit
156  *
157  * Revision 6.179  2003/03/13 16:53:03  dondosha
158  * Minor bug fixes
159  *
160  * Revision 6.178  2002/12/26 20:27:40  dondosha
161  * Bug fix for e-values of hits whose extent changes when reevaluating with ambiguities
162  *
163  * Revision 6.177  2002/11/25 19:57:29  dondosha
164  * Further fix to the HSP limit (-H) megablast option
165  *
166  * Revision 6.176  2002/11/22 23:30:34  dondosha
167  * 1. Use array of structures instead of array of pointers for initial offset pairs;
168  * 2. Scan subject sequence to the last byte (it wasn't!) when looking for words
169  *
170  * Revision 6.175  2002/11/06 22:26:49  dondosha
171  * Calculate number of identities at the same time when doing reevaluation for ambiguities
172  *
173  * Revision 6.174  2002/11/04 22:57:24  dondosha
174  * Save number of identities in result HSPs
175  *
176  * Revision 6.173  2002/10/10 21:02:20  dondosha
177  * Fixed reevaluation with ambiguities: if score gets negative, check whether best score already qualifies for saving
178  *
179  * Revision 6.172  2002/09/25 18:35:33  dondosha
180  * Fix in MegaBlastExtendHit for contiguous word size 12
181  *
182  * Revision 6.171  2002/08/30 17:53:51  thiessen
183  * fix missing LIBCALL in prototype
184  *
185  * Revision 6.170  2002/08/30 15:54:11  dondosha
186  * 1. Prototypes of internally used functions moved from mblast.h;
187  * 2. Do not allocate ewp_params and ewp structures when stacks are used to save
188  *    initial hits;
189  * 3. Cleaned up initial word extension code.
190  *
191  * Revision 6.169  2002/08/22 13:39:45  camacho
192  * Close the header and sequence files only if allocated
193  *
194  * Revision 6.168  2002/08/20 22:06:02  madden
195  * Restore partial results code for megablast
196  *
197  * Revision 6.167  2002/08/06 15:56:50  dondosha
198  * Removed some old unused code
199  *
200  * Revision 6.166  2002/08/06 15:45:21  dondosha
201  * Corrected the sanity check in MegaBlastGapInfoToSeqAlign
202  *
203  * Revision 6.165  2002/08/01 20:48:25  dondosha
204  * Use BLASTPostSearchLogic function: compute SeqAligns after all results are found
205  *
206  * Revision 6.164  2002/07/31 17:51:09  dondosha
207  * Corrected the two-hit logic in case of word size greater than 12
208  *
209  * Revision 6.163  2002/07/18 19:40:45  dondosha
210  * Added an option to restrict number of HSPs per database sequence
211  *
212  * Revision 6.162  2002/07/17 22:12:58  dondosha
213  * Allow return without computing seqalign for partial search
214  *
215  * Revision 6.161  2002/06/26 00:56:30  camacho
216  *
217  * 1. Fixed bug when searching a mixture of real and mask databases.
218  * 2. Clean up of code that calculates the number of sequences and database
219  *    length.
220  *
221  * Revision 6.160  2002/06/24 18:50:20  dondosha
222  * Memory leak fixed
223  *
224  * Revision 6.159  2002/06/13 19:49:24  dondosha
225  * Hide previous change limiting number of HSPs for very large queries
226  *
227  * Revision 6.158  2002/06/13 17:44:27  dondosha
228  * 1. Fixed a bug in revision 6.156;
229  * 2. Limit the maximal number of saved HSPs per database sequence if the
230  *    search space is too large to 100 times the number of query sequences.
231  *
232  * Revision 6.157  2002/06/11 14:44:47  dondosha
233  * Return status from some functions instead of search block pointer
234  *
235  * Revision 6.156  2002/06/07 22:39:19  dondosha
236  * Do not mask residues on any of the contexts except first - needed for megablast with non-greedy traceback
237  *
238  * Revision 6.155  2002/06/07 19:30:08  dondosha
239  * Shift offsets for non-greedy preliminary extension when 1-base database scanning is used
240  *
241  * Revision 6.154  2002/06/07 16:47:42  dondosha
242  * Re-sort results after traceback in case of non-greedy extension
243  *
244  * Revision 6.153  2002/05/17 21:40:13  dondosha
245  * Added 2 optimal Mega BLAST word templates for length 21
246  *
247  * Revision 6.152  2002/05/14 22:20:20  dondosha
248  * Renamed maximal discontiguous template type into optimal
249  *
250  * Revision 6.151  2002/05/10 22:38:49  dondosha
251  * Always do preliminary and then traceback gapped extension if dynamic programming extension is used
252  *
253  * Revision 6.150  2002/05/09 17:01:23  dondosha
254  * Renamed typedefs dp_ptr and dp_node to GapXDPPtr and GapXDP
255  *
256  * Revision 6.149  2002/05/08 22:48:12  dondosha
257  * Allocate memory for dynamic programming upfront in Mega BLAST case
258  *
259  * Revision 6.148  2002/05/03 21:36:26  dondosha
260  * Do not allocate memory for greedy algorithm if it is not going to be used
261  *
262  * Revision 6.147  2002/04/24 17:50:53  dondosha
263  * Check percent identity cut-off in all cases
264  *
265  * Revision 6.146  2002/04/23 20:41:20  dondosha
266  * In case of non-affine extension in megablast, check percent identity cutoff after the traceback is obtained
267  *
268  * Revision 6.145  2002/04/11 20:33:47  dondosha
269  * Returned the reaping of hitlists by e-value before reevaluation with ambiguities
270  *
271  * Revision 6.144  2002/04/10 18:20:05  dondosha
272  * Do not subtract 4 from wordsize in BlastFillQueryOffsets
273  *
274  * Revision 6.143  2002/04/09 18:21:00  dondosha
275  * Changed #ifdefs to conditionals, allowing different discontiguous templates in a single binary
276  *
277  * Revision 6.142  2002/04/04 21:19:15  dondosha
278  * Corrections for megablast with non-greedy extensions
279  *
280  * Revision 6.141  2002/04/01 22:26:51  dondosha
281  * Correction in the MegaBlastReevaluateWithAmbiguities function: delete hsp if its evalue dropped below threshold after reevaluation
282  *
283  * Revision 6.140  2002/03/26 21:17:42  dondosha
284  * Changed discontiguous word finding code to allow template length 21
285  *
286  * Revision 6.139  2002/03/06 18:34:31  dondosha
287  * Pass the filtered locations back from the megablast engine to use in formatting
288  *
289  * Revision 6.138  2002/03/05 17:58:57  dondosha
290  * Set same offsets for the traceback as for preliminary extension for megablast with non-greedy extensions
291  *
292  * Revision 6.137  2002/02/15 23:34:04  dondosha
293  * 1. Allow use of hash table instead of index table (need -DUSE_HASH_TABLE)
294  * 2. Allow use of two simultaneous word templates (with -DUSE_TWO_TEMPLATES)
295  * 3. Added functionality fo print masked query sequence with lower case masking.
296  *
297  * Revision 6.136  2002/01/07 23:16:00  dondosha
298  * Fixed several memory leaks and allocation/freeing bugs in multithreaded megablast
299  *
300  * Revision 6.135  2002/01/04 20:08:36  dondosha
301  * Allocate filter string when rodent repeat filtering is needed for neighboring
302  *
303  * Revision 6.134  2001/12/28 20:46:21  dondosha
304  * 1. Mega BLAST related parameters moved into a separate structure
305  * 2. Environment variables for discontiguous words, etc. changed to options
306  * 3. Extension from discontiguous word with length 18 implemented
307  *
308  * Revision 6.133  2001/12/13 16:07:17  dondosha
309  * Do not use mb_endpoint_results in neighboring
310  *
311  * Revision 6.132  2001/12/11 19:45:56  dondosha
312  * Repeat inclusion check after reevaluation of HSPs
313  *
314  * Revision 6.131  2001/12/10 22:54:27  dondosha
315  * Correction for determining the number of stacks to use in megablast
316  *
317  * Revision 6.130  2001/12/04 17:13:08  dondosha
318  * Made number of stacks for megablast word processing depend on query and database
319  *
320  * Revision 6.129  2001/11/27 22:17:20  dondosha
321  * Allow dynamic programming gapped extension only with affine gap scores
322  *
323  * Revision 6.128  2001/11/16 16:25:34  dondosha
324  * In MegaBlastReevaluateWithAmbiguities: bug from previous change fixed
325  *
326  * Revision 6.127  2001/11/14 23:45:46  dondosha
327  * 1. Retrieve unpacked subject sequence directly in blastna encoding
328  * 2. Implemented a possibility to use dynamic programming gapped extension
329  *    via an environment variable.
330  *
331  * Revision 6.126  2001/11/13 18:23:54  dondosha
332  * Move environment variable checking for discontiguous words to higher level function, so it is done only once
333  *
334  * Revision 6.125  2001/10/12 21:32:46  dondosha
335  * Added discontiguous word capability to megablast
336  *
337  * Revision 6.124  2001/09/18 16:49:25  dondosha
338  * Removed unneeded functions, eliminated mbutils.h header
339  *
340  * Revision 6.123  2001/09/07 14:46:43  dondosha
341  * Roll back removal of threshold_first from functions and structures
342  *
343  * Revision 6.122  2001/09/06 20:24:34  dondosha
344  * Removed threshold_first
345  *
346  * Revision 6.121  2001/09/05 21:30:11  dondosha
347  * Purify error fixed
348  *
349  * Revision 6.120  2001/09/05 20:33:11  dondosha
350  * Fixed a bug in changes from 07/20/2001
351  *
352  * Revision 6.119  2001/08/01 16:23:28  dondosha
353  * Improvement in procedure removing HSPs on close diagonals
354  *
355  * Revision 6.118  2001/07/25 22:10:28  dondosha
356  * Bug fix in hsp inclusion check
357  *
358  * Revision 6.117  2001/07/20 19:27:52  dondosha
359  * 1. Added pv_array and diagonal array when needed.
360  * 2. Added logic to decide when to unpack db sequence depending on its length.
361  * 3. Rewrote reevaluation with ambiguities function to allow gapped alignment.
362  *
363  * Revision 6.116  2001/06/21 21:44:49  dondosha
364  * Free allocated SeqLocs before error return from search set up
365  *
366  * Revision 6.115  2001/06/15 17:35:13  dondosha
367  * Correction to previous changes
368  *
369  * Revision 6.114  2001/06/14 22:09:15  dondosha
370  * Rearranged code for gi lists and oid masks processing to get rid of duplication
371  *
372  * Revision 6.113  2001/06/13 21:46:27  dondosha
373  * Comparison function name in heapsort of gi list changed
374  *
375  * Revision 6.112  2001/06/05 22:15:10  dondosha
376  * If some queries completely mask, destroy respective KarlinBlk
377  *
378  * Revision 6.111  2001/05/16 14:17:09  dondosha
379  * Correction for previous revision
380  *
381  * Revision 6.110  2001/05/15 20:02:27  dondosha
382  * Adjustments to accommodate query in a Bioseq form
383  *
384  * Revision 6.109  2001/05/08 20:18:16  dondosha
385  * Corrections for removal of short queries and score reevaluation for affine gaps
386  *
387  * Revision 6.108  2001/05/04 19:50:45  dondosha
388  * Improved error message when all queries are shorter than word size
389  *
390  * Revision 6.107  2001/05/04 15:55:21  dondosha
391  * Take into account removal of short sequences in function BlastFillQueryOffsets
392  *
393  * Revision 6.106  2001/05/03 21:48:27  dondosha
394  * Handle some cases when memory allocation fails
395  *
396  * Revision 6.105  2001/04/16 16:19:59  dondosha
397  * Remove query sequences with length less than wordsize, print warning
398  *
399  * Revision 6.104  2001/03/22 19:23:09  dondosha
400  * Changed hsp array new size variable Int4 from Int2
401  *
402  * Revision 6.103  2001/03/19 22:39:24  dondosha
403  * Allow location on the first query sequence for megablast
404  *
405  * Revision 6.102  2001/03/12 19:10:10  dondosha
406  * Do not use window size and multiple hits options in megablast
407  *
408  * Revision 6.101  2001/02/07 21:13:30  dondosha
409  * Pass output stream from options block to search
410  *
411  * Revision 6.100  2001/01/24 20:51:50  dondosha
412  * Enabled splitting of the second sequence for 2 sequences with megablast
413  *
414  * Revision 6.99  2001/01/11 18:31:22  dondosha
415  * Changed error level for nonexistent database from ERROR to FATAL
416  *
417  * Revision 6.98  2001/01/03 21:45:31  dondosha
418  * Fixed a memory leak - some edit blocks not freed in megablast
419  *
420  * Revision 6.97  2000/12/28 22:07:20  dondosha
421  * Exit with error if database not found
422  *
423  * Revision 6.96  2000/12/22 19:48:49  dondosha
424  * Corrected handling of percent identity cutoff in MegaBlastSaveCurrentHitlist
425  *
426  * Revision 6.95  2000/12/21 22:30:42  dondosha
427  * Implemented discarding of HSPs below percent identity cutoff
428  *
429  * Revision 6.94  2000/12/14 17:55:25  dondosha
430  * Fixed handling of subsequences masking in MegaBlastMaskTheResidues
431  *
432  * Revision 6.93  2000/12/07 20:30:37  dondosha
433  * Initialize spp to NULL (not done in previous change)
434  *
435  * Revision 6.92  2000/12/07 17:48:31  dondosha
436  * Handle non-whole query SeqLoc correctly in MegaBlastSetUpSearchInternalByLoc
437  *
438  * Revision 6.91  2000/11/29 23:06:24  dondosha
439  * Removed restriction on number of alignments saved for a single query sequence
440  *
441  * Revision 6.90  2000/11/29 16:27:06  dondosha
442  * Get the ncbi4na-encoded subject sequence only once per subject
443  *
444  * Revision 6.89  2000/11/21 20:18:56  dondosha
445  * Look for HSP inclusion based on diagonal instead of offset
446  *
447  * Revision 6.88  2000/11/16 19:14:10  dondosha
448  * Initialize mb_endpoint_results if no traceback
449  *
450  * Revision 6.87  2000/11/03 20:16:54  dondosha
451  * Changed one_line_results parameter to no_traceback
452  *
453  * Revision 6.86  2000/10/31 15:06:15  dondosha
454  * Added Boolean parameter to PrintMaskedSequence function
455  *
456  * Revision 6.85  2000/10/26 18:46:00  dondosha
457  * Check if gi list file is provided from the db alias
458  *
459  * Revision 6.84  2000/10/19 16:02:13  dondosha
460  * Changed MegaBlastGetPercentIdentity to MegaBlastGetNumIdentical
461  *
462  * Revision 6.83  2000/10/18 14:53:25  dondosha
463  * Corrected MegaBlastReevaluateWithAmbiguities function treatment of lower case query regions
464  *
465  * Revision 6.82  2000/10/12 14:55:55  dondosha
466  * Proceed with search even if some, but not all of the queries are completely masked
467  *
468  * Revision 6.81  2000/10/05 19:53:56  dondosha
469  * Save search results in an array of BlastResultsStruct - separate results for each query
470  *
471  * Revision 6.80  2000/09/29 22:21:47  dondosha
472  * Check if memory allocation succeeds to avoid core dumping exit
473  *
474  * Revision 6.79  2000/09/28 18:27:58  dondosha
475  * Do not free lower case mask SeqLocs before the search
476  *
477  * Revision 6.78  2000/09/28 14:59:01  dondosha
478  * Save initial hits in a small structure MegaBlastExactMatch instead of large BLAST_HSP
479  *
480  * Revision 6.77  2000/09/06 18:35:33  dondosha
481  * Corrected percentage of identities computation for -D1 output
482  *
483  * Revision 6.76  2000/09/01 18:29:12  dondosha
484  * Removed calls to ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles
485  *
486  * Revision 6.75  2000/08/30 22:08:44  dondosha
487  * Added function BioseqMegaBlastEngineByLoc
488  *
489  * Revision 6.74  2000/08/25 22:37:15  dondosha
490  * Added function MegaBlastReevaluateWithAmbiguities
491  *
492  * Revision 6.73  2000/08/18 20:12:30  dondosha
493  * Do not use search->query_id in megablast, use only qid_array
494  *
495  * Revision 6.72  2000/08/07 16:59:50  dondosha
496  * Correct construction of path for gi list file
497  *
498  * Revision 6.71  2000/08/07 16:16:12  dondosha
499  * Corrected behavior when input contains an empty sequence
500  *
501  * Revision 6.70  2000/08/02 15:21:49  dondosha
502  * Corrected the way Karlin block is computed for megablast
503  *
504  * Revision 6.69  2000/08/01 18:08:24  dondosha
505  * Fixed function MaskSeqLocFromSeqAlign used for masked query output
506  *
507  * Revision 6.68  2000/07/17 14:37:11  dondosha
508  * Fixed a memory leak in sequence masking
509  *
510  * Revision 6.67  2000/07/14 18:00:39  dondosha
511  * Sort db sequences by best hit expect value in traditional output
512  *
513  * Revision 6.66  2000/07/11 16:46:17  dondosha
514  * Fixed memory leak; use options query_lcase_mask variable for lower case masking
515  *
516  * Revision 6.65  2000/07/10 17:20:17  dondosha
517  * Use several stacks for speed up of search with small word sizes
518  *
519  * Revision 6.64  2000/07/06 17:25:49  dondosha
520  * Pass megablast_full_deflines option to search parameters
521  *
522  * Revision 6.63  2000/07/03 21:51:05  dondosha
523  * For small wordsizes look for 2 hits in a window
524  *
525  * Revision 6.62  2000/06/30 15:07:11  dondosha
526  * Fixed single strand search
527  *
528  * Revision 6.61  2000/06/29 21:10:37  dondosha
529  * Sort seqaligns by evalue only among hits from same database sequence
530  *
531  * Revision 6.60  2000/06/29 14:52:10  dondosha
532  * Typo in previous change
533  *
534  * Revision 6.59  2000/06/29 14:45:43  dondosha
535  * Mask queries with N, not lower case for -Q option output
536  *
537  * Revision 6.58  2000/06/27 22:21:53  dondosha
538  * Set no_check_score to TRUE; Added functions for masked query output
539  *
540  * Revision 6.57  2000/06/27 14:48:39  dondosha
541  * Changed the procedure of HSP inclusion checking before gapped extension to avoid quadratic complexity
542  *
543  * Revision 6.56  2000/06/26 18:21:19  dondosha
544  * Set awake_index to FALSE before any return from MegaBlastSetUpSearchInternalByLoc
545  *
546  * Revision 6.55  2000/06/26 17:35:30  dondosha
547  * If some of the queries are completely masked, the search can still proceed
548  *
549  * Revision 6.54  2000/06/22 22:22:19  dondosha
550  * Fixed a memory leak in MegaBlastNtWordExtend
551  *
552  * Revision 6.53  2000/06/21 18:03:20  dondosha
553  * Use the same hsp_array for gapped extension
554  *
555  * Revision 6.52  2000/06/19 20:08:06  madden
556  * Add last parameter to readdb_get_sequence_ex
557  *
558  * Revision 6.51  2000/06/19 19:42:35  dondosha
559  * Fixed a bug causing megablast to crash when database is not found
560  *
561  * Revision 6.50  2000/06/19 19:26:56  dondosha
562  * Do not go through a double loop on HSPs for small wordsize
563  *
564  * Revision 6.49  2000/06/07 17:39:51  dondosha
565  * Fixed bug causing extra progress messages in nucleotide neighboring
566  *
567  * Revision 6.48  2000/05/31 14:07:14  dondosha
568  * Reallocate memory for the hit stack if it is overflowing
569  *
570  * Revision 6.47  2000/05/26 19:22:28  dondosha
571  * For neighboring test HSP length and percent identity before saving
572  *
573  * Revision 6.46  2000/05/25 21:00:34  dondosha
574  * Set redundant HSPs to NULL in BlastSortUniqHspArray
575  *
576  * Revision 6.45  2000/05/24 19:51:03  dondosha
577  * Initialize qid_array during search set-up; do not use readdb when megablast is used for two sequences
578  *
579  * Revision 6.44  2000/05/22 18:48:07  dondosha
580  * Conform to change in ReadDBFILE structure
581  *
582  * Revision 6.43  2000/05/17 21:27:02  dondosha
583  * Small improvement in handling edit script
584  *
585  * Revision 6.42  2000/05/17 17:11:13  dondosha
586  * Removed some unused variables; fixed comparison function for sorting hsps
587  *
588  * Revision 6.41  2000/05/12 19:44:19  dondosha
589  * Do not use SeqPort for retrieving queries; keep query ids in array to allow binary search; use ncbi4na encoding for subject sequences during extension; added routine BinarySearchInt4
590  *
591  * Revision 6.40  2000/05/03 20:23:04  dondosha
592  * Added function MegaBlastGappedAlign to do the gapped extension after all perfect matches are saved - this significantly improves speed
593  *
594  * Revision 6.39  2000/05/01 21:25:54  dondosha
595  * Changed greedy_gapped_align calls to MegaBlastAffineGreedyAlign; bug fix for neighboring; memory leak fix
596  *
597  * Revision 6.38  2000/04/25 21:51:45  dondosha
598  * Little clean-up in MegaBlastNtWordExtend
599  *
600  * Revision 6.37  2000/04/20 15:14:36  dondosha
601  * Fixed MegaBlastGetFirstAndLastContext and BlastFillQueryOffsets for one strand only search
602  *
603  * Revision 6.36  2000/04/19 18:54:33  dondosha
604  * Bug fix: assign values for end-points different from gap or masked characters
605  *
606  * Revision 6.35  2000/04/18 19:09:03  dondosha
607  * Set the X dropoff correctly for the megablast search
608  *
609  * Revision 6.34  2000/04/14 19:22:52  dondosha
610  * Remove HSPs contained in other HSPs
611  *
612  * Revision 6.33  2000/04/12 19:11:28  dondosha
613  * Create index_callback thread right after the creation of BlastSearchBlk
614  *
615  * Revision 6.32  2000/04/12 18:24:28  dondosha
616  * Added MegaBlastGetPercentIdentity; fixed bugs with memory handling
617  *
618  * Revision 6.31  2000/04/11 21:00:19  dondosha
619  * Fixed memory bug in BlastNeedHumanRepeatFiltering
620  *
621  * Revision 6.30  2000/04/11 19:51:50  dondosha
622  * Removed setting of queue_callback to NULL
623  *
624  * Revision 6.29  2000/04/10 18:03:26  dondosha
625  * Fixed a bug in previous change
626  *
627  * Revision 6.28  2000/04/10 17:44:52  dondosha
628  * Find correct path for Homo_sapiens.n.gil, get this gilist only once
629  *
630  * Revision 6.27  2000/04/10 15:22:35  dondosha
631  * Moved call to BlastFillQueryOffsets to MegaBlastSetUpSearchInternalByLoc
632  *
633  * Revision 6.26  2000/04/07 16:51:57  dondosha
634  * Include callback argument for handling results in BioseqMegaBlastEngine, to be initialized in Main
635  *
636  * Revision 6.25  2000/04/05 18:14:48  dondosha
637  * Moved SeqIdSetDup to objloc.c, slightly changed format of BlastSearchHandleResults
638  *
639  * Revision 6.24  2000/04/04 20:50:46  dondosha
640  * Cleaned from non-megablast (non-blastn) stuff
641  *
642  * Revision 6.23  2000/04/04 16:16:35  dondosha
643  * Fixed some memory leaks in MegaBlast traceback
644  *
645  * Revision 6.22  2000/04/03 23:37:08  dondosha
646  * Added test for human repeat filtering for neighboring
647  *
648  * Revision 6.21  2000/03/31 19:10:32  dondosha
649  * Changed some names related to MegaBlast
650  *
651  * Revision 6.20  2000/03/29 23:32:21  dondosha
652  * Fixed minor bugs from previous change
653  *
654  * Revision 6.19  2000/03/29 22:13:43  dondosha
655  * Enabled processing gap information for MegaBlast code
656  *
657  * Revision 6.18  2000/03/27 20:56:43  madden
658  * Set query frame properly in BlastAdjustHitOffsets (for ungapped blast)
659  *
660  * Revision 6.17  2000/03/23 20:02:02  dondosha
661  * BlastSearchHandleResults exits immediately if hspcnt==0
662  *
663  * Revision 6.16  2000/03/22 17:57:12  dondosha
664  * Improved memory management in MegaBlast
665  *
666  * Revision 6.15  2000/03/16 18:14:00  dondosha
667  * Added routine SeqIdSetDup, improved handling of megablast results
668  *
669  * Revision 6.14  2000/03/13 21:06:59  dondosha
670  * Routine printing results rewritten, plus minor improvements
671  *
672  * Revision 6.13  2000/03/08 20:34:53  madden
673  * Remove ungapped psi-blast stuff
674  *
675  * Revision 6.12  2000/03/03 18:12:22  dondosha
676  * Added routine MegaBlastWordFinderDeallocate to fix multithreaded MegaBlast
677  *
678  * Revision 6.11  2000/03/02 18:29:00  dondosha
679  * Minor bug fix in setting context offsets
680  *
681  * Revision 6.10  2000/03/02 17:23:21  dondosha
682  * Added lower case masking plus bug fixes
683  *
684  * Revision 6.9  2000/02/24 23:20:15  dondosha
685  * Fixed a bug in BlastAdjustHitOffsets
686  *
687  * Revision 6.8  2000/02/24 17:53:38  dondosha
688  * Added routine BlastAdjustHitOffsets
689  *
690  * Revision 6.7  2000/02/23 20:28:38  dondosha
691  * Bug fix for ungapped alignment search, plus style improvements
692  *
693  * Revision 6.6  2000/02/17 20:01:53  dondosha
694  * Removed references to theCacheSize
695  *
696  * Revision 6.5  2000/02/12 21:18:57  kans
697  * added prototype for MegaBlastBuildLookupTable - implemented in lookup.c, called from mblast.c
698  *
699  * Revision 6.4  2000/02/11 21:03:36  dondosha
700  * Added new word finder and extension routines to work with new lokup table
701  *
702  * Revision 6.3  2000/02/03 21:21:10  dondosha
703  * Added header; few bug fixes
704  *
705  * Revision 6.2  2000/02/02  15:03:44  dondosha
706  * Removed unused routine ReapHitlistByContext
707  *
708  * */
709 
710 #include <ncbi.h>
711 #include <blastpri.h>
712 #include <lookup.h>
713 #include <objcode.h>
714 #include <objseq.h>
715 #include <sequtil.h>
716 #include <tofasta.h>
717 #include <seqport.h>
718 #include <readdb.h>
719 #include <ncbithr.h>
720 #include <gapxdrop.h>
721 #include <dust.h>
722 #include <mbalign.h>
723 #include <mblast.h>
724 #include <time.h>
725 
726 Int4
727 MegaBlastWordFinder (BlastSearchBlkPtr search, LookupTablePtr lookup);
728 
729 Int4
730 MegaBlastExtendHit (BlastSearchBlkPtr search, LookupTablePtr lookup,
731                     Int4 s_off, Int4 q_off);
732 Int2
733 MegaBlastNtWordExtend (BlastSearchBlkPtr search, Uint1Ptr subject0,
734                        Int4 q_off, Int4 s_off);
735 
736 Int2 MegaBlastGappedAlign (BlastSearchBlkPtr search);
737 
738 void MegaBlastMaskTheResidues (Uint1Ptr buffer, Int4 max_length,
739          Uint1 mask_residue, SeqLocPtr mask_slp, Boolean reverse,
740 	 Int4 offset, Boolean lowercase_mask);
741 
742 void BlastLCaseMaskTheResidues (Uint1Ptr buffer, Int4 max_length,
743          SeqLocPtr mask_slp, Boolean reverse, Int4 offset);
744 
745 GapXEditScriptPtr MBToGapXEditScript (edit_script_t PNTR ed_script);
746 
747 MBTemplateType GetMBTemplateType (Int2 weight, Int2 length,
748                                   MBDiscWordType type);
749 
750 SeqAlignPtr PNTR MegaBlastPackAlignmentsByQuery (BlastSearchBlkPtr search,
751 		     SeqAlignPtr seqalign);
752 
753 Int2 LIBCALL MegaBlastSequenceAddSequence (BlastSequenceBlkPtr sequence_blk,
754          Uint1Ptr sequence, Uint1Ptr sequence_start, Int4 length,
755          Int4 original_length, Int4 effective_length);
756 
757 Int4 SeqLocTotalLen (CharPtr prog_name, SeqLocPtr slp);
758 
759 Boolean MegaBlastGetFirstAndLastContext (CharPtr prog_name,
760             SeqLocPtr query_slp, Int2Ptr first_context,
761             Int2Ptr last_context, Uint1 strand_options);
762 
763 SeqAlignPtr PNTR
BioseqMegaBlastEngine(BioseqPtr PNTR bspp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)(Int4 done,Int4 positives),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * results_callback)PROTO ((VoidPtr Ptr)))764 BioseqMegaBlastEngine (BioseqPtr PNTR bspp, CharPtr progname, CharPtr database,
765 		       BLAST_OptionsBlkPtr options, ValNodePtr *other_returns,
766 		       ValNodePtr *error_returns, int (LIBCALLBACK
767 						       *callback)(Int4 done,
768 								  Int4
769 								  positives),
770 		       SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list,
771 		       Int4 gi_list_total,
772 		       int (LIBCALLBACK *results_callback)PROTO((VoidPtr Ptr)))
773 {
774    SeqLocPtr slp, next_slp;
775    Int4 index = 0;
776    SeqAlignPtr PNTR head;
777    Int4 from = options->required_start, to = options->required_end;
778 
779    slp = NULL;
780 
781    /* If location given, apply it to the first sequence */
782    if (from > 0 || to > 0) {
783       if (to < 0)
784          to = bspp[0]->length - 1;
785       if (from >= bspp[0]->length || to < 0) {
786          ErrPostEx(SEV_FATAL, 1, 0,
787                    "Location outside of the query sequence range\n");
788          return NULL;
789       } else if (to <= from) {
790          ErrPostEx(SEV_FATAL, 1, 0, "Empty query location provided\n");
791          return NULL;
792       }
793       slp = SeqLocIntNew(from, to, options->strand_option, bspp[0]->id);
794       index++;
795    }
796 
797    for ( ; bspp[index] != NULL; index++)
798       ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdSetDup(bspp[index]->id));
799 
800    head = BioseqMegaBlastEngineByLoc(slp, progname, database, options,
801                                      other_returns, error_returns, callback,
802                                      seqid_list, gi_list, gi_list_total,
803                                      results_callback);
804 
805    if (slp->choice == SEQLOC_INT) {
806      next_slp = slp->next;
807      SeqLocFree(slp);
808      slp = next_slp;
809    }
810 
811    while (slp) {
812      next_slp = slp->next;
813      SeqIdSetFree((SeqIdPtr) slp->data.ptrvalue);
814      MemFree(slp);
815      slp = next_slp;
816    }
817 
818    return head;
819 }
820 
821 SeqAlignPtr PNTR
BioseqMegaBlastEngineByLoc(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)(Int4 done,Int4 positives),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * results_callback)PROTO ((VoidPtr Ptr)))822 BioseqMegaBlastEngineByLoc (SeqLocPtr slp, CharPtr progname, CharPtr database,
823                             BLAST_OptionsBlkPtr options, ValNodePtr *other_returns,
824                             ValNodePtr *error_returns,
825                             int (LIBCALLBACK *callback)(Int4 done, Int4 positives),
826                             SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list,
827                             Int4 gi_list_total,
828                             int (LIBCALLBACK *results_callback)PROTO((VoidPtr Ptr)))
829 {
830 	Boolean options_allocated=FALSE;
831 	BlastSearchBlkPtr search;
832 	Int2 status;
833 	SeqAlignPtr PNTR head;
834 	SeqLocPtr whole_slp=NULL, slp_var;
835 	Int4 index;
836 
837 
838 	head = NULL;
839 
840 	if (error_returns)
841 	{
842 		*error_returns = NULL;
843 	}
844 
845 	if (other_returns)
846 	{
847 		*other_returns = NULL;
848 	}
849 
850 	if (progname == NULL)
851 		return NULL;
852 
853 	/* If no options, use default. */
854         if (options == NULL)
855 	{
856 		options = BLASTOptionNewEx(progname, TRUE, TRUE);
857 		options_allocated = TRUE;
858 	}
859 
860 	status = BLASTOptionValidateEx(options, progname, error_returns);
861 	if (status != 0)
862 	{	/* error messages in other_returns? */
863 		return NULL;
864 	}
865 
866 	if (slp == NULL || database == NULL)
867 		return NULL;
868 
869 	search = MegaBlastSetUpSearchWithReadDbInternal(slp, NULL, progname,
870 							0,
871 							database, options, NULL,
872 							seqid_list, gi_list,
873 							gi_list_total, NULL);
874 
875         if (search == NULL) {
876            /* We need to veryfy if database name is wrong and to set error
877                returns correctly */
878             BlastErrorMsgPtr error_msg;
879             CharPtr chptr;
880             ReadDBFILEPtr rdfp=NULL;
881 
882             rdfp = readdb_new(database, FALSE);
883             if(rdfp == NULL) {
884                 error_msg = MemNew(sizeof(BlastErrorMsg));
885                 chptr = MemNew(StringLen(database) + 256);
886                 sprintf(chptr, "Database %s was not found or does not exist",
887                         database);
888                 error_msg->msg = chptr;
889                 error_msg->level = 3; /* FATAL */
890                 ValNodeAddPointer(error_returns, 0, error_msg);
891             }
892 
893             readdb_destruct(rdfp);
894             return NULL;
895         }
896 
897 	search->thr_info->tick_callback = callback;
898 	search->thr_info->star_callback = callback;
899 	search->handle_results = results_callback;
900         search->output = options->output;
901 
902 	head = BioseqMegaBlastEngineCore(search, options);
903 
904 	if (search->error_return)
905 	{
906 		ValNodeLink(error_returns, search->error_return);
907 		search->error_return = NULL;
908 	}
909 
910 	if (other_returns)
911 	{ /* format dbinfo etc.  */
912 		*other_returns = BlastOtherReturnsPrepare(search);
913 	}
914 
915 	if (options_allocated)
916 	{
917 		options = BLASTOptionDelete(options);
918 	}
919 
920 	search = BlastSearchBlkDestruct(search);
921 
922 	/* Adjsut the offset if the query does not cover the entire sequence. */
923 	slp_var = slp;
924 	for (index=0; slp_var!=NULL; index++,slp_var=slp_var->next) {
925            if (slp_var->choice != SEQLOC_WHOLE)	{
926               ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE,
927                                 SeqIdFindBest(SeqLocId(slp_var), SEQID_GI));
928               if (SeqLocAinB(whole_slp, slp_var) != 0)
929                  AdjustOffSetsInSeqAlign(head[index], slp_var, NULL);
930               ValNodeFree(whole_slp);
931            }
932 	}
933 
934 	return head;
935 }
936 
937 static Int2 mb_two_hit_min_step;
938 
939 SeqAlignPtr PNTR
BioseqMegaBlastEngineCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options)940 BioseqMegaBlastEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options)
941 {
942 	SeqAlignPtr seqalign, PNTR seqalignp;
943         Int2 num_queries;
944         CharPtr tmpstr;
945 
946 	seqalign = NULL;
947 	seqalignp = NULL;
948 start_timer;
949 	if (search == NULL || search->query_invalid)
950 		return NULL;
951 
952 	/* Starting awake thread if multithreaded. */
953 	if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
954 		BlastStartAwakeThread(search->thr_info);
955 
956         if ((tmpstr = getenv("MB_TWO_HIT_MIN_STEP")) != NULL)
957            mb_two_hit_min_step = atoi(tmpstr);
958         else
959            mb_two_hit_min_step = 0;
960 
961 	stop_timer("BioseqBlastEngineCore: before do_the_blast_run()");
962 	do_the_blast_run(search);
963 
964    /* If this is a partial search, we do not do traceback, so can return
965       immediately */
966    if (options->no_traceback == 1 || search->handle_results) {
967       BlastStopAwakeThread(search->thr_info);
968       return NULL;
969    }
970 
971    /* So far the length of the first query was set to the total
972       concatenated length. It can be reset to the real length of the
973       first query sequence here. */
974    search->context[search->first_context].query->length =
975       search->query_context_offsets[search->first_context+1] - 1;
976 
977 
978 start_timer;
979         num_queries = search->last_context / 2 + 1;
980         seqalignp = (SeqAlignPtr PNTR)
981            MemNew(num_queries*sizeof(SeqAlignPtr));
982         BLASTPostSearchLogic(search, options, seqalignp, FALSE);
983 
984         /* Stop the awake thread. */
985         BlastStopAwakeThread(search->thr_info);
986 
987  stop_timer("MegaBlastEngineCore: after do_the_blast_run()");
988         return seqalignp;
989 }
990 
991 #define INDEX_THR_MIN_SIZE 20000
992 BlastSearchBlkPtr
MegaBlastSetUpSearchWithReadDbInternal(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,ReadDBFILEPtr rdfp)993 MegaBlastSetUpSearchWithReadDbInternal (SeqLocPtr query_slp, BioseqPtr
994 					query_bsp, CharPtr prog_name, Int4 qlen,
995 					CharPtr dbname, BLAST_OptionsBlkPtr
996 					options, int (LIBCALLBACK *callback)
997 					PROTO((Int4 done, Int4 positives)),
998 					SeqIdPtr seqid_list,
999 					BlastDoubleInt4Ptr gi_list, Int4
1000 					gi_list_total, ReadDBFILEPtr rdfp)
1001 
1002 {
1003 
1004 	BlastSearchBlkPtr search;
1005 	Boolean options_alloc=FALSE;
1006         Boolean looking_for_gis = FALSE;
1007 	Int2 status, first_context, last_context;
1008         Int8	dblen;
1009 	Int4	query_length;
1010 	Nlm_FloatHi	searchsp_eff=0;
1011         Int4 hitlist_size;
1012 
1013 	/* Allocate default options if none are allocated yet. */
1014 	if (options == NULL)
1015 	{
1016 		options = BLASTOptionNew(prog_name, FALSE);
1017 		options_alloc = TRUE;
1018 	}
1019         hitlist_size = BlastSingleQueryResultSize(options);
1020 
1021         /* last context is the total number of contexts in concatenated
1022            queries */
1023 
1024 	if (query_slp) {
1025 	   query_length = SeqLocTotalLen(prog_name, query_slp);
1026            MegaBlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
1027 	} else {
1028 	   query_length = 2*query_bsp->length + 3;
1029            BlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
1030         }
1031 
1032         qlen = 0;
1033         /* qlen != 0 is a flag for BlastSearchBlkNewExtra to create the
1034            'diagonal array' structure for saving initial hits that will be
1035            used instead of stacks for word sizes < 11 and short to medium
1036            length queries only */
1037         if (options->wordsize < 11 && query_length <= MAX_DIAG_ARRAY)
1038            qlen = query_length;
1039 	/* Pass nonzero length only if need to allocate ewp */
1040 	search = BlastSearchBlkNewExtra(options->wordsize, qlen,
1041 					dbname, FALSE, 0,
1042 					options->threshold_second,
1043 					hitlist_size, prog_name, NULL,
1044 					first_context, last_context, rdfp,
1045 					options->window_size);
1046 
1047 	if (search) {
1048 	   if (NlmThreadsAvailable() && query_length > INDEX_THR_MIN_SIZE) {
1049 	      search->thr_info->awake_index = TRUE;
1050 	      search->thr_info->last_tick = Nlm_GetSecs();
1051 	      search->thr_info->index_thr =
1052 		 NlmThreadCreate(index_proc, search->thr_info);
1053 	      search->thr_info->index_callback = callback;
1054 	   }
1055 	   readdb_get_totals_ex(search->rdfp, &(dblen),
1056                                 &(search->dbseq_num), TRUE);
1057 
1058 
1059        /* Create virtual database if any of the databases have gi lists or
1060           ordinal id masks, or if gi list is provided from options */
1061        looking_for_gis = BlastProcessGiLists(search, options, gi_list, gi_list_total);
1062 
1063        /* search->thr_info->blast_gi_list will be non-NULL if gi_list or
1064         * options->gilist or options->gifile was non-NULL and therefore
1065         * intersected with any oidlists in the search->rdfp(s). If this is the
1066         * case, we need to recalculate the database length and number of
1067         * sequences */
1068        if (search->thr_info->blast_gi_list && !options->use_real_db_size)
1069            readdb_get_totals_ex3(search->rdfp, &dblen, &search->dbseq_num,
1070                                  FALSE, TRUE, eApproximate);
1071 
1072        if (looking_for_gis && search->thr_info->blast_gi_list == NULL)
1073        {
1074               ErrPostEx(SEV_WARNING, 0, 0, "Intersection of gilist and database IDs is empty");
1075               search->query_invalid = TRUE;
1076        }
1077 
1078            /* command-line/options trump alias file. */
1079            if (options->db_length > 0)
1080               dblen = options->db_length;
1081            if (options->dbseq_num > 0)
1082               search->dbseq_num = options->dbseq_num;
1083            if (options->searchsp_eff > 0)
1084               searchsp_eff = options->searchsp_eff;
1085 
1086            search->dblen = dblen;
1087            search->searchsp_eff = searchsp_eff;
1088            status = MegaBlastSetUpSearchInternalByLoc (search, query_slp, query_bsp,
1089                                                        prog_name, qlen, options,
1090                                                        callback);
1091            /*
1092               Turn off the index thread by setting this flag.
1093               Don't wait for a join, as the search will take much
1094               longer than the one second for this to die.
1095            */
1096            search->thr_info->awake_index = FALSE;
1097 
1098            if (search->pbp->mb_params->no_traceback &&
1099                !search->pbp->mb_params->is_neighboring)
1100               search->mb_endpoint_results = ValNodeNew(NULL);
1101 
1102            if (status != 0) {
1103               ErrPostEx(SEV_WARNING, 0, 0, "MegaBlastSetUpSearch failed.");
1104               search->query_invalid = TRUE;
1105            }
1106 
1107            if (!search->query_invalid && !search->pbp->mb_params->use_dyn_prog) {
1108               search = GreedyAlignMemAlloc(search);
1109               if (search->abmp == NULL) {
1110                  BlastConstructErrorMessage("Mega BLAST", "Memory allocation for greedy algorithm failed.", 2, &(search->error_return));
1111                  search->query_invalid = TRUE;
1112               }
1113            } else
1114               search->abmp = NULL;
1115            if (search->rdfp->parameters & READDB_CONTENTS_ALLOCATED)
1116                search->rdfp = ReadDBCloseMHdrAndSeqFiles(search->rdfp);
1117 	}
1118 
1119 	if (options_alloc)
1120            options = BLASTOptionDelete(options);
1121 
1122 	return search;
1123 }
1124 
1125 /* Function below assumes that the gi file is sorted in increasing order */
1126 static Boolean
FindGiInGiList(Uint4 query_gi,BlastDoubleInt4Ptr gi_list,Int4 gi_list_size)1127 FindGiInGiList(Uint4 query_gi, BlastDoubleInt4Ptr gi_list,
1128                Int4 gi_list_size)
1129 {
1130    Int4 begin, end, middle;
1131 
1132    begin = 0;
1133    end = gi_list_size - 1;
1134    while (begin <= end) {
1135       middle = (begin + end) / 2;
1136       if (gi_list[middle].gi > query_gi)
1137 	 end = middle - 1;
1138       else if (gi_list[middle].gi < query_gi)
1139 	 begin = middle + 1;
1140       else {
1141 	 return TRUE;
1142       }
1143    }
1144    return FALSE;
1145 }
1146 
1147 #define MIN_LIMIT_HSP_NUM 1e20
1148 #define BLASTNA_N_RESIDUE 14
1149 
1150 Int2
MegaBlastSetUpSearchInternalByLoc(BlastSearchBlkPtr search,SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))1151 MegaBlastSetUpSearchInternalByLoc (BlastSearchBlkPtr search, SeqLocPtr
1152 				   query_slp, BioseqPtr query_bsp, CharPtr
1153 				   prog_name, Int4 qlen, BLAST_OptionsBlkPtr
1154 				   options, int (LIBCALLBACK
1155 						 *callback)PROTO((Int4 done,
1156 								  Int4
1157 								  positives)))
1158 
1159 {
1160    Boolean mask_at_hash=FALSE, private_slp_delete;
1161    Char buffer[128];
1162    Int2 retval, status;
1163    Int4 effective_query_length, query_length,
1164       index, length, length_adjustment=0,
1165       min_query_length, full_query_length=0;
1166    Int4 context, num_queries;
1167    Nlm_FloatHi avglen;
1168    SeqIdPtr qid=NULL;
1169    SeqLocPtr filter_slp=NULL, private_slp=NULL, private_slp_rev=NULL, slp=NULL, tmp_slp=NULL;
1170    Uint1 residue;
1171    Uint1Ptr query_seq=NULL, query_seq_start=NULL, query_seq_rev=NULL, query_seq_start_rev=NULL;
1172    Uint1Ptr query_seq_combined=NULL;
1173    CharPtr filter_string=NULL;
1174    Uint4 query_gi;
1175    Int4 homo_gilist_size, mouse_gilist_size, rat_gilist_size;
1176    BlastDoubleInt4Ptr homo_gilist=NULL, mouse_gilist=NULL, rat_gilist=NULL;
1177    SeqLocPtr mask_slp=NULL, next_mask_slp=NULL;
1178    SeqPortPtr spp = NULL;
1179 
1180    if (options == NULL) {
1181       ErrPostEx(SEV_FATAL, 1, 0, "BLAST_OptionsBlkPtr is NULL\n");
1182       return 1;
1183    }
1184 
1185    if (query_slp == NULL && query_bsp == NULL) {
1186       ErrPostEx(SEV_FATAL, 1, 0, "Query is NULL\n");
1187       return 1;
1188    }
1189 
1190    context = search->first_context;
1191 
1192    num_queries = search->last_context / 2 + 1;
1193    search->qid_array = (SeqIdPtr PNTR) MemNew(num_queries*sizeof(SeqIdPtr));
1194 
1195    if (!query_slp) {
1196       query_length = query_bsp->length;
1197       private_slp = SeqLocIntNew(0, query_length-1 , Seq_strand_plus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1198       private_slp_rev = SeqLocIntNew(0, query_length-1 , Seq_strand_minus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1199       private_slp_delete = FALSE;
1200       search->query_slp = private_slp;
1201       search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
1202       /* Now let query_length be the total length of 2 strands together with
1203          sentinel bytes */
1204       query_length = 2*query_length + 3;
1205    } else {
1206       search->query_slp = query_slp;
1207       query_length = SeqLocTotalLen(search->prog_name, query_slp);
1208    }
1209 
1210    if ((search->context[search->first_context].query->sequence_start =
1211       (Uint1Ptr) Malloc((query_length+2)*sizeof(Uint1))) == NULL) {
1212       ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1213       retval = 1;
1214       goto MegaBlastSetUpReturn;
1215    }
1216 
1217    search = BlastFillQueryOffsets(search, search->query_slp,
1218                                   options->wordsize);
1219 
1220    search->translation_buffer = NULL;
1221    search->translation_buffer_size = 0;
1222 
1223    /*
1224      Set the context_factor, which specifies how many different
1225      ways the query or db is examined (e.g., blastn looks at both
1226      stands of query, context_factor is 2).
1227    */
1228    /* All strands of all queries concatenated into one */
1229    search->context_factor = 1;
1230 
1231    /* Set the ambiguous residue before the ScoreBlk is filled. */
1232    if(options->matrix!=NULL) {
1233       search->sbp->read_in_matrix = TRUE;
1234    } else
1235       search->sbp->read_in_matrix = FALSE;
1236    BlastScoreSetAmbigRes(search->sbp, 'N');
1237 
1238    search->sbp->penalty = options->penalty;
1239    search->sbp->reward = options->reward;
1240 
1241    /* option is to use alignments chosen by user in PSM computation API (used in WWW PSI-Blast); */
1242    search->pbp->use_best_align = options->use_best_align;
1243 
1244    /* Should culling be used at all? */
1245    search->pbp->perform_culling = options->perform_culling;
1246    search->pbp->hsp_range_max = options->hsp_range_max;
1247    search->pbp->mb_params = MegaBlastParameterBlkNew(options);
1248 
1249    search->pbp->mb_params->max_positions = MIN(query_length, options->block_width);
1250    search->sbp->query_length = query_length;
1251 
1252    search->mb_result_struct = (BLASTResultsStructPtr PNTR)
1253       MemNew(num_queries*sizeof(BLASTResultsStructPtr));
1254 
1255    if (options->matrix != NULL)
1256       status = BlastScoreBlkMatFill(search->sbp, options->matrix);
1257    else
1258       status = BlastScoreBlkMatFill(search->sbp, "BLOSUM62");
1259    if (status != 0) {
1260       ErrPostEx(SEV_WARNING, 0, 0, "BlastScoreBlkMatFill returned non-zero status");
1261       retval = 1;
1262       goto MegaBlastSetUpReturn;
1263    }
1264 
1265    /* This is used right below. */
1266    search->pbp->gapped_calculation = options->gapped_calculation;
1267    search->pbp->do_not_reevaluate = options->do_not_reevaluate;
1268    search->pbp->do_sum_stats = options->do_sum_stats;
1269    search->pbp->first_db_seq = options->first_db_seq;
1270    search->pbp->final_db_seq = options->final_db_seq;
1271 
1272    retval = 0;
1273 
1274    context = search->first_context;
1275 
1276    if (private_slp && private_slp_delete)
1277       private_slp = SeqLocFree(private_slp);
1278    if (private_slp_rev)
1279       private_slp_rev = SeqLocFree(private_slp_rev);
1280 
1281    search->query_id = NULL;
1282    slp = search->query_slp;
1283 
1284    if (search->pbp->mb_params->is_neighboring) {
1285       CharPtr filename;
1286       filename = FindBlastDBFile("Homo_sapiens.n.gil");
1287       homo_gilist = GetGisFromFile (filename, &homo_gilist_size);
1288       MemFree(filename);
1289       filename = FindBlastDBFile("Mus_musculus.n.gil");
1290       mouse_gilist = GetGisFromFile(filename, &mouse_gilist_size);
1291       MemFree(filename);
1292       filename = FindBlastDBFile("Rattus_norvegicus.n.gil");
1293       rat_gilist = GetGisFromFile(filename, &rat_gilist_size);
1294       MemFree(filename);
1295    }
1296 
1297    /* All lower case masking locations are in one list */
1298    next_mask_slp = options->query_lcase_mask;
1299 
1300    while(context<=search->last_context && slp != NULL) {
1301       if (search->first_context == 0) {
1302 	 private_slp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp),
1303 				    Seq_strand_plus, SeqLocId(slp));
1304       }
1305       if (search->last_context % 2 == 1) {
1306 	 private_slp_rev = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp),
1307 					Seq_strand_minus, SeqLocId(slp));
1308       }
1309 
1310       if (private_slp)
1311 	 tmp_slp = private_slp;
1312       else
1313 	 tmp_slp = private_slp_rev;
1314 
1315       search->qid_array[context/2] = SeqIdSetDup(SeqLocId(slp));
1316 
1317       query_length = 0;
1318       query_length = SeqLocLen(tmp_slp);
1319       if (query_length < options->wordsize) {
1320          SeqIdWrite(SeqLocId(tmp_slp), buffer, PRINTID_FASTA_LONG, 127);
1321          ErrPostEx(SEV_WARNING, 0, 0,
1322                    "Query sequence %s removed: length %ld is less than wordsize %d",
1323                    buffer, query_length, options->wordsize);
1324          if (next_mask_slp &&
1325             SeqIdComp(SeqLocId(next_mask_slp), SeqLocId(tmp_slp)) == SIC_YES) {
1326             next_mask_slp = next_mask_slp->next;
1327          }
1328          slp = slp->next;
1329          context += 2;
1330 	 continue;
1331       }
1332 
1333       if (options->filter && !options->filter_string)
1334 	 options->filter_string = BlastConstructFilterString(options->filter);
1335       if (search->pbp->mb_params->is_neighboring) {
1336 	 /* Test if we have to do human repeat filtering on this query */
1337 	 if (private_slp)
1338 	    qid = SeqLocId(private_slp);
1339 	 else
1340 	    qid = SeqLocId(private_slp_rev);
1341 	 query_gi = GetGIForSeqId(qid);
1342 	 if (FindGiInGiList(query_gi, homo_gilist, homo_gilist_size)) {
1343 	    filter_string =
1344 	       (CharPtr) Malloc(StringLen(options->filter_string)+3);
1345 	    sprintf(filter_string, "%s%s", options->filter_string, ";R");
1346          } else if (FindGiInGiList(query_gi, mouse_gilist, mouse_gilist_size)
1347                     || FindGiInGiList(query_gi, rat_gilist, rat_gilist_size)) {
1348             filter_string =
1349                (CharPtr) Malloc(StringLen(options->filter_string)+18);
1350 
1351             sprintf(filter_string, "%s%s", options->filter_string,
1352                     ";R -d rodents.lib");
1353 	 } else
1354 	    filter_string = StringSave(options->filter_string);
1355       } else
1356 	 filter_string = options->filter_string;
1357 
1358       if (private_slp)
1359 	 filter_slp = BlastSeqLocFilterEx(private_slp, filter_string, &mask_at_hash);
1360       else if (private_slp_rev)
1361 	 filter_slp = BlastSeqLocFilterEx(private_slp_rev, filter_string, &mask_at_hash);
1362       if (search->pbp->mb_params->is_neighboring)
1363 	 MemFree(filter_string);
1364 
1365       if (search->first_context==0 && search->last_context % 2 == 1)
1366 	 query_seq_combined =
1367 	    (Uint1Ptr) Malloc((2*query_length+3)*sizeof(Uint1));
1368       else
1369 	 query_seq_combined =
1370 	    (Uint1Ptr) Malloc((query_length+2)*sizeof(Uint1));
1371 
1372       if (query_seq_combined == NULL) {
1373          ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1374          retval = 1;
1375          goto MegaBlastSetUpReturn;
1376       }
1377 
1378       query_seq_combined[0] = 0x0f;
1379       full_query_length = 0;
1380 
1381       /* Extract the mask location corresponding to this query */
1382       if (next_mask_slp &&
1383 	  SeqIdComp(SeqLocId(next_mask_slp), SeqLocId(tmp_slp)) == SIC_YES) {
1384 	 mask_slp = (ValNodePtr) MemDup(next_mask_slp, sizeof(ValNode));
1385 	 next_mask_slp = mask_slp->next;
1386 	 mask_slp->next = NULL;
1387       } else
1388 	 mask_slp = NULL;
1389 
1390       if (private_slp) {
1391          if (private_slp->choice == SEQLOC_INT) {
1392             spp = SeqPortNewByLoc(private_slp, Seq_code_ncbi4na);
1393             SeqPortSet_do_virtual(spp, TRUE);
1394          }
1395 
1396 
1397 	 if ((query_seq_start = (Uint1Ptr)
1398               Malloc(((query_length)+2)*sizeof(Char))) == NULL) {
1399             ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1400             retval = 1;
1401             goto MegaBlastSetUpReturn;
1402          }
1403 
1404 	 query_seq = query_seq_start+1;
1405 
1406 	 index=0;
1407          if (spp) {
1408             while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
1409                if (IS_residue(residue)) {
1410                   query_seq[index] = ncbi4na_to_blastna[residue];
1411                   index++;
1412                }
1413             }
1414             spp = SeqPortFree(spp);
1415          }
1416 
1417 	 query_seq_start[0] = query_seq[query_length] = 0x0f;
1418 
1419          filter_slp = blastMergeFilterLocs(filter_slp, mask_slp, FALSE, 0, 0);
1420          mask_slp = MemFree(mask_slp);
1421          /* Mask by the blastna value of 'N' (15 in ncbi4na) */
1422 
1423          /* If masking done for lookup table only, mask only the combined
1424             sequence that will go into the first context */
1425          if (filter_slp && !mask_at_hash) {
1426             MegaBlastMaskTheResidues(query_seq, query_length,
1427 				     BLASTNA_N_RESIDUE, filter_slp, FALSE,
1428 				     SeqLocStart(private_slp), mask_at_hash);
1429          }
1430          MemCpy(&query_seq_combined[1], query_seq, query_length+1);
1431 	 if (filter_slp && mask_at_hash) {
1432             MegaBlastMaskTheResidues(query_seq_combined+1, query_length,
1433 				     BLASTNA_N_RESIDUE, filter_slp, FALSE,
1434 				     SeqLocStart(private_slp), mask_at_hash);
1435          }
1436 
1437 	 full_query_length = query_length + 1;
1438       }
1439 
1440       if (private_slp_rev) {
1441          spp = SeqPortNewByLoc(private_slp_rev, Seq_code_ncbi4na);
1442          SeqPortSet_do_virtual(spp, TRUE);
1443 	 if ((query_seq_start_rev = (Uint1Ptr)
1444 	    Malloc(((query_length)+2)*sizeof(Char))) == NULL) {
1445             ErrPostEx(SEV_WARNING, 0, 0, "Memory allocation for query sequence failed");
1446             retval = 1;
1447             goto MegaBlastSetUpReturn;
1448          }
1449 	 query_seq_rev = query_seq_start_rev+1;
1450 	 index=0;
1451          if (spp) {
1452             while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
1453                if (IS_residue(residue)) {
1454                   query_seq_rev[index] = ncbi4na_to_blastna[residue];
1455                   index++;
1456                }
1457             }
1458             spp = SeqPortFree(spp);
1459          }
1460 
1461 	 query_seq_start_rev[0] = query_seq_rev[query_length] = 0x0f;
1462          if (filter_slp && !mask_at_hash) {
1463             MegaBlastMaskTheResidues(query_seq_rev, query_length,
1464                                      BLASTNA_N_RESIDUE, filter_slp, TRUE,
1465                                      SeqLocStart(private_slp_rev),
1466                                      mask_at_hash);
1467          }
1468          MemCpy(query_seq_combined+full_query_length+1,
1469                 query_seq_rev,query_length+1);
1470 	 if (filter_slp && mask_at_hash) {
1471             MegaBlastMaskTheResidues(query_seq_combined+full_query_length+1,
1472                                      query_length, BLASTNA_N_RESIDUE,
1473                                      filter_slp, TRUE,
1474                                      SeqLocStart(private_slp_rev),
1475                                      mask_at_hash);
1476          }
1477 	 full_query_length += query_length + 1;
1478       }
1479 
1480       MegaBlastSequenceAddSequence(search->context[search->first_context].query,
1481 				   NULL, query_seq_combined, full_query_length,
1482 				   full_query_length, 0);
1483 
1484       query_seq_combined = MemFree(query_seq_combined);
1485       if (context==search->first_context)
1486 	 query_seq_start = MemFree(query_seq_start);
1487 
1488       if (private_slp && context != search->first_context)
1489 	 MegaBlastSequenceAddSequence(search->context[context].query, query_seq,
1490 				      query_seq_start, query_length,
1491 				      query_length, 0);
1492       if (context % 2 == 0)
1493 	 context++;
1494       if (private_slp_rev && context != search->first_context) {
1495 	 MegaBlastSequenceAddSequence(search->context[context].query, query_seq_rev,
1496 				      query_seq_start_rev, query_length,
1497 				      query_length, 0);
1498       }
1499       context++;
1500 
1501    MegaBlastSetUpReturn:
1502       if (filter_slp && !mask_at_hash)
1503          /* Save filtering locations for formatting */
1504          ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
1505       else /* No longer needed */
1506          SeqLocSetFree(filter_slp);
1507 
1508       if (private_slp)
1509 	 private_slp = SeqLocFree(private_slp);
1510       if (private_slp_rev)
1511 	 private_slp_rev = SeqLocFree(private_slp_rev);
1512       if (retval)
1513          return retval;
1514       slp = slp->next;
1515    } /* End of loop over query contexts (strands) */
1516 
1517    if (search->pbp->mb_params->is_neighboring) {
1518       MemFree(homo_gilist);
1519       MemFree(mouse_gilist);
1520       MemFree(rat_gilist);
1521    }
1522 
1523    retval = 1;
1524    for (index=search->first_context; index<=search->last_context; index++) {
1525       length = search->query_context_offsets[index+1] -
1526          search->query_context_offsets[index] - 1;
1527       if (length > 0) {
1528          BLAST_KarlinBlk *kbp_gap;
1529          BLAST_KarlinBlk *kbp;
1530 
1531          if (retval)
1532             retval = 2;
1533 
1534          status = BlastScoreBlkFill(search->sbp, (CharPtr)
1535                                     search->context[index].query->sequence,
1536                                     length, index);
1537          search->sbp->kbp_gap_std[index] = BlastKarlinBlkCreate();
1538          kbp_gap = search->sbp->kbp_gap_std[index];
1539          kbp     = search->sbp->kbp_std[index];
1540 
1541          kbp_gap->Lambda = kbp->Lambda;
1542          kbp_gap->K      = kbp->K;
1543          kbp_gap->logK   = kbp->logK;
1544          kbp_gap->H      = kbp->H;
1545          kbp_gap->paramC = kbp->paramC;
1546 
1547          if (status==0)
1548             retval = 0;
1549          else
1550             search->sbp->kbp_std[index] =
1551                BlastKarlinBlkDestruct(search->sbp->kbp_std[index]);
1552       }
1553    }
1554 
1555    /* Error only if all query sequences failed! */
1556    if (retval != 0) {
1557       if (retval == 1)
1558          sprintf(buffer, "All queries are shorter than word size or completely masked, no search performed");
1559       else
1560          sprintf(buffer, "Unable to calculate Karlin-Altschul parameters, check query sequence");
1561       BlastConstructErrorMessage("BLASTSetUpSearch", buffer, 2, &(search->error_return));
1562    }
1563 
1564    search->sbp->kbp_gap = search->sbp->kbp_gap_std;
1565    search->sbp->kbp = search->sbp->kbp_std;
1566 
1567    /* If retval was set non-zero above (by the routines calculating Karlin-Altschul params),
1568       return here before these values are used.
1569    */
1570    if (retval)
1571       return retval;
1572 
1573    /* Calculate length adjustments and effective query lengths for
1574       each query. */
1575    for( context=search->first_context;
1576         context<=search->last_context;
1577         context++ ) {
1578       /* Skip those contexts where sequence is not searched. */
1579       if (!search->sbp->kbp[context])
1580          continue;
1581       BlastComputeLengthAdjustment(search->sbp->kbp[context]->K,
1582                                    search->sbp->kbp[context]->logK,
1583                                    1/search->sbp->kbp[context]->H,
1584                                    0.0,
1585                                    length,
1586                                    search->dblen, search->dbseq_num,
1587                                    &length_adjustment );
1588 
1589        length = search->query_context_offsets[context+1] -
1590            search->query_context_offsets[context] - 1;
1591        min_query_length = (Int4) 1/(search->sbp->kbp[context]->K);
1592        effective_query_length =
1593           MAX(min_query_length, length - length_adjustment);
1594        search->context[context].query->effective_length =
1595            effective_query_length;
1596        if (context == search->first_context)
1597           search->length_adjustment = length_adjustment;
1598    }
1599    search->dblen_eff =
1600      search->dblen - search->dbseq_num*search->length_adjustment;
1601    /*if (search->searchsp_eff == 0)
1602       search->searchsp_eff = ((Nlm_FloatHi) search->dblen_eff) *
1603       ((Nlm_FloatHi) effective_query_length); */
1604 
1605    /* The default is that cutoff_s was not set and is zero. */
1606    if (options->cutoff_s == 0)	{
1607       search->pbp->cutoff_e = options->expect_value;
1608       search->pbp->cutoff_e_set = TRUE;
1609       search->pbp->cutoff_s = options->cutoff_s;
1610       search->pbp->cutoff_s_set = FALSE;
1611    } else {
1612       search->pbp->cutoff_e = options->expect_value;
1613       search->pbp->cutoff_e_set = FALSE;
1614       search->pbp->cutoff_s = options->cutoff_s;
1615       search->pbp->cutoff_s_set = TRUE;
1616    }
1617 /* For now e2 is set to 0.5 and cutoff_e2_set is FALSE.  This is then
1618 changed to the proper values in blast_set_parameters.  In the final version
1619 of this program (where more blast programs and command-line options are
1620 available) this needs to be set higher up. */
1621    if (options->cutoff_s2 == 0)	{
1622       search->pbp->cutoff_e2 = options->e2;
1623       search->pbp->cutoff_e2_set = FALSE;
1624       search->pbp->cutoff_s2 = options->cutoff_s2;
1625       search->pbp->cutoff_s2_set = FALSE;
1626    } else {
1627       search->pbp->cutoff_e2 = options->e2;
1628       search->pbp->cutoff_e2_set = FALSE;
1629       search->pbp->cutoff_s2 = options->cutoff_s2;
1630       search->pbp->cutoff_s2_set = TRUE;
1631    }
1632 
1633    search->pbp->discontinuous = options->discontinuous;
1634 
1635 
1636    /* For postion based blast. */
1637    search->pbp->ethresh = options->ethresh;
1638    search->pbp->maxNumPasses = options->maxNumPasses;
1639    search->pbp->pseudoCountConst = options->pseudoCountConst;
1640 
1641    search->pbp->process_num = options->number_of_cpus;
1642    search->pbp->cpu_limit = options->cpu_limit;
1643    search->pbp->gap_decay_rate = options->gap_decay_rate;
1644    search->pbp->gap_size = options->gap_size;
1645    search->pbp->gap_prob = options->gap_prob;
1646    search->pbp->old_stats = options->old_stats;
1647    search->pbp->use_large_gaps = options->use_large_gaps;
1648    search->pbp->number_of_bits = options->number_of_bits;
1649    search->pbp->two_pass_method = options->two_pass_method;
1650    search->pbp->multiple_hits_only = options->multiple_hits_only;
1651    search->pbp->gap_open = options->gap_open;
1652    search->pbp->gap_extend = options->gap_extend;
1653    search->pbp->decline_align = options->decline_align;
1654 
1655    if (options->hsp_num_max) {
1656       search->pbp->hsp_num_max = options->hsp_num_max;
1657       if (search->hsp_array_size > options->hsp_num_max)
1658          search->hsp_array_size = options->hsp_num_max;
1659    } else {
1660       FloatHi searchsp_eff = ((FloatHi) search->dblen_eff) *
1661          ((FloatHi) search->context[search->first_context].query->length);
1662       if (searchsp_eff > MIN_LIMIT_HSP_NUM) {
1663          char buffer[256];
1664          search->pbp->hsp_num_max = 50*(search->last_context + 1);
1665          sprintf(buffer,
1666                  "Limiting the number of HSPs per database sequence to %d",
1667                  search->pbp->hsp_num_max);
1668          ErrPostEx(SEV_WARNING, 0, 0, buffer);
1669       }
1670    }
1671 
1672 
1673    /*search->pbp->gap_x_dropoff = (BLAST_Score)
1674      (options->gap_x_dropoff*NCBIMATH_LN2 /
1675      search->sbp->kbp[search->first_context]->Lambda);*/
1676    search->pbp->gap_x_dropoff = options->gap_x_dropoff;
1677    /* Ensures that gap_x_dropoff_final is at least as large as gap_x_dropoff. */
1678    search->pbp->gap_x_dropoff_final = MAX(options->gap_x_dropoff_final, search->pbp->gap_x_dropoff);
1679 
1680    /* "threshold" (first and second) must be set manually for two-pass right now.*/
1681    search->pbp->threshold_set = TRUE;
1682    search->pbp->threshold_second = options->threshold_second;
1683 
1684    search->pbp->window_size = options->window_size;
1685    search->pbp->window_size_set = TRUE;
1686 
1687    search->whole_query = TRUE;
1688    if (options->required_start != 0 || options->required_end != -1) {
1689       search->whole_query = FALSE;
1690       search->required_start = options->required_start;
1691       if (options->required_end != -1)
1692          search->required_end = options->required_end;
1693       else
1694          search->required_end = qlen;
1695    }
1696 
1697    search->pbp->dropoff_2nd_pass = - options->dropoff_2nd_pass;
1698 
1699    avglen = BLAST_NT_AVGLEN;
1700    /* Use only one type of gap for blastn */
1701    search->pbp->ignore_small_gaps = TRUE;
1702 
1703    search->wfp = search->wfp_second;
1704    if (!MegaBlastBuildLookupTable(search)) {
1705       ErrPostEx(SEV_WARNING, 0, 0, "Failed to construct a lookup table");
1706       return 1;
1707    }
1708 
1709    return 0;
1710 }
1711 
1712 Boolean
MegaBlastGetFirstAndLastContext(CharPtr prog_name,SeqLocPtr query_slp,Int2Ptr first_context,Int2Ptr last_context,Uint1 strand_options)1713 MegaBlastGetFirstAndLastContext(CharPtr prog_name, SeqLocPtr query_slp, Int2Ptr first_context, Int2Ptr last_context, Uint1 strand_options)
1714 {
1715    SeqLocPtr tmp_slp = query_slp;
1716    Int2 tmp_first, tmp_last;
1717    Int2 index;
1718 
1719    if (StringCmp(prog_name, "blastn"))
1720       return BlastGetFirstAndLastContext(prog_name, query_slp, first_context,
1721 					 last_context, strand_options);
1722 
1723    if (!BlastGetFirstAndLastContext(prog_name, query_slp, first_context,
1724 				    last_context, strand_options))
1725       return FALSE;
1726 
1727    for (index=0; tmp_slp->next != NULL; index++, tmp_slp=tmp_slp->next);
1728 
1729    if (!BlastGetFirstAndLastContext(prog_name, tmp_slp, &tmp_first,
1730 				    &tmp_last, strand_options))
1731       return FALSE;
1732    /* For all intermediate sequences count 2 contexts */
1733    *last_context = 2 * index + tmp_last;
1734    return TRUE;
1735 }
1736 
1737 
SeqLocTotalLen(CharPtr prog_name,SeqLocPtr slp)1738 Int4 SeqLocTotalLen(CharPtr prog_name, SeqLocPtr slp)
1739 {
1740    Int4 total_length = 0;
1741    SeqLocPtr tmp_slp = slp;
1742 
1743    if (StringCmp(prog_name, "blastn"))
1744       return SeqLocLen(slp);
1745 
1746    while (tmp_slp != NULL) {
1747       total_length += 2*(SeqLocLen(tmp_slp) + 1);
1748       tmp_slp = tmp_slp->next;
1749    }
1750    return total_length - 1;
1751 }
1752 
1753 BlastSearchBlkPtr
BlastFillQueryOffsets(BlastSearchBlkPtr search,SeqLocPtr query_slp,Int4 wordsize)1754 BlastFillQueryOffsets(BlastSearchBlkPtr search, SeqLocPtr query_slp,
1755                       Int4 wordsize)
1756 {
1757    SeqLocPtr slp = query_slp;
1758    Int2 num_contexts;
1759    Int2 i = 0;
1760    Int4 length;
1761 
1762    if (slp == NULL) return search;
1763    /* Note: the last offset will be equal to the combined query length + 1, for
1764       convenience of computing each individual query length later */
1765    for (slp=query_slp, num_contexts=0; slp; slp=slp->next, num_contexts+=2);
1766 
1767    search->query_context_offsets = (Int4Ptr) Malloc((num_contexts+1)*sizeof(Int4));
1768 
1769    search->query_context_offsets[0] = 0;
1770    for (slp = query_slp; slp; slp = slp->next) {
1771       length = SeqLocLen(slp) + 1;
1772       if (length <= wordsize)
1773          /* Sequence shorter than word size will be removed */
1774          length = 0;
1775       if (search->first_context == 0)
1776 	 search->query_context_offsets[i+1] =
1777 	    search->query_context_offsets[i] + length;
1778       else /* Top strand is not searched */
1779 	 search->query_context_offsets[i+1] =
1780 	    search->query_context_offsets[i];
1781       i++;
1782       if (search->last_context % 2 == 1)
1783 	    search->query_context_offsets[i+1] =
1784 	       search->query_context_offsets[i] + length;
1785       else
1786 	 /* Bottom strand is not searched */
1787 	 search->query_context_offsets[i+1] =
1788 	    search->query_context_offsets[i];
1789       i++;
1790    }
1791 
1792 
1793    return search;
1794 }
1795 
1796 static int LIBCALLBACK
evalue_compare_seqaligns(VoidPtr v1,VoidPtr v2)1797 evalue_compare_seqaligns(VoidPtr v1, VoidPtr v2)
1798 {
1799    SeqAlignPtr sap1, sap2, PNTR sapp1, PNTR sapp2;
1800 
1801    sapp1 = (SeqAlignPtr PNTR) v1;
1802    sapp2 = (SeqAlignPtr PNTR) v2;
1803 
1804    sap1 = *sapp1;
1805    sap2 = *sapp2;
1806 
1807    if (sap1->score->value.realvalue > sap2->score->value.realvalue)
1808       return -1;
1809    else if (sap1->score->value.realvalue < sap2->score->value.realvalue)
1810       return 1;
1811    else if (sap1->score->value.intvalue < sap2->score->value.intvalue)
1812       return -1;
1813    else if (sap1->score->value.intvalue > sap2->score->value.intvalue)
1814       return 1;
1815    else
1816       return 0;
1817 
1818 }
1819 
1820 /* Disassemble a list of SeqAligns into an array by query (in case of
1821    multiple queries */
MegaBlastPackAlignmentsByQuery(BlastSearchBlkPtr search,SeqAlignPtr seqalign)1822 SeqAlignPtr PNTR MegaBlastPackAlignmentsByQuery(BlastSearchBlkPtr search,
1823 						SeqAlignPtr seqalign)
1824 {
1825    SeqAlignPtr PNTR seqalignp, PNTR sapp, PNTR sapp1;
1826    SeqAlignPtr seqalign_var, sap, seqalign_prev;
1827    Int2 index, i, hit_count, dbseq_count;
1828    SeqIdPtr query_id, seg_query_id;
1829    SeqLocPtr slp;
1830    Int4 num_queries = search->last_context / 2 + 1;
1831    Int4 count = 0;
1832 
1833    slp = search->query_slp;
1834 
1835    seqalignp = (SeqAlignPtr PNTR)
1836       MemNew(num_queries*sizeof(SeqAlignPtr));
1837 
1838    if (!seqalign) {
1839       *seqalignp = NULL;
1840       return seqalignp;
1841    }
1842 
1843    for (index=0; index<num_queries; index++) {
1844       query_id = search->qid_array[index];
1845       hit_count = 0;
1846       seqalign_var = seqalign;
1847       seqalign_prev = NULL;
1848       while (seqalign_var != NULL) {
1849 	 seg_query_id = ((DenseSegPtr)seqalign_var->segs)->ids;
1850 	 if (SeqIdComp(query_id, seg_query_id) == SIC_YES) {
1851 	    /* Remove this link from the list - we won't need it any more */
1852 	    if (seqalign_prev != NULL)
1853 	       seqalign_prev->next = seqalign_var->next;
1854 	    else /* first link in the list */
1855 	       seqalign = seqalign_var->next;
1856 
1857 	    seqalign_var->next = NULL;
1858 
1859 	    /* Add this seqalign to the list for this query */
1860 	    if (seqalignp[index]==NULL) {
1861 	       seqalignp[index] = seqalign_var;
1862 	       sap = seqalignp[index];
1863 	    } else {
1864 	       sap->next = seqalign_var;
1865 	       sap = sap->next;
1866 	    }
1867 	    hit_count++;
1868 	    /* Move to the next link */
1869 	    if (seqalign_prev != NULL)
1870 	       seqalign_var = seqalign_prev->next;
1871 	    else
1872 	       seqalign_var = seqalign;
1873 	 } else { /* hit from different query */
1874 	    seqalign_prev = seqalign_var;
1875 	    seqalign_var = seqalign_var->next;
1876 	 }
1877       }
1878 
1879       count += hit_count;
1880       /* Now sort seqaligns for this query based on evalue */
1881       if (hit_count>1) {
1882 	 Int4 start, end;
1883 	 SeqIdPtr sid;
1884 
1885 	 sapp = Nlm_Malloc(hit_count*sizeof(SeqAlignPtr));
1886 	 sapp1 = Nlm_Malloc(hit_count*sizeof(SeqAlignPtr));
1887 	 sapp[0] = seqalignp[index];
1888 
1889 	 /* sapp is an array of pointers to all hits for this query */
1890 	 /* sapp1 is an array of pointers to best hits for each db  */
1891 	 /* sequence for this query                                 */
1892 	 for (i=1; i<hit_count; i++)
1893 	    sapp[i] = sapp[i-1]->next;
1894 
1895 	 dbseq_count = 0;
1896 	 for (start=0; start<hit_count; ) {
1897 	    end = start;
1898 	    sid = ((DenseSegPtr)sapp[start]->segs)->ids->next;
1899 	    for(end=start+1; end < hit_count &&
1900 		SeqIdComp(sid, ((DenseSegPtr)sapp[end]->segs)->ids->next)
1901 		   ==SIC_YES;
1902 		end++);
1903 
1904 	    if (end > start + 1)
1905 	       HeapSort(&sapp[start], end-start, sizeof(SeqAlignPtr),
1906 			evalue_compare_seqaligns);
1907 	    for (i=start; i<end-1; i++)
1908 	       sapp[i]->next = sapp[i+1];
1909 	    sapp[end-1]->next = NULL; /* Separate lists for different db
1910 					 sequences */
1911 	    sapp1[dbseq_count++] = sapp[start];
1912 	    start = end;
1913 	 }
1914 	 /* Sort the lists for different db sequences based on their
1915 	    best hits */
1916 	 HeapSort(sapp1, dbseq_count, sizeof(SeqAlignPtr),
1917 		  evalue_compare_seqaligns);
1918 	 /* Connect the sorted lists */
1919 	 for (i=0; i<dbseq_count-1; i++) {
1920 	    for (sap=sapp1[i]; sap->next != NULL; sap = sap->next);
1921 	    sap->next = sapp1[i+1];
1922 	 }
1923 
1924 	 seqalignp[index] = sapp1[0];
1925 	 MemFree(sapp);
1926 	 MemFree(sapp1);
1927       }
1928    }
1929    /* Shouldn't be necessary, but clean just in case */
1930    SeqAlignSetFree(seqalign);
1931    return seqalignp;
1932 }
1933 
1934 /* Attach the "sequence" pointer to the BlastSequenceBlkPtr. sequence_start may be the
1935 actual start of the sequence (this pointer is kept for deallocation purposes). The
1936 sequence may start before "sequence" starts as there may be a sentinel (i.e., NULLB)
1937 before the start of the sequence.  When the extension function extends this way it
1938 can tell that there is a NULLB there and stop the extension.
1939 
1940 */
1941 
1942 Int2 LIBCALL
MegaBlastSequenceAddSequence(BlastSequenceBlkPtr sequence_blk,Uint1Ptr sequence,Uint1Ptr sequence_start,Int4 length,Int4 original_length,Int4 effective_length)1943 MegaBlastSequenceAddSequence (BlastSequenceBlkPtr sequence_blk, Uint1Ptr sequence,
1944 			      Uint1Ptr sequence_start, Int4 length,
1945 			      Int4 original_length, Int4 effective_length)
1946 {
1947    Uint1Ptr seqptr;
1948 
1949    if (sequence_blk == NULL)
1950       return 1;
1951    /* Assume that memory for sequence_blk->sequence is
1952       allocated elsewhere */
1953    if (sequence == NULL && sequence_start != NULL) {
1954       if (sequence_blk->sequence!=NULL && sequence_blk->length>0) {
1955 	 seqptr = sequence_blk->sequence_start + sequence_blk->length;
1956 	 MemCpy(seqptr, sequence_start, length+1);
1957       } else {
1958 	 MemCpy(sequence_blk->sequence_start, sequence_start, length+1);
1959 	 sequence_blk->sequence = sequence_blk->sequence_start + 1;
1960 	 sequence_blk->effective_length = effective_length;
1961       }
1962    }
1963    else if (sequence != NULL) {
1964       sequence_blk->sequence = sequence;
1965       sequence_blk->sequence_start = sequence_start;
1966    }
1967 
1968    sequence_blk->length += length;
1969    sequence_blk->original_length += original_length;
1970 
1971    return 0;
1972 }
1973 
1974 static Boolean
MegaBlastSaveExactMatch(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off)1975 MegaBlastSaveExactMatch(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off)
1976 {
1977    MegaBlastExactMatchPtr match_array;
1978    Int4 num, num_avail;
1979 
1980    if (!search || !search->wfp || !search->wfp->lookup ||
1981        !search->wfp->lookup->mb_lt)
1982       return FALSE;
1983 
1984    num = search->current_hitlist->hspcnt;
1985    num_avail = search->current_hitlist->exact_match_max;
1986 
1987    match_array = search->current_hitlist->exact_match_array;
1988    if (num>=num_avail) {
1989       if (search->current_hitlist->do_not_reallocate)
1990          return FALSE;
1991       num_avail *= 2;
1992       match_array = (MegaBlastExactMatchPtr)
1993          Realloc(match_array, num_avail*sizeof(MegaBlastExactMatch));
1994       if (!match_array) {
1995          ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in MegaBlastSaveExactMatch for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, num_avail);
1996          search->current_hitlist->do_not_reallocate = TRUE;
1997          return FALSE;
1998       } else {
1999          search->current_hitlist->exact_match_max = num_avail;
2000          search->current_hitlist->exact_match_array = match_array;
2001       }
2002    }
2003 
2004    match_array[num].q_off = q_off;
2005    match_array[num].s_off = s_off;
2006    search->current_hitlist->hspcnt++;
2007    return TRUE;
2008 }
2009 
2010 #define MB_TEMPLATE_LENGTH 18
2011 
2012 #ifndef BUFFER_LENGTH
2013 #define BUFFER_LENGTH 255
2014 #endif
2015 #define MAX_LINE 48
2016 
2017 /* Discontiguous words, database scanned 4 bases at a time */
2018 
2019 static Int4
MegaBlastWordFinder_disc(BlastSearchBlkPtr search,LookupTablePtr lookup)2020 MegaBlastWordFinder_disc(BlastSearchBlkPtr search, LookupTablePtr lookup)
2021 {
2022    register Uint1Ptr subject;
2023    register Int4 s_off, ecode, mask, q_off, extra_code;
2024    Int4 ecode1, ecode2;
2025    Int4 subj_length = search->subject->length;
2026    Int4 min_hit_size;
2027    PV_ARRAY_TYPE *pv_array = lookup->pv_array;
2028    Int4 pv_array_bts;
2029    Int4 bit0, no_bit0;
2030    Int2 word_length;
2031 #ifdef USE_HASH_TABLE
2032    Uint1Ptr hash_buf;
2033    Int4 hash_shift, hash_mask, crc, size;
2034 #endif
2035    MegaBlastParameterBlkPtr mb_params = search->pbp->mb_params;
2036    Int2 template_length = mb_params->template_length;
2037    MBTemplateType template_type = mb_params->template_type;
2038    Boolean use_two_templates = mb_params->use_two_templates;
2039 
2040    min_hit_size = lookup->mb_lt->lpm;
2041    if (search->pbp->window_size > 0)
2042       min_hit_size += 4;
2043 
2044    if (mb_params->is_neighboring &&
2045        subj_length < MIN_NEIGHBOR_HSP_LENGTH)
2046       return 0;
2047    if (search->current_hitlist == NULL)
2048       search->current_hitlist = BlastHitListNew(search);
2049 
2050    if (mb_params->word_weight == 11) {
2051       no_bit0 = 0x007fffff;
2052       bit0 = 0x00800000;
2053    } else {
2054       no_bit0 = 0xffffffff;
2055       bit0 = 0;
2056    }
2057 
2058 #ifdef USE_HASH_TABLE
2059    for (hash_shift=-1,size=lookup->mb_lt->hashsize; size;
2060         size=size>>1,hash_shift++);
2061    hash_shift = (32 - hash_shift)/2;
2062    hash_mask = lookup->mb_lt->hashsize - 1;
2063 #endif
2064 
2065    word_length = (lookup->mb_lt->width + 1) * 4;
2066    if (template_length > word_length) {
2067       subj_length -= template_length - word_length;
2068    }
2069 
2070    mask = lookup->mb_lt->mask;
2071    subject = search->subject->sequence - 1;
2072    ecode = 0;
2073    if (lookup->mb_lt->estack)
2074       MemSet(lookup->mb_lt->stack_index, 0,
2075              lookup->mb_lt->num_stacks*sizeof(Int4));
2076 
2077 #ifdef USE_HASH_TABLE
2078    pv_array_bts = PV_ARRAY_BTS;
2079 #else
2080    pv_array_bts = PV_ARRAY_BTS + ((lookup->mb_lt->width < 3) ? 0 : 5);
2081 #endif
2082 
2083    for (s_off = 0; s_off < lookup->mb_lt->width*4; s_off += 4) {
2084       ecode = (ecode << 8) + *++subject;
2085    }
2086 
2087    s_off += 4;
2088    if (template_length == 16) {
2089       while (s_off<=subj_length) {
2090          ecode = ((ecode & mask) << 8) + *++subject;
2091          if (template_type == TEMPL_11_16)
2092             ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2093          else if (template_type == TEMPL_12_16)
2094             ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2095          else if (template_type == TEMPL_11_16_OPT)
2096             ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2097          else if (template_type == TEMPL_12_16_OPT)
2098             ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2099 #ifdef USE_HASH_TABLE /* Using hash table */
2100          hash_buf = (Uint1Ptr)&ecode1;
2101          CRC32(crc, hash_buf);
2102          ecode1 = (crc>>hash_shift) & hash_mask;
2103 #endif
2104          if (use_two_templates) {
2105             if (template_type == TEMPL_11_16)
2106                ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2107             else if (template_type == TEMPL_12_16)
2108                ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2109          }
2110 #ifdef USE_HASH_TABLE /* Using hash table */
2111          hash_buf = (Uint1Ptr)&ecode2;
2112          CRC32(crc, hash_buf);
2113          ecode2 = (crc>>hash_shift) & hash_mask;
2114 #endif
2115          if (pv_array) {
2116             while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2117                      (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2118                    && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2119                      (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2120                ecode = ((ecode & mask) << 8) + *++subject;
2121                if (template_type == TEMPL_11_16)
2122                   ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2123                else if (template_type == TEMPL_12_16)
2124                   ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2125                else if (template_type == TEMPL_11_16_OPT)
2126                   ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2127                else if (template_type == TEMPL_12_16_OPT)
2128                   ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2129 #ifdef USE_HASH_TABLE
2130                hash_buf = (Uint1Ptr)&ecode1;
2131                CRC32(crc, hash_buf);
2132                ecode1 = (crc>>hash_shift) & hash_mask;
2133 #endif
2134 
2135                if (use_two_templates) {
2136                   if (template_type == TEMPL_11_16)
2137                      ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2138                   else if (template_type == TEMPL_12_16)
2139                      ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2140                }
2141 #ifdef USE_HASH_TABLE
2142                hash_buf = (Uint1Ptr)&ecode2;
2143                CRC32(crc, hash_buf);
2144                ecode2 = (crc>>hash_shift) & hash_mask;
2145 #endif
2146                s_off += 4;
2147             }
2148             if (s_off > subj_length)
2149                break;
2150          }
2151          for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2152               q_off = lookup->mb_lt->next_pos[q_off]) {
2153             search->second_pass_hits++;
2154             MegaBlastExtendHit(search, lookup, s_off, q_off);
2155          }
2156          if (use_two_templates) {
2157             for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2158                  q_off = lookup->mb_lt->next_pos2[q_off]) {
2159                search->second_pass_hits++;
2160                MegaBlastExtendHit(search, lookup, s_off, q_off);
2161             }
2162          }
2163          s_off += 4;
2164       }
2165 
2166    } else { /* template length > 16 */
2167       extra_code = 0;
2168       while (s_off<=subj_length) {
2169          ecode = ((ecode & mask) << 8) + *++subject;
2170          switch (template_type) {
2171          case TEMPL_11_18:
2172             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2173             ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2174             break;
2175          case TEMPL_12_18:
2176             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2177             ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2178             break;
2179          case TEMPL_11_18_OPT:
2180             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2181             ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2182             break;
2183          case TEMPL_12_18_OPT:
2184             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2185             ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2186             break;
2187          case TEMPL_11_21:
2188             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2189             ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2190             break;
2191          case TEMPL_12_21:
2192             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2193             ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2194             break;
2195          case TEMPL_11_21_OPT:
2196             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2197             ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2198             break;
2199          case TEMPL_12_21_OPT:
2200             extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2201             ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2202             break;
2203          default:
2204             extra_code = 0;
2205             ecode1 = 0;
2206             break;
2207          }
2208 #ifdef USE_HASH_TABLE
2209          hash_buf = (Uint1Ptr)&ecode1;
2210          CRC32(crc, hash_buf);
2211          ecode1 = (crc>>hash_shift) & hash_mask;
2212 #endif
2213 
2214          if (use_two_templates) {
2215             switch (template_type) {
2216             case TEMPL_11_18:
2217                extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2218                ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) |
2219                   extra_code | bit0;
2220                break;
2221             case TEMPL_12_18:
2222                extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2223                ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) |
2224                   extra_code | bit0;
2225                break;
2226             case TEMPL_11_21:
2227                extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2228                ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) |
2229                   extra_code | bit0;
2230                break;
2231             case TEMPL_12_21:
2232                extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2233                ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) |
2234                   extra_code | bit0;
2235                break;
2236             default:
2237                ecode2 = 0;
2238                break;
2239             }
2240 #ifdef USE_HASH_TABLE
2241             hash_buf = (Uint1Ptr)&ecode2;
2242             CRC32(crc, hash_buf);
2243             ecode2 = (crc>>hash_shift) & hash_mask;
2244 #endif
2245          }
2246          if (pv_array) {
2247             while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2248                      (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2249                    && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2250                        (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2251                ecode = ((ecode & mask) << 8) + *++subject;
2252 
2253                switch (template_type) {
2254                case TEMPL_11_18:
2255                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2256                   ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2257                   break;
2258                case TEMPL_12_18:
2259                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18(subject);
2260                   ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2261                   break;
2262                case TEMPL_11_18_OPT:
2263                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2264                   ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2265                   break;
2266                case TEMPL_12_18_OPT:
2267                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2268                   ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2269                   break;
2270                case TEMPL_11_21:
2271                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2272                   ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2273                   break;
2274                case TEMPL_12_21:
2275                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21(subject);
2276                   ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2277                   break;
2278                case TEMPL_11_21_OPT:
2279                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2280                   ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2281                   break;
2282                case TEMPL_12_21_OPT:
2283                   extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2284                   ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2285                   break;
2286                default:
2287                   extra_code = 0;
2288                   ecode1 = 0;
2289                   break;
2290                }
2291 
2292 #ifdef USE_HASH_TABLE
2293                hash_buf = (Uint1Ptr)&ecode1;
2294                CRC32(crc, hash_buf);
2295                ecode1 = (crc>>hash_shift) & hash_mask;
2296 #endif
2297 
2298                if (use_two_templates) {
2299                   switch (template_type) {
2300                   case TEMPL_11_18:
2301                      extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2302                      ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) |
2303                         extra_code | bit0;
2304                      break;
2305                   case TEMPL_12_18:
2306                      extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_18_OPT(subject);
2307                      ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) |
2308                         extra_code | bit0;
2309                      break;
2310                    case TEMPL_11_21:
2311                      extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2312                      ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) |
2313                         extra_code | bit0;
2314                      break;
2315                   case TEMPL_12_21:
2316                      extra_code = (Int4) GET_EXTRA_CODE_PACKED_4_21_OPT(subject);
2317                      ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) |
2318                         extra_code | bit0;
2319                      break;
2320                  default:
2321                      ecode2 = 0; break;
2322                   }
2323 #ifdef USE_HASH_TABLE
2324                   hash_buf = (Uint1Ptr)&ecode2;
2325                   CRC32(crc, hash_buf);
2326                   ecode2 = (crc>>hash_shift) & hash_mask;
2327 #endif
2328                }
2329                s_off += 4;
2330             }
2331             if (s_off > subj_length)
2332                break;
2333          }
2334          for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2335               q_off = lookup->mb_lt->next_pos[q_off]) {
2336             search->second_pass_hits++;
2337             MegaBlastExtendHit(search, lookup, s_off, q_off);
2338          }
2339          if (use_two_templates) {
2340             for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2341                  q_off = lookup->mb_lt->next_pos2[q_off]) {
2342                search->second_pass_hits++;
2343                MegaBlastExtendHit(search, lookup, s_off, q_off);
2344             }
2345          }
2346          s_off += 4;
2347       }
2348    }
2349 
2350    if (!lookup->mb_lt->estack)
2351       BlastExtendWordExit(search);
2352 
2353    search->current_hitlist->do_not_reallocate = FALSE;
2354 
2355    /* Do greedy gapped extension */
2356    if (search->current_hitlist->hspcnt > 0)
2357       if (MegaBlastGappedAlign(search) != 0)
2358          return -1;
2359 
2360    return search->current_hitlist->hspcnt;
2361 }
2362 
2363 /* Discontiguous words, database scanned 1 base at a time */
2364 
2365 #define mask30b 0x3fffffff
2366 #define mask2b  0x00000003
2367 
2368 static Int4
MegaBlastWordFinder_disc_1b(BlastSearchBlkPtr search,LookupTablePtr lookup)2369 MegaBlastWordFinder_disc_1b(BlastSearchBlkPtr search, LookupTablePtr lookup)
2370 {
2371    register Uint1Ptr subject;
2372    register Int4 s_off, ecode, mask, q_off, extra_code;
2373    Int4 ecode1, ecode2;
2374    Int4 subj_length = search->subject->length;
2375    Int4 min_hit_size;
2376    PV_ARRAY_TYPE *pv_array = lookup->pv_array;
2377    Int4 pv_array_bts;
2378    Uint1 bit;
2379    Int4 no_bit0, bit0;
2380    Int2 word_length;
2381 #ifdef USE_HASH_TABLE
2382    Uint1Ptr hash_buf;
2383    Int4 hash_shift, hash_mask, crc, size;
2384 #endif
2385    MegaBlastParameterBlkPtr mb_params = search->pbp->mb_params;
2386    Int2 template_length = mb_params->template_length;
2387    MBTemplateType template_type = mb_params->template_type;
2388    Boolean use_two_templates = mb_params->use_two_templates;
2389 
2390    min_hit_size = lookup->mb_lt->lpm;
2391    if (search->pbp->window_size > 0)
2392       min_hit_size += 4;
2393 
2394    if (mb_params->is_neighboring &&
2395        subj_length < MIN_NEIGHBOR_HSP_LENGTH)
2396       return 0;
2397    if (search->current_hitlist == NULL)
2398       search->current_hitlist = BlastHitListNew(search);
2399 
2400    if (mb_params->word_weight == 11) {
2401       no_bit0  = 0x007fffff;
2402       bit0     = 0x00800000;
2403    } else {
2404       no_bit0 = -1;
2405       bit0 = 0;
2406    }
2407 
2408 
2409 #ifdef USE_HASH_TABLE
2410    for (hash_shift=0,size=lookup->mb_lt->hashsize; size;
2411         size=size>>1,hash_shift++);
2412    hash_shift = (32 - hash_shift)/2;
2413    hash_mask = lookup->mb_lt->hashsize - 1;
2414 #endif
2415 
2416    word_length = (lookup->mb_lt->width + 1) * 4;
2417    if (template_length > word_length) {
2418       subj_length -= template_length - word_length;
2419    }
2420 
2421    mask = lookup->mb_lt->mask;
2422    subject = search->subject->sequence - 1;
2423    ecode = 0;
2424    if (lookup->mb_lt->estack)
2425       MemSet(lookup->mb_lt->stack_index, 0,
2426              lookup->mb_lt->num_stacks*sizeof(Int4));
2427 
2428    pv_array_bts = PV_ARRAY_BTS + ((lookup->mb_lt->width < 3) ? 0 : 5);
2429 
2430    for (s_off = 0; s_off < lookup->mb_lt->width*4; s_off += 4) {
2431       ecode = (ecode << 8) + *++subject;
2432    }
2433 
2434    bit = 3;
2435    s_off++;
2436    if (template_length == word_length) {
2437       while (s_off<=subj_length) {
2438          if (bit == 3) subject++;
2439          ecode = ((ecode & mask30b) << 2) | (((*subject)>>(2*bit)) & mask2b);
2440          if (template_type == TEMPL_11_16)
2441             ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2442          else if (template_type == TEMPL_12_16)
2443             ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2444          else if (template_type == TEMPL_11_16_OPT)
2445             ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2446          else if (template_type == TEMPL_12_16_OPT)
2447             ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2448 #ifdef USE_HASH_TABLE
2449          hash_buf = (Uint1Ptr)&ecode1;
2450          CRC32(crc, hash_buf);
2451          ecode1 = (crc>>hash_shift) & hash_mask;
2452 #endif
2453 
2454          if (use_two_templates) {
2455             switch (template_type) {
2456             case TEMPL_11_16:
2457                ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2458                break;
2459             case TEMPL_12_16:
2460                ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2461                break;
2462             default:
2463                ecode2 = 0; break;
2464             }
2465 #ifdef USE_HASH_TABLE
2466             hash_buf = (Uint1Ptr)&ecode2;
2467             CRC32(crc, hash_buf);
2468             ecode2 = (crc>>hash_shift) & hash_mask;
2469 #endif
2470          }
2471          bit = (bit - 1) & mask2b;
2472 
2473          if (pv_array) {
2474             while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2475                       (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2476                    && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2477                         (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2478                if (bit == 3) subject++;
2479                ecode = ((ecode & mask30b) << 2) |
2480                   (((*subject)>>(2*bit)) & mask2b);
2481                if (template_type == TEMPL_11_16)
2482                   ecode1 = GET_WORD_INDEX_11_16(ecode) & no_bit0;
2483                else if (template_type == TEMPL_12_16)
2484                   ecode1 = GET_WORD_INDEX_12_16(ecode) & no_bit0;
2485                else if (template_type == TEMPL_11_16_OPT)
2486                   ecode1 = GET_WORD_INDEX_11_16_OPT(ecode) & no_bit0;
2487                else if (template_type == TEMPL_12_16_OPT)
2488                   ecode1 = GET_WORD_INDEX_12_16_OPT(ecode) & no_bit0;
2489 #ifdef USE_HASH_TABLE
2490                hash_buf = (Uint1Ptr)&ecode1;
2491                CRC32(crc, hash_buf);
2492                ecode1 = (crc>>hash_shift) & hash_mask;
2493 #endif
2494                if (use_two_templates) {
2495                   switch (template_type) {
2496                   case TEMPL_11_16:
2497                      ecode2 = GET_WORD_INDEX_11_16_OPT(ecode) | bit0;
2498                      break;
2499                   case TEMPL_12_16:
2500                      ecode2 = GET_WORD_INDEX_12_16_OPT(ecode) | bit0;
2501                      break;
2502                   default:
2503                      ecode2 = 0; break;
2504                   }
2505 #ifdef USE_HASH_TABLE
2506                   hash_buf = (Uint1Ptr)&ecode2;
2507                   CRC32(crc, hash_buf);
2508                   ecode2 = (crc>>hash_shift) & hash_mask;
2509 #endif
2510                }
2511                bit = (bit - 1) & mask2b;
2512                s_off++;
2513             }
2514             if (s_off > subj_length)
2515                break;
2516          }
2517          for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2518               q_off = lookup->mb_lt->next_pos[q_off]) {
2519             search->second_pass_hits++;
2520             MegaBlastExtendHit(search, lookup, s_off, q_off);
2521          }
2522          if (use_two_templates) {
2523             for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2524                  q_off = lookup->mb_lt->next_pos2[q_off]) {
2525                search->second_pass_hits++;
2526                MegaBlastExtendHit(search, lookup, s_off, q_off);
2527             }
2528          }
2529          s_off++;
2530       }
2531    } else {
2532       Uint1Ptr seqptr;
2533       Uint1 bitval;
2534       Int4 tmpval;
2535       extra_code = 0;
2536       while (s_off<=subj_length) {
2537          if (bit == 3) subject++;
2538          ecode = ((ecode & mask30b) << 2) | (((*subject)>>(2*bit)) & mask2b);
2539          seqptr = subject;
2540          bitval = bit;
2541          extra_code = 0;
2542          tmpval = 0;
2543          switch (template_type) {
2544          case TEMPL_11_18:
2545             GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2546             ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2547             break;
2548          case TEMPL_12_18:
2549             GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2550             ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2551             break;
2552          case TEMPL_11_18_OPT:
2553             GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2554             ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2555             break;
2556          case TEMPL_12_18_OPT:
2557             GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2558             ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2559             break;
2560          case TEMPL_11_21:
2561             GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2562             ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2563             break;
2564          case TEMPL_12_21:
2565             GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2566             ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2567             break;
2568          case TEMPL_11_21_OPT:
2569             GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2570             ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2571             break;
2572          case TEMPL_12_21_OPT:
2573             GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2574             ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2575             break;
2576          default:
2577             extra_code = 0; ecode1 = 0; break;
2578          }
2579 #ifdef USE_HASH_TABLE
2580          hash_buf = (Uint1Ptr)&ecode1;
2581          CRC32(crc, hash_buf);
2582          ecode1 = (crc>>hash_shift) & hash_mask;
2583 #endif
2584          if (use_two_templates) {
2585             seqptr = subject;
2586             bitval = bit;
2587             extra_code = 0;
2588             tmpval = 0;
2589             switch (template_type) {
2590             case TEMPL_11_18:
2591                GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2592                ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) | extra_code | bit0;
2593                break;
2594             case TEMPL_12_18:
2595                GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2596                ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) | extra_code | bit0;
2597                break;
2598             case TEMPL_11_21:
2599                GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2600                ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) | extra_code | bit0;
2601                break;
2602             case TEMPL_12_21:
2603                GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2604                ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) | extra_code | bit0;
2605                break;
2606             default:
2607                ecode2 = 0; break;
2608             }
2609 #ifdef USE_HASH_TABLE
2610             hash_buf = (Uint1Ptr)&ecode2;
2611             CRC32(crc, hash_buf);
2612             ecode2 = (crc>>hash_shift) & hash_mask;
2613 #endif
2614          }
2615          bit = (bit - 1) & mask2b;
2616          if (pv_array) {
2617             while ((s_off <= subj_length) && ((pv_array[ecode1>>pv_array_bts]&
2618                       (((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK))) == 0)
2619                    && (!use_two_templates || ((pv_array[ecode2>>pv_array_bts]&
2620                         (((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK))) == 0))) {
2621                if (bit == 3) subject++;
2622                ecode = ((ecode & mask30b) << 2) | (((*subject)>>(2*bit)) & mask2b);
2623                seqptr = subject;
2624                bitval = bit;
2625                extra_code = 0;
2626                tmpval = 0;
2627                switch (template_type) {
2628                case TEMPL_11_18:
2629                   GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2630                   ecode1 = (GET_WORD_INDEX_11_18(ecode) | extra_code) & no_bit0;
2631                   break;
2632                case TEMPL_12_18:
2633                   GET_EXTRA_CODE_PACKED_18(seqptr, bitval, tmpval, extra_code);
2634                   ecode1 = (GET_WORD_INDEX_12_18(ecode) | extra_code) & no_bit0;
2635                   break;
2636                case TEMPL_11_18_OPT:
2637                   GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2638                   ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode) | extra_code) & no_bit0;
2639                   break;
2640                case TEMPL_12_18_OPT:
2641                   GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2642                   ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode) | extra_code) & no_bit0;
2643                   break;
2644                case TEMPL_11_21:
2645                   GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2646                   ecode1 = (GET_WORD_INDEX_11_21(ecode) | extra_code) & no_bit0;
2647                   break;
2648                case TEMPL_12_21:
2649                   GET_EXTRA_CODE_PACKED_21(seqptr, bitval, tmpval, extra_code);
2650                   ecode1 = (GET_WORD_INDEX_12_21(ecode) | extra_code) & no_bit0;
2651                   break;
2652                case TEMPL_11_21_OPT:
2653                   GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2654                   ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode) | extra_code) & no_bit0;
2655                   break;
2656                case TEMPL_12_21_OPT:
2657                   GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2658                   ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode) | extra_code) & no_bit0;
2659                   break;
2660                default:
2661                   extra_code = 0; ecode1 = 0; break;
2662                }
2663 #ifdef USE_HASH_TABLE
2664                hash_buf = (Uint1Ptr)&ecode1;
2665                CRC32(crc, hash_buf);
2666                ecode1 = (crc>>hash_shift) & hash_mask;
2667 #endif
2668                if (use_two_templates) {
2669                   seqptr = subject;
2670                   bitval = bit;
2671                   extra_code = 0;
2672                   tmpval = 0;
2673                   switch (template_type) {
2674                   case TEMPL_11_18:
2675                      GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2676                      ecode2 = GET_WORD_INDEX_11_18_OPT(ecode) | extra_code | bit0;
2677                      break;
2678                   case TEMPL_12_18:
2679                      GET_EXTRA_CODE_PACKED_18_OPT(seqptr, bitval, tmpval, extra_code);
2680                      ecode2 = GET_WORD_INDEX_12_18_OPT(ecode) | extra_code | bit0;
2681                      break;
2682                   case TEMPL_11_21:
2683                      GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2684                      ecode2 = GET_WORD_INDEX_11_21_OPT(ecode) | extra_code | bit0;
2685                      break;
2686                   case TEMPL_12_21:
2687                      GET_EXTRA_CODE_PACKED_21_OPT(seqptr, bitval, tmpval, extra_code);
2688                      ecode2 = GET_WORD_INDEX_12_21_OPT(ecode) | extra_code | bit0;
2689                      break;
2690                   default:
2691                      ecode2 = 0; break;
2692                   }
2693 #ifdef USE_HASH_TABLE
2694                   hash_buf = (Uint1Ptr)&ecode2;
2695                   CRC32(crc, hash_buf);
2696                   ecode2 = (crc>>hash_shift) & hash_mask;
2697 #endif
2698                }
2699                bit = (bit - 1) & mask2b;
2700                s_off++;
2701             }
2702             if (s_off > subj_length)
2703                break;
2704          }
2705          for (q_off = lookup->mb_lt->hashtable[ecode1]; q_off>0;
2706               q_off = lookup->mb_lt->next_pos[q_off]) {
2707             search->second_pass_hits++;
2708             MegaBlastExtendHit(search, lookup, s_off, q_off);
2709          }
2710          if (use_two_templates) {
2711             for (q_off = lookup->mb_lt->hashtable2[ecode2]; q_off>0;
2712                  q_off = lookup->mb_lt->next_pos2[q_off]) {
2713                search->second_pass_hits++;
2714                MegaBlastExtendHit(search, lookup, s_off, q_off);
2715             }
2716          }
2717          s_off++;
2718       }
2719    }
2720    if (!lookup->mb_lt->estack)
2721       BlastExtendWordExit(search);
2722 
2723    search->current_hitlist->do_not_reallocate = FALSE;
2724 
2725    /* Do greedy gapped extension */
2726    if (search->current_hitlist->hspcnt > 0)
2727       if (MegaBlastGappedAlign(search) != 0)
2728          return -1;
2729 
2730    if (search->current_hitlist)
2731       return search->current_hitlist->hspcnt;
2732    else return 0;
2733 }
2734 
2735 /* Contiguous words, database scanned 4 bases at a time */
2736 
2737 Int4
MegaBlastWordFinder(BlastSearchBlkPtr search,LookupTablePtr lookup)2738 MegaBlastWordFinder(BlastSearchBlkPtr search, LookupTablePtr lookup)
2739 {
2740    register Uint1Ptr subject;
2741    register Int4 s_off, ecode, mask, q_off;
2742    Int4 subj_length = search->subject->length;
2743    Int4 min_hit_size;
2744    PV_ARRAY_TYPE *pv_array = lookup->pv_array;
2745    Int4 pv_array_bts;
2746 
2747    if (search->pbp->mb_params->disc_word) {
2748       if (search->pbp->mb_params->one_base_step)
2749          return MegaBlastWordFinder_disc_1b(search, lookup);
2750       else
2751          return MegaBlastWordFinder_disc(search, lookup);
2752    }
2753 
2754    min_hit_size = lookup->mb_lt->lpm;
2755    if (search->pbp->window_size > 0)
2756       min_hit_size += 4;
2757 
2758    if (search->pbp->mb_params->is_neighboring &&
2759        subj_length < MIN_NEIGHBOR_HSP_LENGTH)
2760       return 0;
2761    if (search->current_hitlist == NULL)
2762       search->current_hitlist = BlastHitListNew(search);
2763 
2764    mask = lookup->mb_lt->mask;
2765    subject = search->subject->sequence - 1;
2766    ecode = 0;
2767    if (lookup->mb_lt->estack)
2768       MemSet(lookup->mb_lt->stack_index, 0,
2769              lookup->mb_lt->num_stacks*sizeof(Int4));
2770 
2771    pv_array_bts = PV_ARRAY_BTS + ((lookup->mb_lt->width < 3) ? 0 : 5);
2772 
2773    for (s_off = 0; s_off < (lookup->mb_lt->width - 1)*4; s_off += 4) {
2774       ecode = (ecode << 8) + *++subject;
2775    }
2776 
2777    s_off += 4;
2778    while (s_off<=subj_length) {
2779       if (pv_array) {
2780          ecode = ((ecode & mask) << 8) + *++subject;
2781          while ((s_off < subj_length) && ((pv_array[ecode>>pv_array_bts]&
2782                  (((PV_ARRAY_TYPE) 1)<<(ecode&PV_ARRAY_MASK))) == 0)) {
2783             ecode = ((ecode & mask) << 8) + *++subject;
2784             s_off += 4;
2785          }
2786          if (s_off > subj_length)
2787             break;
2788       } else {
2789          ecode = ((ecode & mask) << 8) + *++subject;
2790       }
2791       for (q_off = lookup->mb_lt->hashtable[ecode]; q_off>0;
2792 	   q_off = lookup->mb_lt->next_pos[q_off]) {
2793          search->second_pass_hits++;
2794 	 MegaBlastExtendHit(search, lookup, s_off, q_off);
2795       }
2796       s_off += 4;
2797    }
2798 
2799    if (!lookup->mb_lt->estack)
2800       BlastExtendWordExit(search);
2801 
2802    search->current_hitlist->do_not_reallocate = FALSE;
2803    /* Do greedy gapped extension */
2804    if (search->current_hitlist->hspcnt > 0)
2805       if (MegaBlastGappedAlign(search) != 0)
2806          return -1;
2807 
2808    if (search->current_hitlist)
2809       return search->current_hitlist->hspcnt;
2810    else return 0;
2811 }
2812 
2813 static int LIBCALLBACK
diag_compare_match(VoidPtr v1,VoidPtr v2)2814 diag_compare_match(VoidPtr v1, VoidPtr v2)
2815 {
2816    MegaBlastExactMatchPtr h1, h2;
2817 
2818    h1 = *((MegaBlastExactMatchPtr PNTR) v1);
2819    h2 = *((MegaBlastExactMatchPtr PNTR) v2);
2820 
2821    return (h1->q_off - h1->s_off) - (h2->q_off - h2->s_off);
2822 }
2823 
2824 #define MB_MAX_LENGTH_TO_UNPACK 400
MegaBlastGappedAlign(BlastSearchBlkPtr search)2825 Int2 MegaBlastGappedAlign(BlastSearchBlkPtr search)
2826 {
2827    MegaBlastExactMatchPtr PNTR e_hsp_array;
2828    MegaBlastExactMatchPtr e_hsp;
2829    Int4 index, i, hspcnt, buf_len=0;
2830    Uint1Ptr subject0 = NULL;
2831    Boolean delete_hsp;
2832    GapAlignBlkPtr gap_align;
2833    Boolean use_dyn_prog = search->pbp->mb_params->use_dyn_prog;
2834 
2835    hspcnt = search->current_hitlist->hspcnt;
2836 
2837    if (hspcnt == 0)
2838      return 0;
2839 
2840    /* Make current hitlist available for rewriting of extended hsps
2841       without freeing the hsp_array since it's used in this function */
2842    search->current_hitlist->hspcnt = 0;
2843 
2844    e_hsp_array = (MegaBlastExactMatchPtr PNTR)
2845       Malloc(hspcnt*sizeof(MegaBlastExactMatchPtr));
2846 
2847    if (e_hsp_array == NULL)
2848       return 1;
2849 
2850    for (index=0; index < hspcnt; index++) {
2851       e_hsp_array[index] =
2852          &(search->current_hitlist->exact_match_array[index]);
2853    }
2854 
2855    if (use_dyn_prog || (search->rdfp && search->subject->length > MB_MAX_LENGTH_TO_UNPACK)) {
2856       subject0 = search->subject->sequence;
2857    } else {
2858       /* Get subject sequence in Seq_code_ncbi4na format with all
2859          ambiguities information.
2860          If database sequence is split, do it only once and save in
2861          search->subject->sequence_start.
2862          For the 2 sequences engine this is done beforehand.
2863       */
2864       if (search->subject->sequence_start == NULL && search->rdfp) {
2865          readdb_get_sequence_ex(search->rdfp, search->subject_id,
2866                                 &search->subject->sequence_start,
2867                                 &buf_len, TRUE);
2868       }
2869       subject0 =
2870          (&search->subject->sequence_start[search->subject->original_length])
2871          + 1;
2872    }
2873 
2874    HeapSort(e_hsp_array, hspcnt, sizeof(MegaBlastExactMatchPtr), diag_compare_match);
2875    if (use_dyn_prog) {
2876       if (search->gap_align == NULL) {
2877          search->gap_align = GapAlignBlkNew(1, 1);
2878          search->gap_align->dyn_prog = (GapXDPPtr)
2879             Malloc((search->context[search->first_context].query->length + 2)
2880                    *sizeof(GapXDP));
2881       }
2882       gap_align = search->gap_align;
2883       gap_align->gap_open = search->pbp->gap_open;
2884       gap_align->gap_extend = search->pbp->gap_extend;
2885       gap_align->x_parameter = search->pbp->gap_x_dropoff;
2886       gap_align->matrix = search->sbp->matrix;
2887       gap_align->query =
2888          search->context[search->first_context].query->sequence;
2889       gap_align->subject = subject0;
2890       /* Subtract 1 because the concatenated length includes the last
2891          sentinel byte */
2892       gap_align->query_length =
2893          search->context[search->first_context].query->length - 1;
2894       gap_align->subject_length = search->subject->length;
2895    }
2896 
2897    for (index=0; index<hspcnt; index++) {
2898       e_hsp = e_hsp_array[index];
2899       delete_hsp = FALSE;
2900       for (i = search->current_hitlist->hspcnt-1;
2901 	   i >= 0 && MB_HSP_CLOSE(e_hsp->q_off, search->current_hitlist->hsp_array[i]->query.offset, e_hsp->s_off, search->current_hitlist->hsp_array[i]->subject.offset, MB_DIAG_NEAR);
2902 	   i--) {
2903 	if (MB_HSP_CONTAINED(e_hsp->q_off, search->current_hitlist->hsp_array[i]->query.offset, search->current_hitlist->hsp_array[i]->query.end, e_hsp->s_off, search->current_hitlist->hsp_array[i]->subject.offset, search->current_hitlist->hsp_array[i]->subject.end, MB_DIAG_CLOSE)) {
2904 	  delete_hsp = TRUE;
2905 	  break;
2906 	}
2907       }
2908       if (!delete_hsp) {
2909          if ((search->pbp->mb_params->disc_word &&
2910              (search->pbp->window_size == 0))) {
2911             delete_hsp = BlastNtWordUngappedExtend(search, e_hsp->q_off,
2912                               e_hsp->s_off, search->pbp->cutoff_s2);
2913          }
2914          if (!delete_hsp) {
2915             search->second_pass_extends++;
2916 
2917             if (use_dyn_prog) {
2918                Uint1 rem = e_hsp->s_off & 0x03;
2919                gap_align->q_start = e_hsp->q_off - 1 - rem;
2920                gap_align->s_start = e_hsp->s_off - 1 - rem;
2921                gap_align->decline_align = INT2_MAX;
2922                if (!PerformNtGappedAlignment(gap_align))
2923                   return 1;
2924                if (gap_align->score >= search->pbp->cutoff_s2) {
2925                   BlastNtSaveCurrentHspGapped(search, gap_align->score,
2926                       gap_align->query_start, gap_align->subject_start,
2927                       gap_align->query_stop-gap_align->query_start+1,
2928                       gap_align->subject_stop-gap_align->subject_start+1,
2929                                               e_hsp->q_off, e_hsp->s_off, search->first_context, NULL);
2930                }
2931             } else
2932                MegaBlastNtWordExtend(search, subject0, e_hsp->q_off,
2933                                      e_hsp->s_off);
2934          }
2935       }
2936    }
2937 
2938    MemFree(e_hsp_array);
2939    return 0;
2940 }
2941 
2942 Int4
MegaBlastExtendHit(BlastSearchBlkPtr search,LookupTablePtr lookup,Int4 s_off,Int4 q_off)2943 MegaBlastExtendHit(BlastSearchBlkPtr search, LookupTablePtr lookup,
2944 		   Int4 s_off, Int4 q_off) {
2945    Int4 index, len = 4*lookup->mb_lt->width;
2946    Int4 index1, step;
2947    Int2 min_hit_length;
2948    MbStackPtr estack;
2949    Int4 diag, stack_top;
2950    BLAST_ExtendWordParamsPtr ewp_params = search->ewp_params;
2951    register Int4 window;
2952    Int4 step_unit;
2953    Boolean one_word, hit_ready, two_hits;
2954 
2955    window = search->pbp->window_size + len;
2956    two_hits = (search->pbp->window_size > 0);
2957 
2958    if (search->pbp->mb_params->one_base_step)
2959       step_unit = 1;
2960    else
2961       step_unit = 4;
2962 
2963    min_hit_length = lookup->mb_lt->lpm;
2964 
2965    /* For short query/db stacks are not used;
2966       instead use a diagonal array */
2967    if (!lookup->mb_lt->estack) {
2968       CfjModStruct *combo_array =
2969          search->context[search->first_context].ewp->combo_array;
2970       CfjModStruct *combo_array_elem;
2971       register Int4 min_diag_mask;
2972       Int4 offset, s_pos;
2973 
2974       min_diag_mask = ewp_params->min_diag_mask;
2975       offset = ewp_params->offset;
2976 
2977       s_pos = s_off + offset;
2978       diag = (s_off - q_off) & min_diag_mask;
2979       combo_array_elem = &combo_array[diag];
2980       step = s_pos - combo_array_elem->last_hit;
2981 
2982       one_word = (combo_array_elem->diag_level >= min_hit_length);
2983       hit_ready = ((!two_hits && one_word) ||
2984                     combo_array_elem->diag_level > min_hit_length);
2985 
2986       if (two_hits && step < mb_two_hit_min_step && step > step_unit
2987           && one_word) {
2988          /* Words too close are not allowed */
2989          return 0;
2990       } else if (step == step_unit ||
2991                (step > step_unit && step <= window
2992                 && one_word && !hit_ready)) {
2993          /* We had a hit on this diagonal that is extended by current hit */
2994          combo_array_elem->last_hit = s_pos;
2995          combo_array_elem->diag_level += step;
2996          /* Save the hit immediately if:
2997                For single-hit case - it reached the word size
2998                For double-hit case - hit is longer than word size, or
2999                2 words have already been found, and there is another gap.
3000             If hit is longer than the required minimum, it must have already
3001             been saved earlier.
3002          */
3003          if (!hit_ready && ((combo_array_elem->diag_level > min_hit_length) ||
3004               (!two_hits && combo_array_elem->diag_level == min_hit_length)))
3005          {
3006             MegaBlastSaveExactMatch(search, q_off, s_off);
3007          }
3008       } else {
3009          /* Start the new hit, overwriting the diagonal array entry */
3010          combo_array_elem->last_hit = s_pos;
3011          combo_array_elem->diag_level = len;
3012          /* Save the hit if it already qualifies */
3013          if (!two_hits && (len >= min_hit_length))
3014             MegaBlastSaveExactMatch(search, q_off, s_off);
3015       }
3016       return 0;
3017    }
3018 
3019    index1 = (s_off - q_off) % lookup->mb_lt->num_stacks;
3020    if (index1<0)
3021       index1 += lookup->mb_lt->num_stacks;
3022    estack = lookup->mb_lt->estack[index1];
3023    stack_top = lookup->mb_lt->stack_index[index1] - 1;
3024 
3025    for (index=0; index<=stack_top; ) {
3026       step = s_off - estack[index].level;
3027       one_word = (estack[index].length >= min_hit_length);
3028       if (estack[index].diag == s_off - q_off) {
3029          hit_ready = ((!two_hits && one_word) ||
3030                               estack[index].length > min_hit_length);
3031          if (two_hits && step < mb_two_hit_min_step && step > step_unit
3032              && one_word) {
3033             lookup->mb_lt->stack_index[index1] = stack_top + 1;
3034             /* Skip words that are too close to each other (experimental) */
3035             return 0;
3036          } else if (step == step_unit ||
3037                   (step > step_unit && step <= window &&
3038                    one_word && !hit_ready)) {
3039             estack[index].length += step;
3040             /* We had a hit on this diagonal that is extended by current hit */
3041             /* Save the hit if:
3042                For single-hit case - it has just reached the word size
3043                For double-hit case - second hit has just been found,
3044                   after the first reached the word size.
3045                Otherwise, just update the hit length and offset.
3046             */
3047             if (!hit_ready && (estack[index].length > min_hit_length ||
3048                 (!two_hits && estack[index].length == min_hit_length))) {
3049                MegaBlastSaveExactMatch(search, q_off, s_off);
3050             }
3051             estack[index].level = s_off;
3052          } else {
3053             /* Start the new hit */
3054             estack[index].length = len;
3055             estack[index].level = s_off;
3056             if (!two_hits && (len >= min_hit_length))
3057                MegaBlastSaveExactMatch(search, q_off, s_off);
3058          }
3059          lookup->mb_lt->stack_index[index1] = stack_top + 1;
3060          return 0;
3061       } else if (step <= step_unit || (step <= window && one_word)) {
3062          /* Hit from a different diagonal, and it can potentially continue */
3063          index++;
3064       } else {
3065          /* Hit from a different diagonal that does not continue: remove it
3066             from the stack */
3067          estack[index] = estack[stack_top--];
3068       }
3069    }
3070 
3071    /* Need an extra slot on the stack for this hit */
3072    if (++stack_top>=lookup->mb_lt->stack_size[index1]) {
3073       /* Stack about to overflow - reallocate memory */
3074       MbStackPtr ptr;
3075       if (!(ptr = (MbStackPtr)Realloc(estack,
3076                                       2*lookup->mb_lt->stack_size[index1]*sizeof(MbStack)))) {
3077          ErrPostEx(SEV_WARNING, 0, 0, "Unable to allocate %ld extra spaces for stack %d",
3078                    2*lookup->mb_lt->stack_size[index1], index1);
3079          return 1;
3080       } else {
3081          lookup->mb_lt->stack_size[index1] *= 2;
3082          estack = lookup->mb_lt->estack[index1] = ptr;
3083       }
3084    }
3085    estack[stack_top].diag = s_off - q_off;
3086    estack[stack_top].level = s_off;
3087    estack[stack_top].length = len;
3088    lookup->mb_lt->stack_index[index1] = stack_top + 1;
3089    if (!two_hits && (len >= min_hit_length))
3090       MegaBlastSaveExactMatch(search, q_off, s_off);
3091 
3092    return 0;
3093 }
3094 
3095 /* subject sequence is compressed 4:1 */
3096 Int2
MegaBlastNtWordExtend(BlastSearchBlkPtr search,Uint1Ptr subject0,Int4 q_off,Int4 s_off)3097 MegaBlastNtWordExtend(BlastSearchBlkPtr search, Uint1Ptr subject0,
3098 		      Int4 q_off, Int4 s_off)
3099 {
3100 	register Uint1Ptr	q;
3101 	register BLAST_Score    score;
3102 	Uint1Ptr query0, s;
3103 	BLAST_Score	X;
3104         BLAST_ParameterBlkPtr   pbp;
3105         BLAST_ScoreBlkPtr       sbp;
3106 	Int4 q_avail, s_avail;
3107 	/* Variables for the call to MegaBlastGreedyAlign */
3108 	Int4 q_ext_l, q_ext_r, s_ext_l, s_ext_r;
3109 	edit_script_t *ed_script_fwd=NULL, *ed_script_rev=NULL;
3110 	Boolean good_hit;
3111         Uint1 rem;
3112         GapXEditScriptPtr esp = NULL;
3113 
3114 	good_hit = TRUE;
3115         sbp=search->sbp;
3116         pbp=search->pbp;
3117 
3118 	query0 = (Uint1Ptr) search->context[search->first_context].query->sequence;
3119         q_avail = search->context[search->first_context].query->length - q_off;
3120         s_avail = search->subject->length - s_off;
3121 
3122 	q = query0 + q_off;
3123         if (!search->rdfp || search->subject->length <= MB_MAX_LENGTH_TO_UNPACK) {
3124            s = subject0 + s_off;
3125            rem = 4;
3126         } else {
3127            s = subject0 + s_off/4;
3128            rem = s_off % 4;
3129         }
3130 
3131 	/* Find where positive scoring starts & ends within the word hit */
3132 	score = 0;
3133 
3134 	X = pbp->gap_x_dropoff;
3135 
3136 	if (!search->pbp->mb_params->no_traceback) {
3137 	   ed_script_fwd = edit_script_new();
3138 	   ed_script_rev = edit_script_new();
3139 	}
3140 
3141 	/* extend to the right */
3142 	score = MegaBlastAffineGreedyAlign(s, s_avail, q, q_avail, FALSE, X,
3143 					   sbp->reward, -sbp->penalty,
3144 					   pbp->gap_open, pbp->gap_extend,
3145 					   &s_ext_r, &q_ext_r, search->abmp,
3146 					   ed_script_fwd, rem);
3147 	if (search->rdfp && search->subject->length > MB_MAX_LENGTH_TO_UNPACK)
3148            rem = 0;
3149 	/* extend to the left */
3150 	score += MegaBlastAffineGreedyAlign(subject0, s_off, query0, q_off,
3151 					    TRUE, X, sbp->reward,
3152 					    -sbp->penalty, pbp->gap_open,
3153 					    pbp->gap_extend, &s_ext_l,
3154 					    &q_ext_l, search->abmp,
3155 					    ed_script_rev, rem);
3156 	/* For neighboring we have a stricter criterion to keep an HSP */
3157 	if (search->pbp->mb_params->is_neighboring) {
3158 	   FloatHi perc_identity;
3159 	   Int4 hsp_length;
3160 
3161 	   hsp_length = MIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
3162 	   perc_identity = 100 * (1 - ((FloatHi) score) / hsp_length);
3163 	   if (hsp_length < MIN_NEIGHBOR_HSP_LENGTH ||
3164 	       perc_identity < MIN_NEIGHBOR_PERC_IDENTITY)
3165 	      good_hit = FALSE;
3166 	}
3167 
3168 	/* In basic case the greedy algorithm returns number of
3169 	   differences, hence we need to convert it to score */
3170 	if (pbp->gap_open==0 && pbp->gap_extend==0)
3171 	   score =
3172 	      (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*sbp->reward/2 -
3173 	      score*(sbp->reward - sbp->penalty);
3174 	else if (sbp->reward % 2 == 1)
3175 	   score /= 2;
3176 
3177 	if (good_hit && score >= pbp->cutoff_s2) { /* Score is reportable */
3178 	   if (!search->pbp->mb_params->no_traceback) {
3179 	      edit_script_append(ed_script_rev, ed_script_fwd);
3180               esp = MBToGapXEditScript(ed_script_rev);
3181 	   }
3182 
3183 	   search->second_pass_good_extends++;
3184 
3185 	   BlastNtSaveCurrentHspGapped(search, score, q_off-q_ext_l,
3186 				     s_off-s_ext_l, q_ext_l+q_ext_r,
3187 				     s_ext_l+s_ext_r, q_off, s_off,
3188                                      search->first_context, esp);
3189 	}
3190         edit_script_free(ed_script_fwd);
3191         edit_script_free(ed_script_rev);
3192 
3193 	return 0;
3194 }
3195 
3196 #define NO_UNPACK_MBALIGN_ARG 4
PerformGreedyAlignmentWithTraceback(GapAlignBlkPtr gap_align,GreedyAlignMemPtr gamp,BLAST_ScoreBlkPtr sbp)3197 void PerformGreedyAlignmentWithTraceback(GapAlignBlkPtr gap_align,
3198         GreedyAlignMemPtr gamp, BLAST_ScoreBlkPtr sbp)
3199 {
3200     Int4 score, s_ext_r, q_ext_r, s_ext_l, q_ext_l;
3201     Uint1Ptr query0, subject0, query, subject;
3202     Int4 X = gap_align->x_parameter;
3203     Int4 reward = sbp->reward;
3204     Int4 penalty = -sbp->penalty;
3205     Int4 q_off, s_off, q_avail, s_avail;
3206     edit_script_t *ed_script_fwd=NULL, *ed_script_rev=NULL;
3207     GapXEditScriptPtr esp = NULL;
3208 
3209     query0 = gap_align->query;
3210     subject0 = gap_align->subject;
3211     q_off = gap_align->q_start;
3212     s_off = gap_align->s_start;
3213     query = query0 + q_off;
3214     subject = subject0 + s_off;
3215     q_avail = gap_align->query_length - q_off;
3216     s_avail = gap_align->subject_length - s_off;
3217     ed_script_fwd = edit_script_new();
3218     ed_script_rev = edit_script_new();
3219 
3220     /* extend to the right */
3221     score = MegaBlastAffineGreedyAlign(subject, s_avail, query, q_avail,
3222                FALSE, X, reward, penalty, gap_align->gap_open,
3223                gap_align->gap_extend, &s_ext_r, &q_ext_r, gamp,
3224                ed_script_fwd, NO_UNPACK_MBALIGN_ARG);
3225     /* extend to the left */
3226     score += MegaBlastAffineGreedyAlign(subject0, s_off, query0, q_off,
3227                 TRUE, X, reward, penalty, gap_align->gap_open,
3228                 gap_align->gap_extend, &s_ext_l, &q_ext_l, gamp,
3229                 ed_script_rev, NO_UNPACK_MBALIGN_ARG);
3230 
3231     /* In basic case the greedy algorithm returns number of
3232        differences, hence we need to convert it to score */
3233     if (gap_align->gap_open==0 && gap_align->gap_extend==0)
3234        score =
3235           (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*sbp->reward/2 -
3236           score*(sbp->reward - sbp->penalty);
3237     else if (sbp->reward % 2 == 1)
3238        score /= 2;
3239 
3240     edit_script_append(ed_script_rev, ed_script_fwd);
3241     esp = MBToGapXEditScript(ed_script_rev);
3242 
3243     edit_script_free(ed_script_fwd);
3244     edit_script_free(ed_script_rev);
3245 
3246     gap_align->score = score;
3247     gap_align->query_start = q_off - q_ext_l;
3248     gap_align->subject_start = s_off - s_ext_l;
3249     gap_align->query_stop = q_off + q_ext_r;
3250     gap_align->subject_stop = s_off + s_ext_r;
3251     gap_align->edit_block = GapXEditBlockNew(gap_align->query_start, gap_align->subject_start);
3252 
3253     gap_align->edit_block->start1 = gap_align->query_start;
3254     gap_align->edit_block->start2 = gap_align->subject_start;
3255     gap_align->edit_block->length1 = gap_align->query_length;
3256     gap_align->edit_block->length2 = gap_align->subject_length;
3257     gap_align->edit_block->frame1 = gap_align->query_frame;
3258     gap_align->edit_block->frame2 = gap_align->subject_frame;
3259     gap_align->edit_block->reverse = 0;
3260 
3261     gap_align->edit_block->esp = esp;
3262 }
3263 
3264 /* A routine to mask the residues from a mask SeqLoc by changing to a given
3265  * value (mask_residue).
3266  * The buffer should contain an already decoded sequence.
3267  *
3268  */
MegaBlastMaskTheResidues(Uint1Ptr buffer,Int4 max_length,Uint1 mask_residue,SeqLocPtr mask_slp,Boolean reverse,Int4 offset,Boolean mask_at_hash)3269 void MegaBlastMaskTheResidues(Uint1Ptr buffer, Int4 max_length, Uint1
3270 			      mask_residue, SeqLocPtr mask_slp, Boolean reverse,
3271 			      Int4 offset, Boolean mask_at_hash)
3272 
3273 {
3274    SeqLocPtr slp=NULL;
3275    Int4 index, start, stop;
3276 
3277    while (mask_slp) {
3278       slp=NULL;
3279       while((slp = SeqLocFindNext(mask_slp, slp))!=NULL) {
3280 	 if (reverse) {
3281 	    start = MAX(max_length + offset - 1 - SeqLocStop(slp), 0);
3282 	    stop = max_length - 1 - MAX(SeqLocStart(slp) - offset, 0);
3283 	 } else {
3284 	    start = MAX(SeqLocStart(slp) - offset, 0);
3285 	    stop = MIN(SeqLocStop(slp) - offset, max_length - 1);
3286 	 }
3287 
3288 	 if (mask_at_hash)
3289 	    for (index=start; index<=stop; index++)
3290 	       /* Set the 5th bit to mark that this is a masked residue */
3291 	       buffer[index] = buffer[index] | 0x10;
3292 	 else
3293 	    for (index=start; index<=stop; index++)
3294 	       buffer[index] = mask_residue;
3295       }
3296       mask_slp = mask_slp->next;
3297    }
3298 }
3299 
3300 /* The following function can only be used for IUPAC encoding */
3301 void
BlastLCaseMaskTheResidues(Uint1Ptr buffer,Int4 max_length,SeqLocPtr mask_slp,Boolean reverse,Int4 offset)3302 BlastLCaseMaskTheResidues(Uint1Ptr buffer, Int4 max_length, SeqLocPtr mask_slp, Boolean reverse, Int4 offset)
3303 
3304 {
3305    SeqLocPtr slp=NULL;
3306    Int4 index, start, stop;
3307 
3308    while (mask_slp) {
3309       slp=NULL;
3310       while((slp = SeqLocFindNext(mask_slp, slp))!=NULL) {
3311 	 if (reverse) {
3312 	    start = max_length - 1 - SeqLocStop(slp);
3313 	    stop = max_length - 1 - SeqLocStart(slp);
3314 	 }	else {
3315 	    start = SeqLocStart(slp);
3316 	    stop = SeqLocStop(slp);
3317 	 }
3318 
3319 	 start -= offset;
3320 	 stop  -= offset;
3321 
3322 	 for (index=start; index<=stop; index++)
3323 	    buffer[index] = tolower(buffer[index]);
3324       }
3325       mask_slp = mask_slp->next;
3326    }
3327 }
3328 
3329 static int LIBCALLBACK
diag_compare_hsps(VoidPtr v1,VoidPtr v2)3330 diag_compare_hsps(VoidPtr v1, VoidPtr v2)
3331 
3332 {
3333    BLAST_HSPPtr h1, h2;
3334    BLAST_HSPPtr PNTR hp1, PNTR hp2;
3335 
3336    hp1 = (BLAST_HSPPtr PNTR) v1;
3337    hp2 = (BLAST_HSPPtr PNTR) v2;
3338    h1 = *hp1;
3339    h2 = *hp2;
3340 
3341    if (h1==NULL && h2==NULL) return 0;
3342    else if (h1==NULL) return 1;
3343    else if (h2==NULL) return -1;
3344 
3345    /* Separate different queries and/or strands */
3346    if (h1->context < h2->context)
3347       return -1;
3348    else if (h1->context > h2->context)
3349       return 1;
3350 
3351    return (h1->query.offset - h1->subject.offset) -
3352       (h2->query.offset - h2->subject.offset);
3353 }
3354 
3355 typedef enum E_HSPInclusionStatus {
3356    e_Equal = 0,      /**< Identical */
3357    e_FirstInSecond,  /**< First included in rectangle formed by second */
3358    e_SecondInFirst,  /**< Second included in rectangle formed by first */
3359    e_DiagNear,       /**< Diagonals are near, but neither HSP is included in
3360                        the other. */
3361    e_DiagDistant     /**< Diagonals are far apart, or different contexts */
3362 } E_HSPInclusionStatus;
3363 
3364 /** HSP inclusion criterion for megablast: one HSP must be included in a
3365  * diagonal strip of a certain width around the other, and also in a rectangle
3366  * formed by the other HSP's endpoints.
3367  */
3368 static E_HSPInclusionStatus
BLAST_HSPInclusionTest(BLAST_HSP * hsp1,BLAST_HSP * hsp2)3369 BLAST_HSPInclusionTest(BLAST_HSP* hsp1, BLAST_HSP* hsp2)
3370 {
3371    if (hsp1->context != hsp2->context ||
3372        !MB_HSP_CLOSE(hsp1->query.offset, hsp2->query.offset,
3373                      hsp1->subject.offset, hsp2->subject.offset,
3374                      2*MB_DIAG_NEAR))
3375       return e_DiagDistant;
3376 
3377    if (hsp1->query.offset == hsp2->query.offset &&
3378        hsp1->query.end == hsp2->query.end &&
3379        hsp1->subject.offset == hsp2->subject.offset &&
3380        hsp1->subject.end == hsp2->subject.end &&
3381        hsp1->score == hsp2->score) {
3382       return e_Equal;
3383    } else if (hsp1->query.offset >= hsp2->query.offset &&
3384        hsp1->query.end <= hsp2->query.end &&
3385        hsp1->subject.offset >= hsp2->subject.offset &&
3386        hsp1->subject.end <= hsp2->subject.end &&
3387        hsp1->score < hsp2->score) {
3388       return e_FirstInSecond;
3389    } else if (hsp1->query.offset <= hsp2->query.offset &&
3390               hsp1->query.end >= hsp2->query.end &&
3391               hsp1->subject.offset <= hsp2->subject.offset &&
3392               hsp1->subject.end >= hsp2->subject.end &&
3393               hsp1->score >= hsp2->score) {
3394       return e_SecondInFirst;
3395    }
3396    return e_DiagNear;
3397 }
3398 
3399 /** How many HSPs to check for inclusion for each new HSP? */
3400 #define MAX_NUM_CHECK_INCLUSION 20
3401 
3402 static void
BlastSortUniqHspArray(BLAST_HitListPtr hitlist)3403 BlastSortUniqHspArray(BLAST_HitListPtr hitlist)
3404 {
3405    Int4 index, new_hspcnt, index1, index2;
3406    BLAST_HSPPtr PNTR hsp_array = hitlist->hsp_array;
3407    Boolean shift_needed = FALSE;
3408    E_HSPInclusionStatus inclusion_status = e_DiagNear;
3409 
3410    HeapSort(hitlist->hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr),
3411             diag_compare_hsps);
3412 
3413    for (index=1, new_hspcnt=0; index<hitlist->hspcnt; index++) {
3414       if (hsp_array[index]==NULL)
3415 	 continue;
3416       inclusion_status = e_DiagNear;
3417       for (index1 = new_hspcnt; inclusion_status != e_DiagDistant &&
3418            index1 >= 0 && new_hspcnt-index1 < MAX_NUM_CHECK_INCLUSION;
3419            index1--) {
3420          inclusion_status =
3421             BLAST_HSPInclusionTest(hsp_array[index], hsp_array[index1]);
3422          if (inclusion_status == e_FirstInSecond ||
3423              inclusion_status == e_Equal) {
3424             /* Free the new HSP and break out of the inclusion test loop */
3425             hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
3426             break;
3427          } else if (inclusion_status == e_SecondInFirst) {
3428             hsp_array[index1] = BLAST_HSPFree(hsp_array[index1]);
3429             shift_needed = TRUE;
3430          }
3431       }
3432 
3433       /* If some lower indexed HSPs have been removed, shift the subsequent
3434          HSPs */
3435       if (shift_needed) {
3436          /* Find the first non-NULL HSP, going backwards */
3437          while (index1 >= 0 && !hsp_array[index1])
3438             index1--;
3439          /* Go forward, and shift any non-NULL HSPs */
3440          for (index2 = ++index1; index1 <= new_hspcnt; index1++) {
3441             if (hsp_array[index1])
3442                hsp_array[index2++] = hsp_array[index1];
3443          }
3444          new_hspcnt = index2 - 1;
3445          shift_needed = FALSE;
3446       }
3447       if (hsp_array[index] != NULL)
3448          hsp_array[++new_hspcnt] = hsp_array[index];
3449    }
3450    /* Set all HSP pointers that will not be used any more to NULL */
3451    MemSet(&hsp_array[new_hspcnt+1], 0,
3452 	  (hitlist->hspcnt - new_hspcnt - 1)*sizeof(BLAST_HSPPtr));
3453    hitlist->hspcnt = new_hspcnt + 1;
3454    hitlist->hspcnt_max = hitlist->hspcnt;
3455 }
3456 
3457 void
MegaBlastFillHspGapInfo(BLAST_HSPPtr hsp,GapXEditScriptPtr esp)3458 MegaBlastFillHspGapInfo(BLAST_HSPPtr hsp, GapXEditScriptPtr esp)
3459 {
3460    hsp->gap_info = GapXEditBlockNew(hsp->query.offset, hsp->subject.offset);
3461 
3462    hsp->gap_info->start1 = hsp->query.offset;
3463    hsp->gap_info->start2 = hsp->subject.offset;
3464    hsp->gap_info->length1 = hsp->query.length;
3465    hsp->gap_info->length2 = hsp->subject.length;
3466    hsp->gap_info->frame1 = hsp->gap_info->frame2 = 1;
3467    hsp->gap_info->reverse = 0;
3468 
3469    hsp->gap_info->esp = esp;
3470 }
3471 
3472 GapXEditScriptPtr
MBToGapXEditScript(edit_script_t PNTR ed_script)3473 MBToGapXEditScript (edit_script_t PNTR ed_script)
3474 {
3475    GapXEditScriptPtr esp_start, esp, esp_prev;
3476    Int4 i;
3477 
3478    if (ed_script==NULL || ed_script->num == 0)
3479       return NULL;
3480 
3481    for (i=0; i<ed_script->num; i++) {
3482       esp = (GapXEditScriptPtr) MemNew(sizeof(GapXEditScript));
3483       esp->num = EDIT_VAL(ed_script->op[i]);
3484       esp->op_type = 3 - EDIT_OPC(ed_script->op[i]);
3485       if (esp->op_type == 3)
3486          fprintf(stderr, "op_type = 3\n");
3487       if (i==0)
3488 	 esp_start = esp_prev = esp;
3489       else {
3490 	 esp_prev->next = esp;
3491 	 esp_prev = esp;
3492       }
3493    }
3494 
3495    return esp_start;
3496 
3497 }
3498 
3499 
3500 SeqAlignPtr
MegaBlastGapInfoToSeqAlign(BlastSearchBlkPtr search,Int4 subject_index,Int2 query_index)3501 MegaBlastGapInfoToSeqAlign(BlastSearchBlkPtr search, Int4 subject_index,
3502                            Int2 query_index)
3503 {
3504    SeqAlignPtr seqalign, seqalign_var, sap;
3505    BLASTResultHitlistPtr result_hitlist;
3506    SeqIdPtr subject_id;
3507    BLASTResultHspPtr hsp_array;
3508    Int4 index;
3509    SeqIdPtr query_id, new_subject_seqid;
3510    StdSegPtr ssp;
3511    ValNodePtr gi_list=NULL;
3512 
3513    /* Sanity check */
3514    if (query_index > search->last_context / 2 ||
3515        search->mb_result_struct[query_index] == NULL ||
3516        subject_index >= search->mb_result_struct[query_index]->hitlist_count)
3517       return NULL;
3518 
3519    result_hitlist =
3520       search->mb_result_struct[query_index]->results[subject_index];
3521 
3522    if (search->rdfp) {
3523       readdb_get_descriptor(search->rdfp,
3524                             result_hitlist->subject_id,
3525                             &subject_id, NULL);
3526    } else if (result_hitlist->subject_info &&
3527               result_hitlist->subject_info->sip) {
3528       subject_id =
3529          SeqIdDup(result_hitlist->subject_info->sip);
3530    }
3531 
3532    hsp_array = result_hitlist->hsp_array;
3533 
3534    gi_list = BlastGetAllowedGis(search, result_hitlist->subject_id, &new_subject_seqid);
3535 
3536    for (index=0; index<result_hitlist->hspcnt; index++) {
3537       query_id = search->qid_array[hsp_array[index].context/2];
3538       if (index==0) {
3539 	 if (new_subject_seqid)
3540 	    seqalign = seqalign_var =
3541 	       GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3542 				       new_subject_seqid, query_id);
3543 	 else
3544 	    seqalign = seqalign_var =
3545 	       GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3546 				       subject_id, query_id);
3547       } else {
3548 	 if (new_subject_seqid)
3549 	    seqalign_var->next =
3550 	       GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3551 				       new_subject_seqid, query_id);
3552 	 else
3553 	    seqalign_var->next =
3554 	       GapXEditBlockToSeqAlign(hsp_array[index].gap_info,
3555 				       subject_id, query_id);
3556 
3557 	 seqalign_var = seqalign_var->next;
3558       }
3559       seqalign_var->score = GetScoreSetFromBlastResultHsp(&hsp_array[index], gi_list);
3560       if (seqalign_var->segtype == 3) {
3561 	 ssp = seqalign_var->segs;
3562 	 while (ssp) {
3563 	    ssp->scores = GetScoreSetFromBlastResultHsp(&hsp_array[index], gi_list);
3564 	    ssp = ssp->next;
3565 	 }
3566       } else if (seqalign_var->segtype == 5) { /* Discontinuous */
3567 	 sap = (SeqAlignPtr) seqalign_var->segs;
3568 	 for(;sap != NULL; sap = sap->next) {
3569 	    sap->score = GetScoreSetFromBlastResultHsp(&hsp_array[index], gi_list);
3570 	 }
3571       }
3572    }
3573 
3574    SeqIdSetFree(subject_id);
3575    return seqalign;
3576 }
3577 
3578 Int4
MegaBlastGetNumIdentical(Uint1Ptr query,Uint1Ptr subject,Int4 q_start,Int4 s_start,Int4 length,Boolean reverse)3579 MegaBlastGetNumIdentical(Uint1Ptr query, Uint1Ptr subject, Int4 q_start,
3580                          Int4 s_start, Int4 length, Boolean reverse)
3581 {
3582    Int4 i, ident = 0;
3583    Uint1Ptr q, s;
3584    Uint1 at_hash_mask = 0x0f;
3585 
3586    q = &query[q_start];
3587    if (!reverse)
3588       s = &subject[s_start];
3589    else
3590       s = &subject[s_start + length - 1];
3591 
3592    for (i=0; i<length; i++) {
3593       if ((*q & at_hash_mask) == *s)
3594 	 ident++;
3595       q++;
3596       if (!reverse)
3597          s++;
3598       else
3599          s--;
3600    }
3601 
3602    return ident;
3603 }
3604 
3605 /* Reevaluate score of one HSP, using ambiguity information */
ReevaluateScoreWithAmbiguities(BlastSearchBlkPtr search,Uint1Ptr subject_start,BLAST_HSPPtr hsp)3606 Boolean ReevaluateScoreWithAmbiguities(BlastSearchBlkPtr search,
3607            Uint1Ptr subject_start, BLAST_HSPPtr hsp)
3608 {
3609    BLAST_Score	sum, score, gap_open, gap_extend;
3610    BLAST_ScorePtr PNTR    matrix;
3611    Uint1Ptr query_start, query, subject;
3612    Uint1Ptr new_q_start, new_s_start, new_q_end, new_s_end;
3613    Int4 i, context, last_esp_num;
3614    Int2 factor;
3615    Uint1 mask = 0x0f;
3616    Boolean delete_hsp;
3617    GapXEditBlockPtr gap_info = hsp->gap_info;
3618    GapXEditScriptPtr esp, last_esp, prev_esp, first_esp;
3619    FloatHi searchsp_eff;
3620    Int4 align_length;
3621 
3622    if (search->sbp->reward % 2 == 1)
3623       factor = 2;
3624    else
3625       factor = 1;
3626 
3627    if (search->pbp->gap_open == 0 && search->pbp->gap_extend == 0) {
3628       gap_open = 0;
3629       gap_extend =
3630          (search->sbp->reward - 2*search->sbp->penalty) * factor / 2;
3631    } else {
3632       gap_open = factor*search->pbp->gap_open;
3633       gap_extend = factor*search->pbp->gap_extend;
3634    }
3635 
3636    matrix = search->sbp->matrix;
3637 
3638    context = hsp->context;
3639 
3640    searchsp_eff = (FloatHi) search->dblen_eff *
3641       ((FloatHi) search->context[context].query->effective_length);
3642 
3643    query_start = search->context[context].query->sequence;
3644    query = query_start + hsp->query.offset;
3645    subject = subject_start + hsp->subject.offset;
3646    score = 0;
3647    sum = 0;
3648    new_q_start = new_q_end = query;
3649    new_s_start = new_s_end = subject;
3650    i = 0;
3651 
3652    esp = gap_info->esp;
3653    prev_esp = NULL;
3654    last_esp = first_esp = esp;
3655    last_esp_num = 0;
3656 
3657    while (esp) {
3658       if (esp->op_type == GAPALIGN_SUB) {
3659          sum += factor*matrix[*query & mask][*subject];
3660          query++;
3661          subject++;
3662          i++;
3663       } else if (esp->op_type == GAPALIGN_DEL) {
3664          sum -= gap_open + gap_extend * esp->num;
3665          subject += esp->num;
3666          i += esp->num;
3667       } else if (esp->op_type == GAPALIGN_INS) {
3668          sum -= gap_open + gap_extend * esp->num;
3669          query += esp->num;
3670          i += esp->num;
3671       }
3672 
3673       if (sum < 0) {
3674          if (score < search->pbp->cutoff_s2 ||
3675              BlastKarlinStoE_simple(score, search->sbp->kbp[context],
3676                                     searchsp_eff) > search->pbp->cutoff_e) {
3677             /* Start from new offset */
3678             new_q_start = query;
3679             new_s_start = subject;
3680             score = sum = 0;
3681             if (i < esp->num) {
3682                esp->num -= i;
3683                first_esp = esp;
3684                i = 0;
3685             } else {
3686                first_esp = esp->next;
3687             }
3688             /* Unlink the bad part of the esp chain
3689                so it can be freed later */
3690             if (prev_esp)
3691                prev_esp->next = NULL;
3692             last_esp = NULL;
3693          } else {
3694             /* Stop here */
3695             break;
3696          }
3697       } else if (sum > score) {
3698          /* Remember this point as the best scoring end point */
3699          score = sum;
3700          last_esp = esp;
3701          last_esp_num = i;
3702          new_q_end = query;
3703          new_s_end = subject;
3704       }
3705       if (i >= esp->num) {
3706          i = 0;
3707          prev_esp = esp;
3708          esp = esp->next;
3709       }
3710    }
3711 
3712    score /= factor;
3713 
3714    delete_hsp = FALSE;
3715 
3716    if (score >= search->pbp->cutoff_s2) {
3717       hsp->score = score;
3718       searchsp_eff = (FloatHi) search->dblen_eff *
3719          (FloatHi) search->context[context].query->effective_length;
3720       hsp->evalue = BlastKarlinStoE_simple(score,
3721                        search->sbp->kbp[context], searchsp_eff);
3722       if (hsp->evalue > search->pbp->cutoff_e) {
3723          delete_hsp = TRUE;
3724       } else {
3725          hsp->query.length = new_q_end - new_q_start;
3726          hsp->subject.length = new_s_end - new_s_start;
3727          hsp->query.offset = hsp->gap_info->start1 = new_q_start - query_start;
3728          hsp->query.end = hsp->query.offset + hsp->query.length;
3729          hsp->subject.offset = hsp->gap_info->start2 =
3730              new_s_start - subject_start;
3731          hsp->subject.end = hsp->subject.offset + hsp->subject.length;
3732          /* Make corrections in edit block and free any parts that
3733             are no longer needed */
3734          if (first_esp != hsp->gap_info->esp) {
3735             GapXEditScriptDelete(hsp->gap_info->esp);
3736             hsp->gap_info->esp = first_esp;
3737          }
3738          if (last_esp->next != NULL) {
3739             last_esp->next = GapXEditScriptDelete(last_esp->next);
3740          }
3741          last_esp->num = last_esp_num;
3742          BlastHSPGetNumIdentical(search, hsp, NULL, &hsp->num_ident,
3743                                  &align_length);
3744       }
3745    } else {
3746       delete_hsp = TRUE;
3747    }
3748 
3749    return delete_hsp;
3750 }
3751 
3752 /* In the following function the forward strand is assumed */
3753 
3754 Int2
MegaBlastReevaluateWithAmbiguities(BlastSearchBlkPtr search)3755 MegaBlastReevaluateWithAmbiguities(BlastSearchBlkPtr search)
3756 {
3757    BLAST_HitListPtr current_hitlist;
3758    BLAST_HSPPtr PNTR hsp_array, hsp;
3759    Uint1Ptr subject_start = NULL;
3760    Int4 index, context, hspcnt, buf_len=0;
3761    Boolean purge, delete_hsp;
3762    Int4 query_length;
3763    Int2 status;
3764 
3765    if (!search->pbp->mb_params)
3766       return 1;
3767 
3768    current_hitlist = search->current_hitlist;
3769    if (current_hitlist == NULL || current_hitlist->hspcnt == 0)
3770       return 0;
3771 
3772 
3773    status = BlastGetNonSumStatsEvalue(search);
3774 
3775    /* While query offsets are not yet adjusted, remove the redundant HSPs */
3776    /*if (current_hitlist->hspcnt > 1)
3777      BlastSortUniqHspArray(current_hitlist);*/
3778    hspcnt = current_hitlist->hspcnt;
3779 
3780    hsp_array = current_hitlist->hsp_array;
3781    /* In case of greedy extension without traceback,
3782       only adjust the offsets */
3783    if (!search->pbp->mb_params->use_dyn_prog &&
3784        search->pbp->mb_params->no_traceback) {
3785       for (index=0; index<hspcnt; index++) {
3786          if (hsp_array[index] != NULL) {
3787             /* Adjust the query offsets here */
3788             hsp_array[index]->query.offset -=
3789                search->query_context_offsets[hsp_array[index]->context];
3790             hsp_array[index]->query.end -=
3791                search->query_context_offsets[hsp_array[index]->context];
3792             hsp_array[index]->query.gapped_start -=
3793                search->query_context_offsets[hsp_array[index]->context];
3794          }
3795       }
3796       status = BlastReapHitlistByEvalue(search);
3797       return status;
3798    }
3799 
3800    status = BlastReapHitlistByEvalue(search);
3801 
3802    if (current_hitlist->hspcnt == 0)
3803       /* All HSPs have been deleted */
3804       return 0;
3805 
3806    /* Can happen only when subject was being unpacked on the fly
3807       during gapped extensions */
3808    if (!search->subject->sequence_start)
3809       readdb_get_sequence_ex(search->rdfp, search->subject_id,
3810                              &search->subject->sequence_start,
3811                              &buf_len, TRUE);
3812    /* The sequence in blastna encoding is now stored in sequence_start */
3813    subject_start = search->subject->sequence_start + 1;
3814 
3815    purge = FALSE;
3816    for (index=0; index<hspcnt; index++) {
3817       if (hsp_array[index] == NULL)
3818          continue;
3819       else
3820          hsp = hsp_array[index];
3821 
3822       context = hsp->context;
3823 
3824       /* Adjust the query offsets here */
3825       hsp->query.offset -= search->query_context_offsets[context];
3826       hsp->query.end -= search->query_context_offsets[context];
3827       hsp->query.gapped_start -= search->query_context_offsets[context];
3828 
3829       if (search->pbp->mb_params->use_dyn_prog) {
3830          /* Check if the extension jumped through the border between
3831             sequences or sequence strands. If this happens, leave the part
3832             which contains the original offset pair, saved in gapped_start's
3833          */
3834          Int4 extra_length;
3835          query_length = search->query_context_offsets[context+1] -
3836             search->query_context_offsets[context] - 1;
3837 
3838          while ((hsp->query.end - 1) / query_length > 0) {
3839             if (hsp->query.gapped_start < query_length) {
3840                extra_length = hsp->query.end - query_length + 1;
3841                hsp->subject.end -= extra_length;
3842                hsp->query.length -= extra_length;
3843                hsp->subject.length -= extra_length;
3844                hsp->query.end = query_length - 1;
3845             } else {
3846                extra_length = query_length + 1 - hsp->query.offset;
3847                hsp->subject.offset += extra_length;
3848                hsp->query.offset = 0;
3849                hsp->query.end -= (query_length + 1);
3850                hsp->query.length -= extra_length;
3851                hsp->subject.length -= extra_length;
3852                hsp->query.gapped_start -= query_length + 1;
3853                context++;
3854                query_length = search->query_context_offsets[context+1] -
3855                   search->query_context_offsets[context] - 1;
3856             }
3857          }
3858          hsp->context = context;
3859          /* Gap information not available here, so no reevaluation
3860             with ambiguities can be done - it will be done with the final
3861             extension with traceback */
3862          continue;
3863       }
3864 
3865       delete_hsp =
3866          ReevaluateScoreWithAmbiguities(search, subject_start, hsp);
3867 
3868       if (delete_hsp) { /* This HSP is now below the cutoff */
3869          hsp_array[index] = BLAST_HSPFree(hsp);
3870          purge = TRUE;
3871       }
3872    }
3873    if (purge) {
3874       index = HspArrayPurge(hsp_array, hspcnt, TRUE);
3875       current_hitlist->hspcnt = index;
3876       current_hitlist->hspcnt_max = index;
3877    }
3878 
3879    if (search->pbp->mb_params->use_dyn_prog) {
3880       status = BlastReapHitlistByEvalue(search);
3881    }
3882 
3883    if (current_hitlist->hspcnt > 1)
3884       BlastSortUniqHspArray(current_hitlist);
3885 
3886    if (search->pbp->hsp_num_max &&
3887        search->pbp->hsp_num_max < search->current_hitlist->hspcnt &&
3888        !search->pbp->mb_params->use_dyn_prog && search->last_context <= 1) {
3889       /* Number of HSPs to be saved is restricted by user;
3890          delete extra HSPs unless gapped alignments are done later, or
3891          there are multiple query sequences - then this is done elsewhere.
3892       */
3893       for (index=search->pbp->hsp_num_max;
3894            index < search->current_hitlist->hspcnt; ++index) {
3895          current_hitlist->hsp_array[index] =
3896             BLAST_HSPFree(current_hitlist->hsp_array[index]);
3897       }
3898       search->current_hitlist->hspcnt = search->pbp->hsp_num_max;
3899    }
3900 
3901    return 0;
3902 }
3903 
3904 /* The following binary search routine assumes that array A is filled. */
BinarySearchInt4(Int4 n,Int4Ptr A,Int4 size)3905 Int4 BinarySearchInt4(Int4 n, Int4Ptr A, Int4 size)
3906 {
3907     Int4 m, b, e;
3908 
3909     b = 0;
3910     e = size;
3911     while (b < e - 1) {
3912 	m = (b + e) / 2;
3913 	if (A[m] > n)
3914 	    e = m;
3915 	else
3916 	    b = m;
3917     }
3918     return b;
3919 }
3920 
3921 /* Create a SeqLoc chain of query segments covered by all HSPs from a given
3922    SeqAlign - needed for masked query output only */
MaskSeqLocFromSeqAlign(SeqAlignPtr seqalign)3923 SeqLocPtr MaskSeqLocFromSeqAlign(SeqAlignPtr seqalign)
3924 {
3925    SeqLocPtr slp = NULL, new_slp, slp_var;
3926    SeqAlignPtr sap;
3927    DenseSegPtr dsp;
3928    Int4 from, to;
3929 
3930    for (sap = seqalign; sap; sap = sap->next) {
3931       dsp = (DenseSegPtr) sap->segs;
3932       if (dsp->strands[0] == Seq_strand_plus) {
3933          from = dsp->starts[0];
3934          to = dsp->starts[2*(dsp->numseg - 1)] +
3935             dsp->lens[dsp->numseg - 1] - 1;
3936       } else {
3937          to = dsp->starts[0] + dsp->lens[0] - 1;
3938          from = dsp->starts[2*(dsp->numseg - 1)];
3939       }
3940       new_slp = SeqLocIntNew(from, to, Seq_strand_plus, dsp->ids);
3941       if (slp==NULL)
3942 	 slp = slp_var = new_slp;
3943       else {
3944 	 slp_var->next = new_slp;
3945 	 slp_var = slp_var->next;
3946       }
3947    }
3948 
3949    return slp;
3950 }
3951 
PrintMaskedSequence(BioseqPtr query_bsp,SeqLocPtr mask_slp,FILE * fp,Int2 line_len,Boolean lcase_masking)3952 void PrintMaskedSequence(BioseqPtr query_bsp, SeqLocPtr mask_slp,
3953                          FILE *fp, Int2 line_len, Boolean lcase_masking)
3954 {
3955    SeqLocPtr query_slp = NULL;
3956    SeqPortPtr spp;
3957    Uint1Ptr query_seq, seq_line;
3958    Int4 index, id_length = 0, defline_len;
3959    Uint1 residue;
3960    CharPtr buffer = NULL;
3961 
3962    ValNodeAddPointer(&query_slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(query_bsp->id, SEQID_GI)));
3963 
3964 
3965    spp = SeqPortNewByLoc(query_slp, Seq_code_iupacna);
3966    SeqPortSet_do_virtual(spp, TRUE);
3967    query_seq = (Uint1Ptr) MemNew((query_bsp->length + 1)*sizeof(Uint1));
3968    index=0;
3969    while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3970       if (IS_residue(residue)) {
3971 	 query_seq[index] = residue;
3972 	 index++;
3973       }
3974    }
3975    if (lcase_masking)
3976       BlastLCaseMaskTheResidues(query_seq, query_bsp->length, mask_slp, FALSE,
3977                                 SeqLocStart(query_slp));
3978    else
3979       BlastMaskTheResidues(query_seq, query_bsp->length, 'N', mask_slp, FALSE,
3980                            SeqLocStart(query_slp));
3981 
3982    SeqLocFree(query_slp);
3983    if (query_bsp->descr)
3984       defline_len = StringLen(BioseqGetTitle(query_bsp));
3985    else
3986       defline_len = 0;
3987    defline_len += 255;     /* Sufficient for an ID. */
3988    buffer = MemNew((defline_len+1)*sizeof(Char));
3989    if (query_bsp->id->choice != SEQID_LOCAL) {
3990       SeqIdWrite(query_bsp->id, buffer, PRINTID_FASTA_LONG, BUFFER_LENGTH);
3991       id_length = StringLen(buffer);
3992       buffer[id_length] = ' ';
3993       id_length++;
3994    }
3995    StringCpy(&buffer[id_length], BioseqGetTitle(query_bsp));
3996    fprintf(fp, ">%s\n", buffer);
3997 
3998    seq_line = query_seq;
3999    for (index=0; index<=(query_bsp->length-1)/line_len; index++) {
4000       fprintf(fp, "%.*s\n", line_len, seq_line);
4001       seq_line += line_len;
4002    }
4003 
4004    spp = SeqPortFree(spp);
4005    MemFree(buffer);
4006    query_seq = MemFree(query_seq);
4007 }
4008 
4009 #define MIN_HSP_ARRAY_SIZE 20
4010 Int4 LIBCALL
MegaBlastSaveCurrentHitlist(BlastSearchBlkPtr search)4011 MegaBlastSaveCurrentHitlist(BlastSearchBlkPtr search)
4012 {
4013 	BLASTResultHitlistPtr result_hitlist, PNTR results, PNTR mb_results;
4014 	BLASTResultsStructPtr result_struct;
4015 	BLAST_HitListPtr current_hitlist;
4016 	BLAST_HSPPtr hsp;
4017 	BLAST_KarlinBlkPtr kbp;
4018 	BLASTResultHspPtr hsp_array;
4019 	Int4 hspcnt, index, index1, new_index, old_index, low_index, high_index;
4020 	Int4 hitlist_count, hitlist_max, hspmax, high_score=0, retval;
4021 	Nlm_FloatHi current_evalue=DBL_MAX;
4022 	Int4 query_length, hsp_index, new_size;
4023         Int4Ptr hsp_array_sizes;
4024 	Int2 num_queries, query_index;
4025         Boolean PNTR do_not_reallocate;
4026         Boolean use_dyn_prog = search->pbp->mb_params->use_dyn_prog;
4027 
4028 	if (search == NULL)
4029 		return 0;
4030 
4031 	if (search->current_hitlist == NULL || search->current_hitlist->hspcnt <= 0)	/* No hits to save. */
4032 	{
4033 		search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
4034 		return 0;
4035 	}
4036 
4037 	current_hitlist = search->current_hitlist;
4038 
4039 	retval = current_hitlist->hspcnt;
4040 
4041         num_queries = search->last_context / 2 + 1;
4042         mb_results = (BLASTResultHitlistPtr PNTR)
4043            MemNew(num_queries*sizeof(BLASTResultHitlistPtr));
4044         hsp_array_sizes = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
4045         do_not_reallocate = (Boolean PNTR) MemNew(num_queries*sizeof(Boolean));
4046 
4047         /* The HSPs in the hsp_array are sorted previously by score, therefore
4048            for each individual query they already come in the increasing order
4049            of evalue, since linking of HSPs is not done in megablast */
4050         index1 = 0;
4051         hspcnt = current_hitlist->hspcnt;
4052         hspmax = current_hitlist->hspcnt_max;
4053 
4054         HeapSort(current_hitlist->hsp_array, hspcnt,
4055                  sizeof(BLAST_HSPPtr), score_compare_hsps);
4056 
4057         hsp = current_hitlist->hsp_array[0];
4058         for (index=0; index<hspcnt; index++) {
4059            while (hsp == NULL && index1 < hspmax) {
4060               index1++;
4061               hsp = current_hitlist->hsp_array[index1];
4062            }
4063            if (index1==hspmax) break;
4064            if (search->pbp->mb_params->perc_identity > 0) {
4065               if (MegaBlastGetHspPercentIdentity(search, hsp) <
4066                   search->pbp->mb_params->perc_identity) {
4067                  index1++;
4068                  if (index1 >= hspmax)
4069                     break;
4070                  hsp = current_hitlist->hsp_array[index1];
4071                  continue;
4072               }
4073            }
4074            query_index = hsp->context / 2;
4075 
4076            result_hitlist = mb_results[query_index];
4077            if (!result_hitlist) {
4078               hsp_array_sizes[query_index] = MIN_HSP_ARRAY_SIZE;
4079               if (search->pbp->hsp_num_max &&
4080                   search->pbp->hsp_num_max < MIN_HSP_ARRAY_SIZE)
4081                  hsp_array_sizes[query_index] = search->pbp->hsp_num_max;
4082               result_hitlist = mb_results[query_index] =
4083                  BLASTResultHitlistNew(hsp_array_sizes[query_index]);
4084               result_hitlist->hspcnt = 1;
4085               /* This is the first HSP for this query, so it is the
4086                  best one */
4087               result_hitlist->best_evalue = hsp->evalue;
4088               result_hitlist->high_score = hsp->score;
4089            } else if (result_hitlist->hspcnt == hsp_array_sizes[query_index]) {
4090               if (!do_not_reallocate[query_index]) {
4091                  new_size = 2*hsp_array_sizes[query_index];
4092                  if (search->pbp->hsp_num_max &&
4093                      search->pbp->hsp_num_max < new_size && !use_dyn_prog) {
4094                     /* Number of HSPs to be saved is restricted by user;
4095                        stop at this limit if gapped alignment has already
4096                        been completed */
4097                     new_size = search->pbp->hsp_num_max;
4098                     if (new_size <= hsp_array_sizes[query_index]) {
4099                        do_not_reallocate[query_index] = TRUE;
4100                        continue;
4101                     }
4102                  }
4103 
4104                  if (!(hsp_array = Realloc(result_hitlist->hsp_array,
4105                            new_size*sizeof(BLASTResultHsp)))) {
4106                     ErrPostEx(SEV_WARNING, 0, 0,
4107                               "Cannot reallocate hsp_array for a result hitlist, size %d", new_size);
4108                     do_not_reallocate[query_index] = TRUE;
4109                  } else {
4110                     result_hitlist->hsp_array = hsp_array;
4111                     result_hitlist->hspcnt++;
4112                     hsp_array_sizes[query_index] = new_size;
4113                  }
4114               } else {
4115 		/* hsp_array is already full and reallocation not allowed */
4116                  continue;
4117 	      }
4118            } else
4119               result_hitlist->hspcnt++;
4120 
4121            if (result_hitlist == NULL) /* Should never happen */
4122               continue;
4123 
4124            result_hitlist->subject_id = search->subject_id;
4125            result_hitlist->subject_info = search->subject_info;
4126 
4127            hsp_array = result_hitlist->hsp_array;
4128            hsp_index = result_hitlist->hspcnt - 1;
4129 
4130            hsp_array[hsp_index].ordering_method = hsp->ordering_method;
4131            hsp_array[hsp_index].number = hsp->num;
4132            hsp_array[hsp_index].score = hsp->score;
4133            hsp_array[hsp_index].e_value = hsp->evalue;
4134            hsp_array[hsp_index].num_ident = hsp->num_ident;
4135            kbp = search->sbp->kbp[hsp->context];
4136            hsp_array[hsp_index].bit_score = ((hsp->score*kbp->Lambda) -
4137                                              kbp->logK)/NCBIMATH_LN2;
4138 
4139            if (hsp->context & 1)
4140               hsp_array[hsp_index].query_frame = -1;
4141            else
4142               hsp_array[hsp_index].query_frame = 1;
4143            query_length =
4144               search->query_context_offsets[hsp->context+1] -
4145               search->query_context_offsets[hsp->context] - 1;
4146            if (!use_dyn_prog && !search->pbp->mb_params->no_traceback) {
4147               hsp->gap_info->start1 = hsp->query.offset;
4148               hsp->gap_info->start2 = hsp->subject.offset;
4149               hsp->gap_info->frame1 = hsp_array[hsp_index].query_frame;
4150               hsp->gap_info->length1 = query_length;
4151               hsp_array[hsp_index].query_gapped_start = hsp->query.offset + 1;
4152               hsp_array[hsp_index].subject_gapped_start =
4153                  hsp->subject.offset + 1;
4154            } else {
4155               hsp_array[hsp_index].query_gapped_start =
4156                  hsp->query.gapped_start;
4157               hsp_array[hsp_index].subject_gapped_start =
4158                  hsp->subject.gapped_start;
4159            }
4160            hsp_array[hsp_index].gap_info = hsp->gap_info;
4161            /* Edit block pointer has been copied; remove it from hsp to avoid
4162               double freeing */
4163            hsp->gap_info = NULL;
4164            hsp_array[hsp_index].context = hsp->context;
4165            hsp_array[hsp_index].query_offset = hsp->query.offset;
4166            hsp_array[hsp_index].query_length = hsp->query.length;
4167            hsp_array[hsp_index].subject_offset = hsp->subject.offset;
4168            hsp_array[hsp_index].subject_length = hsp->subject.length;
4169            hsp_array[hsp_index].subject_frame = hsp->subject.frame;;
4170            hsp_array[hsp_index].point_back = result_hitlist;
4171            hsp_array[hsp_index].back_left =
4172               hsp_array[hsp_index].back_right = NULL;
4173            hsp_array[hsp_index].hspset_cnt = -1;
4174 
4175            index1++;
4176            if (index1 >= hspmax)
4177               break;
4178            hsp = current_hitlist->hsp_array[index1];
4179         }
4180 
4181         search->subject_info = NULL;
4182 
4183 /* For MT BLAST we check that no other thread is attempting to insert results. */
4184 	if (search->thr_info->results_mutex)
4185             NlmMutexLock(search->thr_info->results_mutex);
4186 
4187         for (query_index=0; query_index<num_queries; query_index++) {
4188            result_hitlist = mb_results[query_index];
4189            if (!result_hitlist)
4190               continue;
4191 
4192            /* This is the structure that is identical on every thread. */
4193            result_struct = search->mb_result_struct[query_index];
4194 
4195            if (!result_struct)
4196               result_struct = search->mb_result_struct[query_index] =
4197                  BLASTResultsStructNew(search->result_size, 0, 0);
4198 
4199            hitlist_count = result_struct->hitlist_count;
4200            hitlist_max = result_struct->hitlist_max;
4201            results = result_struct->results;
4202 
4203            /* Record the worst evalue for ReevaluateWithAmbiguities. */
4204            if (hitlist_count == hitlist_max)
4205               search->worst_evalue = results[hitlist_count-1]->best_evalue;
4206 
4207            current_evalue = result_hitlist->best_evalue;
4208            high_score = result_hitlist->high_score;
4209            /* New hit is less significant than all the other hits. */
4210            if (hitlist_count > 0 &&
4211                (current_evalue > results[hitlist_count-1]->best_evalue ||
4212                 (current_evalue == results[hitlist_count-1]->best_evalue &&
4213                  high_score < results[hitlist_count-1]->high_score))) {
4214               if (hitlist_count == hitlist_max) {
4215                  /* Array is full, delete the entry. */
4216                  result_hitlist = BLASTResultHitlistFreeEx(search, result_hitlist);
4217                  continue;
4218               } else       /* Add to end of array. */
4219                  new_index = hitlist_count;
4220            } else if (hitlist_count > 0) {
4221               /* The array is all NULL's if hitlist_count==0 */
4222               high_index=0;
4223               low_index=hitlist_count-1;
4224               new_index = (high_index+low_index)/2;
4225               old_index = new_index;
4226               for (index=0; index<BLAST_SAVE_ITER_MAX; index++) {
4227                  if (results[new_index]->best_evalue > current_evalue)
4228                     low_index = new_index;
4229                  else if (results[new_index]->best_evalue < current_evalue)
4230                     high_index = new_index;
4231                  else { /* If e-values are the same, use high score. */
4232                     /* If scores are the same, use ordinal number. */
4233                     if (results[new_index]->high_score < high_score)
4234                        low_index = new_index;
4235                     else if (results[new_index]->high_score > high_score)
4236                        high_index = new_index;
4237                     else if (results[new_index]->subject_id < search->subject_id)
4238                        low_index = new_index;
4239                     else
4240                        high_index = new_index;
4241                  }
4242 
4243                  new_index = (high_index+low_index)/2;
4244                  if (old_index == new_index) {
4245                     if (results[new_index]->best_evalue < current_evalue)
4246                        /* Perform this check as new_index get rounded DOWN above.*/
4247                        new_index++;
4248                     else if (results[new_index]->best_evalue == current_evalue && results[new_index]->high_score > high_score)
4249                        new_index++;
4250                     break;
4251                  }
4252                  old_index = new_index;
4253               }
4254               if (hitlist_count == hitlist_max) {
4255                  /* The list is full, delete the last entry. */
4256                  if (results[hitlist_max-1]->seqalign)
4257                     SeqAlignSetFree(results[hitlist_max-1]->seqalign);
4258                  results[hitlist_max-1] =
4259                     BLASTResultHitlistFreeEx(search, results[hitlist_max-1]);
4260                      result_struct->hitlist_count--;
4261                      hitlist_count = result_struct->hitlist_count;
4262               }
4263               if (hitlist_max > 1)
4264                  Nlm_MemMove((results+new_index+1), (results+new_index),
4265                              (hitlist_count-new_index)*sizeof(results[0]));
4266            } else /* First hit to be stored. */
4267               new_index = 0;
4268 
4269            if (new_index < hitlist_max) {
4270               results[new_index] = result_hitlist;
4271               result_struct->hitlist_count++;
4272            }
4273         }
4274         MemFree(do_not_reallocate);
4275         MemFree(hsp_array_sizes);
4276         MemFree(mb_results);
4277         /* Free mutex. */
4278 	if (search->thr_info->results_mutex)
4279             NlmMutexUnlock(search->thr_info->results_mutex);
4280 	return retval;
4281 }
4282 
4283 FloatLo
MegaBlastGetHspPercentIdentity(BlastSearchBlkPtr search,BLAST_HSPPtr hsp)4284 MegaBlastGetHspPercentIdentity(BlastSearchBlkPtr search, BLAST_HSPPtr hsp)
4285 {
4286    FloatLo perc_ident;
4287    Int4 numseg, align_length, i, q_off, s_off;
4288    Int2 context;
4289    Int4Ptr length, start;
4290    Uint1Ptr strands;
4291    Uint1Ptr query_seq, subject_seq = NULL;
4292    GapXEditScriptPtr esp;
4293 
4294    if (!hsp->gap_info)
4295       /* Cannot do any meaningful check at this time, return 100 so nothing is
4296          discarded */
4297       return 100.0;
4298 
4299    context = hsp->context;
4300    query_seq = search->context[context].query->sequence;
4301    subject_seq = search->subject->sequence_start + 1;
4302 
4303    esp = hsp->gap_info->esp;
4304 
4305    for (numseg=0; esp; esp = esp->next, numseg++);
4306 
4307    /* GXECollectDataForSeqalign might change the query offset, so need an extra
4308       variable */
4309    q_off = hsp->query.offset;
4310    s_off = hsp->subject.offset;
4311    GXECollectDataForSeqalign(hsp->gap_info, hsp->gap_info->esp, numseg,
4312                              &start, &length, &strands,
4313                              &q_off, &s_off);
4314 
4315    perc_ident = 0;
4316    align_length = 0;
4317    for (i=0; i<numseg; i++) {
4318       align_length += length[i];
4319       if (start[2*i] != -1 && start[2*i+1] != -1) {
4320          perc_ident +=
4321             (FloatLo) MegaBlastGetNumIdentical(query_seq, subject_seq,
4322                                                start[2*i], start[2*i+1],
4323                                                length[i], FALSE);
4324       }
4325    }
4326    perc_ident = perc_ident / align_length * 100;
4327    MemFree(start);
4328    MemFree(length);
4329    MemFree(strands);
4330    return perc_ident;
4331 }
4332 
MegaBlastParameterBlkNew(BLAST_OptionsBlkPtr options)4333 MegaBlastParameterBlkPtr MegaBlastParameterBlkNew(BLAST_OptionsBlkPtr options)
4334 {
4335    MegaBlastParameterBlkPtr mb_params = (MegaBlastParameterBlkPtr)
4336       MemNew(sizeof(MegaBlastParameterBlk));
4337 
4338    mb_params->no_traceback = options->no_traceback;
4339    mb_params->is_neighboring = options->is_neighboring;
4340    mb_params->full_seqids = options->megablast_full_deflines;
4341    mb_params->perc_identity = options->perc_identity;
4342    mb_params->max_positions = options->block_width;
4343    if (options->mb_template_length > 0) {
4344       mb_params->template_length = options->mb_template_length;
4345       mb_params->disc_word = TRUE;
4346       mb_params->word_weight = options->wordsize;
4347       mb_params->template_type =
4348          GetMBTemplateType(mb_params->word_weight, mb_params->template_length,
4349                            options->mb_disc_type);
4350       if (options->mb_disc_type == MB_TWO_TEMPLATES)
4351          mb_params->use_two_templates = TRUE;
4352    } else {
4353       mb_params->template_length = MIN(options->wordsize, 12);
4354       mb_params->word_weight = options->wordsize;
4355    }
4356    mb_params->one_base_step = options->mb_one_base_step;
4357    mb_params->use_dyn_prog = options->mb_use_dyn_prog;
4358 
4359    return mb_params;
4360 }
4361 
GetMBTemplateType(Int2 weight,Int2 length,MBDiscWordType type)4362 MBTemplateType GetMBTemplateType(Int2 weight, Int2 length, MBDiscWordType type)
4363 {
4364    if (weight == 11) {
4365       if (length == 16) {
4366          if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4367             return TEMPL_11_16;
4368          else if (type == MB_WORD_OPTIMAL)
4369             return TEMPL_11_16_OPT;
4370       } else if (length == 18) {
4371          if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4372             return TEMPL_11_18;
4373          else if (type == MB_WORD_OPTIMAL)
4374             return TEMPL_11_18_OPT;
4375       } else if (length == 21) {
4376          if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4377             return TEMPL_11_21;
4378          else if (type == MB_WORD_OPTIMAL)
4379             return TEMPL_11_21_OPT;
4380       }
4381    } else if (weight == 12) {
4382       if (length == 16) {
4383          if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4384             return TEMPL_12_16;
4385          else if (type == MB_WORD_OPTIMAL)
4386             return TEMPL_12_16_OPT;
4387       } else if (length == 18) {
4388          if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4389             return TEMPL_12_18;
4390          else if (type == MB_WORD_OPTIMAL)
4391             return TEMPL_12_18_OPT;
4392       } else if (length == 21) {
4393          if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES)
4394             return TEMPL_12_21;
4395          else if (type == MB_WORD_OPTIMAL)
4396             return TEMPL_12_21_OPT;
4397       }
4398    }
4399    return TEMPL_ERROR;
4400 }
4401