1 static char const rcsid[] = "$Id: blastutl.c,v 6.472 2010/12/10 18:47:12 madden Exp $";
2 
3 /* ===========================================================================
4 *
5 *                            PUBLIC DOMAIN NOTICE
6 *               National Center for Biotechnology Information
7 *
8 *  This software/database is a "United States Government Work" under the
9 *  terms of the United States Copyright Act.  It was written as part of
10 *  the author's official duties as a United States Government employee and
11 *  thus cannot be copyrighted.  This software/database is freely available
12 *  to the public for use. The National Library of Medicine and the U.S.
13 *  Government have not placed any restriction on its use or reproduction.
14 *
15 *  Although all reasonable efforts have been taken to ensure the accuracy
16 *  and reliability of the software and data, the NLM and the U.S.
17 *  Government do not and cannot warrant the performance or results that
18 *  may be obtained by using this software or data. The NLM and the U.S.
19 *  Government disclaim all warranties, express or implied, including
20 *  warranties of performance, merchantability or fitness for any particular
21 *  purpose.
22 *
23 *  Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26 
27 /*****************************************************************************
28 
29 File name: blastutl.c
30 
31 Author: Tom Madden
32 
33 Contents: Utilities for BLAST
34 
35 $Revision: 6.472 $
36 
37 ******************************************************************************/
38 /*
39  *
40 * $Log: blastutl.c,v $
41 * Revision 6.472  2010/12/10 18:47:12  madden
42 * Do not change db_length on options
43 *
44 * Revision 6.471  2007/05/08 19:03:33  kans
45 * in FilterWithSeg added SeqDataPtr and ByteStorePtr casts for seq_data
46 *
47 * Revision 6.470  2007/03/13 20:40:24  madden
48 *   - In s_ComputeAverageLength, compute the floating point value retval
49 *     using floating point division.
50 *
51 *   - In BioseqBlastEngineCore, call blast_set_paramters for rounds > 1
52 *     of PSI-BLAST.
53 *
54 *   - In GetDbSubjRatio, use floating point operations to compute the
55 *     floating point value db_subj_ratio.
56 *   [from Mike Gertz]
57 *
58 * Revision 6.469  2007/03/05 14:51:24  camacho
59 * - Make s_ComputeAverageLength static.
60 *
61 * Revision 6.468  2007/01/23 15:25:44  madden
62 * Use SeqLocDustEx rather than SeqLocDust
63 *
64 * Revision 6.467  2007/01/17 15:46:00  madden
65 * remove FilterDNA
66 *
67 * Revision 6.466  2006/08/10 17:34:38  merezhuk
68 * Fix for reading -z advanced option by StringToInt8; RT # 15187990
69 *
70 * Revision 6.465  2006/02/15 18:23:47  madden
71 * Made changes so that CheckStartForGappedAlignment by default
72 * checks ungapped alignments of length 11, rather than length 10.
73 * Made changes to the rules used when the starting point is close to
74 * the edge of the preliminary gapped alignment.
75 * (from Mike Gertz)
76 *
77 * Revision 6.464  2005/12/01 15:10:23  madden
78 * Gave BLASTCheckHSPInclusion external linkage (i.e. removed the static specifier).
79 *
80 * Revision 6.463  2005/10/13 15:59:06  camacho
81 * Add code to fix cutoff scores in PSI-BLAST.
82 *
83 * Revision 6.462  2005/07/28 14:57:09  coulouri
84 * remove dead code
85 *
86 * Revision 6.461  2005/07/27 15:51:54  coulouri
87 * remove unused queue_callback
88 *
89 * Revision 6.460  2005/05/02 16:03:14  coulouri
90 * refactor code to set db_chunk_size
91 *
92 * Revision 6.459  2005/04/25 14:16:36  coulouri
93 * set db_chunk_size adaptively
94 *
95 * Revision 6.458  2005/04/04 20:44:27  camacho
96 * Do not overwrite the effective search space in Pssm2Sequences if specified in the options structure
97 *
98 * Revision 6.457  2005/02/07 15:30:08  dondosha
99 * Removed restriction on the value of longest intron option
100 *
101 * Revision 6.456  2005/01/24 20:37:37  camacho
102 * Added conditional compilation to structs need for BLAST_CLUSTER_HITS
103 *
104 * Revision 6.455  2005/01/18 14:54:13  camacho
105 * Change in tie-breakers for score comparison, suggestion by Mike Gertz
106 *
107 * Revision 6.454  2004/12/20 15:22:16  camacho
108 * Calculate kbp_ideal values rather than loading them from pre-computed values
109 *
110 * Revision 6.453  2004/12/01 17:24:15  coulouri
111 * do not dereference null pointer
112 *
113 * Revision 6.452  2004/11/22 16:10:11  dondosha
114 * Minor fix to make sure that "evalue" score type is always used when hsp is not part of a linked set
115 *
116 * Revision 6.451  2004/11/04 15:51:55  bealer
117 * - bl2seq should use dblen as average length if database is not available.
118 *
119 * Revision 6.450  2004/11/01 14:07:56  madden
120 * From Mike Gertz:
121 *
122 *    - In query_offset_compare_hsp and query_end_compare_hsp, use the
123 *      subject query/offset as a tie-breaker.  Without this tie-breaker
124 *      CheckGappedAlignmentsForOverlap won't work properly.
125 *
126 *    - In CheckGappedAlignmentsForOverlap check that hsp_array, rather
127 *      than *hsp_array, is not nil.
128 *
129 *    - In BlastSaveCurrentHsp, rewrote the binary search to use
130 *      score_compare_hsps, so that the answers are consistent with the
131 *      heap code used in the algo/blast/core code.
132 *
133 *    - In BlastGappedScoreInternal delete gapped extensions that don't
134 *      reach the cutoff score (cutoff_s1).
135 *
136 * Revision 6.449  2004/10/25 18:36:17  papadopo
137 * From Michael Gertz: remove unneeded decrement of alignment offsets in BlastNtSaveCurrentHsp
138 *
139 * Revision 6.448  2004/10/19 19:42:17  dondosha
140 * Optimized algorithm in BlastPruneSeqAlignByGiList to make it up to 25 times faster; Added new function BlastPruneSeqAlignBySortedGiList
141 *
142 * Revision 6.447  2004/10/18 13:02:41  madden
143 * Changes from Mike Gertz:
144 *         - In score_compare_hsps, query_offset_compare_hsp and
145 *           query_end_compare_hsp, change the comparison tests so that
146 *           nil HSPs are less than any non-nil HSP.  Previously, these
147 *           comparison functions would return 0 if either HSP was nil,
148 *           which would result in sort routines terminating before the
149 *           non-nil HSPs in the list were fully sorted.
150 *
151 *         - In score_compare_hsps, copied the set of tie-breakers from
152 *           the corresponding routine in algo/blast/core/blast_hits.c.
153 *
154 *         - In RealBlastGetGappedAlignmentTraceback, the HSP list must
155 *           be sorted before BLASTCheckHSPInclusion is invoked.
156 *
157 * Revision 6.446  2004/09/28 16:05:32  papadopo
158 * From Michael Gertz: In BlastGappedScoreInternal, changed a
159 * reference to the sumscore field of an HSP to a reference to the
160 * xsum field of an HSP.
161 *
162 * Revision 6.445  2004/08/23 17:05:42  papadopo
163 * From Michael Gertz: make CopyResultHspToHSP public
164 *
165 * Revision 6.444  2004/08/16 19:37:26  dondosha
166 * Enabled uneven gap HSP linking for blastx
167 *
168 * Revision 6.443  2004/08/05 21:52:28  camacho
169 * Gracefully handle inability to calculate ungapped lambda for PSSM in psiblast2sequences
170 *
171 * Revision 6.442  2004/07/24 18:55:29  camacho
172 * Fix to GetSequenceWithDenseSeg when sequence cannot be found
173 *
174 * Revision 6.441  2004/07/19 17:05:36  papadopo
175 * specify (unused) 'output-to-scoremat' parameter
176 *
177 * Revision 6.440  2004/06/30 12:29:39  madden
178 * Moved some functions to blfmtutl.c
179 *
180 * Revision 6.439  2004/06/22 14:16:55  camacho
181 * Changed invocation of posFreqsToMatrix to conform with new signature
182 *
183 * Revision 6.438  2004/06/01 20:34:06  dondosha
184 * Fix in previous change; memory leak fix
185 *
186 * Revision 6.437  2004/05/27 17:36:24  dondosha
187 * Minor fix for previous 2 changes
188 *
189 * Revision 6.436  2004/05/25 21:42:47  dondosha
190 * Fix in previous change: in some cases edit block should not be freed when BLAST_HSP is freed
191 *
192 * Revision 6.435  2004/05/21 13:53:04  dondosha
193 * Use BLAST_HSPFree to free BLAST_HSP structures, hence no need to call GapXEditBlockDelete in multiple places
194 *
195 * Revision 6.434  2004/04/22 16:40:32  dondosha
196 * Set search->subject_id to correct ordinal id, needed for finding splice junctions in HSP links at traceback stage
197 *
198 * Revision 6.433  2004/03/22 22:10:38  dondosha
199 * Use kbp_gap instead of kbp pointers in megablast traceback
200 *
201 * Revision 6.432  2004/02/26 15:52:30  papadopo
202 * Mike Gertz' modifications to unify handling of gapped Karlin blocks between protein and nucleotide searches
203 *
204 * Revision 6.431  2004/02/04 15:35:03  camacho
205 * Rollback to fix problems in release 2.2.7
206 *
207 * Revision 6.429  2004/01/30 16:54:45  dondosha
208 * Check if HSP needs to be deleted after reevaluation with ambiguities, after greedy traceback
209 *
210 * Revision 6.428  2004/01/28 16:54:03  dondosha
211 * Restored the code that shifts subject coordinates for blastn traceback with long subject sequences
212 *
213 * Revision 6.427  2004/01/25 05:06:21  dondosha
214 * Translate only relevant parts of long subject sequences for tblastn traceback
215 *
216 * Revision 6.426  2004/01/16 23:43:44  dondosha
217 * No more need for special argument for partial search: it is set in options
218 *
219 * Revision 6.425  2004/01/14 17:01:06  dondosha
220 * Gapped alignment is position based only if posMatrix exists
221 *
222 * Revision 6.424  2004/01/09 18:13:24  dondosha
223 * In [Get,Check]StartForGappedAlignment: if posMatrix not available, use square matrix for calculations
224 *
225 * Revision 6.423  2004/01/06 22:37:40  dondosha
226 * Use BLAST_HSPfree function; in particular fixes a bug with wrong memory being freed
227 *
228 * Revision 6.422  2003/12/11 23:46:28  dondosha
229 * Correction in setting hit ranges after repeats filtering
230 *
231 * Revision 6.421  2003/12/10 17:05:28  dondosha
232 * Added function ReevaluateScoreWithAmbiguities to reevaluate score for one HSP; use it after greedy traceback
233 *
234 * Revision 6.420  2003/11/24 22:06:41  madden
235 * Tblastn optimization, only fetch part of sequence needed
236 *
237 * Revision 6.419  2003/10/30 18:37:19  dondosha
238 * Fix for megablast with non-greedy traceback
239 *
240 * Revision 6.418  2003/10/29 17:46:59  dondosha
241 * Allow 2-stage greedy extension in megablast
242 *
243 * Revision 6.417  2003/08/20 22:14:08  dondosha
244 * Little correction in call to OOFBlastHSPGetNumIdentical
245 *
246 * Revision 6.416  2003/08/04 16:19:16  dondosha
247 * Added effective HSP length (length adjustment) to other returns, so it can be reported in XML output
248 *
249 * Revision 6.415  2003/05/30 17:25:36  coulouri
250 * add rcsid
251 *
252 * Revision 6.414  2003/05/23 22:12:11  camacho
253 * Fix memory leak in PsiBlast2Sequences
254 *
255 * Revision 6.413  2003/04/22 21:52:13  dondosha
256 * Added function OOFBlastHSPGetNumIdentical
257 *
258 * Revision 6.412  2003/04/10 19:21:16  dondosha
259 * Memory leak fix for megablast with limited number of HSPs per hit
260 *
261 * Revision 6.411  2003/03/24 19:42:14  madden
262 * Changes to support query concatenation for blastn and tblastn
263 *
264 * Revision 6.410  2003/03/11 14:33:48  madden
265 * Sort HSPs after array is no longer reallocated
266 *
267 * Revision 6.409  2003/02/21 02:52:16  madden
268 * Ensure stable sorting in score_compare_hsp (change from Morgulis)
269 *
270 * Revision 6.408  2003/01/24 22:26:03  camacho
271 * RPSInit is deprecated, use RPSInitEx instead
272 *
273 * Revision 6.407  2002/12/09 17:22:16  dondosha
274 * When alignment jumps beyond a strand boundary, keep the part of it where initial word is
275 *
276 * Revision 6.406  2002/12/04 23:32:50  camacho
277 * Do not set use_this_gi with nucleotide dbs (redundant)
278 *
279 * Revision 6.405  2002/12/04 18:42:22  camacho
280 * Minor change to previous commit
281 *
282 * Revision 6.404  2002/12/04 18:38:58  camacho
283 * Use correct effective search space in B2SPssmMultipleQueries
284 *
285 * Revision 6.403  2002/12/04 17:08:33  camacho
286 * Minor change to B2SPssmCleanUpSearch
287 *
288 * Revision 6.402  2002/11/27 15:41:51  dondosha
289 * Added -t, -g and -n megablast options to parse_blast_options
290 *
291 * Revision 6.401  2002/11/26 23:02:07  madden
292 * Add w option to parse_blast_options (OOF for blastx)
293 *
294 * Revision 6.400  2002/11/25 19:57:30  dondosha
295 * Further fix to the HSP limit (-H) megablast option
296 *
297 * Revision 6.399  2002/11/22 23:31:43  dondosha
298 * 1. Use array of structures instead of array of pointers for initial offset pairs;
299 * 2. Sort the HSP array when maximal number of HSPs is reached for a sequence
300 *
301 * Revision 6.398  2002/11/13 23:23:53  dondosha
302 * Correction for getting number of identities in tblastn
303 *
304 * Revision 6.397  2002/11/07 22:25:34  dondosha
305 * Correction in calculating number of identities for very long database sequences
306 *
307 * Revision 6.396  2002/11/04 23:00:54  dondosha
308 * Calculate number of identities while computing the traceback, and save it in the seqalign
309 *
310 * Revision 6.395  2002/10/22 21:03:42  camacho
311 * Calculate the effective search space correctly for rpsblast in BlastOtherReturnsPrepare
312 *
313 * Revision 6.394  2002/10/22 17:57:48  camacho
314 * Changes to B2SPssmMultipleQueries
315 *
316 * Revision 6.393  2002/10/22 15:28:45  kans
317 * SeqAlignCompare takes LIBCALLBACK
318 *
319 * Revision 6.392  2002/10/21 23:13:36  camacho
320 * Added B2SPssmOnTheFly functions
321 *
322 * Revision 6.391  2002/10/18 15:08:28  dondosha
323 * Correction in SaveCurrentHsp functions when maximal number of HSPs is reached
324 *
325 * Revision 6.390  2002/10/17 14:33:12  dondosha
326 * Correction for the maximal number of HSPs option
327 *
328 * Revision 6.389  2002/09/19 22:22:18  camacho
329 * Sanity checks in BlastTwoSequencesByLocWithCallback
330 *
331 * Revision 6.388  2002/09/16 15:54:59  camacho
332 * Turn off RedoAlignmentCore from psi-bl2seq
333 *
334 * Revision 6.387  2002/09/13 20:05:43  camacho
335 * Set the dbseq_num to 1 in BlastTwoSequencesByLocWithCallback
336 *
337 * Revision 6.386  2002/09/11 20:46:25  camacho
338 * Removed deprecated BlastSeqIdListPtr code
339 *
340 * Revision 6.385  2002/09/03 14:22:45  camacho
341 * Changes to pacify mac compiler
342 *
343 * Revision 6.384  2002/09/02 21:54:41  camacho
344 * Correction to previous revision
345 *
346 * Revision 6.383  2002/09/02 20:44:56  camacho
347 * Allow pssm rescaling if scalingFactor is non-zero
348 *
349 * Revision 6.382  2002/08/30 15:42:49  dondosha
350 * In blastn, use ewp structure only for the first context
351 *
352 * Revision 6.381  2002/08/29 19:22:20  camacho
353 * Save karlinK parameter when rescaling pssm
354 *
355 * Revision 6.380  2002/08/29 16:23:42  camacho
356 * Removed debugging code
357 *
358 * Revision 6.379  2002/08/29 15:49:56  camacho
359 * Added matrix rescaling code for psi-blast2sequences
360 *
361 * Revision 6.378  2002/08/26 16:55:52  madden
362 * Fix for scaling with translated searches
363 *
364 * Revision 6.376  2002/08/05 20:07:37  dondosha
365 * Correction for bl2seq with megablast option: convert gap info to seqalign after search
366 *
367 * Revision 6.375  2002/08/02 21:49:56  vakatov
368 * + LIBCALL
369 *
370 * Revision 6.374  2002/08/01 21:33:12  madden
371 * Do not put p-value and small_gap into SeqAlign
372 *
373 * Revision 6.373  2002/08/01 20:45:34  dondosha
374 * Changed prototype of the BLASTPostSearchLogic function to make it
375 * more convenient
376 *
377 * Revision 6.372  2002/07/18 19:40:45  dondosha
378 * Added an option to restrict number of HSPs per database sequence
379 *
380 * Revision 6.371  2002/07/11 22:31:54  camacho
381 * Added sanity check to BlastTwoSequencesByLocWithCallback with PSSM
382 *
383 * Revision 6.370  2002/07/02 17:08:01  dondosha
384 * Reverse previous change - not needed
385 *
386 * Revision 6.369  2002/07/02 01:41:31  dondosha
387 * Typo fix
388 *
389 * Revision 6.368  2002/07/02 01:36:40  dondosha
390 * For megablast use larger window in CheckStartForGappedAlignment
391 *
392 * Revision 6.367  2002/06/21 21:43:01  camacho
393 * Removed obsolete BlastSeqIdList structure and functions
394 *
395 * Revision 6.366  2002/06/13 16:51:41  madden
396 * BlastTwoSequencesCore and BlastTwoSequencesCoreEx return status instead of SearchBlk
397 *
398 * Revision 6.365  2002/06/12 20:34:50  coulouri
399 * Don't dereference possibly NULL pointer
400 *
401 * Revision 6.364  2002/06/11 20:40:05  dondosha
402 * Correction to previous change
403 *
404 * Revision 6.363  2002/06/11 14:44:46  dondosha
405 * Return status from some functions instead of search block pointer
406 *
407 * Revision 6.362  2002/05/31 16:06:20  kans
408 * changed MemSet (..., NULL, ...) to MemSet (..., 0, ...) for Mac compiler
409 *
410 * Revision 6.361  2002/05/29 17:14:49  dondosha
411 * Check whether an id found by SeqIdFindBest is indeed a gi
412 *
413 * Revision 6.360  2002/05/28 22:00:12  camacho
414 * *** empty log message ***
415 *
416 * Revision 6.359  2002/05/13 13:51:32  dondosha
417 * Made two functions public
418 *
419 * Revision 6.358  2002/05/08 22:51:11  dondosha
420 * Do the starting positions check for final gapped alignment in Mega BLAST case as well
421 *
422 * Revision 6.357  2002/04/23 20:41:21  dondosha
423 * In case of non-affine extension in megablast, check percent identity cutoff after the traceback is obtained
424 *
425 * Revision 6.356  2002/04/19 17:26:07  madden
426 * Fix for last update
427 *
428 * Revision 6.355  2002/04/18 20:16:52  madden
429 * Fix problem with FUM for SeqLoc
430 *
431 * Revision 6.354  2002/04/17 20:42:23  madden
432 * Fix typo for mask1
433 *
434 * Revision 6.353  2002/04/04 21:19:15  dondosha
435 * Corrections for megablast with non-greedy extensions
436 *
437 * Revision 6.352  2002/03/28 18:51:39  madden
438 * All threads get access to (query) masking seqloc, merge overlapping segments for seg
439 *
440 * Revision 6.351  2002/03/26 23:18:00  dondosha
441 * Duplicate mb_endpoint_results structure on all threads
442 *
443 * Revision 6.350  2002/03/26 16:49:33  madden
444 * Use scaled up/down Lambda
445 *
446 * Revision 6.349  2002/03/14 16:11:40  camacho
447 * Extended BlastTwoSequences to allow comparison between sequence and PSSM
448 *
449 * Revision 6.348  2002/03/05 17:58:56  dondosha
450 * Set same offsets for the traceback as for preliminary extension for megablast with non-greedy extensions
451 *
452 * Revision 6.347  2002/02/15 23:36:22  dondosha
453 * Correction for megablast with non-greedy extensions
454 *
455 * Revision 6.346  2002/01/11 20:14:28  madden
456 * Put the use_this_gi into the SeqAlign
457 *
458 * Revision 6.345  2002/01/07 23:16:00  dondosha
459 * Fixed several memory leaks and allocation/freeing bugs in multithreaded megablast
460 *
461 * Revision 6.344  2001/12/28 20:38:40  dondosha
462 * Moved Mega BLAST related parameters into a separate structure
463 *
464 * Revision 6.343  2001/12/13 16:06:54  dondosha
465 * Use separate mb_endpoint_results list for each of multiple threads
466 *
467 * Revision 6.342  2001/11/26 20:19:25  madden
468 * Add call to BLASTOptionValidateEx to BlastTwoSequencesWithCallback
469 *
470 * Revision 6.341  2001/11/16 15:44:26  dondosha
471 * In BlastPruneSeqAlignByGiList: retrieve bioseq only if seqid in seqalign is not a gi
472 *
473 * Revision 6.340  2001/11/14 00:31:44  camacho
474 * Updated BlastGetAllowedGis and BlastGetFirstGiofSubset functions
475 * to return the correct seqid's when dealing with the new database
476 * format and mask (subset) databases.
477 *
478 * Revision 6.339  2001/11/13 18:20:33  dondosha
479 * Use GapxEditScript structure instead of edit_script_t in higher level function calls
480 *
481 * Revision 6.338  2001/10/12 16:10:07  dondosha
482 * 1. Made BLASTResultFreeHsp public
483 * 2. Added BioseqBlastEngineCoreEx with partial search option
484 *
485 * Revision 6.337  2001/10/05 18:10:29  madden
486 * Add threshold_second to parse_blast_options
487 *
488 * Revision 6.336  2001/09/19 17:24:17  kans
489 * removed extra parameter from BioseqMegaBlastEngineCore
490 *
491 * Revision 6.335  2001/09/07 14:46:43  dondosha
492 * Roll back removal of threshold_first from functions and structures
493 *
494 * Revision 6.334  2001/09/06 20:24:33  dondosha
495 * Removed threshold_first
496 *
497 * Revision 6.333  2001/07/27 20:04:09  dondosha
498 * Small correction in passing effective db length for two sequences engine
499 *
500 * Revision 6.332  2001/07/26 18:19:03  dondosha
501 * Added a few more letter options in parse_blast_options
502 *
503 * Revision 6.331  2001/07/20 18:55:58  dondosha
504 * 1. Use effective db length option in 2 sequences engine
505 * 2. Create diagonal array for megablast when needed
506 *
507 * Revision 6.330  2001/07/09 14:17:24  madden
508 * Fix PC-lint complaints from R. Williams
509 *
510 * Revision 6.329  2001/07/09 13:12:03  madden
511 * Removed unused variables
512 *
513 * Revision 6.328  2001/06/25 18:30:24  madden
514 * Add define for NLM_GENERATED_CODE_PROTO to get prototypes in fdlobj.h
515 *
516 * Revision 6.327  2001/06/25 16:03:31  madden
517 * Comment out CheckGappedAlignmentsForOverlap
518 *
519 * Revision 6.326  2001/06/12 19:48:55  madden
520 * Introduce total_hsp_limit, check before making SeqAlign
521 *
522 * Revision 6.325  2001/06/04 21:29:42  dondosha
523 * Add message about deleted hits with e-value below the low threshold
524 *
525 * Revision 6.324  2001/05/07 13:18:24  madden
526 * Fix to really remove deleted HSPs from (culling) heap
527 *
528 * Revision 6.323  2001/05/04 19:50:45  dondosha
529 * Improved error message when all queries are shorter than word size
530 *
531 * Revision 6.322  2001/05/03 21:48:28  dondosha
532 * Handle some cases when memory allocation fails
533 *
534 * Revision 6.321  2001/04/16 21:28:11  dondosha
535 * Added function BlastPruneSeqAlignByEvalueRange
536 *
537 * Revision 6.320  2001/04/12 21:34:50  dondosha
538 * Added function BlastPruneSeqAlignByGiList
539 *
540 * Revision 6.319  2001/04/12 17:17:15  madden
541 * Fixes core-dump for small query
542 *
543 * Revision 6.318  2001/04/12 15:01:25  madden
544 * change repeat filtering db
545 *
546 * Revision 6.317  2001/04/11 20:56:06  madden
547 * Added scalingFactor for rpsblast
548 *
549 * Revision 6.316  2001/04/11 18:22:13  dondosha
550 * Copy query_slp in BlastSearchBlkDuplicate for all programs
551 *
552 * Revision 6.315  2001/04/03 21:59:49  dondosha
553 * Implemented tabulated output for non-megablast bl2seq
554 *
555 * Revision 6.314  2001/03/28 21:05:23  dondosha
556 * Set dbinfo->is_protein in other returns
557 *
558 * Revision 6.313  2001/03/27 21:27:01  madden
559 * Minor efficiency in how lookup table is made
560 *
561 * Revision 6.312  2001/03/27 21:13:56  dondosha
562 * Do not print error if OID list exists without CommonIndex
563 *
564 * Revision 6.311  2001/03/27 20:35:10  dondosha
565 * Small bug fix
566 *
567 * Revision 6.310  2001/03/26 15:03:25  madden
568 * Fix number warnings and two bugs found by PC compiler
569 *
570 * Revision 6.309  2001/03/21 15:46:32  dondosha
571 * Added missing parentheses in previous change
572 *
573 * Revision 6.308  2001/03/20 20:06:13  dondosha
574 * Added protection from crossing strand boundary for blastn
575 *
576 * Revision 6.307  2001/03/19 18:51:39  madden
577 * HitRangeToSeqLoc returns values appropriate for subsequences
578 *
579 * Revision 6.306  2001/03/12 14:53:46  dondosha
580 * Uninitialized variable corrections
581 *
582 * Revision 6.305  2001/03/08 22:05:48  dondosha
583 * Split very long database sequences in all BLAST programs
584 *
585 * Revision 6.304  2001/02/16 18:45:39  dondosha
586 * Fixed minor purify errors
587 *
588 * Revision 6.303  2001/02/08 20:41:16  dondosha
589 * Implemented tabulated output for all translated programs
590 *
591 * Revision 6.302  2001/02/07 21:12:05  dondosha
592 * 1. Added Blast Engine functions with callback argument
593 * 2. Pass output stream from options block to search
594 *
595 * Revision 6.301  2001/01/29 22:23:00  madden
596 * Do not recreate hsp_array
597 *
598 * Revision 6.300  2001/01/26 17:43:09  madden
599 * Comment out unneeded memset
600 *
601 * Revision 6.299  2001/01/23 20:25:43  dondosha
602 * 1. Renamed BlastParceInputString to BlastParseInputString
603 * 2. Recognize a double quoted string as an option value in
604 *    BlastParseInputString
605 *
606 * Revision 6.298  2001/01/23 18:23:57  madden
607 * Fix memory leak
608 *
609 * Revision 6.297  2001/01/19 16:49:37  madden
610 * Added helper array to BlastNtGappedScoreInternal
611 *
612 * Revision 6.296  2001/01/16 23:16:51  dondosha
613 * Added 2 arguments and several options to parse_blast_options
614 *
615 * Revision 6.295  2001/01/16 20:32:46  kans
616 * included simutil.h to suppress Mac error
617 *
618 * Revision 6.294  2001/01/12 17:10:04  dondosha
619 * If subject SeqLoc is on a single strand and query on both, swap the strands
620 *
621 * Revision 6.293  2001/01/11 18:34:20  dondosha
622 * Changed error level for nonexistent database from ERROR to FATAL
623 *
624 * Revision 6.292  2001/01/09 20:16:27  dondosha
625 * Implemented from-to location options for both sequences in bl2seq
626 *
627 * Revision 6.291  2001/01/05 17:12:48  dondosha
628 * Correction in previous memory leak fix
629 *
630 * Revision 6.290  2001/01/04 15:01:25  dondosha
631 * Fix for tblastx in blast two sequences engine
632 *
633 * Revision 6.289  2001/01/03 21:45:30  dondosha
634 * Fixed a memory leak - some edit blocks not freed in megablast
635 *
636 * Revision 6.288  2000/12/28 18:23:05  madden
637 * Add -P and -A to parse_blast_options
638 *
639 * Revision 6.287  2000/12/19 15:52:47  dondosha
640 * Forbid reversing query and subject for two sequences megablast
641 *
642 * Revision 6.286  2000/12/19 14:52:59  dondosha
643 * Previous change wrong
644 *
645 * Revision 6.285  2000/12/15 15:38:38  dondosha
646 * Call AdjustOffSetsInSeqAlign with correct query and subject SeqLocs
647 *
648 * Revision 6.284  2000/12/15 14:25:41  madden
649 * Optimization to BlastTranslateUnambiguousSequence
650 *
651 * Revision 6.283  2000/12/15 14:23:34  madden
652 * Use readdb_get_sequence_ex to get sequence faster
653 *
654 * Revision 6.282  2000/12/13 22:26:44  dondosha
655 * Free the ncbi4na-encoded subject sequence after search in two sequences megablast engine
656 *
657 * Revision 6.281  2000/12/13 13:51:35  madden
658 * Free SeqLocPtr in BlastSequencesOnTheFly
659 *
660 * Revision 6.280  2000/12/07 17:46:56  dondosha
661 * Call AdjustOffSetsInSeqAlign for for megablast too
662 *
663 * Revision 6.279  2000/12/04 18:51:24  madden
664 * Fix memory leaks
665 *
666 * Revision 6.278  2000/11/29 23:05:00  dondosha
667 * Keep ncbi4na-encoded subject sequence in search->subject for megablast
668 *
669 * Revision 6.277  2000/11/16 19:15:31  dondosha
670 * Pass back endpoint results in other_returus for Mega BLAST with no traceback
671 *
672 * Revision 6.276  2000/11/09 17:28:35  dondosha
673 * Set block_width to 0 for Mega BLAST in BlastTwoSequences engine
674 *
675 * Revision 6.275  2000/11/08 22:21:33  dondosha
676 * Enabled new tblastn by adding a longest_intron option
677 *
678 * Revision 6.274  2000/11/08 20:20:31  dondosha
679 * Do not free subject in BlastTwoSequencesCore for new tblastn - done elsewhere
680 *
681 * Revision 6.273  2000/11/07 16:30:27  madden
682 * Introduce intermediate score (before linking of HSPs) for blastx and tblastn
683 *
684 * Revision 6.272  2000/11/03 20:15:19  dondosha
685 * Pass the subject sequence to new_link_hsps from two sequences engine
686 *
687 * Revision 6.271  2000/11/02 20:15:38  dondosha
688 * Added functions BlastTwoSequencesByLocWithCallback and BlastTwoSequencesWithCallback
689 *
690 * Revision 6.270  2000/11/02 16:36:12  madden
691 * Fixed another minor problem from merge
692 *
693 * Revision 6.269  2000/11/02 16:12:37  madden
694 * fix Errors during merge of code
695 *
696 * Revision 6.268  2000/11/01 16:25:57  madden
697 * Changes from Futamura for psitblastn
698 *
699 * Revision 6.267  2000/10/31 17:51:44  dondosha
700 * Copy the necessary search block data for multi-threaded megablast
701 *
702 * Revision 6.266  2000/10/23 22:17:54  shavirin
703 * Added creation of "no database found" message in case if database is
704 * not found.
705 *
706 * Revision 6.265  2000/10/18 19:46:29  dondosha
707 * Fixed bug in BlastTwoSequencesCore for partial subject sequence search
708 *
709 * Revision 6.264  2000/10/16 19:34:16  shavirin
710 * Added possibility to run RPS Blast search from function BioseqBlastEngineByLocEx().
711 *
712 * Revision 6.263  2000/10/13 17:32:50  shavirin
713 * Adjusted calls to readdb_get_header for ASN.1 structured deflines.
714 *
715 * Revision 6.262  2000/10/13 16:05:44  shavirin
716 * Fixed minir bug with reporting database name.
717 *
718 * Revision 6.261  2000/10/12 14:45:34  madden
719 * Break out of loop if hsp is freed
720 *
721 * Revision 6.260  2000/10/11 17:14:02  dondosha
722 * For tblastn traceback convert subject sequence to ncbi4na encoding in BlastTwoSequencesCore
723 *
724 * Revision 6.259  2000/10/10 16:11:15  shavirin
725 * Added check for NULL in the function BLASTCheckHSPInclusion().
726 *
727 * Revision 6.258  2000/10/06 19:32:02  shavirin
728 * Added call to SeqMgrAddToBioseqIndex() for created fake Bioseq.
729 *
730 * Revision 6.257  2000/10/05 22:43:10  dondosha
731 * Use mb_result_struct for Mega BLAST results in two sequences functions
732 *
733 * Revision 6.256  2000/10/05 19:57:08  dondosha
734 * In Mega BLAST, results are saved in and freed from mb_result_struct, not result_struct
735 *
736 * Revision 6.255  2000/10/03 21:28:54  shavirin
737 * Added check for search->pbp for not NULL in BlastSearchBlkDestruct().
738 *
739 * Revision 6.254  2000/09/29 21:14:47  shavirin
740 * Added additional check for inclusion of HSPs after traceback for
741 * OOF gapped alignment case.
742 *
743 * Revision 6.253  2000/09/28 14:57:50  dondosha
744 * Initialize exact match array for megablast in BlastHitListNew
745 *
746 * Revision 6.252  2000/09/25 15:43:36  madden
747 * Fix for rpsblast, too high expect values getting through
748 *
749 * Revision 6.251  2000/09/14 15:05:46  dondosha
750 * For new tblastn, reset evalues to individual ones before relinking HSPs
751 *
752 * Revision 6.250  2000/09/07 13:41:42  madden
753 * Fix if first start is -1 in DenseSeg
754 *
755 * Revision 6.249  2000/09/01 18:29:12  dondosha
756 * Removed calls to ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles
757 *
758 * Revision 6.248  2000/08/31 18:37:21  shavirin
759 * Added check for NULL in BlastMakeCopyQueryDNAP().
760 *
761 * Revision 6.247  2000/08/31 16:55:17  shavirin
762 * Fixed problem with OOF alignment of negative starnd HSPs.
763 *
764 * Revision 6.246  2000/08/28 21:53:12  shavirin
765 * Added function BlastOtherReturnsFree(). Cleaned memory in case of
766 * tweak_parameters = TRUE. (Freed SeqAlign calculated before RedoAlignmentCore.
767 *
768 * Revision 6.245  2000/08/22 20:02:27  dondosha
769 * Previous change not quite right: use real subject length for all programs
770 *
771 * Revision 6.244  2000/08/22 19:42:25  dondosha
772 * Divide search->subject->length by 3 for tblastn in RealBlastGetGappedAlignmentTraceback
773 *
774 * Revision 6.243  2000/08/18 21:27:59  madden
775 * undo change 6.240 when smith_waterman is not set, the extra alignment is needed when only tweak_parameters is set
776 *
777 * Revision 6.242  2000/08/18 20:12:29  dondosha
778 * Do not use search->query_id in megablast, use only qid_array
779 *
780 * Revision 6.241  2000/08/08 21:43:35  shavirin
781 * Initialized GapAlignBlkPtr for the value of discontinuous parametrers.
782 *
783 * Revision 6.240  2000/08/03 22:25:36  shavirin
784 * Removed redundant gapped Traceback in case when tweak_parameters or
785 * smith_waterman is set.
786 *
787 * Revision 6.239  2000/07/31 23:08:13  dondosha
788 * Do not go over the end of the HSP in subject sequence when computing start for gapped alignment
789 *
790 * Revision 6.238  2000/07/25 18:12:03  shavirin
791 * WARNING: This is no-turning-back changed related to S&W Blast from
792 * Alejandro Schaffer
793 *
794 * Revision 6.237  2000/07/25 16:54:26  shavirin
795 * Corrected functions initializing gap_align in case of OOF gapping.
796 *
797 * Revision 6.236  2000/07/18 22:33:02  shavirin
798 * Adjusted start for gapped alignment in OOF case.
799 *
800 * Revision 6.235  2000/07/17 14:26:08  shavirin
801 * Added support for Out of frame gapping.
802 *
803 * Revision 6.234  2000/07/13 18:33:28  madden
804 * Fix for exploded hits with pdb
805 *
806 * Revision 6.233  2000/07/11 18:38:02  madden
807 * decreased size of helper array, added prefetch to BlastGappedScoreInternal
808 *
809 * Revision 6.232  2000/07/10 15:23:30  dondosha
810 * Moved check query_invalid from BlastTwoSequencesCoreEx to BlastTwoSequencesCore
811 *
812 * Revision 6.231  2000/07/10 15:06:23  madden
813 * Use helper array in BlastGappedScoreInternal to reduce cache misses
814 *
815 * Revision 6.230  2000/06/30 17:52:44  madden
816 * Move AWAKE_THR_MIN_SIZE to blastdef.h
817 *
818 * Revision 6.229  2000/06/29 21:27:02  dondosha
819 * Fixed memory leaks in culling by similarity
820 *
821 * Revision 6.228  2000/06/29 19:19:39  madden
822 * Fix minus strand offset in BlastConvertDNASeqLoc
823 *
824 * Revision 6.227  2000/06/26 20:15:34  shavirin
825 * Fixed coordinates transfer in the function BlastConvertDNASeqLoc().
826 *
827 * Revision 6.226  2000/06/23 20:17:42  madden
828 * Optimization for CheckGappedAlignmentsForOverlap (remove n-squared hsp check)
829 *
830 * Revision 6.225  2000/06/23 15:22:43  madden
831 * Fix problem with removing translated hits with different frames
832 *
833 * Revision 6.224  2000/06/21 18:02:25  dondosha
834 * In BlastSaveCurrentHspGapped no need to allocate new memory for hsp_array
835 *
836 * Revision 6.223  2000/06/21 15:10:27  madden
837 * efficiency in BlastGappedScoreInternal
838 *
839 * Revision 6.222  2000/06/21 12:53:22  madden
840 * Do each frame separately in CheckGappedScoreInternal for efficiency
841 *
842 * Revision 6.221  2000/06/20 16:45:36  dondosha
843 * Fixed a minor bug in revision 6.219
844 *
845 * Revision 6.220  2000/06/19 20:07:19  madden
846 * Skip transferring sequence to blastna format
847 *
848 * Revision 6.219  2000/06/19 19:16:19  dondosha
849 * Optimized reallocation of hsp array when it is overflowing
850 *
851 * Revision 6.218  2000/06/15 15:31:26  dondosha
852 * Added two sequences BLAST functions returning SearchBlk instead of SeqAlign;added code to cluster hits and keep only one hit per cluster - disabled so far; enabled two sequences BLAST for tblastn
853 *
854 * Revision 6.217  2000/06/13 20:54:38  shavirin
855 * Added return of EFF_SEARCH_SPACE in the function BlastOtherReturnsPrepare
856 *
857 * Revision 6.216  2000/06/08 20:34:15  madden
858 * add explode_seqids option to show all ids in a defline
859 *
860 * Revision 6.215  2000/05/24 20:53:48  dondosha
861 * Fixed a bug in previous change
862 *
863 * Revision 6.214  2000/05/24 19:49:07  dondosha
864 * Create qid_array for the new search in BlastSearchDuplicate, if megablast
865 *
866 * Revision 6.213  2000/05/22 19:49:35  dondosha
867 * Initialize vnp to NULL in BlastSeqLocFilterEx
868 *
869 * Revision 6.212  2000/05/16 20:00:02  madden
870 * fix for formatting db names
871 *
872 * Revision 6.211  2000/05/12 19:41:54  dondosha
873 * Free qid_array in BlastSearchBlkDestruct
874 *
875 * Revision 6.210  2000/05/05 20:10:22  madden
876 * Add vecscreen filtering capability
877 *
878 * Revision 6.209  2000/04/29 18:55:53  wheelan
879 * temporary fix for BlastTwoSequences NULL return problem
880 *
881 * Revision 6.208  2000/04/28 16:52:31  madden
882 * Fix for ungapped search of subset databases
883 *
884 * Revision 6.207  2000/04/10 17:26:28  madden
885 * Add BLASTResultFreeHsp to free memory as it is no longer needed
886 *
887 * Revision 6.206  2000/04/10 15:24:49  dondosha
888 * Enabled use of MegaBlast for BlastTwoSequences
889 *
890 * Revision 6.205  2000/04/07 16:57:45  shavirin
891 * Transfered queue parameters in BlastSearchBlkDuplicate() function.
892 *
893 * Revision 6.204  2000/04/06 17:33:57  madden
894 * Check if pointer is NULL in BlastGetAllowedGis
895 *
896 * Revision 6.203  2000/04/03 21:23:18  dondosha
897 * Do not construct ewp_params and ewp for MegaBlast search
898 *
899 * Revision 6.202  2000/04/03 20:05:27  madden
900 * Free lh_helper on tmp_hitlist, fixes leak
901 *
902 * Revision 6.201  2000/03/31 19:11:06  dondosha
903 * Changed some names related to MegaBlast
904 *
905 * Revision 6.200  2000/03/31 16:45:43  dondosha
906 * Enabled blastx for BlastTwoSequences search
907 *
908 * Revision 6.199  2000/03/30 21:44:22  madden
909 * Add BLASTResultHitlistFreeEx that checks Heap integrity
910 *
911 * Revision 6.198  2000/03/29 22:18:02  dondosha
912 * Moved adjustment of offsets in blastn to BlastSaveCurrentHitlist, added gap info processing for MegaBlast
913 *
914 * Revision 6.197  2000/03/22 17:58:54  dondosha
915 * Duplicate entire list of query_ids in BlastSearchBlkDuplicate
916 *
917 * Revision 6.196  2000/03/08 20:34:30  madden
918 * Add BlastGetFirstGiofSubset, BlastGetAllowedGis returns primary SeqId
919 *
920 * Revision 6.195  2000/03/03 18:15:52  dondosha
921 * Fixed bugs and memory leaks in MegaBlast related code
922 *
923 * Revision 6.194  2000/03/03 17:58:23  shavirin
924 * Added new function BlastConvertDNASeqLoc()
925 *
926 * Revision 6.193  2000/03/01 14:37:45  dondosha
927 * Adjust query offsets after search for all 3 versions of blastn
928 *
929 * Revision 6.192  2000/02/29 18:06:07  dondosha
930 * In case of MegaBlast save correct query ids in seqaligns
931 *
932 * Revision 6.191  2000/02/24 23:21:27  dondosha
933 * Adjust context offsets before gapped alignment to avoid strand crossover
934 *
935 * Revision 6.190  2000/02/23 20:51:05  dondosha
936 * Modifications for blastn to concatenate strands - handling of query offsets
937 *
938 * Revision 6.189  2000/02/17 21:23:10  shavirin
939 * Added parameter is_rps_blast.
940 *
941 * Revision 6.188  2000/02/17 19:02:09  shavirin
942 * Removed all references to absolete theCacheSize variable.
943 *
944 * Revision 6.187  2000/02/17 18:30:56  shavirin
945 * Added translated DNA filtering for RPS Blast
946 *
947 * Revision 6.186  2000/02/17 14:38:27  madden
948 * Duplicate filter_string for multiple threads
949 *
950 * Revision 6.185  2000/02/16 21:49:16  shavirin
951 * Fixed some memory leaks.
952 *
953 * Revision 6.184  2000/02/15 19:16:26  shavirin
954 * MemFree(pbp->filter_string) in BlastSearchBlkDestruct
955 *
956 * Revision 6.183  2000/02/14 16:15:50  madden
957 * Revert to 6.179
958 *
959 * Revision 6.182  2000/02/11 22:03:03  shavirin
960 * Returned back previous change.
961 *
962 * Revision 6.181  2000/02/11 21:25:58  shavirin
963 * Removed call to BlastLinkHsps() function for tblastn program.
964 *
965 * Revision 6.180  2000/02/11 20:45:54  dondosha
966 * Adjust the second strand offsets after blastn search
967 *
968 * Revision 6.179  2000/02/11 16:40:53  egorov
969 * The parse_blast_options is made public.
970 *
971 * Revision 6.178  2000/02/04 22:31:38  kans
972 * test subject_bsp for NULL before dereferencing in BlastTwoSequencesByLocEx
973 *
974 * Revision 6.177  2000/02/04 16:13:15  shavirin
975 * Returned changes done in Revision 6.172.
976 *
977 * Revision 6.176  2000/02/02 18:22:05  madden
978 * Free memory for LinkHelpStruct
979 *
980 * Revision 6.175  2000/02/01 22:13:26  dondosha
981 * Added code related to greedy basic gapped alignment
982 *
983 * Revision 6.174  2000/01/28 16:45:53  madden
984 * HitRangeToSeqLoc called with combine TRUE
985 *
986 * Revision 6.173  2000/01/26 22:01:56  madden
987 * Add function BlastGetProgramName
988 *
989 * Revision 6.172  2000/01/14 18:28:11  shavirin
990 * Some WordExtention* function mad external.
991 *
992 * Revision 6.171  2000/01/12 21:46:19  dondosha
993 * Minor memory leak clean-up (routine BlastSeqLocFilterEx)
994 *
995 * Revision 6.170  2000/01/12 18:54:44  madden
996 * Do not free bestid to fix problem
997 *
998 * Revision 6.169  2000/01/11 17:12:51  shavirin
999 * Added handling of the new parameter theCacheSize.
1000 *
1001 * Revision 6.168  2000/01/11 15:32:47  dondosha
1002 * Fixed memory leaks in opening shared header and sequence file memory maps
1003 *
1004 * Revision 6.167  2000/01/04 21:56:59  madden
1005 * Add NULLB to both ends of db sequence before gap extend, use dynamic buffer for blast options in repeat filtering
1006 *
1007 * Revision 6.166  2000/01/03 17:38:33  shavirin
1008 * Added check for rdfp in BlastGetAllowedGis() function.
1009 *
1010 * Revision 6.165  1999/12/31 14:23:20  egorov
1011 * Add support for using mixture of real and maks database with gi-list files:
1012 * 1. Change logic of creating rdfp list.
1013 * 2. BlastGetDbChunk gets real databases first, then masks.
1014 * 3. Propoper calculation of database sizes using alias files.
1015 * 4. Change to CommonIndex to support using of mask databases.
1016 * 5. Use correct gis in formated output (BlastGetAllowedGis()).
1017 * 6. Other small changes
1018 *
1019 * Revision 6.164  1999/12/22 22:00:35  dondosha
1020 * Destruct the header and sequence memory maps separately before destructing the search structure
1021 *
1022 * Revision 6.163  1999/12/22 21:08:36  shavirin
1023 * Rewritten function BlastNewFindWords() added function BlastNewFindWordsEx()
1024 *
1025 * Revision 6.160  1999/12/21 20:02:45  egorov
1026 * Fix memory leak.
1027 *
1028 * Revision 6.159  1999/12/17 22:22:57  madden
1029 * New masking parameters from Wojtek
1030 *
1031 * Revision 6.158  1999/12/16 19:08:36  egorov
1032 * Check rdfp for NULL before using.  Bug reported by Patrick and Sergei Sh.
1033 *
1034 * Revision 6.157  1999/12/15 17:42:26  egorov
1035 * Change BlastGetAllowedGis() to handle gi's belonged to a database alias.
1036 *
1037 * Revision 6.156  1999/12/13 21:53:02  madden
1038 * Some fixes for repeat masking
1039 *
1040 * Revision 6.155  1999/11/26 22:11:26  madden
1041 * Added BlastNT functions for nucl. extensions
1042 *
1043 * Revision 6.154  1999/11/24 15:21:38  egorov
1044 * Avoid GCC warning
1045 *
1046 * Revision 6.153  1999/11/09 14:14:12  madden
1047 * Start alive thread for masking only if query is above min size
1048 *
1049 * Revision 6.152  1999/11/02 15:32:36  madden
1050 * Allow setting of repeat filtering options and database
1051 *
1052 * Revision 6.151  1999/11/01 20:18:22  egorov
1053 * New format of filter_string
1054 *
1055 * Revision 6.150  1999/10/27 21:33:02  madden
1056 * Use housekeeping threads only for larger sequences
1057 *
1058 * Revision 6.149  1999/10/18 20:06:52  shavirin
1059 * evalue_compare_hits() : In case of equal scores and E-values order
1060 * will be determined by subject id
1061 *
1062 * Revision 6.148  1999/10/18 16:15:04  egorov
1063 * Bug fixed
1064 *
1065 * Revision 6.147  1999/10/15 20:52:10  shavirin
1066 * Fixed bug with seq_id_list initialization
1067 *
1068 * Revision 6.146  1999/10/12 21:50:47  shavirin
1069 * Added intialization of db_chunk_size in BlastThrInfoNew().
1070 *
1071 * Revision 6.145  1999/10/05 17:42:55  shavirin
1072 * Removed global variables from blast.c
1073 *
1074 * Revision 6.144  1999/10/01 18:26:56  madden
1075 * Check for search->rdfp before search->rdfp->oidlist
1076 *
1077 * Revision 6.143  1999/09/28 20:14:33  madden
1078 * Joerg changes to mimize cache misses
1079 *
1080 * Revision 6.142  1999/09/22 20:58:49  egorov
1081 * OID list change
1082 *
1083 * Revision 6.141  1999/09/16 16:55:12  madden
1084 * Changes for long words in blastn
1085 *
1086 * Revision 6.140  1999/09/03 17:23:25  madden
1087 * Fixed bug in CheckStartForGappedAlignment
1088 *
1089 * Revision 6.139  1999/09/01 19:21:06  shavirin
1090 * Added propagation of the score for discontinuous alignment in
1091 * functions: RealBlastGetGappedAlignmentTraceback() and BioseqBlastEngineCore()
1092 *
1093 * Revision 6.138  1999/08/27 18:07:34  shavirin
1094 * Passed parameter decline_align from top to the engine.
1095 *
1096 * Revision 6.137  1999/08/20 20:54:12  madden
1097 * place sentinel byte at beginning of nt sequence for ALIGN
1098 *
1099 * Revision 6.136  1999/08/20 19:48:13  madden
1100 * Changed call to BlastSearchBlkNew(Extra), removed use of version array
1101 *
1102 * Revision 6.135  1999/08/20 16:35:25  shavirin
1103 * Added protection against invalid program name in BlastGetTypes().
1104 *
1105 * Revision 6.134  1999/08/06 18:53:57  madden
1106 * Added calls to lookup_position_aux_destruct
1107 *
1108 * Revision 6.133  1999/08/05 19:01:29  madden
1109 * Add check for NULL search or invalid query in BlastTwoSequencesCore
1110 *
1111 * Revision 6.132  1999/07/01 13:03:24  sicotte
1112 * Updated for DenseDiag and Moved seqalign_reverse_strand from blastutl.c(blast.h) to SeqAlignListReverseStrand in salpedit.ch and fixed call in salutil.c
1113 *
1114 * Revision 6.131  1999/06/24 17:24:12  madden
1115 * Fix bug in GetSeqAlignCount when SeqAlignPtr is NULL
1116 *
1117 * Revision 6.130  1999/06/18 21:17:58  madden
1118 * Check that an exact match gives a positive value when making words for blast2seqs
1119 *
1120 * Revision 6.129  1999/06/14 15:20:26  madden
1121 * Produce temporary BLAST_HitList to fix blastx core-dump
1122 *
1123 * Revision 6.128  1999/05/27 17:33:05  madden
1124 * Fixed Int2 (should have been Int4) problem
1125 *
1126 * Revision 6.127  1999/05/25 13:37:49  madden
1127 * Make smallest float 1.0e-180
1128 *
1129 * Revision 6.126  1999/05/19 12:44:00  madden
1130 * Change in longest_db_seq for multiple db search
1131 *
1132 * Revision 6.125  1999/05/13 13:48:11  madden
1133 * Only filter out hits if on same strand
1134 *
1135 * Revision 6.124  1999/04/15 13:24:35  madden
1136 * Fix for sum stats problems
1137 *
1138 * Revision 6.123  1999/04/13 19:16:47  madden
1139 * Check that two HSPs are on same strand before deleting one
1140 *
1141 * Revision 6.122  1999/04/12 20:24:54  egorov
1142 * Fix MT problem
1143 *
1144 * Revision 6.121  1999/04/01 21:42:46  madden
1145 * Fix memory leaks when gi list is used
1146 *
1147 * Revision 6.120  1999/04/01 14:18:58  madden
1148 * Fixed memory leaks with gi_list
1149 *
1150 * Revision 6.119  1999/03/31 15:46:52  madden
1151 * Removed unused code and variables
1152 *
1153 * Revision 6.118  1999/03/17 13:21:06  madden
1154 * Fix comment in comment problem
1155 *
1156 * Revision 6.117  1999/03/16 19:27:36  egorov
1157 * More type castings
1158 *
1159 * Revision 6.116  1999/03/12 17:19:59  egorov
1160 * More type casting fixes
1161 *
1162 * Revision 6.115  1999/03/12 15:03:45  egorov
1163 * Add proper Int4-long type casting
1164 *
1165 * Revision 6.114  1999/03/04 14:18:09  egorov
1166 * Do correct filter masking when query is seqloc
1167 * The only BlastMaskTheResidues() function is changed:
1168 *
1169 * Revision 6.113  1999/02/22 21:59:05  madden
1170 * binary search in GetAllowedGis function
1171 *
1172 * Revision 6.112  1999/02/22 17:32:46  madden
1173 * Fix memory leak
1174 *
1175 * Revision 6.111  1999/02/18 21:18:23  madden
1176 * Optimization
1177 *
1178 * Revision 6.110  1999/02/17 13:23:01  madden
1179 * Added hsp_num_max
1180 *
1181 * Revision 6.109  1999/02/11 13:53:46  madden
1182 * Added combine Boolean to HitRangeToSeqLoc, fixed mem leak
1183 *
1184 * Revision 6.108  1999/01/28 17:20:57  madden
1185 * Check do_sum_stats for linking, Int2 to Int4, UMR
1186 *
1187 * Revision 6.107  1999/01/28 16:05:49  madden
1188 * HspArrayPurge change, HSPs saved more efficiently
1189 *
1190 * Revision 6.106  1999/01/26 18:27:23  madden
1191 * handle delta sequences correctly
1192 *
1193 * Revision 6.105  1999/01/26 17:59:26  madden
1194 * ContextToFrame no longer static
1195 *
1196 * Revision 6.104  1999/01/25 21:31:25  madden
1197 * Check for illegal chars when nucl. query is translated
1198 *
1199 * Revision 6.103  1999/01/25 19:04:37  madden
1200 * prevent core-dump when query is empty
1201 *
1202 * Revision 6.102  1999/01/20 21:05:33  madden
1203 * Look for repeats on both strands
1204 *
1205 * Revision 6.101  1999/01/19 13:29:24  madden
1206 * Change to HspArrayPurge
1207 *
1208  * Revision 6.100  1998/12/31 18:17:08  madden
1209  * Added strand option
1210  *
1211  * Revision 6.99  1998/12/31 15:36:07  victorov
1212  * filtering internals is now based on SeqLoc instead of Bioseq
1213  *
1214  * Revision 6.98  1998/12/18 16:20:18  madden
1215  * efficiencies
1216  *
1217  * Revision 6.97  1998/12/15 14:11:29  madden
1218  * Change to permit an arbitrary number of HSPs
1219  *
1220  * Revision 6.96  1998/11/30 15:58:20  madden
1221  * Added CheckStartForGappedAlignment
1222  *
1223  * Revision 6.95  1998/11/27 15:24:12  madden
1224  * Duplicated handle_results and query_id if SearchBlk duplicated
1225  *
1226  * Revision 6.94  1998/11/16 17:39:23  kans
1227  * added FALSE for new paramter to FilterCC
1228  *
1229  * Revision 6.93  1998/11/06 14:13:01  madden
1230  * Added call to AdjustOffSetsInSeqAlign in BioseqBlastEngineByLocEx
1231  *
1232  * Revision 6.92  1998/10/21 13:44:16  madden
1233  * Fixed UMR found by purify
1234  *
1235  * Revision 6.91  1998/10/20 19:57:21  madden
1236  * Run dust if filtering is selected for nt
1237  *
1238  * Revision 6.90  1998/10/13 20:37:53  madden
1239  * Use IS_residue after call to SeqPortGetResidue
1240  *
1241  * Revision 6.89  1998/09/24 15:26:38  egorov
1242  * Fix lint complaints
1243  *
1244  * Revision 6.88  1998/09/16 19:00:16  madden
1245  * Added subset Boolean
1246  *
1247  * Revision 6.87  1998/09/15 13:12:29  madden
1248  * Fixed memory leak
1249  *
1250  * Revision 6.86  1998/09/14 15:11:18  egorov
1251  * Add support for Int8 length databases; remove unused variables
1252  *
1253  * Revision 6.85  1998/09/04 20:48:48  madden
1254  * typo fix (= instead of ==)
1255  *
1256  * Revision 6.84  1998/09/03 20:23:42  madden
1257  * Copied seq_ext and seq_ext_type in MakeFakeBioseq
1258  *
1259  * Revision 6.83  1998/09/03 19:41:09  madden
1260  * do not switch sequences for Blast2Sequences if filtering is performed
1261  *
1262  * Revision 6.82  1998/08/24 14:59:59  madden
1263  * readdb_get_sequence_ex function
1264  *
1265  * Revision 6.81  1998/07/30 19:00:56  madden
1266  * Fix memory leak
1267  *
1268  * Revision 6.80  1998/07/29 21:29:45  madden
1269  * Fixed UMR with longest_db_seq that showed up in Blast 2 sequences
1270  *
1271  * Revision 6.79  1998/07/28 21:18:35  madden
1272  * Change to BLAST_ExtendWordParamsNew saves memory
1273  *
1274  * Revision 6.78  1998/07/24 14:58:53  madden
1275  * Jinqhuis call to SeqLocRevCmp put back
1276  *
1277  * Revision 6.77  1998/07/22 20:31:51  madden
1278  * Replaced cutvalue of 1000000 with INT4_MAX
1279  *
1280  * Revision 6.76  1998/07/22 12:17:03  madden
1281  * Added BioseqHitRange call for repeat filtering
1282  *
1283  * Revision 6.75  1998/07/21 20:58:10  madden
1284  * Changes to allow masking at hash only
1285  *
1286  * Revision 6.74  1998/07/20 15:51:28  zjing
1287  * add a check for plus-minus before SeqLocRevCmp
1288  *
1289  * Revision 6.73  1998/07/17 15:39:59  madden
1290  * Changes for Effective search space.
1291  *
1292  * Revision 6.72  1998/07/14 21:31:43  madden
1293  * Fix for incorrectly sorted HSP bug and speed-up of CheckHspOverlap
1294  *
1295  * Revision 6.71  1998/07/06 13:39:04  madden
1296  * Fixed improper use of Int4 in parse_seg_options
1297  *
1298  * Revision 6.70  1998/07/02 21:00:39  egorov
1299  * Remove memory leak in threaded version
1300  *
1301  * Revision 6.69  1998/06/12 22:09:14  madden
1302  * Added call to SegParamsFree
1303  *
1304  * Revision 6.68  1998/06/12 16:08:51  madden
1305  * BlastHitRange stuff
1306  *
1307  * Revision 6.67  1998/06/08 15:07:32  madden
1308  * Fixed bug in BlastConvertProteinSeqLoc
1309  *
1310  * Revision 6.66  1998/06/04 16:23:17  madden
1311  * Use new seg
1312  *
1313  * Revision 6.65  1998/05/28 19:59:58  madden
1314  * Zhengs new culling code
1315  *
1316  * Revision 6.64  1998/05/22 20:20:38  madden
1317  * Added BlastTwoSequencesByLocEx and BlastTwoSequencesEx
1318  *
1319  * Revision 6.63  1998/05/18 17:58:31  madden
1320  * fixed parsing of coil-coil options, added parsing of dust options
1321  *
1322  * Revision 6.62  1998/05/17 16:28:41  madden
1323  * Allow changes to filter options and cc filtering.
1324  *
1325  * Revision 6.61  1998/05/05 14:05:35  madden
1326  * Added functions BlastStartAwakeThread and BlastStopAwakeThread
1327  *
1328  * Revision 6.60  1998/04/28 21:04:19  madden
1329  * Reset number of HSPs to zero if relinking
1330  *
1331  * Revision 6.59  1998/04/24 21:52:09  madden
1332  * Protection against NULL pointers
1333  *
1334  * Revision 6.58  1998/04/24 19:10:59  egorov
1335  * Fix bug when if wordsize == 2 blastall produces extra alignments
1336  *
1337  * Revision 6.57  1998/04/23 21:15:09  egorov
1338  * Show exact matching even if score is below threshold (case of two sequences)
1339  *
1340  * Revision 6.56  1998/04/15 20:24:54  madden
1341  * BlastMaskTheResidues optimized
1342  *
1343  * Revision 6.55  1998/04/10 17:46:58  madden
1344  * Changed FALSE to NULL in BioseqSeg
1345  *
1346  * Revision 6.54  1998/04/02 21:12:55  madden
1347  * Properly set value for linking HSPs in blastx and tblastn
1348  *
1349  * Revision 6.53  1998/04/01 22:47:35  madden
1350  * Check for query_invalid flag
1351  *
1352  * Revision 6.52  1998/03/26 14:20:20  madden
1353  * Changed GetScoreSetFromBlastResultHsp1 from static to LIBCALL
1354  *
1355  * Revision 6.51  1998/03/25 22:28:16  madden
1356  * Changes to allow random access BLAST by gi
1357  *
1358  * Revision 6.50  1998/03/24 15:38:25  madden
1359  * Use BlastDoubleInt4Ptr to keep track of gis and ordinal_ids
1360  *
1361  * Revision 6.49  1998/03/19 22:16:24  madden
1362  * Changes to allow blasting by gi list
1363  *
1364  * Revision 6.48  1998/03/18 14:14:11  madden
1365  * Support random access by gi list
1366  *
1367  * Revision 6.47  1998/03/16 17:41:59  madden
1368  * Fixed leaks
1369  *
1370  * Revision 6.46  1998/03/14 18:28:10  madden
1371  * Added BioseqBlastEngineEx
1372  *
1373  * Revision 6.45  1998/03/09 16:35:10  madden
1374  * Fixed bug with tblastn and blastx gapped searches
1375  *
1376  * Revision 6.44  1998/02/27 14:32:33  madden
1377  * Functions moved to blastool.c
1378  *
1379  * Revision 6.43  1998/02/26 22:34:27  madden
1380  * Changes for 16 bit windows
1381  *
1382  * Revision 6.42  1998/02/26 19:12:39  madden
1383  *  Removed AdjustOffSetsInSeqAlign, added BlastNtFindWords BlastPopulateAllWordArrays BlastFindWords and BlastNewFindWords
1384  *
1385  * Revision 6.41  1998/02/24 22:47:06  madden
1386  * Fixed problem with Option validation
1387  *
1388  * Revision 6.40  1998/02/23 16:09:57  madden
1389  * Corrected from offset for subject in tblastx search
1390  *
1391  * Revision 6.39  1998/02/19 17:17:05  madden
1392  * Use of Int4 rather than Int2 when pruning SeqAlign
1393  *
1394  * Revision 6.38  1998/02/12 21:50:39  madden
1395  * protection against NULL hitlist in blastx and tblastn
1396  *
1397  * Revision 6.37  1998/02/11 17:18:19  madden
1398  * Made BlastGetGappedAlignmentTraceback functions to BlastGetGapAlgnTbck (shorter than 32 chars)
1399  *
1400  * Revision 6.36  1998/01/31 21:34:09  madden
1401  * Fix to SeqAlign pruning
1402  *
1403  * Revision 6.35  1998/01/06 18:26:22  madden
1404  * Use SeqLocLen rather than bsp->length, wordsize done properly for nucl
1405  *
1406  * Revision 6.34  1998/01/05 22:41:40  madden
1407  * Added seqalign_reverse_strand
1408  *
1409  * Revision 6.33  1998/01/05 20:53:16  madden
1410  * Added ability to align minus-minus or plus-minus in BlastTwoSeqsByLoc
1411  *
1412  * Revision 6.32  1998/01/05 16:46:55  madden
1413  * One or both strands can be searched, as opposed to only both, changes to number of contexts
1414  *
1415  * Revision 6.31  1997/12/31 17:52:09  madden
1416  * Change to BLAST_WordFinderNew
1417  *
1418  * Revision 6.30  1997/12/23 19:16:52  madden
1419  * Minor efficiency in ExtendWordExit
1420  *
1421  * Revision 6.29  1997/12/23 18:12:34  madden
1422  * Changes for range-dependent blast
1423  *
1424  * Revision 6.28  1997/12/12 20:38:55  madden
1425  * ContextToFrame lost last parameter, fix to sprintf
1426  *
1427  * Revision 6.27  1997/12/11 22:22:24  madden
1428  * Proper casting of variables
1429  *
1430  * Revision 6.26  1997/12/10 22:43:09  madden
1431  * proper casting
1432  *
1433  * Revision 6.25  1997/12/01 22:07:10  madden
1434  * Changed call to BLASTOptionValidateEx
1435  *
1436  * Revision 6.24  1997/11/28 18:19:33  madden
1437  * Changes to TxDfDbInfoNew
1438  *
1439  * Revision 6.23  1997/11/18 22:23:20  madden
1440  * Added BLASTOptionSetGapParams
1441  *
1442  * Revision 6.22  1997/11/14 17:15:29  madden
1443  * Realign matches when they contain ambiguities in blastx/tblastn
1444  *
1445  * Revision 6.21  1997/11/07 00:49:02  madden
1446  * Added call to BLAST_MatrixFill
1447  *
1448  * Revision 6.20  1997/10/29 22:11:13  madden
1449  * ABS value of frames
1450  *
1451  * Revision 6.19  1997/10/24 20:44:52  madden
1452  * Removed BlastSetReadDB and BlastGetReadDB_ID
1453  *
1454  * Revision 6.18  1997/10/22 21:46:34  madden
1455  * Changed default values
1456  *
1457  * Revision 6.17  1997/10/21 20:39:18  madden
1458  * Fix for more alignments than descriptions.
1459  *
1460  * Revision 6.16  1997/10/21 19:50:00  madden
1461  * Fix for no valid query sequence and hitlist_max of 1
1462  *
1463  * Revision 6.15  1997/10/03 21:27:28  madden
1464  * Added BlastGetTypes
1465  *
1466  * Revision 6.14  1997/10/02 17:29:29  madden
1467  * Added PrintDbInformationBasic
1468  *
1469  * Revision 6.13  1997/10/01 13:35:31  madden
1470  * Changed BLAST_VERSION to BLAST_ENGINE_VERSION
1471  *
1472  * Revision 6.12  1997/09/30 20:03:07  madden
1473  * Saved db filename in dbinfo
1474  *
1475  * Revision 6.11  1997/09/24 22:36:35  madden
1476  * Fixes for MT multidb searches
1477  *
1478  * Revision 6.10  1997/09/23 16:43:41  madden
1479  * removed unneeded DenseSegPtr
1480  *
1481  * Revision 6.9  1997/09/22 18:18:35  madden
1482  * Added umlaut to Schaffer in reference
1483  *
1484  * Revision 6.8  1997/09/18 22:22:03  madden
1485  * Added prune functions
1486  *
1487  * Revision 6.7  1997/09/16 16:54:09  kans
1488  * return FASLE instead of NULL for Boolean value
1489  *
1490  * Revision 6.6  1997/09/16 16:31:28  madden
1491  * More changes for multiple db runs
1492  *
1493  * Revision 6.5  1997/09/11 18:49:31  madden
1494  * Changes to enable searches against multiple databases.
1495  *
1496  * Revision 6.4  1997/09/10 21:28:00  madden
1497  * Changes to set CPU limits
1498  *
1499  * Revision 6.3  1997/09/08 16:25:32  madden
1500  * Fixed bug that did not mask low-complexity regions at the end of a query
1501  *
1502  * Revision 6.2  1997/08/27 14:46:51  madden
1503  * Changes to enable multiple DB searches
1504  *
1505  * Revision 6.1  1997/08/26 15:05:26  madden
1506  * Fix for negative effective search space
1507  *
1508  * Revision 6.0  1997/08/25 18:52:49  madden
1509  * Revision changed to 6.0
1510  *
1511  * Revision 1.105  1997/08/22 18:37:43  madden
1512  * Added function BlastOtherReturnsPrepare
1513  *
1514  * Revision 1.104  1997/08/20 21:43:34  madden
1515  * Added page numbers
1516  *
1517  * Revision 1.103  1997/08/14 21:07:08  madden
1518  * ignored gapped for tblastx
1519  *
1520  * Revision 1.102  1997/08/14 14:30:35  madden
1521  * BlastNewFindWords called with range set for ranged blast
1522  *
1523  * Revision 1.101  1997/07/31 21:18:11  madden
1524  * Removed left-over file from seg
1525  *
1526  * Revision 1.100  1997/07/30 16:39:30  madden
1527  * Print gap existence and extension parameters for blastn
1528  *
1529  * Revision 1.99  1997/07/30 16:31:37  madden
1530  * tblastx prepares StdSeg
1531  *
1532  * Revision 1.98  1997/07/29 17:07:27  madden
1533  * better tblastx error messages.
1534  *
1535  * Revision 1.97  1997/07/25 15:39:49  madden
1536  * Corrected citation
1537  *
1538  * Revision 1.96  1997/07/25 13:47:46  madden
1539  * Made buffer longer to avoid ABR
1540  *
1541  * Revision 1.95  1997/07/23 20:59:02  madden
1542  * Changed blastn defaults for gap opening and extension
1543  *
1544  * Revision 1.94  1997/07/22 17:22:41  madden
1545  * Added NULL arg (for index callback) to BLASTSetUpSearch funcs
1546  *
1547  * Revision 1.93  1997/07/21 17:36:42  madden
1548  * Added BlastGetReleaseDate
1549  *
1550  * Revision 1.92  1997/07/18 20:57:02  madden
1551  * Added functions BlastGetVersionNumber and BlastGetReference
1552  *
1553  * Revision 1.91  1997/07/18 14:26:20  madden
1554  * call to AcknowledgeBlastQuery changed, SeqId no longer deleted there.
1555  *
1556  * Revision 1.90  1997/07/16 20:34:35  madden
1557  * Added function BlastConvertProteinSeqLoc
1558  *
1559  * Revision 1.89  1997/07/15 20:36:14  madden
1560  * Added BioseqSeg and SeqLocSeg
1561  *
1562  * Revision 1.88  1997/07/14 20:11:10  madden
1563  * Removed unused variables
1564  *
1565  * Revision 1.87  1997/07/14 16:15:41  madden
1566  * call to BLASTOptionValidateEx in BlastBioseqEngine
1567  *
1568  * Revision 1.86  1997/07/14 15:31:49  madden
1569  * Added BlastErrorMessage functions
1570  *
1571  * Revision 1.85  1997/07/11 19:29:37  madden
1572  * Added function BioseqBlastEngineByLoc
1573  *
1574  * Revision 1.84  1997/07/10 20:35:43  madden
1575  * Changed parameter output
1576  *
1577  * Revision 1.83  1997/07/02 20:18:39  madden
1578  * Made continuous SeqAlign the default
1579  *
1580  * Revision 1.82  1997/07/02 18:31:39  madden
1581  * changed defaults
1582  *
1583  * Revision 1.81  1997/07/01 19:15:44  madden
1584  * More changes to FormatBlastParameters
1585  *
1586  * Revision 1.80  1997/07/01 17:51:36  madden
1587  * changed gap_decay rate, gap_prob
1588  *
1589  * Revision 1.79  1997/07/01 15:44:44  madden
1590  * Changes to FormatBlastParameters per S. Altschul
1591  *
1592  * Revision 1.78  1997/06/30 15:50:06  madden
1593  * Changes to FormatBlastParameters
1594  *
1595  * Revision 1.77  1997/06/27 22:18:51  madden
1596  * Updated default parameters
1597  *
1598  * Revision 1.76  1997/06/27 14:31:08  madden
1599  * Added functions BlastAddSeqIdToList and BlastSeqIdListDestruct
1600  *
1601  * Revision 1.75  1997/06/24 13:51:27  madden
1602  * Fixed SeqLoc leak
1603  *
1604  * Revision 1.74  1997/06/23 20:49:31  madden
1605  * BLASTOptionValidate checks for proper gapping parameters
1606  *
1607  * Revision 1.73  1997/06/20 13:11:33  madden
1608  * Made AdjustOffSetsInSeqAlign non-static, Fixed purify error
1609  *
1610  * Revision 1.72  1997/06/06 21:29:48  madden
1611  * Added Boolean html to AcknowledgeBlastQuery and PrintDbInformation
1612  *
1613  * Revision 1.71  1997/06/06 19:49:46  madden
1614  * Added BlastMakeFakeBioseq and BlastDeleteFakeBioseq
1615  *
1616  * Revision 1.70  1997/05/30 21:05:59  madden
1617  * corrected call to readdb_new
1618  *
1619  * Revision 1.69  1997/05/27 20:20:02  madden
1620  * Added function BlastMaskTheResidues
1621  *
1622  * Revision 1.68  1997/05/22 21:24:55  madden
1623  * Added support for final gapX dropoff value
1624  *
1625  * Revision 1.67  1997/05/20 17:52:58  madden
1626  * Added functions BlastTwoSequencesByLoc and BlastSequencesOnTheFlyByLoc
1627  *
1628  * Revision 1.66  1997/05/12 21:34:16  madden
1629  * readdb_new allows indeterminate database type
1630  *
1631  * Revision 1.65  1997/05/06 22:17:59  madden
1632  * Duplicate dblen_eff, dbseq_num, and length_adjustment
1633  *
1634  * Revision 1.64  1997/05/01  15:53:19  madden
1635  * Addition of extra KarlinBlk's for psi-blast
1636  *
1637  * Revision 1.63  1997/04/29  14:07:45  madden
1638  * Fixed problem with hits failing PreliminaryGapping; fixed UMR.
1639  *
1640  * Revision 1.62  1997/04/25  20:23:06  madden
1641  * Freed SeqPort to clear mem leak.
1642  *
1643  * Revision 1.61  1997/04/24  14:43:07  madden
1644  * Fix for minus strand (ungapped) tblastn runs.
1645  *
1646  * Revision 1.60  1997/04/23  21:56:07  madden
1647  * Changes in BlastGetGappedAlignmentTraceback for in-frame gapping tblastn.
1648  *
1649  * Revision 1.59  1997/04/22  14:00:14  madden
1650  * Removed unused variables.
1651  *
1652  * Revision 1.58  1997/04/22  13:04:19  madden
1653  * Changes for in-frame blastx gapping.
1654  *
1655  * Revision 1.57  1997/04/21  15:35:26  madden
1656  * Fixes for 'gapped' StdSegs.
1657  *
1658  * Revision 1.56  1997/04/18  17:08:35  madden
1659  * Corrected printing of threshold values.
1660  *
1661  * Revision 1.55  1997/04/17  22:12:43  madden
1662  * Fix for offset in GetStartForGappedAlignment.
1663  *
1664  * Revision 1.54  1997/04/17  22:07:48  madden
1665  * Changes to allow in-frame gapped tblastn.
1666  *
1667  * Revision 1.53  1997/04/15  22:02:59  madden
1668  * Set original_length1 for translating searches.
1669  *
1670  * Revision 1.52  1997/04/14  21:31:58  madden
1671  * Checking for NULL pointer.
1672  *
1673  * Revision 1.51  1997/04/14  15:59:47  madden
1674  * Changes for ungapped psi-blast.
1675  *
1676  * Revision 1.50  1997/04/11  21:18:45  madden
1677  * Added GetSequenceWithDenseSeg.
1678  *
1679  * Revision 1.49  1997/04/11  19:02:49  madden
1680  * Changes for in-frame blastx, tblastn gapping.
1681  *
1682  * Revision 1.48  1997/04/09  20:01:53  madden
1683  * Copied seqid_list from search structure to duplicate, for use on threads.
1684  *
1685  * Revision 1.47  1997/04/08  16:27:28  madden
1686  * Fixed leaks; fix for blastn formatting of parameters.
1687  *
1688  * Revision 1.46  1997/04/07  21:42:56  madden
1689  * Freed SeqLocPtr used for dust.
1690  *
1691  * Revision 1.45  1997/04/07  18:17:09  madden
1692  * Formatted parameters for Stephen.
1693  *
1694  * Revision 1.44  1997/04/04  20:44:09  madden
1695  * Added check for NULL return.
1696  *
1697  * Revision 1.43  1997/04/04  20:42:35  madden
1698  * Added function BioseqBlastEngineCore.
1699  *
1700  * Revision 1.42  1997/04/03  19:50:56  madden
1701  * Changes to use effective database length instead of the length of each
1702  * sequence in statistical calculations.
1703  *
1704  * Revision 1.41  1997/03/27  22:30:51  madden
1705  * Correctly checked for overlapping HSP's.
1706  *
1707  * Revision 1.40  1997/03/20  22:56:24  madden
1708  * Added gap_info to hsp.
1709  *
1710  * Revision 1.39  1997/03/20  21:52:10  madden
1711  * Fix for segmented query BioseqPtr when gapped alignment is performed.
1712  *
1713  * Revision 1.39  1997/03/20  21:52:10  madden
1714  * Fix for segmented query BioseqPtr when gapped alignment is performed.
1715  *
1716  * Revision 1.38  1997/03/14  22:06:11  madden
1717  * fixed MT bug in BlastReevaluateWithAmbiguities.
1718  *
1719  * Revision 1.37  1997/03/14  15:57:23  madden
1720  * Removed superfluous call to SeqAlignNew
1721  *
1722  * Revision 1.36  1997/03/14  15:22:11  madden
1723  * Fixed UMR of seqalign in BlastTwoSequencesCore.
1724  *
1725  * Revision 1.35  1997/03/11  14:38:40  madden
1726  * Added BlastSequencesOnTheFly and BlastTwoSequencesCore.
1727  *
1728  * Revision 1.34  1997/03/07  22:35:54  madden
1729  * Fix for BLASTOptionNew.
1730  *
1731  * Revision 1.33  1997/03/07  21:58:36  madden
1732  * Added Boolean gapped argument to BLASTOptionNew.
1733  *
1734  * Revision 1.32  1997/03/07  21:11:22  madden
1735  * Added in check for blastn on gapped calculations.
1736  *
1737  * Revision 1.31  1997/03/06  21:47:27  madden
1738  * Made FormatBlastParameters non-static.
1739  *
1740  * Revision 1.30  1997/03/05  18:16:16  madden
1741  * SeqIdFree replaced by SeqIdSetFree, fixed memory leak.
1742  *
1743  * Revision 1.29  1997/03/05  14:29:46  madden
1744  * Moved BlastSaveCurrentHsp from blast.c; Added function CheckHspOverlap.
1745  *
1746  * Revision 1.28  1997/03/04  21:34:59  madden
1747  * Added in HspArrayPurge.
1748  *
1749  * Revision 1.27  1997/03/04  20:08:19  madden
1750  * Moved gapped alignment code from blast.c to blastutl.c
1751  *
1752  * Revision 1.26  1997/03/03  22:39:45  madden
1753  * Moved code from blast.c to blastutl.c.
1754  *
1755  * Revision 1.25  1997/03/03  21:47:22  madden
1756  * Moved functions from blast.c to blastutl.c for 16-bit windows.
1757  *
1758  * Revision 1.24  1997/03/03  20:58:09  madden
1759  * Fixed offsets for minus strands.
1760  *
1761  * Revision 1.23  1997/03/03  17:30:21  madden
1762  * Set SeqAlignPtr to NULL in BlastTwoSequences and BlastBioseqEngine, possible UMR.
1763  *
1764  * Revision 1.22  1997/03/01  18:25:33  madden
1765  * reverse flag added to BlastGetGappedAlignmentTraceback functions.
1766  *
1767  * Revision 1.21  1997/02/27  22:47:07  madden
1768  * Replaced tblastx with tblastn in BioseqBlastEngine.
1769  *
1770  * Revision 1.20  1997/02/26  23:39:54  madden
1771  * Added Txdfline stuff.
1772  *
1773  * Revision 1.19  1997/02/26  20:37:31  madden
1774  * Added *error_returns to BioseqBlastEngine.
1775  *
1776  * Revision 1.18  1997/02/25  19:17:05  madden
1777  * Changes to BioseqBlastEngine.
1778  *
1779  * Revision 1.17  1997/02/20  23:00:34  madden
1780  * Checked for NULL return in BlastTwoSequences.
1781  *
1782  * Revision 1.16  1997/02/20  18:38:34  madden
1783  * Set Default db_length to zero in Options.
1784  *
1785  * Revision 1.15  1997/02/19  16:25:22  madden
1786  * Reset gapped_calculation for blastn; returned proper SeqAlign for blastx, tblastn
1787  * in BioseqBlastEngine.
1788  *
1789  * Revision 1.14  1997/02/19  13:45:13  madden
1790  * replaced zero in call to BlastGetGappedAlignmentTraceback with FALSE.
1791  *
1792  * Revision 1.13  1997/02/18  22:09:02  madden
1793  * Removed unused variable.
1794  *
1795  * Revision 1.12  1997/02/18  21:03:00  madden
1796  * Changes to BioseqBlastEngine for gapped calculations.
1797  *
1798  * Revision 1.11  1997/02/18  18:31:34  madden
1799  * Used SeqIdFindBest in BlastTwoSequences.
1800  *
1801  * Revision 1.10  1997/02/18  17:58:52  madden
1802  * Added BioseqBlastEngine.
1803  *
1804  * Revision 1.9  1997/02/14  17:17:59  madden
1805  * Changes to default options and BlastTwoSequences for nucl.
1806  * sequences with ambiguites.
1807  *
1808  * Revision 1.8  1997/02/13  18:23:56  madden
1809  * Fixed ID type from BlastTwoSequences.
1810  *
1811  * Revision 1.7  1997/02/11  19:30:54  madden
1812  * Changes to BlastTwoSequences for gapped alignments.
1813  *
1814  * Revision 1.6  1997/02/10  20:03:58  madden
1815  * BlastTwoSequences indexes only the subject.
1816  *
1817  * Revision 1.5  1997/02/10  15:24:26  madden
1818  * Removed unused variable.
1819  *
1820  * Revision 1.4  1997/02/07  22:43:03  madden
1821  * Moved BLAST_WordFinderNew and Destruct from blast.c to blastutl.c, made
1822  * non-static.
1823  *
1824  * Revision 1.3  1997/02/07  22:32:40  madden
1825  * Changed prototypes for BlastGetSubjectId and GetSeqAlignForResultHitList.
1826  *
1827  * Revision 1.2  1997/02/05  13:36:48  madden
1828  * Removed Unused variable.
1829  *
1830  * Revision 1.1  1997/02/04  18:23:58  madden
1831  * Initial revision
1832  *
1833 */
1834 
1835 #define NLM_GENERATED_CODE_PROTO
1836 #include <ncbi.h>
1837 #include <blast.h>
1838 #include <blastpri.h>
1839 #include <objcode.h>
1840 #include <objseq.h>
1841 #include <sequtil.h>
1842 #include <tofasta.h>
1843 #include <seqport.h>
1844 #include <readdb.h>
1845 #include <ncbithr.h>
1846 #include <blast_dust.h>
1847 #include <urkpcc.h>
1848 #include <txalign.h>
1849 #include <seg.h>
1850 #include <salpedit.h>
1851 #include <mbalign.h>
1852 #include <mblast.h>
1853 #include <vecscrn.h>
1854 #include <rpsutil.h>
1855 #include <simutil.h>
1856 #include <blfmtutl.h>
1857 
1858 typedef struct _pgp_blast_options {
1859     BLAST_OptionsBlkPtr options;
1860     CharPtr blast_database;
1861     BioseqPtr query_bsp, fake_bsp;
1862     Int4 number_of_descriptions, number_of_alignments;
1863     FILE *infp, *outfp;
1864     AsnIoPtr aip_out;
1865     Boolean html;
1866     Boolean believe_query;
1867     Uint4 align_options, print_options;
1868   /* PHI-PSI Blast variables */
1869     Uint1 featureOrder[FEATDEF_ANY];
1870     Uint1 groupOrder[FEATDEF_ANY];
1871     Int4 program_flag;
1872     CharPtr patfile;
1873     FILE *patfp;
1874     seedSearchItems *seedSearch;
1875 } PGPBlastOptions, PNTR PGPBlastOptionsPtr;
1876 
1877 /* Window size used to scan HSP for highest score region, where gapped
1878 extension starts. */
1879 #define HSP_MAX_WINDOW 11
1880 
1881 #define BLASTFILTER_DIR "/usr/ncbi/blast/filter"
1882 
1883 static SeqIdPtr
BlastGetFirstGiofSubset(ReadDBFILEPtr rdfp,Int4 ordinal_id,Int2 aliasfilebit)1884 BlastGetFirstGiofSubset(ReadDBFILEPtr rdfp, Int4 ordinal_id, Int2 aliasfilebit)
1885 {
1886     Boolean	not_done = TRUE;
1887     SeqIdPtr	bestid = NULL, tmp_seqid, seqid=NULL;
1888     Uint4	header_index = 0;
1889     Int4	gi = 0;
1890     Int4	alias_mask;
1891     BlastDefLinePtr bdfp;
1892 
1893     if (!rdfp->cih && rdfp->formatdb_ver < FORMATDB_VER) {
1894         /* FORMATDB_VER_TEXT version requires the common index
1895          * to determine the subset databases */
1896         ErrPostEx(SEV_ERROR, 0, 0, "Database mask cannot be used without CommonIndex");
1897         return NULL;
1898     }
1899 
1900     alias_mask = (0x1 << rdfp->aliasfilebit);
1901 
1902     bdfp = NULL;
1903     if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
1904         bdfp = FDReadDeflineAsn(rdfp, ordinal_id);
1905         if(bdfp == NULL) {
1906             ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d",
1907                       ordinal_id);
1908             return NULL;
1909         }
1910 
1911         bestid = SeqIdFindBest(bdfp->seqid, SEQID_GI);
1912         if (bestid->choice == SEQID_GI) {
1913             gi = bestid->data.intvalue;
1914             ValNodeAddInt(&seqid, SEQID_GI, gi);
1915         }
1916         bdfp = BlastDefLineSetFree(bdfp);
1917 
1918         return seqid;
1919     }
1920 
1921     while (not_done) {
1922         CommonIndexPtr	cigi;
1923 
1924         /* get seqid from database headers file */
1925         not_done = readdb_get_header (rdfp, ordinal_id, &header_index, &tmp_seqid, NULL);
1926 
1927         if (not_done == FALSE)
1928             break;
1929 
1930         if (not_done) {
1931             /* get gi number */
1932             bestid = SeqIdFindBest(tmp_seqid, SEQID_GI);
1933             if (bestid->choice != SEQID_GI) {
1934                 tmp_seqid = SeqIdSetFree(tmp_seqid);
1935                 break;
1936             }
1937             gi = bestid->data.intvalue;
1938 
1939             /* get database commonindex mask */
1940             cigi = rdfp->cih->ci + gi;
1941             if (alias_mask & SwapUint4(cigi->dbmask)) {
1942                 ValNodeAddInt(&seqid, SEQID_GI, gi);
1943                 break;
1944             }
1945             tmp_seqid = SeqIdSetFree(tmp_seqid);
1946         }
1947     }
1948     tmp_seqid = SeqIdSetFree(tmp_seqid);
1949 
1950     return seqid;
1951 }
1952 
1953 #define BLAST_ITER_MAX 30
1954 
1955 /*
1956   Goes through the list of gi's/ordinal id's looking for matches
1957   to the ordinal ID.  Returns those acceptable gi's as SeqIdPtr's.
1958 */
1959 SeqIdPtr
BlastGetAllowedGis(BlastSearchBlkPtr search,Int4 ordinal_id,SeqIdPtr PNTR seqid)1960 BlastGetAllowedGis (BlastSearchBlkPtr search, Int4 ordinal_id, SeqIdPtr PNTR seqid)
1961 {
1962     BlastGiListPtr blast_gi_list;
1963     Boolean found=FALSE;
1964     BlastDoubleInt4Ptr *gi_list_pointer;
1965     Int4 index, total, first, last, current;
1966     ValNodePtr gi_list=NULL;
1967 
1968     if (seqid)
1969         *seqid = NULL;
1970     gi_list = NULL;
1971     if (search->thr_info->blast_gi_list) {
1972         blast_gi_list = search->thr_info->blast_gi_list;
1973         total = blast_gi_list->total;
1974         found = FALSE;
1975         gi_list_pointer = blast_gi_list->gi_list_pointer;
1976         first = 0;
1977         last = total;
1978         for (index=0; index<BLAST_ITER_MAX; index++) {
1979             current = (first+last)/2;
1980             if (ordinal_id < gi_list_pointer[current]->ordinal_id)
1981                 last = current;
1982             else if (ordinal_id > gi_list_pointer[current]->ordinal_id)
1983                 first = current;
1984             else {	/* back up looking for all gi's associated with this oid. */
1985                 while (current > 0 &&
1986                        ordinal_id == gi_list_pointer[current-1]->ordinal_id)
1987                     current--;
1988                 found = TRUE;
1989                 break;
1990             }
1991         }
1992 
1993         if (found) {
1994             while (current < total) {
1995                 if (ordinal_id == gi_list_pointer[current]->ordinal_id) {
1996                     ValNodeAddInt(&gi_list, SEQID_GI, blast_gi_list->gi_list_pointer[current]->gi);
1997                 } else {
1998                     break;
1999                 }
2000                 current++;
2001             }
2002         }
2003 
2004         if (seqid && search->rdfp && search->rdfp->aliasfilebit != 0) {
2005             *seqid = BlastGetFirstGiofSubset(search->rdfp, ordinal_id, search->rdfp->aliasfilebit);
2006         }
2007         return (SeqIdPtr) gi_list;
2008     } else  if (search->rdfp != NULL && search->rdfp->oidlist != NULL) {
2009         /* if we have at least one mask, then we need print only those gis, which
2010            are in the database list (reals and masks) */
2011 
2012         Boolean	not_done = TRUE;
2013         SeqIdPtr	bestid = NULL, tmp_seqid = NULL;
2014         Uint4	header_index = 0;
2015         Int4	gi = 0;
2016         Int4	mask;
2017         Int2	firstpos, curfirstpos;
2018         ReadDBFILEPtr	rdfp = search->rdfp, tmprdfp;
2019         BlastDefLinePtr bdfp, bdfp_head;
2020 
2021         if (!rdfp->cih && rdfp->formatdb_ver < FORMATDB_VER) {
2022             /* FORMATDB_VER_TEXT version requires the common index
2023              * to determine the subset databases */
2024            /*ErrPostEx(SEV_ERROR, 0, 0, "Database mask cannot be used without CommonIndex");*/
2025             return NULL;
2026         }
2027 
2028         /* kludge: only protein databases are non-redundant */
2029         if (readdb_is_prot(search->rdfp) == FALSE)
2030             return NULL;
2031 
2032         bdfp = NULL; bdfp_head = NULL;
2033         if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
2034             /* just chain the seqid's returned, as they are filtered in
2035              * FDReadDeflineAsn according to the membership_bit in the
2036              * rdfp */
2037             bdfp = FDReadDeflineAsn(rdfp, ordinal_id);
2038             if(bdfp == NULL) {
2039                 ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d", ordinal_id);
2040                 return NULL;
2041             }
2042             for (bdfp_head = bdfp; bdfp; bdfp = bdfp->next) {
2043                 bestid = SeqIdFindBest(bdfp->seqid, SEQID_GI);
2044                 if (bestid->choice == SEQID_GI) {
2045                     gi = bestid->data.intvalue;
2046                     ValNodeAddInt(&gi_list, SEQID_GI, gi);
2047                 }
2048             }
2049 
2050             BlastDefLineSetFree(bdfp_head);
2051 
2052         } else {
2053 
2054             while (not_done) {
2055                 CommonIndexPtr	cigi;
2056 
2057                 /* get seqid from database headers file */
2058                 not_done = readdb_get_header (search->rdfp, ordinal_id, &header_index, &tmp_seqid, NULL);
2059 
2060                 if (not_done == FALSE)
2061                     break;
2062 
2063                 if (not_done) {
2064                     /* get gi number */
2065                     bestid = SeqIdFindBest(tmp_seqid, SEQID_GI);
2066                     if (bestid->choice != SEQID_GI) {
2067                         tmp_seqid = SeqIdSetFree(tmp_seqid);
2068                         break;
2069                     }
2070                     gi = bestid->data.intvalue;
2071 
2072                     /* get database commonindex mask */
2073                     cigi = search->rdfp->cih->ci + gi;
2074                     mask = SwapUint4(cigi->dbmask);
2075 
2076                     firstpos = 0;
2077                     while (((curfirstpos = bit_engine_firstbit(mask)) != -1)) {
2078                         CharPtr		dbname;
2079 
2080                         firstpos += curfirstpos;
2081 
2082                         dbname = DBName(search->rdfp->cih->num_of_DBs,
2083                                         search->rdfp->cih->dbids, firstpos);
2084 
2085                         /* search in rdfp list this database */
2086                         tmprdfp = search->rdfp;
2087                         while (tmprdfp) {
2088                             if (tmprdfp->aliasfilename) {
2089                                 /* use mask name, if exists */
2090                                 if (!StrCmp(dbname, tmprdfp->aliasfilename)) {
2091                                     ValNodeAddInt(&gi_list, SEQID_GI, gi);
2092                                 }
2093                             } else {
2094                                 /* use real file name */
2095                                 if (!StrCmp(dbname, tmprdfp->filename)) {
2096                                     ValNodeAddInt(&gi_list, SEQID_GI, gi);
2097                                 }
2098                             }
2099                             tmprdfp = tmprdfp->next;
2100                         }
2101                         mask >>= (curfirstpos + 1);
2102                         firstpos++;
2103                     }
2104                 }
2105 
2106                 if (tmp_seqid) {
2107                     tmp_seqid = SeqIdSetFree(tmp_seqid);
2108                 }
2109             }
2110         }
2111         if (seqid)
2112             *seqid = BlastGetFirstGiofSubset(search->rdfp, ordinal_id, search->rdfp->aliasfilebit);
2113 
2114         return (SeqIdPtr) gi_list;
2115     }
2116 
2117     return NULL;
2118 }
2119 
2120 /*
2121 	SOME FUNCTIONS TO PRODUCE A SeqAlign from the BLAST results.
2122 */
2123 
2124 /*****************************************************************************
2125 
2126 	Finds the best SeqId for the SeqAlign.  Looks for the GI, then takes
2127 	anything if that's not found and makes up a local ID if no ID is
2128 	found at all.
2129 *****************************************************************************/
2130 
2131 SeqIdPtr
GetTheSeqAlignID(SeqIdPtr seq_id)2132 GetTheSeqAlignID(SeqIdPtr seq_id)
2133 {
2134 	SeqIdPtr new_id, ret_id;
2135 	ObjectIdPtr obidp;
2136 
2137 	ret_id = NULL;
2138 	if (seq_id)
2139 	{
2140 		/* Get the gi from the chain, if it's there. */
2141 		new_id = SeqIdFindBest(seq_id, SEQID_GI);
2142 		if (new_id)
2143 		{
2144 			ret_id = SeqIdDup(new_id);
2145 		}
2146 		else
2147 		{	/* No Gi was found, use any ID. */
2148 			ret_id = SeqIdDup(seq_id);
2149 		}
2150 	}
2151 
2152 	if (ret_id == NULL)
2153 	{	/* make up an ID. */
2154 		obidp = ObjectIdNew();
2155 		obidp->str = StringSave("lcl|unknown");
2156 		ValNodeAddPointer(&ret_id, SEQID_LOCAL, obidp);
2157 	}
2158 
2159 	return ret_id;
2160 }
2161 static SeqAlignPtr
FillInSegsInfo(SeqAlignPtr sap_head,StdSegPtr ssp_head,DenseDiagPtr ddp_head)2162 FillInSegsInfo(SeqAlignPtr sap_head, StdSegPtr ssp_head, DenseDiagPtr ddp_head)
2163 
2164 {
2165 	SeqAlignPtr sap;
2166 
2167 	if (ddp_head || ssp_head)
2168 	{
2169 		if (sap_head)
2170 		{
2171 			sap = sap_head;
2172 			while (sap->next)
2173 				sap = sap->next;
2174 			sap->next = SeqAlignNew();
2175 			sap = sap->next;
2176 		}
2177 		else
2178 		{
2179 			sap_head = sap = SeqAlignNew();
2180 		}
2181 
2182 		if (ddp_head)
2183 		{
2184 			sap->type = 2;
2185 			sap->segs = ddp_head;
2186 			sap->segtype = 1;
2187 		}
2188 		else if (ssp_head)
2189 		{
2190 			sap->type = 2;
2191 			sap->segs = ssp_head;
2192 			sap->segtype = 3;
2193 		}
2194 	}
2195 	return sap_head;
2196 }
2197 
2198 
2199 /*************************************************************************
2200 *
2201 *	This function fills in the DenseDiag Information from the variable
2202 *	hsp.  On the first call to this function *old should be
2203 *	NULL, after that pass in the head of the DenseDiagPtr chain.
2204 *	The newest DenseDiagPtr is returned.
2205 *
2206 ************************************************************************/
2207 
2208 static DenseDiagPtr
FillInDenseDiagInfo(DenseDiagPtr PNTR old,BLASTResultHspPtr hsp,Boolean reverse,Int4 query_length,Int4 subject_length,SeqIdPtr gi_list)2209 FillInDenseDiagInfo(DenseDiagPtr PNTR old, BLASTResultHspPtr hsp, Boolean reverse, Int4 query_length, Int4 subject_length, SeqIdPtr gi_list)
2210 
2211 {
2212 	DenseDiagPtr		ddp, new;
2213 
2214 	new = DenseDiagNew();
2215 
2216 	new->dim = 2;	/* Only 2 is supported in spec. */
2217 	new->len = hsp->query_length;
2218 	new->starts = (Int4Ptr) MemNew(2 * sizeof(Int4));
2219 	new->strands = (Uint1Ptr) MemNew(2 * sizeof(Uint1));
2220 	if (reverse)
2221 	{
2222 		if (hsp->subject_frame >= 0)
2223 		{
2224 			new->strands[0] = Seq_strand_plus;
2225 			new->starts[0] = hsp->subject_offset;
2226 		}
2227 		else
2228 		{
2229 			new->strands[0] = Seq_strand_minus;
2230 			new->starts[0] = subject_length - hsp->subject_offset - hsp->subject_length;
2231 		}
2232 		if (hsp->query_frame >= 0)
2233 		{
2234 			new->strands[1] = Seq_strand_plus;
2235 			new->starts[1] = hsp->query_offset;
2236 		}
2237 		else
2238 		{
2239 			new->strands[1] = Seq_strand_minus;
2240 			new->starts[1] = query_length - hsp->query_offset - hsp->query_length;
2241 		}
2242 	}
2243 	else
2244 	{
2245 		if (hsp->query_frame >= 0)
2246 		{
2247 			new->strands[0] = Seq_strand_plus;
2248 			new->starts[0] = hsp->query_offset;
2249 		}
2250 		else
2251 		{
2252 			new->strands[0] = Seq_strand_minus;
2253 			new->starts[0] = query_length - hsp->query_offset - hsp->query_length;
2254 		}
2255 		if (hsp->subject_frame >= 0)
2256 		{
2257 			new->strands[1] = Seq_strand_plus;
2258 			new->starts[1] = hsp->subject_offset;
2259 		}
2260 		else
2261 		{
2262 			new->strands[1] = Seq_strand_minus;
2263 			new->starts[1] = subject_length - hsp->subject_offset - hsp->subject_length;
2264 		}
2265 	}
2266 	new->scores = GetScoreSetFromBlastResultHsp(hsp, gi_list);
2267 
2268 /* Go to the end of the chain, and then attach "new" */
2269 	if (*old)
2270 	{
2271 		ddp = *old;
2272 		while (ddp->next)
2273 			ddp = ddp->next;
2274 		ddp->next = new;
2275 	}
2276 	else
2277 	{
2278 		*old = new;
2279 	}
2280 
2281 	new->next = NULL;
2282 
2283 	return new;
2284 }
2285 
2286 /*************************************************************************
2287 *
2288 *	This function fills in the StdSeg Information from the variable
2289 *	hsp.  On the first call to this function *old should be
2290 *	NULL, after that pass in the head of the DenseDiagPtr chain.
2291 *	The newest StdSegPtr is returned.
2292 *
2293 ************************************************************************/
2294 static StdSegPtr
FillInStdSegInfo(BlastSearchBlkPtr search,Int4 subject_id,Int4 length,StdSegPtr PNTR old,BLASTResultHspPtr hsp,SeqIdPtr sip,Boolean reverse,SeqIdPtr gi_list)2295 FillInStdSegInfo(BlastSearchBlkPtr search, Int4 subject_id, Int4 length, StdSegPtr PNTR old, BLASTResultHspPtr hsp, SeqIdPtr sip, Boolean reverse, SeqIdPtr gi_list)
2296 
2297 {
2298 	Int4			subject_length;
2299 	StdSegPtr		ssp, new;
2300 	SeqIdPtr		query_sip, subject_sip;
2301 	SeqIntPtr		seq_int1, seq_int2;
2302 	SeqLocPtr		slp=NULL;
2303 
2304 	new = StdSegNew();
2305 /* Duplicate the id and split it up into query and subject parts */
2306 	query_sip = SeqIdDup(sip);
2307 	subject_sip = SeqIdDup(sip->next);
2308 
2309 	new->dim = 2;	/* Only 2 is supported in spec. */
2310 	seq_int1 = SeqIntNew();
2311 	if (hsp->query_frame == 0)
2312 	{
2313 		seq_int1->from = hsp->query_offset;
2314 		seq_int1->to = hsp->query_offset + hsp->query_length - 1;
2315 		seq_int1->strand = Seq_strand_unknown;
2316 	}
2317 	else if (hsp->query_frame < 0)
2318 	{
2319 		seq_int1->to = search->context[hsp->context].query->original_length - CODON_LENGTH*hsp->query_offset + hsp->query_frame;
2320 		seq_int1->from = search->context[hsp->context].query->original_length - CODON_LENGTH*(hsp->query_offset+hsp->query_length) + hsp->query_frame + 1;
2321 		seq_int1->strand = Seq_strand_minus;
2322 	}
2323 	else if (hsp->query_frame > 0)
2324 	{
2325 		seq_int1->from = CODON_LENGTH*(hsp->query_offset) + hsp->query_frame - 1;
2326 		seq_int1->to = CODON_LENGTH*(hsp->query_offset+hsp->query_length) + hsp->query_frame - 2;
2327 		seq_int1->strand = Seq_strand_plus;
2328 	}
2329 	seq_int1->id = query_sip;
2330 	seq_int2 = SeqIntNew();
2331 	if (hsp->subject_frame == 0)
2332 	{
2333 		seq_int2->from = hsp->subject_offset;
2334 		seq_int2->to = hsp->subject_offset + hsp->subject_length - 1;
2335 		seq_int2->strand = Seq_strand_unknown;
2336 	}
2337 	else if (hsp->subject_frame < 0)
2338 	{
2339 	    	if (search->rdfp)
2340 			subject_length = readdb_get_sequence_length(search->rdfp, subject_id);
2341                 else
2342                    subject_length = length;
2343 
2344 		seq_int2->from = subject_length - CODON_LENGTH*(hsp->subject_offset + hsp->subject_length) + hsp->subject_frame + 1;
2345 		seq_int2->to = subject_length - CODON_LENGTH*(hsp->subject_offset) + hsp->subject_frame;
2346 		seq_int2->strand = Seq_strand_minus;
2347 	}
2348 	else if (hsp->subject_frame > 0)
2349 	{
2350 		seq_int2->from = CODON_LENGTH*(hsp->subject_offset) + hsp->subject_frame - 1;
2351 		seq_int2->to = CODON_LENGTH*(hsp->subject_offset + hsp->subject_length) + hsp->subject_frame - 2;
2352 		seq_int2->strand = Seq_strand_plus;
2353 	}
2354 	seq_int2->id = subject_sip;
2355 
2356 	if (reverse)
2357 	{
2358 		ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
2359 		ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
2360 	}
2361 	else
2362 	{
2363 		ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
2364 		ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
2365 	}
2366 	new->loc = slp;
2367 
2368         search->subject->sequence = MemFree(search->subject->sequence);
2369 	new->scores = GetScoreSetFromBlastResultHsp(hsp, gi_list);
2370 
2371 /* Go to the end of the chain, and then attach "new" */
2372 	if (*old)
2373 	{
2374 		ssp = *old;
2375 		while (ssp->next)
2376 			ssp = ssp->next;
2377 		ssp->next = new;
2378 	}
2379 	else
2380 	{
2381 		*old = new;
2382 	}
2383 
2384 	new->next = NULL;
2385 
2386 	return new;
2387 }
2388 
2389 /************************************************************************
2390 *
2391 *	This function assembles all the components of the Seq-align from
2392 *	a "sparse" BLAST HitList.  "sparse" means that the hitlist
2393 *	may contain no sequence and not even a descriptor.  It is only
2394 *	required to contain the sequence_number that readdb refers to
2395 *	and scoring/alignment information.
2396 *
2397 *	If dbname is non-NULL, then only a general ("gnl") ID is
2398 *	issued, with the ordinal number of the subject sequence in
2399 *	the ObjectIdPtr.
2400 *
2401 *	Boolean reverse: reverse the query and db order in SeqAlign.
2402 *
2403 ************************************************************************/
2404 SeqAlignPtr LIBCALL
GetSeqAlignForResultHitList(BlastSearchBlkPtr search,Boolean getdensediag,Boolean ordinal_number,Boolean discontinuous,Boolean reverse,Boolean get_redundant_seqs)2405 GetSeqAlignForResultHitList(BlastSearchBlkPtr search, Boolean getdensediag, Boolean ordinal_number, Boolean discontinuous, Boolean reverse, Boolean get_redundant_seqs)
2406 
2407 {
2408 	BLASTResultHspPtr	hsp;
2409 	BLASTResultHitlistPtr	results;
2410 	BLASTResultsStructPtr	result_struct;
2411 	DenseDiagPtr		ddp_head=NULL, ddp;
2412 	SeqIdPtr		gi_list=NULL, sip, sip_subject,
2413 	   sip_subject_start, query_id, new_sip;
2414 	StdSegPtr		ssp_head=NULL, ssp;
2415 	SeqAlignPtr		last, seqalign_head, seqalign, sap_head;
2416 	Int4 			hsp_cnt, index, index2, hspset_cnt_old, i;
2417 	Int4			hitlist_count;
2418 	Int4			subject_length;
2419 	ValNodePtr		vnp, vnp_start;
2420 
2421 	ddp_head = NULL;
2422 	ssp_head = NULL;
2423 	sap_head = NULL;
2424 	seqalign_head = NULL;
2425 
2426         /* discontinuous = FALSE; */
2427 	result_struct = search->result_struct;
2428 	hitlist_count = result_struct->hitlist_count;
2429 
2430 	last = NULL;
2431 	sip = NULL;
2432 	sip_subject_start = NULL;
2433 	for (index=0; index<hitlist_count; index++)
2434 	{
2435 	    results = result_struct->results[index];
2436 	    sip_subject_start = NULL;
2437 	    if (get_redundant_seqs)
2438 	    {
2439 		vnp = NULL;
2440 	    	sip = BlastGetSubjectId(search, index, ordinal_number, &vnp);
2441 		vnp_start = vnp;
2442 		while (vnp)
2443 		{
2444 			sip = GetTheSeqAlignID(vnp->data.ptrvalue);
2445 			SeqIdFree(vnp->data.ptrvalue);
2446 			if (sip_subject_start == NULL)
2447 			{
2448 				sip_subject_start = sip;
2449 			}
2450 			else
2451 			{
2452 				sip_subject = sip_subject_start;
2453 				while (sip_subject->next)
2454 					sip_subject = sip_subject->next;
2455 				sip_subject->next = sip;
2456 			}
2457 			vnp = vnp->next;
2458 		}
2459 		vnp_start = vnp = ValNodeFree(vnp_start);
2460 	    }
2461 	    else
2462 	    {
2463 	    	sip = BlastGetSubjectId(search, index, ordinal_number, NULL);
2464 	    	sip_subject_start = sip_subject = GetTheSeqAlignID(sip);
2465 	    	sip = SeqIdSetFree(sip);
2466 	    }
2467 
2468 	    results = result_struct->results[index];
2469 	    if (search->rdfp)
2470 		subject_length = readdb_get_sequence_length(search->rdfp, results->subject_id);
2471 	    else if (results->subject_info)
2472 			subject_length = results->subject_info->length;
2473 	    else
2474 			subject_length = 0;
2475 
2476 	gi_list = BlastGetAllowedGis(search, results->subject_id, &new_sip);
2477 	/* right now sip_subject should only contain one ID.  At some
2478 	point it will contain multiple ID's for identical sequences. */
2479             if (new_sip != NULL)
2480                sip_subject = new_sip;
2481             else
2482                sip_subject = sip_subject_start;
2483 	    while (sip_subject)
2484 	    {
2485 	    	seqalign = SeqAlignNew();
2486 	    	seqalign->type = 2;		/* alignment is diags */
2487 	    	if (last == NULL)	/* First sequence. */
2488 			seqalign_head = seqalign;
2489 	    	else
2490 			last->next = seqalign;
2491 
2492 	    	last = seqalign;
2493 
2494 		hspset_cnt_old = -1;
2495 		hsp_cnt = results->hspcnt;
2496 		for (index2=0; index2<hsp_cnt; index2++)
2497 		{
2498 			hsp = &(results->hsp_array[index2]);
2499 			if (discontinuous && hspset_cnt_old != hsp->hspset_cnt)
2500 			{
2501 			    hspset_cnt_old = hsp->hspset_cnt;
2502 			    if (index2 != 0)
2503 			    { /* nothing to save on first pass. */
2504 				if (getdensediag)
2505 				{
2506 					sap_head = FillInSegsInfo(sap_head, NULL, ddp_head);
2507 					ddp_head = NULL;
2508 				}
2509 				else
2510 				{
2511 					sap_head = FillInSegsInfo(sap_head, ssp_head, NULL);
2512 					ssp_head = NULL;
2513 				}
2514 			    }
2515 			}
2516 
2517 			query_id = search->query_id;
2518 			if (search->prog_number==blast_type_blastn) {
2519 			   for (i=0; i<hsp->context/2; i++)
2520 			      query_id = query_id->next;
2521 			}
2522 			if (reverse)
2523 			{
2524 				sip = SeqIdDup(sip_subject);
2525 	    			sip->next = GetTheSeqAlignID(query_id);
2526 			}
2527 			else
2528 			{
2529 	    			sip = GetTheSeqAlignID(query_id);
2530 				sip->next = SeqIdDup(sip_subject);
2531 			}
2532 
2533 			if (getdensediag)
2534 			{
2535 		    		ddp = FillInDenseDiagInfo(&ddp_head, hsp, reverse, search->context[hsp->context].query->length, subject_length, gi_list);
2536 		    		ddp->id = sip;
2537 			}
2538 			else
2539 			{
2540                             Int4 length = 0;
2541 
2542                             if (results->subject_info)
2543                                 length = results->subject_info->length;
2544 
2545                             ssp = FillInStdSegInfo(search, results->subject_id, length, &ssp_head, hsp, sip, reverse, gi_list);
2546                             ssp->ids = sip;
2547 			}
2548 			sip = NULL; /* This SeqIdPtr is now on the SeqAlign. */
2549 		}
2550 
2551 		if (discontinuous)
2552 		{
2553 			if (getdensediag)
2554 			{
2555 				sap_head = FillInSegsInfo(sap_head, NULL, ddp_head);
2556 				ddp_head = NULL;
2557 			}
2558 			else
2559 			{
2560 				sap_head = FillInSegsInfo(sap_head, ssp_head, NULL);
2561 				ssp_head = NULL;
2562 			}
2563 	        	seqalign->segs = sap_head;
2564 	        	seqalign->segtype = 5;	/* Discontinuous */
2565 		}
2566 		else
2567 		{
2568 			if (getdensediag)
2569 			{
2570 				seqalign->segs = ddp_head;
2571                                 seqalign->segtype = 1;  /* DenseDiag */
2572 				ddp_head = NULL;
2573 			}
2574 			else
2575 			{
2576                                 seqalign->segs = ssp_head;
2577                                 seqalign->segtype = 3;  /* StdSeg */
2578 				ssp_head = NULL;
2579 			}
2580 		}
2581 
2582 		sap_head = NULL;
2583 
2584 		sip_subject = sip_subject->next;
2585 	     }
2586 	     if (sip_subject_start)
2587 			sip_subject_start = SeqIdFree(sip_subject_start);
2588 	     if (new_sip)
2589 			new_sip = SeqIdFree(new_sip);
2590 	     gi_list = SeqIdSetFree(gi_list);
2591 	}
2592 
2593 	return seqalign_head;
2594 }
2595 
2596 /*
2597 	"Core" function to compare two sequences, for use by
2598 	BlastTwoSequences and BlastSequencesOnTheFly.
2599 
2600 	The subject_bsp is redundant with the subject_seq_start and
2601 	subject_length (or visa-versa), but the subject must be
2602 	extracted from the subject_bsp for BlastTwoSequences anyway, while
2603 	the title and ID are needed from subject_bsp.
2604 */
2605 static Int2
BlastTwoSequencesCoreEx(BlastSearchBlkPtr search,BioseqPtr subject_bsp,Uint1Ptr subject_seq,Int4 subject_length)2606 BlastTwoSequencesCoreEx (BlastSearchBlkPtr search, BioseqPtr subject_bsp, Uint1Ptr subject_seq, Int4 subject_length)
2607 {
2608 	Int2 status=0;
2609 
2610 	search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
2611     if (!search->handle_results)
2612        search->subject_info = BLASTSubjectInfoNew(SeqIdDup(SeqIdFindBest(subject_bsp->id, SEQID_GI)), StringSave(BioseqGetTitle(subject_bsp)), subject_length);
2613     else
2614        search->subject_info = BLASTSubjectInfoNew(SeqIdSetDup(subject_bsp->id), StringSave(BioseqGetTitle(subject_bsp)), subject_length);
2615 
2616     /*CC: is search->sbp->posMatrix, we're comparing a pssm with a subject
2617      * sequence, thus we need to do some set up */
2618     if (search->sbp->posMatrix && search->prog_number == blast_type_blastp) {
2619         Int4 hitlist_max;
2620         BLAST_ScoreBlkPtr sbp = search->sbp;
2621         BLAST_ParameterBlkPtr pbp = search->pbp;
2622 
2623         search->positionBased = TRUE;
2624         sbp->kbp = sbp->kbp_psi;
2625         sbp->kbp_gap = sbp->kbp_gap_psi;
2626         hitlist_max = search->result_struct->hitlist_max;
2627         search->result_struct =
2628             BLASTResultsStructDelete(search->result_struct);
2629 		search->result_struct = BLASTResultsStructNew(hitlist_max,
2630             pbp->max_pieces, pbp->hsp_range_max);
2631 
2632         if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST) {
2633             search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
2634 		    search->wfp_first = BLAST_WordFinderNew(sbp->alphabet_size,
2635                     search->all_words->wordsize, 1, FALSE);
2636 		}
2637 
2638 		if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND) {
2639 		    search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
2640 		    search->wfp_second = BLAST_WordFinderNew(sbp->alphabet_size,
2641                     search->all_words->wordsize, 1, FALSE);
2642 		}
2643 
2644 		/* threshold_first is defunct ! */
2645         search->wfp = search->wfp_first;
2646 		if (search->whole_query == TRUE)
2647             BlastNewFindWords(search, 0, search->context[0].query->length,
2648                     pbp->threshold_second, (Uint1) 0);
2649 		else
2650            	BlastNewFindWords(search, search->required_start,
2651                     search->required_end, pbp->threshold_second, (Uint1) 0);
2652         lookup_position_aux_destruct(search->wfp->lookup);
2653         search->wfp_second = search->wfp_first;
2654     }
2655 	status = BLASTPerformSearch(search, subject_length, subject_seq);
2656 
2657         if (status) {
2658 		BlastConstructErrorMessage("BlastTwoSequencesCoreEx", "non-zero status", 2, &(search->error_return));
2659 		return status;
2660         }
2661 
2662         if (search->prog_number == blast_type_tblastn &&
2663             search->pbp->longest_intron > 0) {
2664            Uint1 rem;
2665            Uint1Ptr seq_4na, seq_2na, subject;
2666            Int4 i;
2667            /* Need to convert from ncbi2na to ncbi4na encoding */
2668            subject = (Uint1Ptr) MemNew(subject_length + 1);
2669            seq_4na = subject;
2670            seq_2na = subject_seq;
2671            rem = 3;
2672            for (i=0; i<subject_length; i++) {
2673               *seq_4na = (Uint1) (1 << READDB_UNPACK_BASE_N(*seq_2na, rem));
2674               seq_4na++;
2675               if (rem>0) rem--;
2676               else {
2677                  rem = 3;
2678                  seq_2na++;
2679               }
2680            }
2681            BlastSequenceAddSequence(search->subject, NULL, subject-1, subject_length, subject_length, 0);
2682            status = BlastLinkHsps(search);
2683         }
2684 
2685 	if (StringCmp(search->prog_name, "blastn") == 0 || search->pbp->gapped_calculation == FALSE)
2686 	{
2687             if (search->pbp->do_sum_stats == TRUE &&
2688 		!search->pbp->mb_params)
2689                 status = BlastLinkHsps(search);
2690             else
2691                 status = BlastGetNonSumStatsEvalue(search);
2692 	}
2693         if (search->pbp->mb_params) {
2694            search->subject->sequence = subject_seq;
2695            MegaBlastReevaluateWithAmbiguities(search);
2696         }
2697         status = BlastReapHitlistByEvalue(search);
2698 
2699         if (search->handle_results)
2700            search->handle_results((VoidPtr) search);
2701         else if (!search->pbp->mb_params)
2702            BlastSaveCurrentHitlist(search);
2703         else
2704            MegaBlastSaveCurrentHitlist(search);
2705         if (search->pbp->mb_params)
2706            /* Free the ncbi4na-encoded sequence */
2707            search->subject->sequence_start = (Uint1Ptr)
2708               MemFree(search->subject->sequence_start);
2709 
2710         search->subject->sequence = NULL;
2711         search->subject->sequence_start = NULL;
2712 	if (search->prog_number==blast_type_blastn) {
2713 	   /* Unconcatenate the strands by adjusting the query offsets in
2714 	      all hsps */
2715 	   search->context[search->first_context].query->length =
2716 	      search->query_context_offsets[search->first_context+1] - 1;
2717 	}
2718 
2719 	return status;
2720 }
2721 
RPS2SeqImpalaStatCorrections(BlastSearchBlkPtr search,Uint1Ptr subject_seq,Int4 subject_length)2722 static BLAST_ScorePtr *RPS2SeqImpalaStatCorrections
2723         (BlastSearchBlkPtr search, Uint1Ptr subject_seq, Int4 subject_length)
2724 {
2725     BLAST_ScorePtr *retval = NULL;
2726     Nlm_FloatHi *scoreArray; /*array of score probabilities*/
2727     Nlm_FloatHi *resProb; /*array of probabilities for each residue*/
2728     BLAST_ScoreFreqPtr this_sfp, return_sfp; /*score frequency pointers to compute lambda*/
2729     BLAST_ScorePtr *posMatrix; /* position-specific matrix. */
2730     Nlm_FloatHi initialUngappedLambda, scaledInitialUngappedLambda,
2731                   correctUngappedLambda, scalingFactor, lambdaRatio;
2732     Nlm_FloatHi temp1; /*intermediate variable for adjusting matrix*/
2733     Int4 temp2; /*intermediate variable for adjusting matrix*/
2734     Int4 seqlength; /* length of posMatrix (or target sequence). */
2735     Int4 i, j; /* loop indices */
2736 
2737     if (search == NULL)
2738  	   return retval;
2739 
2740     posMatrix = search->sbp->posMatrix;
2741     scalingFactor = search->pbp->scalingFactor;
2742 
2743     resProb = (Nlm_FloatHi *) MemNew (PRO_ALPHABET_SIZE * sizeof(Nlm_FloatHi));
2744     scoreArray = (Nlm_FloatHi *) MemNew(scoreRange * sizeof(Nlm_FloatHi));
2745     return_sfp = (BLAST_ScoreFreqPtr) MemNew(1 * sizeof(BLAST_ScoreFreq));
2746 
2747     seqlength = search->sbp->query_length;
2748 
2749     IMPALAfillResidueProbability(subject_seq, subject_length, resProb);
2750     this_sfp = IMPALAfillSfp(posMatrix, seqlength, resProb, scoreArray,
2751                      return_sfp, scoreRange);
2752     initialUngappedLambda = IMPALAfindUngappedLambda(search->sbp->name);
2753     scaledInitialUngappedLambda = initialUngappedLambda/scalingFactor;
2754     correctUngappedLambda = impalaKarlinLambdaNR(this_sfp, scaledInitialUngappedLambda);
2755     if(correctUngappedLambda == -1.0) {
2756         ErrPostEx(SEV_ERROR, 0, 0,
2757                   "RPS2SeqImpalaStatCorrections: Could not calculate ungapped "
2758                   "lambda for PSSM");
2759         MemFree(resProb);
2760         MemFree(scoreArray);
2761         MemFree(return_sfp);
2762         return retval;
2763     }
2764 
2765     lambdaRatio = correctUngappedLambda/scaledInitialUngappedLambda;
2766 
2767     retval = (BLAST_Score **) MemNew((seqlength+1) * sizeof(BLAST_Score *));
2768     for (i = 0; i < seqlength+1; i++)
2769         retval[i] = (BLAST_Score *)MemNew(PRO_ALPHABET_SIZE *
2770                 sizeof(BLAST_Score));
2771 
2772     for (i = 0; i < seqlength+1; i++) {
2773         for (j = 0; j < PRO_ALPHABET_SIZE; j++) {
2774             if ((posMatrix[i][j] == BLAST_SCORE_MIN) || (Xchar == j))
2775                 retval[i][j] = posMatrix[i][j];
2776             else {
2777                 temp1 = ((Nlm_FloatHi) (posMatrix[i][j]));
2778                 temp1 = temp1 * (lambdaRatio);
2779                 temp2 = Nlm_Nint(temp1);
2780                 retval[i][j] = temp2;
2781             }
2782         }
2783     }
2784 
2785     resProb = MemFree(resProb);
2786     scoreArray = MemFree(scoreArray);
2787     return_sfp = MemFree(return_sfp);
2788 
2789     return retval;
2790 }
2791 
2792 static SeqAlignPtr
BlastTwoSequencesCore(BlastSearchBlkPtr search,SeqLocPtr slp,Uint1Ptr subject_seq,Int4 subject_length,Boolean reverse)2793 BlastTwoSequencesCore (BlastSearchBlkPtr search, SeqLocPtr slp, Uint1Ptr subject_seq, Int4 subject_length, Boolean reverse)
2794 
2795 {
2796 	BLASTResultsStructPtr result_struct;
2797 	BioseqPtr subject_bsp;
2798 	Int2 status;
2799 	Int4 index, hitlist_count, rev_subject_length=0;
2800 	SeqAlignPtr seqalign=NULL;
2801 	SeqPortPtr spp;
2802 	Uint1 residue;
2803 	Uint1Ptr sequence, sequence_start, rev_subject=NULL;
2804 	SeqIdPtr sip;
2805     BLAST_ScorePtr *scaledMatrix = NULL, *copyMatrix = NULL;
2806 
2807 	if (search == NULL || search->query_invalid)
2808 		return NULL;
2809 
2810 	sip = SeqLocId(slp);
2811 	subject_bsp = BioseqLockById(sip);
2812 
2813     /* Save subject sequence location for tabulated output */
2814     if (search->handle_results && SeqLocLen(slp) < subject_bsp->length)
2815        search->query_slp->next = slp;
2816 
2817 	status = BlastTwoSequencesCoreEx(search, subject_bsp, subject_seq,
2818 					 subject_length);
2819 
2820 	if (status == 0) {
2821         /*CC: if we're emulating rpsblast, do the impala style matrix
2822          * rescaling */
2823         if (search->positionBased && search->pbp->scalingFactor != 0.0) {
2824             scaledMatrix = RPS2SeqImpalaStatCorrections(search, subject_seq,
2825                     subject_length);
2826             if ( !scaledMatrix ) {
2827                 BioseqUnlock(subject_bsp);
2828                 return NULL;
2829             }
2830             copyMatrix = search->sbp->posMatrix;
2831             search->sbp->posMatrix = scaledMatrix;
2832 
2833             if (search->sbp->karlinK != 0.0)
2834                 search->sbp->kbp_gap[0]->K =
2835                     PRO_K_MULTIPLIER*search->sbp->karlinK;
2836             search->sbp->kbp_gap[0]->logK = log(search->sbp->kbp_gap[0]->K);
2837             search->sbp->kbp_gap[0]->Lambda /= search->pbp->scalingFactor;
2838         }
2839 	  if (search->pbp->mb_params && !search->pbp->mb_params->no_traceback
2840               && !search->pbp->mb_params->use_dyn_prog) {
2841              seqalign = MegaBlastGapInfoToSeqAlign(search, 0, 0);
2842 	  } else if (StringCmp(search->prog_name, "blastn") == 0 &&
2843 		   search->pbp->gapped_calculation == TRUE) {
2844              result_struct = search->result_struct;
2845              hitlist_count = result_struct->hitlist_count;
2846              if (hitlist_count > 0)
2847 	     {
2848                 spp = SeqPortNewByLoc(slp, Seq_code_ncbi4na);
2849                 if (subject_bsp->repr == Seq_repr_delta)
2850                    SeqPortSet_do_virtual(spp, TRUE);
2851 
2852                 /* make one longer to "protect" ALIGN. */
2853                 sequence_start = MemNew((2+SeqLocLen(slp))*sizeof(Uint1));
2854                 sequence_start[0] = ncbi4na_to_blastna[0];
2855                 sequence = sequence_start+1;
2856                 index=0;
2857                 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
2858                 {
2859                    if (IS_residue(residue))
2860 		   {
2861                       sequence[index] = ncbi4na_to_blastna[residue];
2862                       index++;
2863                    }
2864                 }
2865                 /* Gap character in last space. */
2866                 sequence[index] = ncbi4na_to_blastna[0];
2867 
2868                 if (!search->pbp->mb_params) {
2869                    /* Traditional Blastn */
2870                    seqalign = SumBlastGetGappedAlignmentTraceback(
2871                                  search, 0, reverse, FALSE, sequence,
2872                                  SeqLocLen(slp));
2873                 } else if (!search->pbp->mb_params->no_traceback) {
2874                    /* Mega BLAST with non-greedy extension */
2875                    SumBlastGetGappedAlignmentEx(search, 0, FALSE, FALSE,
2876                       sequence, SeqLocLen(slp), TRUE, &seqalign, NULL, 0);
2877                 }
2878 
2879                 sequence_start = MemFree(sequence_start);
2880                 spp = SeqPortFree(spp);
2881              }
2882 	  }
2883 	  else if (search->pbp->gapped_calculation == TRUE)
2884 	  {
2885         result_struct = search->result_struct;
2886         hitlist_count = result_struct->hitlist_count;
2887 		if (hitlist_count > 0) {
2888 
2889                    if (!StringCmp(search->prog_name, "tblastn")
2890                        || !StringCmp(search->prog_name, "psitblastn")) {
2891                       Uint1Ptr subject = NULL;
2892                       SeqPortPtr rev_spp;
2893                       if (slp->choice == SEQLOC_WHOLE) {
2894                          spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_plus,
2895                                           Seq_code_ncbi4na);
2896                          rev_spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_minus,
2897                                               Seq_code_ncbi4na);
2898                       } else {
2899                          spp = SeqPortNew(subject_bsp, SeqLocStart(slp),
2900                                           SeqLocStop(slp), Seq_strand_plus,
2901                                           Seq_code_ncbi4na);
2902                          rev_spp = SeqPortNew(subject_bsp, SeqLocStart(slp),
2903                                               SeqLocStop(slp), Seq_strand_minus,
2904                                               Seq_code_ncbi4na);
2905                       }
2906 		      /* make one longer to "protect" ALIGN. */
2907 		      subject = (Uint1Ptr) MemNew((1+subject_length)*sizeof(Uint1));
2908                       rev_subject = (Uint1Ptr) MemNew((1+subject_length)*sizeof(Uint1));
2909 		      for (index=0; index<subject_length; index++) {
2910                          subject[index] = SeqPortGetResidue(spp);
2911 			 rev_subject[index] = SeqPortGetResidue(rev_spp);
2912                       }
2913 		      /* Gap character in last space. */
2914                       subject[subject_length] = NULLB;
2915 		      rev_subject[subject_length] = NULLB;
2916 		      rev_subject_length = subject_length;
2917 		      spp = SeqPortFree(spp);
2918 		      rev_spp = SeqPortFree(rev_spp);
2919 
2920 
2921                       seqalign = BlastGetGapAlgnTbck(search, 0, reverse,
2922                           FALSE, subject, subject_length,
2923                           rev_subject, rev_subject_length);
2924 
2925                       if (search->pbp->longest_intron <= 0)
2926                          MemFree(subject);
2927                       MemFree(rev_subject);
2928                    } else {
2929                       seqalign = BlastGetGapAlgnTbck(search, 0, reverse,
2930                           FALSE, subject_seq, subject_length,
2931                           rev_subject, rev_subject_length);
2932                       result_struct->results[0]->seqalign = seqalign;
2933                    }
2934                 }
2935 	  }
2936 	  else /* Ungapped case, any program */
2937 	  {
2938              if (search->prog_number == blast_type_blastn ||
2939                  search->prog_number == blast_type_blastp)
2940                 seqalign = GetSeqAlignForResultHitList(search, TRUE, FALSE,
2941                               search->pbp->discontinuous, reverse, FALSE);
2942              else
2943                 seqalign = GetSeqAlignForResultHitList(search, FALSE, FALSE,
2944                               search->pbp->discontinuous, reverse, FALSE);
2945 	  }
2946       /*CC: Revert changes done for psi-blast2sequences */
2947       if (search->positionBased && search->pbp->scalingFactor != 0.0) {
2948           if (scaledMatrix) {
2949               for (index = 0; index < search->sbp->query_length + 1; index++)
2950                   MemFree(scaledMatrix[index]);
2951               MemFree(scaledMatrix);
2952               search->sbp->posMatrix = copyMatrix;
2953           }
2954           if (search->sbp->karlinK != 0.0)
2955               search->sbp->kbp_gap[0]->K = search->sbp->karlinK;
2956           search->sbp->kbp_gap[0]->logK = log(search->sbp->kbp_gap[0]->K);
2957       }
2958         }
2959 	BioseqUnlock(subject_bsp);
2960 
2961 	return seqalign;
2962 }
2963 
2964 BlastSearchBlkPtr LIBCALL
BlastQuerySequenceSetUp(BioseqPtr bsp,CharPtr progname,BLAST_OptionsBlkPtr options)2965 BlastQuerySequenceSetUp(BioseqPtr bsp, CharPtr progname,
2966 			    BLAST_OptionsBlkPtr options)
2967 {
2968    BlastSearchBlkPtr search;
2969    SeqLocPtr slp=NULL;
2970 
2971    if (bsp == NULL)
2972       return NULL;
2973 
2974    ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
2975    if (progname == NULL && options == NULL)
2976       return NULL;
2977 
2978    if (progname == NULL)
2979       progname = options->program_name;
2980 
2981    if (!StringCmp(progname, "blastp") ||
2982        !StringCmp(progname, "blastx")) {
2983       if (options->gapped_calculation == TRUE) {
2984 	 options->two_pass_method  = FALSE;
2985 	 options->multiple_hits_only  = TRUE;
2986       }
2987    }
2988 
2989    search = BLASTSetUpSearchByLoc(slp, progname, bsp->length, 0, NULL, options, NULL);
2990 
2991    search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
2992 
2993    if (search == NULL)
2994       return NULL;
2995 
2996    return search;
2997 }
2998 
2999 /*
3000 	Runs blast between two sequences
3001 */
3002 SeqAlignPtr LIBCALL
BlastTwoSequencesByLocEx(SeqLocPtr slp1,SeqLocPtr slp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns)3003 BlastTwoSequencesByLocEx(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns)
3004 {
3005    return BlastTwoSequencesByLocWithCallback(slp1, slp2, progname, options,
3006            other_returns, error_returns, NULL, NULL);
3007 }
3008 
3009 /************************************************************************/
3010 /*        PSIBLAST2Sequences API                                        */
3011 /************************************************************************/
3012 
B2SAllocateScoreMatrix(Int4 rows,Int4 cols)3013 static BLAST_ScorePtr *B2SAllocateScoreMatrix(Int4 rows, Int4 cols)
3014 {
3015     BLAST_ScorePtr *matrix = NULL;
3016     Int4 i;
3017 
3018     if (!(matrix = (BLAST_ScorePtr *) MemNew(rows*sizeof(BLAST_ScorePtr)))) {
3019         return NULL;
3020     }
3021 
3022     for (i = 0; i < rows; i++) {
3023         matrix[i] = (BLAST_ScorePtr) MemNew(cols*sizeof(BLAST_Score));
3024         if (matrix[i] == NULL) {
3025             while (--i >= 0)
3026                 MemFree(matrix[i]);
3027             MemFree(matrix);
3028             return NULL;
3029         }
3030     }
3031     return matrix;
3032 }
3033 
3034 /* Convert a set of residue frequencies into a scaled PSSM (using
3035  * scalingFactor). */
B2SCalculateScaledPSSM(BlastSearchBlkPtr search,Nlm_FloatHiPtr * posFreqs,compactSearchItems * compactSearch,Nlm_FloatHiPtr karlinK)3036 static BLAST_ScorePtr *B2SCalculateScaledPSSM(BlastSearchBlkPtr search,
3037         Nlm_FloatHiPtr *posFreqs, compactSearchItems *compactSearch,
3038         Nlm_FloatHiPtr karlinK)
3039 {
3040     BLAST_ScorePtr *retval = NULL;
3041     posSearchItems *posSearch = NULL;
3042     Int4 qlen, alphabet_sz, rv;
3043     Nlm_FloatHi scalingFactor = search->pbp->scalingFactor;
3044     BLAST_ScoreBlkPtr sbp = NULL;
3045     ValNodePtr error_return;
3046     Int4 i, gap_open, gap_extend;
3047 
3048     if (!search || !compactSearch || !posFreqs)
3049         return NULL;
3050 
3051     if (!(posSearch = (posSearchItems *)MemNew(sizeof(posSearchItems)))) {
3052         ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3053         return NULL;
3054     }
3055 
3056     qlen = compactSearch->qlength;
3057     alphabet_sz = compactSearch->alphabetSize;
3058     gap_open = search->pbp->gap_open / scalingFactor;
3059     gap_extend = search->pbp->gap_extend / scalingFactor;
3060 
3061     if (!(sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa, 1))) {
3062         ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3063         MemFree(posSearch);
3064         return NULL;
3065     }
3066     sbp->read_in_matrix = TRUE;
3067     sbp->protein_alphabet = TRUE;
3068     sbp->posMatrix = NULL;
3069     sbp->number_of_contexts = 1;
3070     BlastScoreBlkMatFill(sbp, search->sbp->name);
3071     compactSearch->matrix = sbp->matrix;
3072     compactSearch->gapped_calculation = TRUE;
3073     compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
3074     compactSearch->ethresh = 0.001;
3075     BlastScoreBlkFill(sbp, (CharPtr) compactSearch->query, qlen, 0);
3076 
3077     sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
3078     rv = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0], gap_open, gap_extend,
3079             sbp->name, &error_return);
3080     if (rv == 1) {
3081         BlastErrorPrint(error_return);
3082         BLAST_ScoreBlkDestruct(sbp);
3083         MemFree(posSearch);
3084         return NULL;
3085     }
3086     sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
3087     rv = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0], gap_open, gap_extend,
3088             sbp->name, &error_return);
3089     if (rv == 1) {
3090         BlastErrorPrint(error_return);
3091         BLAST_ScoreBlkDestruct(sbp);
3092         MemFree(posSearch);
3093         return NULL;
3094     }
3095 
3096     if (sbp->kbp_ideal == NULL)
3097         sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
3098     compactSearch->lambda =  sbp->kbp_gap_std[0]->Lambda;
3099     compactSearch->kbp_std = sbp->kbp_std;
3100     compactSearch->kbp_psi = sbp->kbp_psi;
3101     compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
3102     compactSearch->kbp_gap_std = sbp->kbp_gap_std;
3103     compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
3104     compactSearch->K_ideal = sbp->kbp_ideal->K;
3105 
3106     /* Initialize the posSearch structure */
3107     posSearch->posFreqs = posFreqs;
3108     posSearch->posMatrix = B2SAllocateScoreMatrix(qlen+1, alphabet_sz);
3109     posSearch->posPrivateMatrix = B2SAllocateScoreMatrix(qlen+1, alphabet_sz);
3110     if (!posSearch->posMatrix || !posSearch->posPrivateMatrix) {
3111         ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3112         BLAST_ScoreBlkDestruct(sbp);
3113         MemFree(posSearch->posMatrix); MemFree(posSearch->posPrivateMatrix);
3114         MemFree(posSearch);
3115         return NULL;
3116     }
3117 
3118     posFreqsToMatrix(posSearch, compactSearch);
3119     impalaScaling(posSearch, compactSearch, scalingFactor, TRUE);
3120     if (karlinK)
3121         *karlinK = compactSearch->kbp_gap_psi[0]->K;
3122 
3123     for (i = 0; i <= qlen; i++)
3124         MemFree(posSearch->posMatrix[i]);
3125     MemFree(posSearch->posMatrix);
3126     BLAST_ScoreBlkDestruct(sbp);
3127     retval = posSearch->posPrivateMatrix;
3128     MemFree(posSearch);
3129 
3130     return retval;
3131 }
3132 
3133 /* Calculates the PSSM for a given SeqLocPtr */
B2SCalculatePSSM(SeqLocPtr slp,BlastSearchBlkPtr search,BLAST_MatrixPtr matrix,Nlm_FloatHiPtr karlinK)3134 static BLAST_ScorePtr *B2SCalculatePSSM(SeqLocPtr slp, BlastSearchBlkPtr search,
3135         BLAST_MatrixPtr matrix, Nlm_FloatHiPtr karlinK)
3136 {
3137     BLAST_ScorePtr *posMatrix = NULL;
3138     compactSearchItems *compactSearch = NULL;
3139     Boolean replaced_sequence = FALSE;
3140     Int4 query_length, full_query_length;
3141     SeqLocPtr filter_slp = NULL, full_slp = NULL;
3142     Uint1Ptr sequence = NULL;
3143     BlastSequenceBlk bseq;
3144     Nlm_FloatHi scalingFactor = search->pbp->scalingFactor;
3145 
3146     query_length = SeqLocLen(slp);
3147 
3148     /* if the slp is not the whole sequence, retrieve the whole sequence and
3149      * use it to compute the pssm */
3150     if (matrix->rows != (query_length+1)) {
3151         SeqPortPtr spp = NULL;
3152         SeqIdPtr sip = NULL;
3153         Uint1 residue;
3154         BioseqPtr bsp = NULL;
3155         Char tmp[256];
3156         Int4 index = 0;
3157 
3158         sip = SeqLocId(slp);
3159         if ((bsp = BioseqLockById(SeqIdFindBest(sip, SEQID_GI))) == NULL) {
3160             SeqIdWrite(SeqLocId(slp),tmp,PRINTID_FASTA_LONG,
3161                     sizeof(tmp));
3162 
3163             ErrPostEx(SEV_ERROR,0,0,"Could not retrieve full bioseq "
3164                     "for %s",tmp);
3165             BioseqUnlock(bsp);
3166             return NULL;
3167         }
3168 
3169         /* get full sequence to be used in WposComputation */
3170         spp = SeqPortNew(bsp, FIRST_RESIDUE, LAST_RESIDUE, Seq_strand_unknown,
3171                 Seq_code_ncbistdaa);
3172 
3173         full_query_length = bsp->length;
3174         sequence = (Uint1Ptr) MemNew(2*((bsp->length)+2)*sizeof(Char));
3175         BioseqUnlock(bsp);
3176 
3177         sequence[index++] = NULLB;
3178         while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3179             if (IS_residue(residue)) {
3180                 if (residue == 24) { /* change selenocysteine to X */
3181                     residue = 21;
3182                     ErrPostEx(SEV_WARNING,0,0, "Selenocysteine (U) at "
3183                         "position %ld replaced by X", (long) index+1);
3184                 }
3185                 sequence[index++] = residue;
3186             }
3187         }
3188         sequence[index] = NULLB;
3189         spp = SeqPortFree(spp);
3190 
3191         /* Filter the sequence if necessary */
3192         ValNodeAddPointer(&full_slp, SEQLOC_WHOLE, SeqIdDup(SeqLocId(slp)));
3193         filter_slp = BlastSeqLocFilter(full_slp, search->pbp->filter_string);
3194         if(search->pbp->query_lcase_mask != NULL)
3195             filter_slp = blastMergeFilterLocs(filter_slp,
3196                     search->pbp->query_lcase_mask, FALSE, 0, 0);
3197 
3198         BlastMaskTheResidues(sequence+1, full_query_length, 21, filter_slp,
3199                 FALSE, SeqLocStart(full_slp));
3200 
3201         /* Save the current query sequence */
3202         MemCpy(&bseq, search->context[0].query, sizeof(BlastSequenceBlk));
3203 
3204 		BlastSequenceAddSequence(search->context[0].query, NULL, sequence,
3205                                  full_query_length, full_query_length, 0);
3206 
3207         SeqLocSetFree(full_slp);
3208         SeqLocSetFree(filter_slp);
3209         replaced_sequence = TRUE;
3210     }
3211 
3212     compactSearch = compactSearchNew(compactSearch);
3213     copySearchItems(compactSearch, search, search->sbp->name);
3214     compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
3215     if (scalingFactor != 0.0 && scalingFactor != 1.0) {
3216         /* build pssm {make,copy}mat/rpsblast style */
3217         posMatrix = B2SCalculateScaledPSSM(search, search->sbp->posFreqs,
3218                 compactSearch, karlinK);
3219     } else {
3220         /* build pssm psiblast style */
3221         posMatrix = WposComputation(compactSearch, NULL, search->sbp->posFreqs);
3222     }
3223     compactSearchDestruct(compactSearch);
3224 
3225     if (replaced_sequence) {
3226         MemCpy(search->context[0].query, &bseq, sizeof(BlastSequenceBlk));
3227         MemFree(sequence);
3228     }
3229 
3230     return posMatrix;
3231 }
3232 
3233 /* Checks if the dimensions of the pssm attached to the search->sbp are
3234  * consistent with the length of the master query (slp), and trims the matrix
3235  * if necessary */
B2SVerifyPSSM(SeqLocPtr slp,BlastSearchBlkPtr search,BLAST_MatrixPtr matrix)3236 static Boolean B2SVerifyPSSM(SeqLocPtr slp, BlastSearchBlkPtr search,
3237         BLAST_MatrixPtr matrix)
3238 {
3239     Int4 i, query_length = SeqLocLen(slp);
3240 
3241     if ((query_length+1) > matrix->rows) {
3242         ErrPostEx(SEV_WARNING,0,0,"Ignoring PSSM because it seems not to "
3243             "correspond to query sequence (query length  = %ld, PSSM's "
3244             "number of rows = %ld)", query_length+1, matrix->rows);
3245         search->positionBased = FALSE;
3246 
3247         if (matrix->matrix == NULL) {
3248             BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3249 
3250             for (i = 0; i < matrix->rows; i++)
3251                 posMatrix[i] = MemFree(posMatrix[i]);
3252             posMatrix = MemFree(posMatrix);
3253         }
3254         search->sbp->posMatrix = NULL;
3255         search->sbp->posFreqs = NULL;
3256         return FALSE;
3257     } else if ((query_length+1) < matrix->rows) {
3258         /* Assume BLAST_Matrix corresponds to the entire sequence, so trim
3259          * it */
3260         Int4 from, to, i, j, alphabet_sz;
3261         BLAST_ScorePtr *pssm = NULL;
3262 
3263         if (slp->choice != SEQLOC_INT) {
3264             ErrPostEx(SEV_ERROR,0,0,"B2SVerifyPSSM: SeqLocPtr is not a "
3265                     "SEQLOC_INT, cannot trim matrix");
3266             return FALSE;
3267         }
3268 
3269         from = SeqLocStart(slp);
3270         to = SeqLocStop(slp);
3271         alphabet_sz = matrix->columns;
3272 
3273         /* Adjust the pssm */
3274         pssm = (BLAST_ScorePtr *)MemNew(sizeof(BLAST_ScorePtr) *
3275                 (query_length+1));
3276         for (i = 0; i <= query_length; i++) {
3277             pssm[i] = (BLAST_ScorePtr)MemNew(sizeof(BLAST_Score) *
3278                     alphabet_sz);
3279         }
3280 
3281         for (i = from; i <= to; i++) {
3282             for (j = 0; j < alphabet_sz; j++)
3283                 pssm[(i-from)][j] = search->sbp->posMatrix[i][j];
3284         }
3285         for (j = 0; j < alphabet_sz; j++)
3286             pssm[query_length][j] = BLAST_SCORE_MIN;
3287 
3288         if (matrix->matrix == NULL) {
3289             /* Free the matrix we calculated originally */
3290             BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3291 
3292             for (i = 0; i < matrix->rows; i++)
3293                 posMatrix[i] = MemFree(posMatrix[i]);
3294             posMatrix = MemFree(posMatrix);
3295         }
3296         search->sbp->posMatrix = pssm;
3297 
3298     }
3299     return TRUE;
3300 }
3301 
3302 /* psi-blast2sequences setup: matrix must contain at least the residue
3303  * frequencies to calculate the PSSM. Otherwise, if the PSSM is given, that
3304  * will be used. */
B2SPssmSetupSearch(BlastSearchBlkPtr search,SeqLocPtr pssm_slp,BLAST_MatrixPtr matrix)3305 Boolean LIBCALL B2SPssmSetupSearch(BlastSearchBlkPtr search,
3306         SeqLocPtr pssm_slp, BLAST_MatrixPtr matrix)
3307 {
3308     Nlm_FloatHi karlinK = 0.0;
3309     Int4 npos, alphabet_size;
3310 
3311     if (!search || !matrix)
3312         return FALSE;
3313 
3314     if (search->prog_number != blast_type_blastp) {
3315         ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: only blastp is "
3316                 "supported");
3317         return FALSE;
3318     }
3319 
3320     search->positionBased = TRUE;
3321     npos = SeqLocLen(pssm_slp);
3322     alphabet_size = search->sbp->alphabet_size;
3323 
3324     if (npos <= 0) {
3325         ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: length of pssm_slp "
3326                 "must be positive");
3327         return FALSE;
3328     }
3329 
3330     /* save the residue frequencies, we might need them later */
3331     if (matrix->posFreqs) {
3332         search->sbp->posFreqs = allocatePosFreqs(npos, alphabet_size);
3333         copyPosFreqs(matrix->posFreqs, search->sbp->posFreqs, npos,
3334                 alphabet_size);
3335     }
3336 
3337     if (matrix->posFreqs && !matrix->matrix) {
3338         search->sbp->posMatrix = B2SCalculatePSSM(pssm_slp, search, matrix,
3339                 &karlinK);
3340         /* if we calculated the pssm, and use did not provide one, save it*/
3341         if (matrix->karlinK == 0.0 && karlinK != 0.0)
3342             matrix->karlinK = karlinK;
3343     } else {
3344         search->sbp->posMatrix = matrix->matrix;
3345     }
3346 
3347     search->sbp->mat_dim1 = search->sbp->query_length + 1;
3348     search->sbp->mat_dim2 = search->sbp->alphabet_size;
3349 
3350     /* Sanity check */
3351     if (!search->sbp->posMatrix) {
3352         ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: "
3353                 "Could not create or obtain PSSM! Please verify "
3354                 "BLAST_Matrix parameter");
3355         search->positionBased = FALSE;
3356         return FALSE;
3357     }
3358 
3359     /* Make sure the BLAST_Matrix number of rows is consistent with
3360      * pssm_slp */
3361     B2SVerifyPSSM(pssm_slp, search, matrix);
3362 
3363     if (matrix->karlinK != 0.0) {
3364         search->sbp->karlinK = matrix->karlinK;
3365         search->sbp->kbp_gap_psi[0]->K = matrix->karlinK;
3366         search->sbp->kbp_gap_psi[0]->logK = log(matrix->karlinK);
3367     }
3368 
3369     return TRUE;
3370 }
3371 
3372 /* clean up psi-blast2sequences */
B2SPssmCleanUpSearch(BlastSearchBlkPtr search,BLAST_MatrixPtr matrix)3373 Boolean LIBCALL B2SPssmCleanUpSearch(BlastSearchBlkPtr search,
3374         BLAST_MatrixPtr matrix)
3375 {
3376     Int4 i, rows = search->sbp->query_length + 1;
3377     BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3378     Nlm_FloatHiPtr *posFreqs = search->sbp->posFreqs;
3379 
3380     if (!matrix)
3381         return FALSE;
3382 
3383     if ((matrix->matrix == NULL) || /* B2SPssmSetupSearch created PSSM */
3384         (posMatrix != matrix->matrix)) { /* B2SVerifyPSSM trimmed PSSM */
3385         for (i = 0; i < rows; i++)
3386             posMatrix[i] = MemFree(posMatrix[i]);
3387         posMatrix = MemFree(posMatrix);
3388     }
3389     if (matrix->posFreqs) {
3390         for (i = 0; i < rows; i++)
3391             posFreqs[i] = MemFree(posFreqs[i]);
3392         posFreqs = MemFree(posFreqs);
3393     }
3394     search->sbp->posMatrix = NULL;
3395     search->sbp->posFreqs = NULL;
3396     search->positionBased = FALSE;
3397     return TRUE;
3398 }
3399 
B2SPssmOnTheFlyByLoc(BlastSearchBlkPtr search,SeqLocPtr subj_slp)3400 SeqAlignPtr LIBCALL B2SPssmOnTheFlyByLoc(BlastSearchBlkPtr search,
3401             SeqLocPtr subj_slp)
3402 {
3403     Int4 index, subject_length;
3404     SeqAlignPtr seqalign = NULL;
3405     Uint1Ptr subject_seq = NULL, subject_seq_start = NULL;
3406     SeqPortPtr spp;
3407     Uint1 residue;
3408 
3409     if (!search || search->query_invalid || !subj_slp)
3410         return NULL;
3411 
3412     if (search->result_struct)
3413         search->result_struct = BLASTResultsStructDelete(search->result_struct);
3414     search->result_struct = BLASTResultsStructNew(search->result_size,
3415                  search->pbp->max_pieces, search->pbp->hsp_range_max);
3416     BlastHitListPurge(search->current_hitlist);
3417 
3418     subject_length = SeqLocLen(subj_slp);
3419 
3420     if (search->prog_number == blast_type_blastp) {
3421         subject_seq_start = (Uint1Ptr) MemNew(
3422                 ((subject_length)+2)*sizeof(Uint1));
3423         /* The first residue is the sentinel. */
3424         subject_seq_start[0] = NULLB;
3425         subject_seq = subject_seq_start+1;
3426         index = 0;
3427         spp = SeqPortNewByLoc(subj_slp, Seq_code_ncbistdaa);
3428         while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3429             if (IS_residue(residue))
3430                 subject_seq[index++] = residue;
3431         }
3432         subject_seq[index] = NULLB;
3433         spp = SeqPortFree(spp);
3434     } else {
3435         return NULL;
3436     }
3437 
3438     seqalign = BlastTwoSequencesCore(search, subj_slp, subject_seq,
3439             subject_length, FALSE);
3440 
3441     MemFree(subject_seq_start);
3442     AdjustOffSetsInSeqAlign(seqalign, search->query_slp, subj_slp);
3443 
3444     return seqalign;
3445 }
3446 
B2SPssmOnTheFly(BlastSearchBlkPtr search,BioseqPtr subj_bsp)3447 SeqAlignPtr LIBCALL B2SPssmOnTheFly(BlastSearchBlkPtr search,
3448         BioseqPtr subj_bsp)
3449 {
3450     SeqAlignPtr salp = NULL;
3451     SeqLocPtr slp = NULL;
3452 
3453     if (!search || search->query_invalid || !subj_bsp)
3454         return NULL;
3455 
3456     ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(subj_bsp->id,
3457                     SEQID_GI)));
3458     salp = B2SPssmOnTheFlyByLoc(search, slp);
3459     SeqLocFree(slp);
3460     return salp;
3461 }
3462 
B2SPssmMultipleQueries(SeqLocPtr pssm_slp,BLAST_MatrixPtr matrix,SeqLocPtr * target_seqs,Int4 ntargets,BLAST_OptionsBlkPtr options)3463 SeqAlignPtr * LIBCALL B2SPssmMultipleQueries(SeqLocPtr pssm_slp,
3464         BLAST_MatrixPtr matrix, SeqLocPtr *target_seqs, Int4 ntargets,
3465         BLAST_OptionsBlkPtr options)
3466 {
3467     SeqAlignPtr *sa_array = NULL;
3468     BlastSearchBlkPtr search = NULL;
3469     Int4 i;
3470 
3471     if (!matrix || !pssm_slp || !target_seqs || ntargets <= 0 || !options)
3472         return NULL;
3473 
3474     /* Set up search structure */
3475     search = BLASTSetUpSearchByLoc(pssm_slp, options->program_name,
3476             SeqLocLen(pssm_slp), 0, NULL, options, NULL);
3477     B2SPssmSetupSearch(search, pssm_slp, matrix);
3478 
3479     /* Allocate memory for return value */
3480     if (!(sa_array = (SeqAlignPtr*)MemNew(sizeof(SeqAlignPtr)*ntargets))) {
3481         ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmMultipleQueries: Out of memory");
3482         BlastSearchBlkDestruct(search);
3483         return NULL;
3484     }
3485 
3486 
3487     /* Iterate over seqlocs in target_seqs, using effective search space in
3488      * rpsblast style */
3489     for (i = 0; i < ntargets; i++) {
3490         Int8 dblen = (options->db_length != 0) ?
3491                         options->db_length : SeqLocLen(pssm_slp);
3492         Int4 nseqs = (options->dbseq_num != 0) ?  options->dbseq_num : 1;
3493 
3494         /* If search space has been specified in the options structure, the it
3495          * must have been set in BLASTSetUpSearchEx, so don't overwrite it */
3496         if ( ! (options->searchsp_eff > 0) ) {
3497             search->searchsp_eff  = BLASTCalculateSearchSpace(options, nseqs,
3498                     dblen, SeqLocLen(target_seqs[i]));
3499         }
3500         sa_array[i] = B2SPssmOnTheFlyByLoc(search, target_seqs[i]);
3501     }
3502 
3503     /* Clean up */
3504     B2SPssmCleanUpSearch(search, matrix);
3505     BlastSearchBlkDestruct(search);
3506 
3507     return sa_array;
3508 }
3509 
3510 /************************************************************************/
3511 /* END    PSIBLAST2Sequences API                                        */
3512 /************************************************************************/
3513 
3514 /* Note that the matrix parameter should correspond to the full master
3515  * sequence */
3516 SeqAlignPtr LIBCALL
BlastTwoSequencesByLocWithCallback(SeqLocPtr slp1,SeqLocPtr slp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)),BLAST_MatrixPtr matrix)3517 BlastTwoSequencesByLocWithCallback(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr
3518         progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns,
3519         ValNodePtr *error_returns, int (LIBCALLBACK
3520             *handle_results)PROTO((VoidPtr srch)), BLAST_MatrixPtr matrix)
3521 {
3522 	BlastAllWordPtr all_words;
3523 	BlastSearchBlkPtr search;
3524 	BioseqPtr subject_bsp;
3525 	Boolean complement=FALSE, reverse, reverse_forbidden, options_alloc;
3526 	Int2 status;
3527 	Int4 index, subject_length, num_of_cols;
3528 	SeqAlignPtr seqalign=NULL;
3529 	SeqLocPtr query_slp, subject_slp;
3530 	SeqPortPtr spp;
3531 	SPCompressPtr spc=NULL;
3532 	Uint1 residue;
3533 	Uint1Ptr subject_seq, subject_seq_start;
3534 	Uint1Ptr *array;
3535         Boolean db_length_changed = FALSE;
3536 
3537 	if (slp1 == NULL || slp2 == NULL)
3538 		return NULL;
3539 
3540 	if (error_returns)
3541 	{
3542 		*error_returns = NULL;
3543 	}
3544 
3545 	if (other_returns)
3546 	{
3547 		*other_returns = NULL;
3548 	}
3549 
3550 	if (progname == NULL && options == NULL)
3551 		return NULL;
3552 
3553 	/* If filtering is performed, do not reverse the sequence.
3554            In this case the wrong sequence would be filtered. */
3555 	reverse_forbidden = FALSE;
3556 	if ((options && ((options->filter_string &&
3557                         StringCmp(options->filter_string, "F")) ||
3558                         options->is_megablast_search)) ||
3559                         matrix != NULL)
3560 	{
3561 		reverse_forbidden = TRUE;
3562 	}
3563 
3564 	/* Select the shorter sequence as the query, provided they are
3565 	   of the same type. */
3566 	if ((StringCmp(progname, "blastn") && StringCmp(progname, "blastp")) ||
3567 	    (reverse_forbidden || SeqLocLen(slp1) < SeqLocLen(slp2)))
3568 	{
3569 		query_slp = slp1;
3570 		subject_slp = slp2;
3571 		reverse = FALSE;
3572 	}
3573 	else
3574 	{
3575 		query_slp = slp2;
3576 		subject_slp = slp1;
3577 		reverse = TRUE;
3578 	}
3579 
3580     /* Make sure strands are handled correctly */
3581     if (!StringCmp(progname, "blastn") &&
3582         SeqLocStrand(query_slp) != Seq_strand_both &&
3583         SeqLocStrand(subject_slp) == Seq_strand_both) {
3584        Change_Loc_Strand(subject_slp, SeqLocStrand(query_slp));
3585        Change_Loc_Strand(query_slp, Seq_strand_both);
3586     }
3587 
3588 	if (progname == NULL)
3589 	{
3590 		progname = options->program_name;
3591 	}
3592 
3593 	/* If the subject strand is minus, turn it into plus for blastn. */
3594 	/* Complement the other strand to keep things straight. */
3595 	if (StringCmp(progname, "blastn") == 0 && SeqLocStrand(subject_slp) == Seq_strand_minus)
3596 	{
3597 		complement = TRUE;
3598                 if(SeqLocStrand(query_slp) == Seq_strand_plus ||
3599 			SeqLocStrand(query_slp) == Seq_strand_minus)
3600 				SeqLocRevCmp(query_slp);
3601 		SeqLocRevCmp(subject_slp);
3602 	}
3603 
3604 	subject_seq_start = subject_seq = NULL;
3605 
3606     /* Allocate default options if none are allocated yet. */
3607     options_alloc = FALSE;
3608     if (options == NULL)
3609     {
3610             options = BLASTOptionNew(progname, FALSE);
3611             options_alloc = TRUE;
3612     }
3613 
3614     status = BLASTOptionValidateEx(options, progname, error_returns);
3615     if (status != 0)
3616     {       /* error messages in other_returns? */
3617             return NULL;
3618     }
3619 
3620 	all_words = NULL;
3621 
3622 	subject_length = SeqLocLen(subject_slp);
3623 
3624 	if (!StringCmp(progname, "blastp") ||
3625 	    !StringCmp(progname, "blastx"))
3626 	{
3627 		subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3628 		/* The first residue is the sentinel. */
3629 		subject_seq_start[0] = NULLB;
3630 		subject_seq = subject_seq_start+1;
3631 		index = 0;
3632 		spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbistdaa);
3633 		while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3634 		{
3635 			if (IS_residue(residue))
3636 			{
3637 				subject_seq[index] = residue;
3638 				index++;
3639 			}
3640 		}
3641 		subject_seq[index] = NULLB;
3642 
3643 		num_of_cols = subject_length+1-options->wordsize;
3644 		all_words = BlastAllWordNew(num_of_cols, options->wordsize, FALSE, TRUE);
3645 		array = (Uint1Ptr *) MemNew(num_of_cols*sizeof(Uint1Ptr));
3646 		for (index=0; index<num_of_cols; index++)
3647 		{
3648 			array[index] = subject_seq+index;
3649 		}
3650 		all_words->array = array;
3651 		spp = SeqPortFree(spp);
3652 		if (options->gapped_calculation == TRUE)
3653 		{
3654 			options->two_pass_method  = FALSE;
3655 			options->multiple_hits_only  = TRUE;
3656 		}
3657 	}
3658 	else if (!StringCmp(progname, "blastn") ||
3659 		 !StringCmp(progname, "tblastn") ||
3660                  !StringCmp(progname, "psitblastn") ||
3661 		 !StringCmp(progname, "tblastx"))
3662 	{
3663 		spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
3664                 subject_bsp = BioseqFindCore(SeqLocId(subject_slp));
3665                 if (subject_bsp != NULL && subject_bsp->repr == Seq_repr_delta)
3666                         SeqPortSet_do_virtual(spp, TRUE);
3667 		spc = SPCompressDNA(spp);
3668 		if (spc == NULL)
3669 			return NULL;
3670 		subject_seq_start = subject_seq = spc->buffer;
3671 		spp = SeqPortFree(spp);
3672 	}
3673 	else /* Impossible! */
3674 	{
3675 	        return NULL;
3676 	}
3677 
3678     if (options->is_megablast_search)
3679         /* This has a different meaning in Mega BLAST and must be 0 */
3680         options->block_width = 0;
3681 
3682     if (options->db_length == 0)
3683     {
3684         options->db_length = subject_length;
3685         db_length_changed = TRUE;
3686     }
3687 
3688     options->dbseq_num = 1;
3689 
3690     search = BLASTSetUpSearchByLoc(query_slp, progname, SeqLocLen(query_slp), subject_length, all_words, options, NULL);
3691 
3692     /* Change length back, change only happens if zero. */
3693     if(db_length_changed)
3694         options->db_length = 0;
3695 
3696 
3697     if (search == NULL)
3698         return NULL;
3699 
3700     if (search->query_invalid) {
3701         search = BlastSearchBlkDestruct(search);
3702 		return NULL;
3703     }
3704 
3705         if (!StringCmp(progname, "tblastn") ||
3706             !StringCmp(progname, "tblastx") ||
3707             !StringCmp(progname, "psitblastn")) {
3708 	   MemFree(search->translation_buffer);
3709 	   search->translation_buffer = MemNew((3+(subject_length/3))*sizeof(Uint1));
3710 	   search->translation_buffer_size = 1+(subject_length/3);
3711 	}
3712 
3713     B2SPssmSetupSearch(search, slp1, matrix);
3714 
3715     search->handle_results = handle_results;
3716     search->output = options->output;
3717 
3718 	seqalign = BlastTwoSequencesCore(search, subject_slp, subject_seq, subject_length, reverse);
3719 
3720 	if (complement)
3721 	{
3722 		seqalign = SeqAlignListReverseStrand(seqalign);
3723 		SeqLocRevCmp(query_slp);
3724 		SeqLocRevCmp(subject_slp);
3725 	}
3726 
3727 	if (spc)
3728 	{
3729 		SPCompressFree(spc);
3730 		spc = NULL;
3731 	}
3732 	else
3733 	{
3734 		subject_seq_start = MemFree(subject_seq_start);
3735 	}
3736 
3737 	if (search->error_return)
3738 	{
3739 		ValNodeLink(error_returns, search->error_return);
3740 		search->error_return = NULL;
3741 	}
3742 
3743 	if (other_returns)
3744 	{ /* format dbinfo etc.  */
3745 		*other_returns = BlastOtherReturnsPrepare(search);
3746 	}
3747 
3748     if (options_alloc)
3749         options = BLASTOptionDelete(options);
3750 
3751     AdjustOffSetsInSeqAlign(seqalign, slp1, slp2);
3752 
3753     B2SPssmCleanUpSearch(search, matrix);
3754 
3755 	search = BlastSearchBlkDestruct(search);
3756 
3757 	return seqalign;
3758 }
3759 
3760 SeqAlignPtr LIBCALL
BlastTwoSequencesByLoc(SeqLocPtr slp1,SeqLocPtr slp2,CharPtr progname,BLAST_OptionsBlkPtr options)3761 BlastTwoSequencesByLoc(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr progname, BLAST_OptionsBlkPtr options)
3762 {
3763 	return BlastTwoSequencesByLocEx(slp1, slp2, progname, options, NULL, NULL);
3764 }
3765 
3766 SeqAlignPtr LIBCALL
BlastTwoSequencesEx(BioseqPtr bsp1,BioseqPtr bsp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns)3767 BlastTwoSequencesEx(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns)
3768 {
3769    return BlastTwoSequencesWithCallback(bsp1, bsp2, progname, options,
3770            other_returns, error_returns, NULL);
3771 }
3772 
3773 SeqAlignPtr LIBCALL
BlastTwoSequencesWithCallback(BioseqPtr bsp1,BioseqPtr bsp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr search)))3774 BlastTwoSequencesWithCallback(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *handle_results)PROTO((VoidPtr search)))
3775 {
3776 	SeqAlignPtr seqalign;
3777 	SeqLocPtr slp1=NULL, slp2=NULL;
3778 
3779 	if (bsp1 == NULL || bsp2 == NULL)
3780 		return NULL;
3781 
3782 	slp1 = NULL;
3783 	slp2 = NULL;
3784     if (!handle_results) {
3785        ValNodeAddPointer(&slp1, SEQLOC_WHOLE,
3786                          SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI)));
3787        ValNodeAddPointer(&slp2, SEQLOC_WHOLE,
3788                          SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI)));
3789     } else {
3790        ValNodeAddPointer(&slp1, SEQLOC_WHOLE,
3791                          SeqIdDup(SeqIdFindBestAccession(bsp1->id)));
3792        ValNodeAddPointer(&slp2, SEQLOC_WHOLE,
3793                          SeqIdDup(SeqIdFindBestAccession(bsp2->id)));
3794     }
3795     seqalign = BlastTwoSequencesByLocWithCallback(slp1, slp2, progname,
3796             options, other_returns, error_returns, handle_results, NULL);
3797 
3798     slp1 = SeqLocFree(slp1);
3799     slp2 = SeqLocFree(slp2);
3800 
3801     return seqalign;
3802 }
3803 
3804 SeqAlignPtr LIBCALL
BlastTwoSequences(BioseqPtr bsp1,BioseqPtr bsp2,CharPtr progname,BLAST_OptionsBlkPtr options)3805 BlastTwoSequences(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options)
3806 {
3807 	return BlastTwoSequencesEx(bsp1, bsp2, progname, options, NULL, NULL);
3808 }
3809 
3810 /*
3811 	Runs blast on the fly between the query BioseqPtr (specified with a
3812 	call to BLASTSetUpSearch) and the subject BioseqPtr.
3813 */
3814 
3815 
3816 BlastSearchBlkPtr LIBCALL
BlastSequencesOnTheFlyEx(BlastSearchBlkPtr search,BioseqPtr subject_bsp)3817 BlastSequencesOnTheFlyEx(BlastSearchBlkPtr search, BioseqPtr subject_bsp)
3818 {
3819 	Int4 index, subject_length;
3820 	SeqPortPtr spp;
3821 	SPCompressPtr spc=NULL;
3822 	Uint1Ptr subject_seq, subject_seq_start;
3823 	Uint1 residue;
3824 
3825 	if (subject_bsp == NULL)
3826 		return NULL;
3827 
3828 	if (search == NULL || search->query_invalid)
3829 		return NULL;
3830 
3831         if (!search->pbp->mb_params) {
3832            if (search->result_struct)
3833               search->result_struct =
3834                  BLASTResultsStructDelete(search->result_struct);
3835            search->result_struct =
3836               BLASTResultsStructNew(search->result_size,
3837                  search->pbp->max_pieces, search->pbp->hsp_range_max);
3838         } else {
3839            if (search->mb_result_struct && search->mb_result_struct[0])
3840               search->mb_result_struct[0] =
3841                  BLASTResultsStructDelete(search->mb_result_struct[0]);
3842            if (!search->mb_result_struct)
3843               search->mb_result_struct = (BLASTResultsStructPtr PNTR)
3844                  MemNew(sizeof(BLASTResultsStructPtr));
3845         }
3846 
3847 	BlastHitListPurge(search->current_hitlist);
3848 
3849 	subject_seq_start = subject_seq = NULL;
3850 
3851 	subject_length = subject_bsp->length;
3852 
3853 	if (StringCmp(search->prog_name, "blastp") == 0)
3854 	{
3855 		subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3856 		/* The first residue is the sentinel. */
3857 		subject_seq_start[0] = NULLB;
3858 		subject_seq = subject_seq_start+1;
3859 		index = 0;
3860 		spp = SeqPortNew(subject_bsp, FIRST_RESIDUE, LAST_RESIDUE,
3861 				 0, Seq_code_ncbistdaa);
3862 		while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3863 		{
3864 			if (IS_residue(residue))
3865 			{
3866 				subject_seq[index] = residue;
3867 				index++;
3868 			}
3869 		}
3870 		subject_seq[index] = NULLB;
3871 		spp = SeqPortFree(spp);
3872 	}
3873 	else if (StringCmp(search->prog_name, "blastn") == 0)
3874 	{
3875 		spp = SeqPortNew(subject_bsp, FIRST_RESIDUE, LAST_RESIDUE,
3876 				 0, Seq_code_ncbi4na);
3877 		spc = SPCompressDNA(spp);
3878 		subject_seq = spc->buffer;
3879 		spp = SeqPortFree(spp);
3880 	}
3881 	else
3882 	{
3883 		return NULL;
3884 	}
3885 
3886 	BlastTwoSequencesCoreEx(search, subject_bsp, subject_seq,
3887 					 subject_length);
3888 
3889 	if (spc)
3890 	{
3891 		SPCompressFree(spc);
3892 		spc = NULL;
3893 	}
3894 	else
3895 	{
3896 		subject_seq_start = MemFree(subject_seq_start);
3897 	}
3898 
3899    return search;
3900 }
3901 
3902 SeqAlignPtr LIBCALL
BlastSequencesOnTheFlyByLoc(BlastSearchBlkPtr search,SeqLocPtr subject_slp)3903 BlastSequencesOnTheFlyByLoc(BlastSearchBlkPtr search, SeqLocPtr subject_slp)
3904 {
3905 	Int4 index, subject_length;
3906 	SeqAlignPtr seqalign=NULL;
3907 	SeqPortPtr spp;
3908 	SPCompressPtr spc=NULL;
3909 	Uint1Ptr subject_seq, subject_seq_start;
3910 	Uint1 residue;
3911 
3912 	if (subject_slp == NULL)
3913 		return NULL;
3914 
3915 	if (search == NULL || search->query_invalid)
3916 		return NULL;
3917 
3918 
3919         if (!search->pbp->mb_params) {
3920            if (search->result_struct)
3921               search->result_struct = BLASTResultsStructDelete(search->result_struct);
3922            search->result_struct =
3923               BLASTResultsStructNew(search->result_size,
3924                  search->pbp->max_pieces, search->pbp->hsp_range_max);
3925         } else {
3926            if (search->mb_result_struct && search->mb_result_struct[0])
3927               search->mb_result_struct[0] =
3928                  BLASTResultsStructDelete(search->mb_result_struct[0]);
3929            if (!search->mb_result_struct)
3930               search->mb_result_struct = (BLASTResultsStructPtr PNTR)
3931                  MemNew(sizeof(BLASTResultsStructPtr));
3932         }
3933 	BlastHitListPurge(search->current_hitlist);
3934 
3935 	subject_seq_start = subject_seq = NULL;
3936 
3937 	subject_length = SeqLocLen(subject_slp);
3938 
3939 	if (StringCmp(search->prog_name, "blastp") == 0)
3940 	{
3941 		subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3942 		/* The first residue is the sentinel. */
3943 		subject_seq_start[0] = NULLB;
3944 		subject_seq = subject_seq_start+1;
3945 		index = 0;
3946 		spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbistdaa);
3947 		while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3948 		{
3949 			if (IS_residue(residue))
3950 			{
3951 				subject_seq[index] = residue;
3952 				index++;
3953 			}
3954 		}
3955 		subject_seq[index] = NULLB;
3956 		spp = SeqPortFree(spp);
3957 	}
3958 	else if (StringCmp(search->prog_name, "blastn") == 0)
3959 	{
3960 		spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
3961 		spc = SPCompressDNA(spp);
3962 		subject_seq = spc->buffer;
3963 		spp = SeqPortFree(spp);
3964 	}
3965 	else
3966 	{
3967 		return NULL;
3968 	}
3969 
3970 	seqalign = BlastTwoSequencesCore(search, subject_slp, subject_seq, subject_length, FALSE);
3971 
3972 	if (spc)
3973 	{
3974 		SPCompressFree(spc);
3975 		spc = NULL;
3976 	}
3977 	else
3978 	{
3979 		subject_seq_start = MemFree(subject_seq_start);
3980 	}
3981 
3982 	AdjustOffSetsInSeqAlign(seqalign, search->query_slp, subject_slp);
3983 
3984 	return seqalign;
3985 }
3986 
3987 SeqAlignPtr LIBCALL
BlastSequencesOnTheFly(BlastSearchBlkPtr search,BioseqPtr subject_bsp)3988 BlastSequencesOnTheFly(BlastSearchBlkPtr search, BioseqPtr subject_bsp)
3989 {
3990 	SeqAlignPtr seqalign;
3991 	SeqLocPtr slp;
3992 
3993 	slp = NULL;
3994 	ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(subject_bsp->id, SEQID_GI)));
3995 	seqalign = BlastSequencesOnTheFlyByLoc(search, slp);
3996 	SeqLocFree(slp);
3997 	return seqalign;
3998 }
3999 /*
4000 	Translate a nucleotide sequence without ambiguity codes.
4001 	This is used for the first-pass translation of the database.
4002 
4003 	BlastSearchBlkPtr search: overall BLAST structure.
4004 	Int4 length: length of the nucl. sequence
4005 	Uint1Ptr prot_seq: the (translated) protein sequence, with NULLB
4006 		sentinels on either end.  This array should be allocated
4007 		with sufficient memory before the function is called.
4008 	Uint1Ptr nt_seq: the original nucl. sequence.
4009 
4010 	The genetic code to be used is determined by the translation_table
4011 	on the BlastSearchBlkPtr.
4012 
4013 	This function translates a packed (ncbi2na) nucl. alphabet.  It
4014 	views a basepair as being in one of four sets of 2-bits:
4015 
4016 	|0|1|2|3||0|1|2|3||0|1|2|3||...
4017 
4018 	1st byte | 2 byte | 3rd byte...
4019 
4020 	A codon that starts at the beginning of the above sequence starts in
4021 	state "0" and includes basepairs 0, 1, and 2.  The next codon, in the
4022 	same frame, after that starts in state "3" and includes 3, 0, and 1.
4023 
4024 	** Optimization:
4025 	  changed the single main loop to
4026 	     - advance to state 0,
4027 	     - optimized inner loop does two (3 byte->4 codon) translation per iteration
4028 	           (loads are moved earlier so they can be done in advance.)
4029 	     - do remainder
4030 */
4031 
4032 Int4 LIBCALL
BlastTranslateUnambiguousSequence(BlastSearchBlkPtr search,Int4 length,Uint1Ptr prot_seq,Uint1Ptr nt_seq,Int2 frame)4033 BlastTranslateUnambiguousSequence(BlastSearchBlkPtr search, Int4 length, Uint1Ptr prot_seq, Uint1Ptr nt_seq, Int2 frame)
4034 
4035 {
4036 	register int state;
4037 	Int2 total_remainder;
4038 	Int4 prot_length;
4039 	register int byte_value, codon=0;
4040 	Uint1 last_remainder, last_byte, remainder;
4041 	register Uint1Ptr translation, nt_seq_end, nt_seq_start;
4042 	Uint1Ptr prot_seq_start;
4043 	int byte_value1,byte_value2,byte_value3,byte_value4,byte_value5;
4044 
4045 	prot_length=0;
4046 	if (nt_seq == NULL || prot_seq == NULL || (length-ABS(frame)+1) < CODON_LENGTH)
4047 	return prot_length;
4048 
4049 	*prot_seq = NULLB;
4050 	prot_seq++;
4051 
4052 /* record to determine protein length. */
4053 	prot_seq_start = prot_seq;
4054 
4055 	if (frame > 0)
4056 		translation = search->translation_table;
4057 	else
4058 		translation = search->translation_table_rc;
4059 
4060 	remainder = length%4;
4061 
4062 	if (frame > 0)
4063 	{
4064 		nt_seq_end = nt_seq + (length)/4 - 1;
4065 		last_remainder = (4*(length/4) - frame + 1)%CODON_LENGTH;
4066 		total_remainder = last_remainder+remainder;
4067 
4068 		state = frame-1;
4069 		byte_value = *nt_seq;
4070 
4071 		/* If there's lots to do, advance to state 0, then enter fast loop */
4072 		while (nt_seq < nt_seq_end)
4073 		{
4074 			switch (state)
4075 			{
4076 				case 0:
4077 					codon = (byte_value >> 2);
4078 					*prot_seq = translation[codon];
4079 					prot_seq++;
4080 				/* do state = 3 now, break is NOT missing. */
4081 				case 3:
4082 					codon = ((byte_value & 3) << 4);
4083 					nt_seq++;
4084 					byte_value = *nt_seq;
4085 					codon += (byte_value >> 4);
4086 					*prot_seq = translation[codon];
4087 					prot_seq++;
4088 					if (nt_seq >= nt_seq_end)
4089 					{
4090 						state = 2;
4091 						break;
4092 					}
4093 				/* Go on to state = 2 if not at end. */
4094 				case 2:
4095 					codon = ((byte_value & 15) << 2);
4096 					nt_seq++;
4097 					byte_value = *nt_seq;
4098 					codon += (byte_value >> 6);
4099 					*prot_seq = translation[codon];
4100 					prot_seq++;
4101 					if (nt_seq >= nt_seq_end)
4102 					{
4103 						state = 1;
4104 						break;
4105 					}
4106 				/* Go on to state = 1 if not at end. */
4107 				case 1:
4108 					codon = byte_value & 63;
4109 					*prot_seq = translation[codon];
4110 					prot_seq++;
4111 					nt_seq++;
4112 					byte_value = *nt_seq;
4113 					state = 0;
4114 					break;
4115 			} /* end switch */
4116 			/* switch ends at state 0, except when at end */
4117 
4118 
4119 			/********************************************/
4120 			/* optimized loop: start in state 0. continue til near end */
4121 			while (nt_seq < (nt_seq_end-10))
4122 			  {
4123 			    byte_value1 = *(++nt_seq);
4124 			    byte_value2 = *(++nt_seq);
4125 			    byte_value3 = *(++nt_seq);
4126 			    /* case 0: */
4127 			    codon = (byte_value >> 2);
4128 			    *prot_seq = translation[codon];
4129 			    prot_seq++;
4130 
4131 			    /* case 3: */
4132 			    codon = ((byte_value & 3) << 4);
4133 			    codon += (byte_value1 >> 4);
4134 			    *prot_seq = translation[codon];
4135 			    prot_seq++;
4136 
4137 			    byte_value4 = *(++nt_seq);
4138 			    /* case 2: */
4139 			    codon = ((byte_value1 & 15) << 2);
4140 
4141 			    codon += (byte_value2 >> 6);
4142 			    *prot_seq = translation[codon];
4143 			    prot_seq++;
4144 			    /* case 1: */
4145 			    codon = byte_value2 & 63;
4146 			    byte_value5 = *(++nt_seq);
4147 			    *prot_seq = translation[codon];
4148 			    prot_seq++;
4149 
4150 			    /* case 0: */
4151 			    codon = (byte_value3 >> 2);
4152 			    *prot_seq = translation[codon];
4153 			    prot_seq++;
4154 			    /* case 3: */
4155 			    byte_value = *(++nt_seq);
4156 			    codon = ((byte_value3 & 3) << 4);
4157 			    codon += (byte_value4 >> 4);
4158 			    *prot_seq = translation[codon];
4159 			    prot_seq++;
4160 			    /* case 2: */
4161 			    codon = ((byte_value4 & 15) << 2);
4162 			    codon += (byte_value5 >> 6);
4163 			    *prot_seq = translation[codon];
4164 			    prot_seq++;
4165 			    /* case 1: */
4166 			    codon = byte_value5 & 63;
4167 			    *prot_seq = translation[codon];
4168 			    prot_seq++;
4169 			    state=0;
4170 			  } /* end optimized while */
4171 		/********************************************/
4172 		} /* end while */
4173 
4174 
4175 		if (state == 1)
4176 		{
4177 		/* This doesn't get done above, DON't do the state = 0
4178 		   below if this is done. */
4179 			byte_value = *nt_seq;
4180 			codon = byte_value & 63;
4181 			state = 0;
4182 			*prot_seq = translation[codon];
4183 			prot_seq++;
4184 		}
4185 		else if (state == 0)
4186 		{ /* This one doesn't get done above. */
4187 			byte_value = *nt_seq;
4188 			codon = ((byte_value) >> 2);
4189 			state = 3;
4190 			*prot_seq = translation[codon];
4191 			prot_seq++;
4192 		}
4193 
4194 		if (total_remainder >= CODON_LENGTH)
4195 		{
4196 			byte_value = *(nt_seq_end);
4197 			last_byte = *(nt_seq_end+1);
4198 			if (state == 0)
4199 			{
4200 				codon = (last_byte >> 2);
4201 			}
4202 			else if (state == 2)
4203 			{
4204 				codon = ((byte_value & 15) << 2);
4205 				codon += (last_byte >> 6);
4206 			}
4207 			else if (state == 3)
4208 			{
4209 				codon = ((byte_value & 3) << 4);
4210 				codon += (last_byte >> 4);
4211 			}
4212 			*prot_seq = translation[codon];
4213 			prot_seq++;
4214 		}
4215 		*prot_seq = NULLB;
4216 	}
4217 	else
4218 	{
4219 		nt_seq_start = nt_seq;
4220 		nt_seq += length/4;
4221 		state = remainder+frame;
4222 	/* Do we start in the last byte?  This one has the lowest order
4223 	bits set to represent the remainder, hence the odd coding here. */
4224 		if (state >= 0)
4225 		{
4226 			last_byte = *nt_seq;
4227 			nt_seq--;
4228 			if (state == 0)
4229 			{
4230 				codon = (last_byte >> 6);
4231 				byte_value = *nt_seq;
4232 				codon += ((byte_value & 15) << 2);
4233 				state = 1;
4234 			}
4235 			else if (state == 1)
4236 			{
4237 				codon = (last_byte >> 4);
4238 				byte_value = *nt_seq;
4239 				codon += ((byte_value & 3) << 4);
4240 				state = 2;
4241 			}
4242 			else if (state == 2)
4243 			{
4244 				codon = (last_byte >> 2);
4245 				state = 3;
4246 			}
4247 			*prot_seq = translation[codon];
4248 			prot_seq++;
4249 
4250 		}
4251 		else
4252 		{
4253 			state = 3 + (remainder + frame + 1);
4254 			nt_seq--;
4255 		}
4256 
4257 		byte_value = *nt_seq;
4258 
4259 		/* If there's lots to do, advance to state 3, then enter fast loop */
4260 		while (nt_seq > nt_seq_start)
4261 		{
4262 			switch (state)
4263 			{
4264 				case 3:
4265 					codon = (byte_value & 63);
4266 					*prot_seq = translation[codon];
4267 					prot_seq++;
4268 				/* do state = 0 now, break is NOT missing. */
4269 				case 0:
4270 					codon = (byte_value >> 6);
4271 					nt_seq--;
4272 					byte_value = *nt_seq;
4273 					codon += ((byte_value & 15) << 2);
4274 					*prot_seq = translation[codon];
4275 					prot_seq++;
4276 					if (nt_seq <= nt_seq_start)
4277 					{
4278 						state = 1;
4279 						break;
4280 					}
4281 				/* Go on to state = 2 if not at end. */
4282 				case 1:
4283 					codon = (byte_value >> 4);
4284 					nt_seq--;
4285 					byte_value = *nt_seq;
4286 					codon += ((byte_value & 3) << 4);
4287 					*prot_seq = translation[codon];
4288 					prot_seq++;
4289 					if (nt_seq <= nt_seq_start)
4290 					{
4291 						state = 2;
4292 						break;
4293 					}
4294 				/* Go on to state = 2 if not at end. */
4295 				case 2:
4296 					codon = (byte_value >> 2);
4297 					*prot_seq = translation[codon];
4298 					prot_seq++;
4299 					nt_seq--;
4300 					byte_value = *nt_seq;
4301 					state = 3;
4302 					break;
4303 			} /* end switch */
4304 			/* switch ends at state 3, except when at end */
4305 
4306 
4307 			/********************************************/
4308 			/* optimized area: start in state 0. continue til near end */
4309 			while (nt_seq > (nt_seq_start+10))
4310 			  {
4311 			    byte_value1 = *(--nt_seq);
4312 			    byte_value2 = *(--nt_seq);
4313 			    byte_value3 = *(--nt_seq);
4314 
4315 			    codon = (byte_value & 63);
4316 			    *prot_seq = translation[codon];
4317 			    prot_seq++;
4318 			    codon = (byte_value >> 6);
4319 			    codon += ((byte_value1 & 15) << 2);
4320 			    *prot_seq = translation[codon];
4321 			    prot_seq++;
4322 			    byte_value4 = *(--nt_seq);
4323 			    codon = (byte_value1 >> 4);
4324 			    codon += ((byte_value2 & 3) << 4);
4325 			    *prot_seq = translation[codon];
4326 			    prot_seq++;
4327 			    codon = (byte_value2 >> 2);
4328 			    *prot_seq = translation[codon];
4329 			    prot_seq++;
4330 			    byte_value5 = *(--nt_seq);
4331 
4332 			    codon = (byte_value3 & 63);
4333 			    *prot_seq = translation[codon];
4334 			    prot_seq++;
4335 			    byte_value = *(--nt_seq);
4336 			    codon = (byte_value3 >> 6);
4337 			    codon += ((byte_value4 & 15) << 2);
4338 			    *prot_seq = translation[codon];
4339 			    prot_seq++;
4340 			    codon = (byte_value4 >> 4);
4341 			    codon += ((byte_value5 & 3) << 4);
4342 			    *prot_seq = translation[codon];
4343 			    prot_seq++;
4344 			    codon = (byte_value5 >> 2);
4345 			    *prot_seq = translation[codon];
4346 			    prot_seq++;
4347 			  } /* end optimized while */
4348 			/********************************************/
4349 
4350 		} /* end while */
4351 
4352 		byte_value = *nt_seq;
4353 		if (state == 3)
4354 		{
4355 			codon = (byte_value & 63);
4356 			*prot_seq = translation[codon];
4357 			prot_seq++;
4358 		}
4359 		else if (state == 2)
4360 		{
4361 			codon = (byte_value >> 2);
4362 			*prot_seq = translation[codon];
4363 			prot_seq++;
4364 		}
4365 	}
4366 
4367 	*prot_seq = NULLB;
4368 
4369 	return (prot_seq - prot_seq_start);
4370 }	/* BlastTranslateUnambiguousSequence */
4371 
4372 
4373 
4374 /*
4375 	Gets an appropriate ID for the database (subject) sequence.
4376 	Int4 hit_number is the index into the BLASTResultHitlistPtr,
4377 	Boolean ordinal_number specifies whether an ordinal number (the
4378 	db sequence number) or a real ID should be used.
4379 */
4380 SeqIdPtr LIBCALL
BlastGetSubjectIdEx(BlastSearchBlkPtr search,Int4 hit_number,Boolean ordinal_number,ValNodePtr * vnpp,Int2 query_number)4381 BlastGetSubjectIdEx(BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number, ValNodePtr *vnpp, Int2 query_number)
4382 {
4383     BLASTResultHitlistPtr   results;
4384     DbtagPtr dbtagptr;
4385     ObjectIdPtr obidp;
4386     SeqIdPtr subject_id=NULL, sip;
4387     Uint4	header;
4388     BLASTResultsStructPtr result_struct;
4389 
4390     if (search->pbp->mb_params)
4391        result_struct = search->mb_result_struct[query_number];
4392     else
4393        result_struct = search->result_struct;
4394 
4395     results = result_struct->results[hit_number];
4396     if (ordinal_number) {
4397 
4398         obidp = ObjectIdNew();
4399         obidp->str = NULL;
4400         obidp->id = results->subject_id;
4401         dbtagptr = DbtagNew();
4402         if (search->rdfp) {
4403             dbtagptr->db = StringSave(search->rdfp->filename);
4404         }
4405         dbtagptr->tag = obidp;
4406         ValNodeAddPointer(&subject_id, SEQID_GENERAL, dbtagptr);
4407     }  else if (search->rdfp) {
4408         if (vnpp == NULL) {
4409             readdb_get_descriptor(search->rdfp, results->subject_id, &subject_id, NULL);
4410         } else {
4411             header = 0;
4412             sip = NULL;
4413 
4414             if(search->rdfp->formatdb_ver == FORMATDB_VER_TEXT) {
4415                 while (readdb_get_header(search->rdfp, results->subject_id, &header, &sip, NULL) == TRUE)
4416                     ValNodeAddPointer(vnpp, 0, sip);
4417             } else {
4418                 BlastDefLinePtr bdfp, bdfp_head;
4419 
4420                 bdfp_head = FDReadDeflineAsn(search->rdfp, results->subject_id);
4421 
4422                 if(bdfp_head == NULL) {
4423                     ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d", results->subject_id);
4424                     return NULL;
4425                 }
4426 
4427                 for(bdfp = bdfp_head; bdfp != NULL; bdfp = bdfp->next) {
4428                     sip = SeqIdSetDup(bdfp->seqid);
4429                     ValNodeAddPointer(vnpp, 0, sip);
4430                 }
4431 
4432                 BlastDefLineSetFree(bdfp_head);
4433             }
4434         }
4435     } else {
4436         if (results->subject_info)
4437             subject_id = SeqIdDup(results->subject_info->sip);
4438     }
4439 
4440     return subject_id;
4441 }
4442 
4443 SeqIdPtr LIBCALL
BlastGetSubjectId(BlastSearchBlkPtr search,Int4 hit_number,Boolean ordinal_number,ValNodePtr * vnpp)4444 BlastGetSubjectId(BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number, ValNodePtr *vnpp)
4445 {
4446    return BlastGetSubjectIdEx(search, hit_number, ordinal_number, vnpp, 0);
4447 }
4448 
4449 /*
4450 	Use by HeapSort (in BioseqBlastEngine) to rank Hitlist's.
4451 */
4452 
4453 int LIBCALLBACK
evalue_compare_hits(VoidPtr v1,VoidPtr v2)4454 evalue_compare_hits(VoidPtr v1, VoidPtr v2)
4455 
4456 {
4457     BLASTResultHitlistPtr h1, h2;
4458     BLASTResultHitlistPtr *hp1, *hp2;
4459 
4460     hp1 = (BLASTResultHitlistPtr *) v1;
4461     hp2 = (BLASTResultHitlistPtr *) v2;
4462     h1 = *hp1;
4463     h2 = *hp2;
4464 
4465     /* Sort first by evalue, then by score in case all evalues are zero. */
4466 
4467     if (h1->best_evalue < h2->best_evalue)
4468         return -1;
4469     if (h1->best_evalue > h2->best_evalue)
4470         return 1;
4471     if (h1->high_score > h2->high_score)
4472         return -1;
4473     if (h1->high_score < h2->high_score)
4474         return 1;
4475 
4476     /* In case of equal scores and E-values order will be determined by
4477        subject id */
4478 
4479     if (h1->subject_id > h2->subject_id)
4480         return -1;
4481     if (h1->subject_id < h2->subject_id)
4482         return 1;
4483 
4484     return 0;
4485 }
4486 
4487 /* Code in BLAST_CLUSTER_HITS is not currently in use */
4488 
4489 #ifdef BLAST_CLUSTER_HITS
4490 typedef struct _blast_result_with_subject_id {
4491    BLASTResultHspPtr hsp;
4492    Int4 hitlist_index, hsp_index;
4493 } BlastResultHspWithId, PNTR BlastResultHspWithIdPtr;
4494 
BLASTResultHspScoreCmp(VoidPtr v1,VoidPtr v2)4495 static int LIBCALLBACK BLASTResultHspScoreCmp(VoidPtr v1, VoidPtr v2)
4496 {
4497    BLASTResultHspPtr h1, h2;
4498 
4499    h1 = (*(BlastResultHspWithIdPtr PNTR) v1)->hsp;
4500    h2 = (*(BlastResultHspWithIdPtr PNTR) v2)->hsp;
4501 
4502    if (h1->score < h2->score)
4503       return 1;
4504    else if (h1->score > h2->score)
4505       return -1;
4506    else return 0;
4507 }
4508 
ResultHspWithIdIndexCmp(VoidPtr v1,VoidPtr v2)4509 static int LIBCALLBACK ResultHspWithIdIndexCmp(VoidPtr v1, VoidPtr v2)
4510 {
4511    BlastResultHspWithIdPtr h1, h2;
4512 
4513    h1 = *(BlastResultHspWithIdPtr PNTR) v1;
4514    h2 = *(BlastResultHspWithIdPtr PNTR) v2;
4515 
4516    if (h1->hitlist_index < h2->hitlist_index)
4517       return -1;
4518    else if (h1->hitlist_index > h2->hitlist_index)
4519       return 1;
4520    else if (h1->hsp_index < h2->hsp_index)
4521       return -1;
4522    else if (h1->hsp_index > h2->hsp_index)
4523       return 1;
4524    else /* Should never happen */
4525       return 0;
4526 }
4527 #endif
4528 
4529 #define CLUSTER_LENGTH_THRESH 0.1
4530 #define CLUSTER_OVERLAP_THRESH 0.9
4531 #define CLUSTER_SCORE_THRESH 1.6
4532 
4533 static Nlm_FloatHi
s_ComputeAverageLength(const BlastSearchBlk * search)4534 s_ComputeAverageLength(const BlastSearchBlk* search)
4535 {
4536     Nlm_FloatHi retval = 0.0;
4537 
4538 	if (StringCmp(search->prog_name, "blastn") != 0) {
4539 		retval = BLAST_AA_AVGLEN;
4540 	} else {
4541 		retval = BLAST_NT_AVGLEN;
4542 	}
4543 
4544     if (search->rdfp) {
4545         Int4 total_number = 0;
4546         Int8 total_length = 0;
4547 
4548         readdb_get_totals(search->rdfp, &total_length, &total_number);
4549         if (total_number > 0)
4550             retval = ((Nlm_FloatHi) total_length)/total_number;
4551     } else if (search->dblen > 0 && search->dbseq_num == 1) {
4552         retval = search->dblen;
4553     }
4554 
4555     return retval;
4556 }
4557 
4558 SeqAlignPtr LIBCALL
BioseqBlastEngineCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options,Int4Ptr * pos_matrix)4559 BioseqBlastEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options,
4560                         Int4Ptr *pos_matrix)
4561 {
4562 	Int4 hitlist_max;
4563 	SeqAlignPtr head, seqalign;
4564 #ifdef BLAST_CLUSTER_HITS
4565         BLASTResultHspPtr hsp, hsp1;
4566         BlastResultHspWithIdPtr PNTR hspp;
4567         BLASTResultsStructPtr result_struct;
4568         BLASTResultHitlistPtr   result_hitlist;
4569         Int4 hspcnt, index, index1, index2;
4570         Int4 q_overlap;
4571         BioseqPtr bsp1, bsp2, PNTR bspp;
4572         BlastSearchBlkPtr search1;
4573         BLAST_KarlinBlkPtr kbp;
4574         FloatHi bit_score;
4575 #endif
4576 
4577 	head = seqalign = NULL;
4578 
4579 	if (search == NULL || search->query_invalid)
4580 		return NULL;
4581 
4582 	/* If pos_matrix is not NULL, then psi-blast iterations are being
4583 	performed.  The first psi-blast iteration should be with normal
4584 	blast. */
4585 	if (pos_matrix)
4586 	{
4587 		search->sbp->posMatrix = pos_matrix;
4588 		search->positionBased = TRUE;
4589                 search->sbp->kbp = search->sbp->kbp_psi;
4590                 search->sbp->kbp_gap = search->sbp->kbp_gap_psi;
4591 		hitlist_max = search->result_struct->hitlist_max;
4592                 search->result_struct = BLASTResultsStructDelete(search->result_struct);
4593 		search->result_struct = BLASTResultsStructNew(hitlist_max, search->pbp->max_pieces, search->pbp->hsp_range_max);
4594                 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST)
4595 		{
4596                        search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
4597 		       search->wfp_first = BLAST_WordFinderNew(search->sbp->alphabet_size,options->wordsize,1, FALSE);
4598 		}
4599 
4600 		if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND)
4601 		{
4602 		       search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
4603 		       search->wfp_second = BLAST_WordFinderNew(search->sbp->alphabet_size,options->wordsize,1, FALSE);
4604 		}
4605 
4606 
4607         /* Only find words once if thresholds are the same. */
4608         search->wfp = search->wfp_first;
4609         if (search->whole_query == TRUE) {
4610             BlastNewFindWords(search, 0, search->context[search->first_context].query->length, search->pbp->threshold_second, (Uint1) 0);
4611         } else {
4612             BlastNewFindWords(search, search->required_start, search->required_end, search->pbp->threshold_second, (Uint1) 0);
4613         }
4614         lookup_position_aux_destruct(search->wfp->lookup);
4615         search->wfp_second = search->wfp_first;
4616 
4617         /* Unless search->pbp->cutoff_s[2]_set is set, we wish to calculate
4618            cutoff_s[2] from cutoff_e[2], rather than the other way around.
4619            Setting cutoff_s[2] to zero, as was the case in the first call to
4620            blast_set_parameters, accomplishes this.
4621         */
4622         if (!search->pbp->cutoff_s_set) {
4623             search->pbp->cutoff_s = 0;
4624         }
4625         if (!search->pbp->cutoff_s2_set) {
4626             search->pbp->cutoff_s2 = 0;
4627         }
4628         /* recalculate the cutoff scores with the newly calculated
4629            Karlin-Altschul parameters. */
4630         blast_set_parameters(search,
4631                              options->dropoff_1st_pass,
4632                              options->dropoff_2nd_pass,
4633                              s_ComputeAverageLength(search),
4634                              search->searchsp_eff,
4635                              options->window_size);
4636 	}
4637 
4638 	/* Starting awake thread if multithreaded. */
4639 	if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
4640 		BlastStartAwakeThread(search->thr_info);
4641 
4642         /* THE BLAST SEARCH IS HERE */
4643 	do_the_blast_run(search);
4644 
4645 #ifdef BLAST_CLUSTER_HITS
4646         if (!search->pbp->mb_params) {
4647 	/* Cluster hits by region within the query */
4648 	/* Assume that hits are already sorted in each hitlist by score */
4649            ValNodePtr mask;
4650            result_struct = search->result_struct;
4651            hspcnt = 0;
4652            /* Collect all HSPs in one array */
4653 
4654            bspp = (BioseqPtr PNTR) Malloc(result_struct->hitlist_count*
4655                                           sizeof(BioseqPtr));
4656            for (index=0; index<result_struct->hitlist_count; index++) {
4657               hspcnt += result_struct->results[index]->hspcnt;
4658               bspp[index] = readdb_get_bioseq(search->rdfp,
4659                                               result_struct->results[index]->subject_id);
4660            }
4661 
4662            hspp = (BlastResultHspWithIdPtr PNTR)
4663               Malloc(hspcnt*sizeof(BlastResultHspWithIdPtr));
4664            index2 = 0;
4665            for (index=0; index<result_struct->hitlist_count; index++) {
4666               result_hitlist = result_struct->results[index];
4667               for (index1=0; index1<result_hitlist->hspcnt; index1++) {
4668                  hspp[index2] = (BlastResultHspWithIdPtr)
4669                     Malloc(sizeof(BlastResultHspWithId));
4670                  hspp[index2]->hitlist_index = index;
4671                  hspp[index2]->hsp_index = index1;
4672                  hspp[index2++]->hsp = &(result_hitlist->hsp_array[index1]);
4673               }
4674            }
4675            /* Sort by score */
4676            HeapSort((VoidPtr)hspp, hspcnt, sizeof(BLASTResultHspPtr),
4677                     BLASTResultHspScoreCmp);
4678            index = 0;
4679            while (index<hspcnt) {
4680               hsp = hspp[index]->hsp;
4681               index2 = 0;
4682 
4683               result_hitlist =
4684                  search->result_struct->results[hspp[index]->hitlist_index];
4685               bsp1 = bspp[hspp[index]->hitlist_index];
4686 
4687               search1 =
4688                  BlastQuerySequenceSetUp(bsp1, search->prog_name,
4689                                          options);
4690               for (index1=index+1; index1<hspcnt; index1++) {
4691                  /* Check if the next hit passes a simple test to be a
4692                     candidate to belong to this cluster */
4693                  if (hspp[index1]->hsp==NULL)
4694                     continue;
4695                  hsp1 = hspp[index1]->hsp;
4696                  result_hitlist =
4697                     search->result_struct->results[hspp[index1]->hitlist_index];
4698                  bsp2 = bspp[hspp[index1]->hitlist_index];
4699                  if (((FloatHi)ABS(bsp1->length - bsp2->length)) /
4700                      MIN(bsp1->length, bsp2->length) > CLUSTER_LENGTH_THRESH)
4701                     continue;
4702                  q_overlap =
4703                     MIN(hsp->query_offset+hsp->query_length,
4704                         hsp1->query_offset+hsp1->query_length) -
4705                     MAX(hsp->query_offset, hsp1->query_offset);
4706                  if (((FloatHi)q_overlap) /
4707                      MAX(hsp->query_length, hsp1->query_length) <
4708                      CLUSTER_OVERLAP_THRESH)
4709                     continue;
4710 
4711                  /* We have a candidate for attaching to the cluster */
4712                  if (hspp[index]->hitlist_index == hspp[index1]->hitlist_index) {
4713                     /* Almost identical hit from same subject in the same
4714                        area of the query - remove! */
4715                     result_hitlist =
4716                        search->result_struct->results[hspp[index1]->hitlist_index];
4717                     hspp[index1]->hsp = NULL;
4718                  }
4719 
4720                  /* Do the two sequences search to determine whether this
4721                     candidate in fact belongs to this cluster */
4722                  search1 = BlastSequencesOnTheFlyEx(search1, bsp2);
4723 
4724                  if (search1 && search1->result_struct->results[0]) {
4725                     if (search1->pbp->gapped_calculation)
4726                        kbp = search1->sbp->kbp_gap[search1->first_context];
4727                     else
4728                        kbp = search1->sbp->kbp[search1->first_context];
4729                     bit_score = ((search1->result_struct->results[0]->high_score *
4730                                   kbp->Lambda) - kbp->logK)/NCBIMATH_LN2;
4731                     if (bit_score > CLUSTER_SCORE_THRESH *
4732                         MAX(bsp1->length, bsp2->length)) {
4733                        /* remove the respective hit */
4734                        hspp[index1]->hsp = NULL;
4735                     }
4736                  }
4737               }
4738               mask = search1->mask;
4739               while (mask) {
4740                  SeqLocSetFree(mask->data.ptrvalue);
4741                  mask = mask->next;
4742               }
4743               ValNodeFree(search1->mask);
4744               search1 = BlastSearchBlkDestruct(search1);
4745               for (++index; index<hspcnt && hspp[index]->hsp==NULL; index++);
4746            }
4747 
4748            for (index=0; index<result_struct->hitlist_count; index++)
4749               BioseqFree(bspp[index]);
4750            MemFree(bspp);
4751            /* Remove all NULLs from hspp array */
4752            for (index=0, index1=0; index<hspcnt; index++) {
4753               if (hspp[index]->hsp != NULL) {
4754                  if (index != index1)
4755                     hspp[index1] = hspp[index];
4756                  index1++;
4757               } else
4758                  hspp[index] = MemFree(hspp[index]);
4759            }
4760            hspcnt = index1;
4761            /* Sort according to original hitlist and hsp indices */
4762            HeapSort((VoidPtr)hspp, hspcnt, sizeof(BLASTResultHspPtr),
4763                     ResultHspWithIdIndexCmp);
4764 
4765            /* Rearrange the hsp_arrays for all hitlists */
4766            index = 0;
4767            for (index2=0; index2<result_struct->hitlist_count; index2++) {
4768               index1 = 0;
4769               while (index<hspcnt && hspp[index]->hitlist_index == index2) {
4770                  result_struct->results[index2]->hsp_array[index1] =
4771                     *(hspp[index]->hsp);
4772                  index++;
4773                  index1++;
4774               }
4775               result_struct->results[index2]->hspcnt = index1;
4776            }
4777 
4778            for (index=0; index<hspcnt; index++)
4779               hspp[index] = MemFree(hspp[index]);
4780            hspp = MemFree(hspp);
4781 	}
4782 #endif  /* Clustering hits */
4783 
4784     if (options->no_traceback) {
4785        BlastStopAwakeThread(search->thr_info);
4786        return NULL;
4787     }
4788 
4789     BLASTPostSearchLogic(search, options, &head, TRUE);
4790 
4791 	/* Stop the awake thread. */
4792 	BlastStopAwakeThread(search->thr_info);
4793 
4794 	return head;
4795 }
4796 
4797 /*
4798 	Deallocates all memory involved with the BlastHitRangePtr.
4799 */
4800 
4801 BlastHitRangePtr LIBCALL
BlastHitRangeDestruct(BlastHitRangePtr old)4802 BlastHitRangeDestruct(BlastHitRangePtr old)
4803 
4804 {
4805 	if (old == NULL)
4806 		return NULL;
4807 
4808 	MemFree(old->range_list);
4809 	MemFree(old->range_list_pointer);
4810 
4811 	return MemFree(old);
4812 }
4813 
4814 /*
4815 	Allocates a a BlastHitRangePtr, with two 'total'
4816 	BlastDoubleInt4Ptr's.
4817 */
4818 
4819 BlastHitRangePtr LIBCALL
BlastHitRangeNew(Int4 total)4820 BlastHitRangeNew(Int4 total)
4821 
4822 {
4823 	BlastHitRangePtr bhrp;
4824 	Int4 index;
4825 
4826 	bhrp = MemNew(sizeof(BlastHitRange));
4827 
4828 	bhrp->range_list = (BlastDoubleInt4Ptr) MemNew(total*sizeof(BlastDoubleInt4));
4829 	bhrp->range_list_pointer = (BlastDoubleInt4Ptr PNTR) MemNew(total*sizeof(BlastDoubleInt4Ptr));
4830 	for (index=0; index<total; index++)
4831 	{
4832 		bhrp->range_list_pointer[index] = &(bhrp->range_list[index]);
4833 	}
4834 
4835 	bhrp->current = 0;
4836 	bhrp->total = total;
4837 
4838 	return bhrp;
4839 }
4840 
4841 static int LIBCALLBACK
bhrp_compare(VoidPtr v1,VoidPtr v2)4842 bhrp_compare(VoidPtr v1, VoidPtr v2)
4843 
4844 {
4845 	BlastDoubleInt4Ptr h1, h2;
4846 	BlastDoubleInt4Ptr *hp1, *hp2;
4847 
4848 	hp1 = (BlastDoubleInt4Ptr PNTR) v1;
4849 	hp2 = (BlastDoubleInt4Ptr PNTR) v2;
4850 	h1 = *hp1;
4851 	h2 = *hp2;
4852 
4853 	if (h1->gi < h2->gi)
4854 		return -1;
4855 	if (h1->gi > h2->gi)
4856 		return 1;
4857 
4858 	return 0;
4859 }
4860 
4861 BlastHitRangePtr LIBCALL
BioseqHitRangeEngineCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options)4862 BioseqHitRangeEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options)
4863 
4864 {
4865 	BlastHitRangePtr bhrp=NULL;
4866 	BLASTResultsStructPtr result_struct;
4867 	Int4 hitlist_count, index, total_hsps;
4868 	Int4 sequence_length, length;
4869 	Uint1Ptr sequence;
4870 
4871 	if (search == NULL || search->query_invalid)
4872 		return NULL;
4873 
4874 	/* Starting awake thread if multithreaded. */
4875 	if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
4876 		BlastStartAwakeThread(search->thr_info);
4877 
4878 	do_the_blast_run(search);
4879 
4880 	if (search->prog_number==blast_type_blastn) {
4881 	   /* Unconcatenate the strands by adjusting the query offsets in
4882 	      all hsps */
4883 	   search->context[search->first_context].query->length =
4884 	      search->query_context_offsets[search->first_context+1] - 1;
4885 	   /*BlastAdjustHitOffsets(search);*/
4886 	}
4887 
4888 	if (StringCmp(search->prog_name, "blastn") == 0 &&
4889 		search->pbp->gapped_calculation)
4890         {
4891 		search->pbp->gap_open = options->gap_open;
4892 		search->pbp->gap_extend = options->gap_extend;
4893 /*
4894 		search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
4895 		search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
4896 */
4897 
4898 
4899 		result_struct = search->result_struct;
4900                 hitlist_count = result_struct->hitlist_count;
4901 		total_hsps = 0;
4902 		for (index=0; index<hitlist_count; index++)
4903 		{
4904 			total_hsps += result_struct->results[index]->hspcnt;
4905 		}
4906 		bhrp = BlastHitRangeNew(total_hsps);
4907 		bhrp->query_id = search->query_id;
4908 
4909 		result_struct = search->result_struct;
4910        		hitlist_count = result_struct->hitlist_count;
4911 
4912 		sequence=NULL;
4913 		sequence_length=0;
4914 
4915 		for (index=0; index<hitlist_count; index++)
4916 		{
4917 			length = readdb_get_sequence_ex(search->rdfp, result_struct->results[index]->subject_id, &sequence, &sequence_length, TRUE);
4918 			SumBlastGetGappedAlignmentEx(search, index, FALSE, FALSE, sequence+1, length, FALSE, NULL, bhrp, 0);
4919 		}
4920 		sequence = MemFree(sequence);
4921 	}
4922 	else
4923 	{
4924 		return NULL;
4925 	}
4926 
4927 	HeapSort(bhrp->range_list_pointer, bhrp->current, sizeof(BlastHitRangePtr PNTR), bhrp_compare);
4928 
4929 	/* Stop the awake thread. */
4930 	BlastStopAwakeThread(search->thr_info);
4931 
4932 	return bhrp;
4933 }
4934 
4935 SeqAlignPtr LIBCALL
BioseqBlastEngineEx(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)4936 BioseqBlastEngineEx(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
4937 
4938 {
4939 	SeqLocPtr slp;
4940 	SeqAlignPtr seqalign;
4941 
4942 	slp = NULL;
4943 	ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
4944 	seqalign = BioseqBlastEngineByLocEx(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total);
4945 	SeqLocFree(slp);
4946 
4947 	return seqalign;
4948 }
4949 
4950 SeqAlignPtr LIBCALL
BioseqBlastEngine(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))4951 BioseqBlastEngine(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
4952 {
4953    /* --KM added NULL mult_queries param to call */
4954    return BioseqBlastEngineWithCallbackMult(bsp, progname, database, options, other_returns, error_returns, callback, NULL, NULL);
4955 }
4956 
4957 SeqAlignPtr LIBCALL
BioseqBlastEngineWithCallback(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)))4958 BioseqBlastEngineWithCallback(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)))
4959 {
4960    return BioseqBlastEngineWithCallbackMult(bsp, progname, database, options, other_returns, error_returns, callback, NULL, NULL);
4961 }
4962 
4963 /* --KM added mult_queries parameter */
4964 SeqAlignPtr LIBCALL
BioseqBlastEngineWithCallbackMult(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)),QueriesPtr mult_queries)4965 BioseqBlastEngineWithCallbackMult(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)), QueriesPtr mult_queries)
4966 {
4967 	SeqLocPtr slp;
4968 	SeqAlignPtr seqalign;
4969 
4970 	slp = NULL;
4971 	ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
4972 	seqalign = BioseqBlastEngineByLocWithCallbackMult(slp, progname, database, options, other_returns, error_returns, callback, NULL, NULL, 0, handle_results, mult_queries);/* --KM pass mult_queries */
4973 	SeqLocFree(slp);
4974 
4975 	return seqalign;
4976 }
4977 
4978 
4979 
4980 SeqAlignPtr LIBCALL
BioseqBlastEngineByLoc(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))4981 BioseqBlastEngineByLoc(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
4982 
4983 {
4984 	return BioseqBlastEngineByLocEx(slp, progname, database, options, other_returns, error_returns, callback, NULL, NULL, 0);
4985 
4986 }
4987 
4988 SeqAlignPtr LIBCALL
BioseqBlastEngineByLocEx(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)4989 BioseqBlastEngineByLocEx(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
4990 
4991 {
4992    return BioseqBlastEngineByLocWithCallback(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total, NULL); /* --KM pass NULL mult_queries */
4993 }
4994 
4995 SeqAlignPtr LIBCALL
BioseqBlastEngineByLocWithCallback(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)))4996 BioseqBlastEngineByLocWithCallback(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)))
4997 {
4998 	return BioseqBlastEngineByLocWithCallbackMult(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total, handle_results, NULL);
4999 }
5000 
5001 /* --KM added mult_queries param */
5002 SeqAlignPtr LIBCALL
BioseqBlastEngineByLocWithCallbackMult(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)),QueriesPtr mult_queries)5003 BioseqBlastEngineByLocWithCallbackMult(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)), QueriesPtr mult_queries)
5004 {
5005 	Boolean options_allocated=FALSE;
5006 	BlastSearchBlkPtr search;
5007 	Int2 status;
5008 	SeqAlignPtr head;
5009 	SeqLocPtr whole_slp=NULL;
5010 		/* Futamura */
5011         posSearchItems *posSearch;
5012         compactSearchItems *compactSearch = NULL;
5013         Boolean  checkReturn = FALSE;
5014 
5015 	head = NULL;
5016 
5017 	if (error_returns)
5018 	{
5019 		*error_returns = NULL;
5020 	}
5021 
5022 	if (other_returns)
5023 	{
5024 		*other_returns = NULL;
5025 	}
5026 
5027 	if (progname == NULL)
5028 		return NULL;
5029 
5030 	/* If no options, use default. */
5031 	if (options == NULL)
5032 	{
5033 		options = BLASTOptionNew(progname, FALSE);
5034 		options_allocated = TRUE;
5035 	}
5036 
5037 	status = BLASTOptionValidateEx(options, progname, error_returns);
5038 	if (status != 0)
5039 	{	/* error messages in other_returns? */
5040 		return NULL;
5041 	}
5042 
5043 	if (slp == NULL || database == NULL)
5044 		return NULL;
5045 
5046     if(options->is_rps_blast) {
5047         RPSInfoPtr rpsinfo;
5048         BioseqPtr bsp, fake_bsp;
5049         Boolean query_is_na;
5050 
5051         if((bsp = BioseqLockById(SeqLocId(slp))) == NULL)
5052             return NULL;
5053 
5054         /* RPS Blast discard program name and use specific RPS Blast
5055            logic for this */
5056 
5057         if(bsp->mol == Seq_mol_aa) {
5058             query_is_na = FALSE;
5059             progname = "blastp";
5060         } else {
5061             query_is_na = TRUE;
5062             progname = "tblastn";
5063         }
5064         if((rpsinfo = RPSInitEx(database, !query_is_na, options)) == NULL) {
5065 
5066             ErrPostEx(SEV_ERROR, 0, 0, "Failure to initialize RPS: %s %s",
5067                       progname, database);
5068             return NULL;
5069         }
5070         /* Update size of the database in accordance with RPS Database size */
5071         RPSUpdateDbSize(options, rpsinfo, bsp->length);
5072 
5073         if(!query_is_na)
5074             fake_bsp = bsp;
5075         else {
5076             options->db_genetic_code = options->genetic_code;
5077             fake_bsp = createFakeProtein();
5078         }
5079         search = BLASTSetUpSearch (fake_bsp, progname, fake_bsp->length, 0,
5080                                    NULL, options, NULL);
5081 
5082         if (search == NULL)
5083             return NULL;
5084 
5085         search->thr_info->tick_callback = NULL;
5086         search->thr_info->star_callback = NULL;
5087 
5088         head = RPSBlastSearch(search, bsp, rpsinfo);
5089 
5090         if(query_is_na)
5091             BioseqFree(fake_bsp);
5092         BioseqUnlock(bsp);
5093         RPSClose(rpsinfo);
5094     } else {
5095 
5096         search = BLASTSetUpSearchByLocWithReadDbEx(slp, progname, SeqLocLen(slp), database, options, NULL, seqid_list, gi_list, gi_list_total, mult_queries);
5097 	/* --KM pass mult_queries */
5098 
5099         if (search == NULL) {
5100            /* We need to veryfy if database name is wrong and to set error
5101                returns correctly */
5102             Boolean is_prot;
5103             BlastErrorMsgPtr error_msg;
5104             CharPtr chptr;
5105             ReadDBFILEPtr rdfp=NULL;
5106 
5107             if(!StringICmp(progname, "blastp") ||
5108                !StringICmp(progname, "blastx")) {
5109                 is_prot = TRUE;
5110             } else {
5111                 is_prot = FALSE;
5112             }
5113 
5114             rdfp = readdb_new(database, is_prot);
5115             if(rdfp == NULL) {
5116                 error_msg = MemNew(sizeof(BlastErrorMsg));
5117                 chptr = MemNew(StringLen(database) + 256);
5118                 sprintf(chptr, "Database %s was not found or does not exist",
5119                         database);
5120                 error_msg->msg = chptr;
5121                 error_msg->level = 3; /* FATAL */
5122                 ValNodeAddPointer(error_returns, 0, error_msg);
5123             }
5124 
5125             readdb_destruct(rdfp);
5126             return NULL;
5127         }
5128 
5129         search->thr_info->tick_callback = callback;
5130         search->thr_info->star_callback = callback;
5131         search->handle_results = handle_results;
5132         search->output = options->output;
5133 
5134         /* Futamura psitblastn */
5135         if (options->recoverCheckpoint)
5136           search->positionBased = TRUE;
5137         else
5138           search->positionBased = FALSE;
5139 
5140         if (options->recoverCheckpoint) {
5141           posSearch = (posSearchItems *) MemNew(1 * sizeof(posSearchItems));
5142           compactSearch = compactSearchNew(compactSearch);
5143           copySearchItems(compactSearch, search, options->matrix);
5144           posInitializeInformation(posSearch,search);
5145           /*AAS*/
5146 
5147           checkReturn = posReadCheckpoint(posSearch, compactSearch,
5148                                           options->CheckpointFileName,
5149                                           NO_SCOREMAT_IO,
5150                                           &(search->error_return));
5151           /* Reading the checkpoint changes the statistical parameters
5152              kbp_psi and kbp_gap_psi.  Recalculate the cutoffs by calling
5153              blast_set_parameters. */
5154 
5155           /* Unless search->pbp->cutoff_s[2]_set is set, we wish to calculate
5156              cutoff_s[2] from cutoff_e[2], rather than the other way around.
5157              Setting cutoff_s[2] to zero, as was the case in the first call to
5158              blast_set_parameters, accomplishes this.
5159           */
5160           if (!search->pbp->cutoff_s_set) {
5161               search->pbp->cutoff_s = 0;
5162           }
5163           if (!search->pbp->cutoff_s2_set) {
5164               search->pbp->cutoff_s2 = 0;
5165           }
5166           search->sbp->kbp = search->sbp->kbp_psi;
5167           search->sbp->kbp_gap = search->sbp->kbp_gap_psi;
5168           blast_set_parameters(search,
5169                                options->dropoff_1st_pass,
5170                                options->dropoff_2nd_pass,
5171                                s_ComputeAverageLength(search),
5172                                search->searchsp_eff,
5173                                options->window_size);
5174 
5175           search->sbp->posMatrix = posSearch->posMatrix;
5176           if (NULL == search->sbp->posFreqs)
5177             search->sbp->posFreqs =  allocatePosFreqs(compactSearch->qlength,
5178                                                       compactSearch->alphabetSize);
5179           copyPosFreqs(posSearch->posFreqs,search->sbp->posFreqs,
5180                        compactSearch->qlength, compactSearch->alphabetSize);
5181 
5182           if (!checkReturn) {
5183 		BlastConstructErrorMessage("BioseqBlastEngineByLocEx",
5184 			"Error recovering from checkpoint", 3, error_returns);
5185 		return NULL;
5186           }
5187         }
5188 
5189         /* ----- Here is real BLAST search done ------- */
5190         if (search->positionBased)
5191           head = BioseqBlastEngineCore(search, options, search->sbp->posMatrix);
5192         else if (options->is_megablast_search) {
5193            SeqAlignPtr PNTR seqalignp;
5194            seqalignp = BioseqMegaBlastEngineCore(search, options);
5195            head = *seqalignp;
5196         } else
5197 	  head = BioseqBlastEngineCore(search, options, NULL);
5198 	/* end Futamura */
5199 
5200     }
5201 
5202     if (search->error_return) {
5203         ValNodeLink(error_returns, search->error_return);
5204         search->error_return = NULL;
5205     }
5206 
5207     if (other_returns) { /* format dbinfo etc.  */
5208         *other_returns = BlastOtherReturnsPrepare(search);
5209     }
5210 
5211     if (options_allocated) {
5212         options = BLASTOptionDelete(options);
5213     }
5214 
5215     search = BlastSearchBlkDestruct(search);
5216 
5217     if(!options->is_rps_blast) {
5218 
5219         /* Adjsut the offset if the query does not cover the entire sequence. */
5220         if (slp->choice != SEQLOC_WHOLE) {
5221             ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE, SeqIdFindBest(SeqLocId(slp), SEQID_GI));
5222             if (SeqLocAinB(whole_slp, slp) != 0) {
5223                 AdjustOffSetsInSeqAlign(head, slp, NULL);
5224             }
5225             ValNodeFree(whole_slp);
5226         }
5227     }
5228 
5229     return head;
5230 }
5231 
5232 SeqLocPtr LIBCALL
BioseqHitRangeEngine(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)5233 BioseqHitRangeEngine(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
5234 
5235 {
5236 	SeqLocPtr slp;
5237 
5238 	slp = NULL;
5239 	ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
5240 	return BioseqHitRangeEngineByLoc(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total);
5241 }
5242 
5243 SeqLocPtr
HitRangeToSeqLoc(BlastHitRangePtr bhrp,Int4 link_value,Boolean combine)5244 HitRangeToSeqLoc(BlastHitRangePtr bhrp, Int4 link_value, Boolean combine)
5245 
5246 {
5247 	Boolean make_seqloc, start=TRUE;
5248 	Int4 index, total, start_pos=0, stop_pos, largest_stop_pos=0;
5249 	SeqIntPtr sint;
5250 	SeqLocPtr retval=NULL;
5251 
5252 	if (bhrp == NULL)
5253 		return NULL;
5254 
5255 	total = bhrp->current;
5256 	index=0;
5257 	while (index < total)
5258 	{
5259 	   if (combine)
5260 	   {
5261 		if (start == TRUE)
5262 		{
5263 			start_pos = bhrp->range_list_pointer[index]->gi + bhrp->base_offset;
5264 			start = FALSE;
5265 			largest_stop_pos = 0;
5266 		}
5267 		else
5268 		{
5269 			/* Keep track of largest stop position. */
5270 			largest_stop_pos = MAX(largest_stop_pos, bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset);
5271 			make_seqloc = FALSE;
5272 			if (index == total-1)	/* Last one. */
5273 			{
5274 				stop_pos = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5275 				start = TRUE;
5276 				make_seqloc = TRUE;
5277 			}
5278 			else if (largest_stop_pos+link_value < bhrp->range_list_pointer[index+1]->gi + bhrp->base_offset)
5279 			{ /* Check overlap with next one. */
5280 				stop_pos = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5281 				start = TRUE;
5282 				make_seqloc = TRUE;
5283 			}
5284 
5285 			if (make_seqloc)
5286 			{
5287 				sint = SeqIntNew();
5288 				sint->from = start_pos;
5289 				sint->to = MAX(largest_stop_pos, stop_pos);
5290 				sint->strand = Seq_strand_plus;
5291 				sint->id = SeqIdDup(SeqIdFindBest(bhrp->query_id, SEQID_GI));
5292 				ValNodeAddPointer(&retval, SEQLOC_INT, sint);
5293 			}
5294 			index++;
5295 		}
5296 	   }
5297 	   else
5298 	   {
5299 		sint = SeqIntNew();
5300 		sint->from = bhrp->range_list_pointer[index]->gi + bhrp->base_offset;
5301 		sint->to = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5302 		sint->strand = Seq_strand_plus;
5303 		sint->id = SeqIdDup(SeqIdFindBest(bhrp->query_id, SEQID_GI));
5304 		ValNodeAddPointer(&retval, SEQLOC_INT, sint);
5305 		index++;
5306 	   }
5307 	}
5308 
5309 	return retval;
5310 }
5311 
5312 #define HITRANGE_LINKVALUE 5
5313 
5314 SeqLocPtr LIBCALL
BioseqHitRangeEngineByLoc(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)5315 BioseqHitRangeEngineByLoc(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
5316 
5317 {
5318 	Boolean options_allocated=FALSE;
5319 	BlastHitRangePtr bhrp;
5320 	BlastSearchBlkPtr search;
5321 	Int2 status;
5322 	SeqLocPtr seqloc, whole_slp=NULL;
5323 
5324 	if (error_returns)
5325 	{
5326 		*error_returns = NULL;
5327 	}
5328 
5329 	if (other_returns)
5330 	{
5331 		*other_returns = NULL;
5332 	}
5333 
5334 	if (progname == NULL)
5335 		return NULL;
5336 
5337 	/* If no options, use default. */
5338 	if (options == NULL)
5339 	{
5340 		options = BLASTOptionNew(progname, FALSE);
5341 		options_allocated = TRUE;
5342 	}
5343 
5344 	status = BLASTOptionValidateEx(options, progname, error_returns);
5345 	if (status != 0)
5346 	{	/* error messages in other_returns? */
5347 		return NULL;
5348 	}
5349 
5350 	if (slp == NULL || database == NULL)
5351 		return NULL;
5352 
5353 	search = BLASTSetUpSearchByLocWithReadDbEx(slp, progname, SeqLocLen(slp), database, options, NULL, seqid_list, gi_list, gi_list_total, NULL); /* --KM pass NULL mult_queries */
5354 
5355 	if (search == NULL)
5356 	{
5357 		return NULL;
5358 	}
5359 
5360 	search->thr_info->tick_callback = callback;
5361 	search->thr_info->star_callback = callback;
5362 
5363 	bhrp = BioseqHitRangeEngineCore(search, options);
5364 	if (bhrp == NULL) /* can happen for invalid queries. */
5365 		return NULL;
5366 
5367         if (slp->choice != SEQLOC_WHOLE) {
5368         	ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE, SeqIdFindBest(SeqLocId(slp), SEQID_GI));
5369 		bhrp->base_offset = GetOffsetInLoc(slp, whole_slp, SEQLOC_START);
5370 		ValNodeFree(whole_slp);
5371 	}
5372 
5373 	seqloc = HitRangeToSeqLoc(bhrp, HITRANGE_LINKVALUE, TRUE);
5374 	bhrp = BlastHitRangeDestruct(bhrp);
5375 	if (search->error_return)
5376 	{
5377 		ValNodeLink(error_returns, search->error_return);
5378 		search->error_return = NULL;
5379 	}
5380 
5381 	if (other_returns)
5382 	{ /* format dbinfo etc.  */
5383 		*other_returns = BlastOtherReturnsPrepare(search);
5384 	}
5385 
5386 	if (options_allocated)
5387 	{
5388 		options = BLASTOptionDelete(options);
5389 	}
5390 	search = BlastSearchBlkDestruct(search);
5391 
5392 	return seqloc;
5393 }
5394 
BlastOtherReturnsFree(ValNodePtr other_returns)5395 void LIBCALL BlastOtherReturnsFree(ValNodePtr other_returns)
5396 {
5397     BLAST_KarlinBlkPtr ka_params;
5398     BLAST_MatrixPtr matrix;
5399     CharPtr params_buffer;
5400     TxDfDbInfoPtr dbinfo;
5401     ValNodePtr  mask_loc, mask_loc_start, vnp;
5402 
5403     mask_loc = NULL;
5404 
5405     for (vnp=other_returns; vnp; vnp = vnp->next) {
5406         switch (vnp->choice) {
5407         case TXDBINFO:
5408             dbinfo = vnp->data.ptrvalue;
5409             dbinfo = TxDfDbInfoDestruct(dbinfo);
5410             break;
5411         case TXKABLK_NOGAP:
5412             ka_params = vnp->data.ptrvalue;
5413             MemFree(ka_params);
5414             break;
5415         case TXKABLK_GAP:
5416             ka_params = vnp->data.ptrvalue;
5417             MemFree(ka_params);
5418             break;
5419         case TXPARAMETERS:
5420             params_buffer = vnp->data.ptrvalue;
5421             MemFree(params_buffer);
5422             break;
5423         case TXMATRIX:
5424             matrix = vnp->data.ptrvalue;
5425             matrix = BLAST_MatrixDestruct(matrix);
5426 
5427             break;
5428         case SEQLOC_MASKING_NOTSET:
5429         case SEQLOC_MASKING_PLUS1:
5430         case SEQLOC_MASKING_PLUS2:
5431         case SEQLOC_MASKING_PLUS3:
5432         case SEQLOC_MASKING_MINUS1:
5433         case SEQLOC_MASKING_MINUS2:
5434         case SEQLOC_MASKING_MINUS3:
5435             ValNodeAddPointer(&mask_loc, vnp->choice, vnp->data.ptrvalue);
5436             break;
5437         default:
5438             break;
5439         }
5440     }
5441 
5442     mask_loc_start = mask_loc;
5443     while (mask_loc) {
5444         SeqLocSetFree(mask_loc->data.ptrvalue);
5445         mask_loc = mask_loc->next;
5446     }
5447     ValNodeFree(mask_loc_start);
5448 
5449     other_returns = ValNodeFree(other_returns);
5450 
5451     return;
5452 }
5453 
5454 ValNodePtr LIBCALL
BlastOtherReturnsPrepare(BlastSearchBlkPtr search)5455 BlastOtherReturnsPrepare(BlastSearchBlkPtr search)
5456 
5457 {
5458     BLAST_KarlinBlkPtr ka_params;
5459     BLAST_MatrixPtr blast_matrix;
5460     CharPtr parameters, chptr;
5461     ReadDBFILEPtr rdfp_var;
5462     TxDfDbInfoPtr dbinfo, head, dbinfo_var=NULL;
5463     ValNodePtr other_returns=NULL;
5464 
5465     head = NULL;
5466     if (search->thr_info->blast_gi_list) {
5467         dbinfo = MemNew(sizeof(TxDfDbInfo));
5468         dbinfo->total_length = search->dblen;
5469         dbinfo->number_seqs = search->dbseq_num;
5470         dbinfo->subset = TRUE;
5471         head = dbinfo;
5472         dbinfo_var = dbinfo;
5473     }
5474 
5475     rdfp_var = search->rdfp;
5476     while (rdfp_var) {
5477         dbinfo = MemNew(sizeof(TxDfDbInfo));
5478         dbinfo->name = StringSave(readdb_get_filename(rdfp_var));
5479 
5480         if((chptr = readdb_get_title(rdfp_var)) == NULL)
5481             chptr = readdb_get_filename(rdfp_var);
5482         dbinfo->definition = StringSave(chptr);
5483 
5484         dbinfo->date = StringSave(readdb_get_date(rdfp_var));
5485 
5486         dbinfo->is_protein = readdb_is_prot(rdfp_var);
5487 
5488         if (rdfp_var->aliaslen)
5489             dbinfo->total_length = rdfp_var->aliaslen;
5490         else
5491             dbinfo->total_length = readdb_get_dblen(rdfp_var);
5492         if (rdfp_var->aliasnseq)
5493             dbinfo->number_seqs = rdfp_var->aliasnseq;
5494         else
5495             dbinfo->number_seqs = readdb_get_num_entries(rdfp_var);
5496         if (head == NULL) {
5497             head = dbinfo;
5498             dbinfo_var = dbinfo;
5499         } else {
5500             dbinfo_var->next = dbinfo;
5501             dbinfo_var = dbinfo_var->next;
5502         }
5503         rdfp_var = rdfp_var->next;
5504     }
5505     if (head)
5506         ValNodeAddPointer (&other_returns, TXDBINFO, head);
5507 
5508     if (search->sbp->kbp && search->sbp->kbp[search->first_context]) {
5509         ka_params = BlastKarlinBlkCreate();
5510         ka_params->Lambda = search->sbp->kbp[search->first_context]->Lambda;
5511         ka_params->K = search->sbp->kbp[search->first_context]->K;
5512         ka_params->H = search->sbp->kbp[search->first_context]->H;
5513         ValNodeAddPointer (&other_returns, TXKABLK_NOGAP, ka_params);
5514     }
5515 
5516     if (search->pbp->gapped_calculation == TRUE) {
5517         if (search->sbp->kbp_gap && search->sbp->kbp_gap[search->first_context]) {
5518                 ka_params = BlastKarlinBlkCreate();
5519                 ka_params->Lambda = search->sbp->kbp_gap[search->first_context]->Lambda;
5520                 ka_params->K = search->sbp->kbp_gap[search->first_context]->K;
5521                 ka_params->H = search->sbp->kbp_gap[search->first_context]->H;
5522                 ValNodeAddPointer (&other_returns, TXKABLK_GAP, ka_params);
5523         }
5524     }
5525 
5526     if (search->query_invalid == FALSE) {
5527         parameters = FormatBlastParameters(search);
5528         ValNodeAddPointer (&other_returns, TXPARAMETERS, parameters);
5529     }
5530 
5531     blast_matrix = BLAST_MatrixFill(search->sbp, search->positionBased);
5532     ValNodeAddPointer (&other_returns, TXMATRIX, blast_matrix);
5533 
5534     if (search->mask)
5535         ValNodeLink(&other_returns, search->mask);
5536 
5537     if (search->pbp->is_rps_blast) {
5538         ValNodeAddFloat(&other_returns, EFF_SEARCH_SPACE,
5539             ((Nlm_FloatHi) search->dblen_eff)*
5540             ((Nlm_FloatHi) (search->rps_qlen - search->length_adjustment)));
5541     } else {
5542         ValNodeAddFloat(&other_returns, EFF_SEARCH_SPACE,
5543             ((Nlm_FloatHi) search->dblen_eff)*
5544             ((Nlm_FloatHi) search->context[search->first_context].query->effective_length));
5545     }
5546     ValNodeAddInt(&other_returns, EFF_HSP_LENGTH, search->length_adjustment);
5547 
5548     /* If Mega BLAST endpoint results, save them here */
5549     if (search->mb_endpoint_results && search->pbp->mb_params &&
5550         search->pbp->mb_params->no_traceback)
5551        /* Here 21 = BlastResponse_mbalign (see file objblst3.h) */
5552        ValNodeAddPointer(&other_returns, 21,
5553                          search->mb_endpoint_results->data.ptrvalue);
5554 
5555     return other_returns;
5556 }
5557 
5558 
5559 /*
5560 	Deallocates memory for BLAST_ExtendWordParamsPtr
5561 
5562 */
5563 
5564 static BLAST_ExtendWordParamsPtr
BLAST_ExtendWordParamsDestruct(BLAST_ExtendWordParamsPtr ewp_params)5565 BLAST_ExtendWordParamsDestruct (BLAST_ExtendWordParamsPtr ewp_params)
5566 
5567 {
5568 	ewp_params = MemFree(ewp_params);
5569 
5570 	return ewp_params;
5571 }
5572 
5573 
5574 /*
5575 	Allocates memory for the BLAST_ExtendWordParamsPtr.
5576 
5577 	This function also sets many of the parametes such as min_diag_length etc.
5578 
5579 	Int4 qlen: length of the query.
5580 	Boolean multiple_hits: specifies whether multiple hits method is used.
5581 	Int4 window_size: the max. distance between two hits that are extended.
5582 */
5583 
5584 BLAST_ExtendWordParamsPtr
BLAST_ExtendWordParamsNew(Int4 qlen,Boolean multiple_hits,Int4 window_size)5585 BLAST_ExtendWordParamsNew (Int4 qlen, Boolean multiple_hits, Int4 window_size)
5586 
5587 {
5588 	BLAST_ExtendWordParamsPtr ewp_params;
5589 	Int4 min_diag_length, bits_to_shift;
5590 
5591 	ewp_params= MemNew(sizeof(BLAST_ExtendWordParams));
5592 
5593 	if (ewp_params)
5594 	{
5595 		min_diag_length = 1;
5596 		bits_to_shift = 0;
5597 		/* What power of 2 is just longer than the query? */
5598 		while (min_diag_length < (qlen+window_size))
5599 		{
5600 			min_diag_length = min_diag_length << 1;
5601 			bits_to_shift++;
5602 		}
5603 		/* These are used in the word finders to shift and mask
5604 		rather than dividing and taking the remainder. */
5605 		ewp_params->bits_to_shift = bits_to_shift;
5606 		ewp_params->min_diag_length = min_diag_length;
5607 		ewp_params->min_diag_mask = min_diag_length-1;
5608 		ewp_params->multiple_hits = multiple_hits;
5609 		ewp_params->offset = window_size;
5610                 ewp_params->window = window_size;
5611 	}
5612 	return ewp_params;
5613 }
5614 
5615 /*
5616 	Deallocates memory for the BLAST_ExtendWordPtr.
5617 
5618 */
5619 BLAST_ExtendWordPtr LIBCALL
BLAST_ExtendWordDestruct(BLAST_ExtendWordPtr ewp)5620 BLAST_ExtendWordDestruct (BLAST_ExtendWordPtr ewp)
5621 
5622 {
5623 	if (ewp)
5624 	{
5625 		if (ewp->_buffer)
5626 			ewp->_buffer = MemFree(ewp->_buffer);
5627 
5628 		ewp = MemFree(ewp);
5629 	}
5630 
5631 	return ewp;
5632 
5633 }
5634 
5635 /*
5636 	Allocates memory for the BLAST_ExtendWordPtr.
5637 
5638 	All of the memory for the arrays is allocated in one chunk
5639 	called "_buffer".  If multiple_hits is specified them room
5640 	for "diag_level", "last_hit", and "version" is allocated and
5641 	pointers into the array for these are set.  If multiple_hits
5642 	is not set, then only room for diag_level and version is allocated;
5643 	last_hit is not needed.
5644 
5645 	Int4 qlen, dblen: length of the query and the LONGEST subject sequence.
5646 	Boolean multiple_hits: specifies whether multiple hits method is used.
5647 
5648 	** CFJ
5649 	** - previously buffer contained diag_level array, last_hit array, and version array
5650 	**   change to contain array of struct {dl,lh,v}.
5651 	**
5652 	** - Now that version is no longer used, combining the remaining 2 is probably not a big win.
5653 
5654 */
5655 BLAST_ExtendWordPtr
BLAST_ExtendWordNew(BLAST_ExtendWordParamsPtr ewp_params)5656 BLAST_ExtendWordNew (BLAST_ExtendWordParamsPtr ewp_params)
5657 
5658 {
5659 	BLAST_ExtendWordPtr ewp;
5660 	int i;
5661 
5662 	ewp = MemNew(sizeof(BLAST_ExtendWord));
5663 
5664 	if (ewp)
5665 	{
5666 		/* Allocate the buffer to be used for Combo array. */
5667     	        ewp->_buffer = (Int4Ptr) MemNew(ewp_params->min_diag_length*sizeof(CfjModStruct));
5668 
5669 		if (ewp->_buffer == NULL)
5670 		{
5671 			ewp = BLAST_ExtendWordDestruct(ewp);
5672 			return NULL;
5673 		}
5674 
5675 		ewp->combo_array= (CfjModStruct *) ewp->_buffer;
5676 		ewp_params->offset=0;
5677 		for(i=0;i<ewp_params->min_diag_length;i++){
5678 		  ewp->combo_array[i].diag_level=0;
5679 		  ewp->combo_array[i].last_hit = -ewp_params->window;
5680 		}
5681 	}
5682 
5683 	return ewp;
5684 }
5685 
5686 /*****************************************************************************
5687 *
5688 *	Zeroe's out the memory in the array _buffer, if offset is greater than
5689 *	INT4_MAX/2.  The first "min_diag_length" spaces in the array are used
5690 *	by the array "diag_level", the second "min_diag_length" spaces are used
5691 *	by "last_hit".  All of these are zeroed out.  The last "min_diag_length"
5692 *	spaces are used by "version"; these are not zeroed out.
5693 *
5694 *	If offset is not greater than INT4_MAX/2, then the memory is not
5695 *	zeroed out.  Rather "offset" is used as a "zero-point" that is
5696 *	always greater than the next possible value when the word finder
5697 *	starts working on a new subject sequence.
5698 *
5699 ******************************************************************************/
5700 void LIBCALL
BlastExtendWordExit(BlastSearchBlkPtr search)5701 BlastExtendWordExit(BlastSearchBlkPtr search)
5702 
5703 {
5704 	BLAST_ExtendWordPtr ewp;
5705 	BLAST_ExtendWordParamsPtr ewp_params;
5706 	Int2 index;
5707 	Int4 i, min_diag_length;
5708 
5709 	ewp_params = search->ewp_params;
5710 
5711 	for (index=search->first_context; index<=search->last_context; index++)
5712 	{
5713 
5714 		if (ewp_params->offset >= INT4_MAX/2)
5715 		{
5716 			ewp = search->context[index].ewp;
5717                         if (ewp) {
5718                            min_diag_length = ewp_params->min_diag_length;
5719                            for(i=0;i<min_diag_length;i++)
5720 			   {
5721 			        ewp->combo_array[i].diag_level=0;
5722 			        ewp->combo_array[i].last_hit = -ewp_params->window;
5723                            }
5724                         }
5725                 }
5726 	}
5727 
5728 	if (ewp_params->offset < INT4_MAX/2)
5729 	{
5730 		ewp_params->offset += search->subject->length + ewp_params->window ;
5731 	}
5732 	else
5733 	{
5734 		ewp_params->offset = 0;
5735 	}
5736 }
5737 
5738 
5739 BlastSequenceBlkPtr LIBCALL
BlastSequenceBlkDestruct(BlastSequenceBlkPtr seq_blk)5740 BlastSequenceBlkDestruct(BlastSequenceBlkPtr seq_blk)
5741 
5742 {
5743 
5744 	if (seq_blk == NULL)
5745 		return NULL;
5746 
5747 	/* Free from the start of sequence if it's filled in. */
5748 	if (seq_blk->sequence_start != NULL)
5749 	{
5750 		seq_blk->sequence_start = MemFree(seq_blk->sequence_start);
5751 	}
5752 	else
5753 	{
5754 		seq_blk->sequence = MemFree(seq_blk->sequence);
5755 	}
5756 
5757 	seq_blk = MemFree(seq_blk);
5758 
5759 	return seq_blk;
5760 }
5761 
5762 
5763 
5764 static BLASTContextStructPtr
BLASTContextFree(BLASTContextStructPtr context,Int2 number)5765 BLASTContextFree(BLASTContextStructPtr context, Int2 number)
5766 
5767 {
5768 	Int2 index;
5769 
5770 	if (context == NULL)
5771 	  return NULL;
5772 
5773 	for (index=0; index<number; index++)
5774 	{
5775 		context[index].ewp = BLAST_ExtendWordDestruct(context[index].ewp);
5776 		if (context[index].query_allocated == TRUE)
5777 		{
5778 			context[index].query = BlastSequenceBlkDestruct(context[index].query);
5779 		}
5780 	}
5781 	context = MemFree(context);
5782 
5783 	return context;
5784 }
5785 
BlastThrInfoFree(BlastThrInfoPtr thr_info)5786 void BlastThrInfoFree(BlastThrInfoPtr thr_info)
5787 {
5788     VoidPtr status=NULL;
5789 
5790     if (thr_info == NULL)
5791 	return;
5792 
5793     if (thr_info->index_thr)
5794     {
5795 		NlmThreadJoin(thr_info->index_thr, &status);
5796 		thr_info->index_thr = NULL;
5797     }
5798 
5799     if (thr_info->awake_thr)
5800     {
5801         NlmThreadJoin(thr_info->awake_thr, &status);
5802         thr_info->awake_thr = NULL;
5803         if (thr_info->callback_mutex)
5804         {
5805                 NlmMutexDestroy(thr_info->callback_mutex);
5806                 thr_info->callback_mutex = NULL;
5807         }
5808     }
5809     BlastGiListDestruct(thr_info->blast_gi_list, TRUE);
5810 
5811     NlmMutexDestroy(thr_info->db_mutex);
5812     NlmMutexDestroy(thr_info->results_mutex);
5813     NlmMutexDestroy(thr_info->callback_mutex);
5814 
5815     MemFree(thr_info);
5816 
5817     return;
5818 }
5819 
BlastThrInfoNew(void)5820 BlastThrInfoPtr BlastThrInfoNew(void)
5821 {
5822     BlastThrInfoPtr thr_info;
5823 
5824     thr_info = MemNew(sizeof(BlastThrInfo));
5825 
5826     return thr_info;
5827 }
5828 
5829 
5830 /*
5831 	Allocates space for a copy of the BlastSearchBlk for use in
5832 	multi-processing BLAST.
5833 */
5834 
5835 BlastSearchBlkPtr LIBCALL
BlastSearchBlkDuplicate(BlastSearchBlkPtr search)5836 BlastSearchBlkDuplicate (BlastSearchBlkPtr search)
5837 
5838 {
5839 
5840 	BlastSearchBlkPtr new_search;
5841 	Int2 index;
5842 
5843 	if (search == NULL)
5844 		return NULL;
5845 
5846 	new_search = (BlastSearchBlkPtr) MemNew(sizeof(BlastSearchBlk));
5847 	if (new_search == NULL)
5848 		return NULL;
5849 
5850 	/* What's allocated here? */
5851 	new_search->allocated = 0;
5852 	new_search->allocated += BLAST_SEARCH_ALLOC_SUBJECT;
5853 	new_search->allocated += BLAST_SEARCH_ALLOC_PBP;
5854 	new_search->allocated += BLAST_SEARCH_ALLOC_CONTEXT;
5855 	new_search->allocated += BLAST_SEARCH_ALLOC_READDB;
5856 	new_search->allocated += BLAST_SEARCH_ALLOC_EWPPARAMS;
5857 
5858         /* AM: Support for query multiplexing. */
5859 	if( search->mult_queries )
5860 	  new_search->mult_queries = BlastDuplicateMultQueries( search->mult_queries );
5861 
5862 	/* Duplicate the rfdp struct, but not the contents. */
5863 	new_search->rdfp = readdb_attach(search->rdfp);
5864 	if (new_search->rdfp == NULL)
5865 	{
5866 		new_search = BlastSearchBlkDestruct(new_search);
5867 		return NULL;
5868 	}
5869 
5870 	new_search->positionBased = search->positionBased;
5871 
5872 	/* Changes, need to allocate. */
5873 	new_search->pbp = MemDup(search->pbp, sizeof(BLAST_ParameterBlk));
5874 	if (search->pbp->mb_params)
5875 	  new_search->pbp->mb_params =
5876 	    MemDup(search->pbp->mb_params, sizeof(MegaBlastParameterBlk));
5877 	new_search->pbp->filter_string = StringSave(search->pbp->filter_string);
5878 	new_search->sbp = search->sbp;
5879 	new_search->wfp_first = search->wfp_first;
5880 	if (search->prog_number==blast_type_blastn &&
5881 	    search->pbp->mb_params) {
5882 	   new_search->wfp_second =
5883 	      MemDup(search->wfp_second, sizeof(BLAST_WordFinder));
5884 	   new_search->wfp_second->lookup =
5885 	      MegaBlastLookupTableDup(search->wfp_second->lookup);
5886            new_search->wfp = new_search->wfp_second;
5887 	} else
5888 	   new_search->wfp_second = search->wfp_second;
5889 	new_search->prog_name = StringSave(search->prog_name);
5890 	new_search->prog_number = search->prog_number;
5891 	new_search->first_context = search->first_context;
5892 	new_search->last_context = search->last_context;
5893         new_search->query_slp = search->query_slp;
5894 	if (search->prog_number==blast_type_blastn) {
5895 	   new_search->query_context_offsets =
5896 	      MemDup(search->query_context_offsets,
5897 		     (search->last_context-search->first_context+2)*sizeof(Int4));
5898 	}
5899 	if (search->ewp_params)
5900 	   new_search->ewp_params = MemDup(search->ewp_params, sizeof(BLAST_ExtendWordParams));
5901 	new_search->dblen = search->dblen;
5902 	new_search->dblen_eff = search->dblen_eff;
5903 	new_search->dblen_eff_real = search->dblen_eff_real;
5904 	new_search->dbseq_num = search->dbseq_num;
5905 	new_search->length_adjustment = search->length_adjustment;
5906 	new_search->searchsp_eff = search->searchsp_eff;
5907 
5908 	/* Allocate last_context+1 elements, even if there are only last_context-first_context
5909 	being used. */
5910 	new_search->context = (BLASTContextStructPtr) MemNew((search->last_context+1)*sizeof(BLASTContextStruct));
5911 	for (index=new_search->first_context; index<=new_search->last_context; index++)
5912 	{
5913 	   if (new_search->ewp_params)
5914 	      new_search->context[index].ewp = BLAST_ExtendWordNew(new_search->ewp_params);
5915 		new_search->context[index].query = search->context[index].query;
5916 		new_search->context[index].query->frame = ContextToFrame(new_search, index);
5917 		new_search->context[index].query_allocated = FALSE;
5918 	}
5919 
5920 	new_search->context_factor = search->context_factor;
5921 
5922 	new_search->subject = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
5923 	/* 100 is the size limit in the present BLAST for hsp's. */
5924 	new_search->hsp_array_size = search->hsp_array_size;
5925 	/* The results are held here. */
5926 	new_search->result_struct = search->result_struct;
5927         new_search->mb_result_struct = search->mb_result_struct;
5928         new_search->result_size = search->result_size;
5929 	new_search->worst_evalue = DBL_MAX;
5930 
5931 	new_search->translation_table = search->translation_table;
5932 	new_search->translation_table_rc = search->translation_table_rc;
5933 	new_search->genetic_code = search->genetic_code;
5934 	new_search->db_genetic_code = search->db_genetic_code;
5935 
5936 	if (search->translation_buffer_size > 0)
5937 	{	/* two extra for the NULLB's on end. */
5938 		new_search->translation_buffer = MemNew((2+search->translation_buffer_size)*sizeof(Uint1));
5939 		new_search->translation_buffer_size = search->translation_buffer_size;
5940 	}
5941 
5942 	new_search->gap_align = NULL;	/* Allocated automatically. */
5943 
5944 	new_search->whole_query = search->whole_query;
5945 	new_search->required_start = search->required_start;
5946 	new_search->required_end = search->required_end;
5947 
5948 	new_search->handle_results = search->handle_results;
5949 	if (!search->pbp->mb_params)
5950            new_search->query_id = SeqIdSetDup(search->query_id);
5951         else {
5952 	   new_search->qid_array = (SeqIdPtr PNTR)
5953 	      Malloc((search->last_context/2 + 1)*sizeof(SeqIdPtr));
5954 
5955 	   for (index=0; index<=search->last_context/2; index++)
5956 	      new_search->qid_array[index] = SeqIdSetDup(search->qid_array[index]);
5957 	}
5958 
5959         /* Duplicating DNAP sequence used in OOF search */
5960         if(search->pbp->is_ooframe)
5961             new_search->query_dnap = BlastMakeCopyQueryDNAP(search->query_dnap);
5962 
5963         new_search->thr_info = search->thr_info;
5964         new_search->semid = search->semid;
5965 
5966 #ifdef BLAST_COLLECT_STATS
5967 	new_search->first_pass_hits = 0;
5968 	new_search->second_pass_hits = 0;
5969 	new_search->second_pass_trys = 0;
5970 	new_search->first_pass_extends = 0;
5971 	new_search->second_pass_extends = 0;
5972 	new_search->first_pass_good_extends = 0;
5973 	new_search->second_pass_good_extends = 0;
5974 	new_search->number_of_seqs_better_E = 0;
5975 	new_search->prelim_gap_no_contest = 0;
5976 	new_search->prelim_gap_passed = 0;
5977 	new_search->prelim_gap_attempts = 0;
5978 	new_search->real_gap_number_of_hsps = 0;
5979 #endif
5980 	new_search->output = search->output;
5981 
5982 	if (search->abmp) {
5983 	   new_search = GreedyAlignMemAlloc(new_search);
5984            if (new_search->abmp == NULL) {
5985               new_search = BlastSearchBlkDestruct(new_search);
5986               return NULL;
5987            }
5988         }
5989         if (search->mb_endpoint_results) {
5990            new_search->mb_endpoint_results = ValNodeNew(NULL);
5991            new_search->mb_endpoint_results->data.ptrvalue =
5992               search->mb_endpoint_results->data.ptrvalue;
5993         }
5994 	new_search->mask1 = search->mask1;
5995 
5996 	return new_search;
5997 }
5998 /*
5999 	Allocates space for the new BlastSearchBlk and some sturctures
6000 	attached to it.
6001 */
6002 
6003 BlastSearchBlkPtr LIBCALL
BlastSearchBlkNew(Int2 wordsize,Int4 qlen,CharPtr dbname,Boolean multiple_hits,BLAST_Score threshold_first,BLAST_Score threshold_second,Int4 result_size,CharPtr prog_name,BlastAllWordPtr all_words,Int2 first_context,Int2 last_context,Int4 window_size)6004 BlastSearchBlkNew (Int2 wordsize, Int4 qlen, CharPtr dbname, Boolean multiple_hits, BLAST_Score threshold_first, BLAST_Score threshold_second, Int4 result_size, CharPtr prog_name, BlastAllWordPtr all_words, Int2 first_context, Int2 last_context, Int4 window_size)
6005 
6006 {
6007 	return BlastSearchBlkNewExtra(wordsize, qlen, dbname, multiple_hits, threshold_first, threshold_second, result_size, prog_name, all_words, first_context, last_context, NULL, window_size);
6008 
6009 }
6010 
6011 /*
6012 	Allocates space for the new BlastSearchBlk and some sturctures
6013 	attached to it.
6014 */
6015 
6016 BlastSearchBlkPtr LIBCALL
BlastSearchBlkNewExtra(Int2 wordsize,Int4 qlen,CharPtr dbname,Boolean multiple_hits,BLAST_Score threshold_first,BLAST_Score threshold_second,Int4 result_size,CharPtr prog_name,BlastAllWordPtr all_words,Int2 first_context,Int2 last_context,ReadDBFILEPtr rdfp,Int4 window_size)6017 BlastSearchBlkNewExtra (Int2 wordsize, Int4 qlen, CharPtr dbname, Boolean multiple_hits, BLAST_Score threshold_first, BLAST_Score threshold_second, Int4 result_size, CharPtr prog_name, BlastAllWordPtr all_words, Int2 first_context, Int2 last_context, ReadDBFILEPtr rdfp, Int4 window_size)
6018 
6019 {
6020 
6021 	BlastSearchBlkPtr search;
6022 	BLASTContextStructPtr context;
6023 	Uint1 is_prot;
6024 	Int2 index;
6025 	Uint1 alphabet;
6026 	Int4 longest_db_seq=INT4_MAX;
6027 	ReadDBFILEPtr rdfp_var;
6028         Int4 last_ewp_index;
6029 
6030 	search = (BlastSearchBlkPtr) MemNew(sizeof(BlastSearchBlk));
6031 
6032 	if (search != NULL)
6033 	{
6034 		search->allocated = 0;	/* everything's allocated here. */
6035 		search->allocated += BLAST_SEARCH_ALLOC_QUERY;
6036 		search->allocated += BLAST_SEARCH_ALLOC_SUBJECT;
6037 		search->allocated += BLAST_SEARCH_ALLOC_PBP;
6038 		search->allocated += BLAST_SEARCH_ALLOC_SBP;
6039 		search->allocated += BLAST_SEARCH_ALLOC_EWPPARAMS;
6040 		search->allocated += BLAST_SEARCH_ALLOC_CONTEXT;
6041 		search->allocated += BLAST_SEARCH_ALLOC_RESULTS;
6042 		search->allocated += BLAST_SEARCH_ALLOC_READDB;
6043 		search->allocated += BLAST_SEARCH_ALLOC_ALL_WORDS;
6044                 search->allocated += BLAST_SEARCH_ALLOC_THRINFO;
6045                 search->allocated += BLAST_SEARCH_ALLOC_MASK1;
6046 
6047 		search->positionBased = FALSE;
6048 
6049 		if (StringCmp(prog_name, "blastn") == 0)
6050 		{
6051 			alphabet = BLASTNA_SEQ_CODE;
6052 		}
6053 		else
6054 		{
6055 			alphabet = Seq_code_ncbistdaa;
6056 		}
6057 
6058 		if (dbname != NULL)
6059 		{
6060 
6061 			if (rdfp == NULL)
6062 			{
6063 				if (StringCmp(prog_name, "blastp") == 0 || StringCmp(prog_name, "blastx") == 0)
6064 				{ /* Protein DB for blastp and blastx. */
6065 					is_prot = READDB_DB_IS_PROT;
6066 				}
6067 				else
6068 				{
6069 					is_prot = READDB_DB_IS_NUC;
6070 				}
6071 
6072 				if ((search->rdfp=readdb_new(dbname, is_prot)) == NULL)
6073 				{
6074 					return NULL;
6075 				}
6076 			}
6077 			else
6078 			{	/* Attaches to the rdfp, rather than reallocating it. */
6079 				search->rdfp = readdb_attach(rdfp);
6080 			}
6081 
6082 			rdfp_var = search->rdfp;
6083 			longest_db_seq = 0;
6084 			while (rdfp_var)
6085 			{
6086 				longest_db_seq = MAX(longest_db_seq, readdb_get_maxlen(rdfp_var));
6087 				rdfp_var = rdfp_var->next;
6088 			}
6089 		}
6090 
6091 		search->first_context = first_context;
6092 		search->last_context = last_context;
6093 
6094 		search->pbp =
6095 		   (BLAST_ParameterBlkPtr) MemNew(sizeof(BLAST_ParameterBlk));
6096 
6097 		search->sbp = BLAST_ScoreBlkNew(alphabet, last_context+1);
6098 
6099 		/* Only allocate these if thresholds are above zero, i.e. they will be used. */
6100 		if (StringCmp(prog_name, "blastn") != 0)
6101 		{
6102 			if (threshold_second > 0)
6103 			{
6104 				search->wfp_first = BLAST_WordFinderNew(search->sbp->alphabet_size, wordsize, 1, FALSE);
6105 				search->allocated += BLAST_SEARCH_ALLOC_WFP_FIRST;
6106 		/* Only allocate a new WFP if 2nd th differs from 1st. */
6107                                 search->wfp_second = search->wfp_first;
6108 			}
6109 		}
6110 		else
6111 		{
6112 			if (multiple_hits)
6113 				search->wfp_second = BLAST_WordFinderNew(256, wordsize, READDB_COMPRESSION_RATIO, FALSE);
6114 			else
6115 				search->wfp_second = BLAST_WordFinderNew(256, wordsize, READDB_COMPRESSION_RATIO, TRUE);
6116 			search->allocated += BLAST_SEARCH_ALLOC_WFP_SECOND;
6117 		}
6118 
6119 		search->prog_name = StringSave(prog_name);
6120 		search->prog_number = BlastGetProgramNumber(prog_name);
6121 		if (qlen > 0)
6122 		   search->ewp_params = BLAST_ExtendWordParamsNew(qlen, multiple_hits, window_size);
6123 		else
6124 		   search->ewp_params = NULL;
6125 		context = search->context = (BLASTContextStructPtr)
6126                    MemNew((1+search->last_context)*sizeof(BLASTContextStruct));
6127                 if (search->prog_number != blast_type_blastn)
6128                    last_ewp_index = search->last_context;
6129                 else /* All queries (Mega BLAST) and strands are concatenated
6130                         in a single sequence */
6131                    last_ewp_index = search->first_context;
6132 
6133 		for (index=search->first_context; index<=search->last_context; index++)
6134 		{
6135 		   if (search->ewp_params && index <= last_ewp_index)
6136                       context[index].ewp = BLAST_ExtendWordNew(search->ewp_params);
6137                    context[index].query = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
6138                    context[index].query->frame = ContextToFrame(search, index);
6139                    context[index].query_allocated = TRUE;
6140 		}
6141 
6142 		search->subject = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
6143 		/* 100 is the size limit in the present BLAST for hsp's. */
6144 		search->hsp_array_size = 100;
6145 		/* The results are held here. */
6146 		search->result_size = result_size;
6147 /*
6148 		search->result_struct = BLASTResultsStructNew(result_size, search->pbp->max_pieces, search->pbp->hsp_range_max);
6149 */
6150 
6151 		search->worst_evalue = DBL_MAX;
6152 
6153 		search->whole_query = TRUE;
6154 		search->required_start = 0;
6155 		search->required_end = -1;
6156 
6157 		search->all_words = all_words;
6158 
6159                 search->thr_info = BlastThrInfoNew();
6160 #ifdef BLAST_COLLECT_STATS
6161 		search->first_pass_hits = 0;
6162 		search->second_pass_hits = 0;
6163 		search->second_pass_trys = 0;
6164 		search->first_pass_extends = 0;
6165 		search->second_pass_extends = 0;
6166 		search->first_pass_good_extends = 0;
6167 		search->second_pass_good_extends = 0;
6168 		search->number_of_seqs_better_E = 0;
6169 		search->prelim_gap_no_contest = 0;
6170 		search->prelim_gap_passed = 0;
6171 		search->prelim_gap_attempts = 0;
6172 		search->real_gap_number_of_hsps = 0;
6173 #endif
6174 	}
6175 
6176 	return search;
6177 }
6178 
6179 /*
6180 	Deallocates memory associated with the BlastSearchBlkPtr.
6181 */
6182 
6183 BlastSearchBlkPtr LIBCALL
BlastSearchBlkDestruct(BlastSearchBlkPtr search)6184 BlastSearchBlkDestruct (BlastSearchBlkPtr search)
6185 
6186 {
6187 
6188     if (search != NULL) {
6189         if (search->allocated & BLAST_SEARCH_ALLOC_QUERY)
6190             search->original_seq = MemFree(search->original_seq);
6191 
6192         if (search->allocated & BLAST_SEARCH_ALLOC_SUBJECT)
6193             search->subject = BlastSequenceBlkDestruct(search->subject);
6194 
6195         if (search->allocated & BLAST_SEARCH_ALLOC_SBP)
6196             search->sbp = BLAST_ScoreBlkDestruct(search->sbp);
6197 
6198         if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST)
6199             search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
6200 
6201         if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND) {
6202             search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
6203         } else if (search->prog_number==blast_type_blastn &&
6204                    search->pbp->mb_params) {
6205             search->wfp_second =
6206                 MegaBlastWordFinderDeallocate(search->wfp_second);
6207         }
6208 
6209         /* Freeing DNAP sequence used in OOF */
6210 
6211         if(search->pbp != NULL && search->pbp->is_ooframe) {
6212             BlastFreeQueryDNAP(search->query_dnap);
6213             search->query_dnap = NULL;
6214         }
6215 
6216         if (search->allocated & BLAST_SEARCH_ALLOC_EWPPARAMS) {
6217             search->ewp_params = BLAST_ExtendWordParamsDestruct(search->ewp_params);
6218         }
6219 
6220         if (search->allocated & BLAST_SEARCH_ALLOC_CONTEXT) {
6221             search->context = BLASTContextFree(search->context, 1+search->last_context);
6222         }
6223 
6224         if (search->allocated & BLAST_SEARCH_ALLOC_RESULTS) {
6225            if (!search->pbp->mb_params)
6226               search->result_struct =
6227                  BLASTResultsStructDelete(search->result_struct);
6228            else {
6229               Int2 index;
6230               for (index=0; index<=search->last_context/2; index++)
6231                  search->mb_result_struct[index] =
6232                     BLASTResultsStructDelete(search->mb_result_struct[index]);
6233               search->mb_result_struct = MemFree(search->mb_result_struct);
6234            }
6235         }
6236 
6237         if (search->allocated & BLAST_SEARCH_ALLOC_PBP) {
6238 	    search->pbp->mb_params = MemFree(search->pbp->mb_params);
6239             MemFree(search->pbp->filter_string);
6240             search->pbp = MemFree(search->pbp);
6241         }
6242 
6243         if (search->allocated & BLAST_SEARCH_ALLOC_READDB) {
6244             search->rdfp = readdb_destruct(search->rdfp);
6245         }
6246 
6247         if (search->current_hitlist) {
6248             search->current_hitlist = BlastHitListDestruct(search->current_hitlist);
6249         }
6250         search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
6251 
6252 
6253         if (search->prog_name) {
6254             search->prog_name = MemFree(search->prog_name);
6255         }
6256 
6257         if (search->query_id) {
6258             search->query_id = SeqIdSetFree(search->query_id);
6259         }
6260         if (search->qid_array) {
6261             Int4 index;
6262             for (index=0; index<=search->last_context/2; index++)
6263                 SeqIdSetFree(search->qid_array[index]);
6264             search->qid_array = MemFree(search->qid_array);
6265         }
6266         if (search->translation_buffer_size > 0) {
6267             search->translation_buffer = MemFree(search->translation_buffer);
6268         }
6269 
6270         if (search->allocated & BLAST_SEARCH_ALLOC_TRANS_INFO) {
6271 
6272             if (search->translation_table) {
6273                 search->translation_table = MemFree(search->translation_table);
6274             }
6275 
6276             if (search->translation_table_rc) {
6277                 search->translation_table_rc = MemFree(search->translation_table_rc);
6278             }
6279         }
6280 
6281         if (search->allocated & BLAST_SEARCH_ALLOC_ALL_WORDS) {
6282             search->all_words = BlastAllWordDestruct(search->all_words);
6283         }
6284 
6285         search->gap_align = GapAlignBlkDelete(search->gap_align);
6286 
6287         if (search->allocated & BLAST_SEARCH_ALLOC_QUERY_SLP) {
6288             if (search->query_slp)
6289                 search->query_slp = SeqLocFree(search->query_slp);
6290         }
6291 
6292 
6293         if(search->allocated & BLAST_SEARCH_ALLOC_THRINFO)
6294             BlastThrInfoFree(search->thr_info);
6295 
6296         if (search->abmp)
6297             search->abmp = GreedyAlignMemFree(search->abmp);
6298 
6299         search->query_context_offsets = MemFree(search->query_context_offsets);
6300 
6301         MemFree(search->mb_endpoint_results);
6302 
6303 	if (search->allocated & BLAST_SEARCH_ALLOC_MASK1)
6304 	{
6305 		if (search->mask1)
6306 		{
6307 			SeqLocSetFree(search->mask1->data.ptrvalue);
6308 			search->mask1 = ValNodeFree(search->mask1);
6309 		}
6310 	}
6311 
6312         search = MemFree(search);
6313     }
6314 
6315     return search;
6316 }
6317 
6318 
6319 /*
6320 	Deallocates all the memory associated with the BlastAllWordPtr.
6321 */
6322 
6323 BlastAllWordPtr LIBCALL
BlastAllWordDestruct(BlastAllWordPtr all_words)6324 BlastAllWordDestruct(BlastAllWordPtr all_words)
6325 
6326 {
6327 	if (all_words == NULL)
6328 		return NULL;
6329 
6330 	if (all_words->array)
6331 	{
6332 		all_words->array = MemFree(all_words->array);
6333 	}
6334 
6335 	if (all_words->rows_allocated && all_words->array_storage)
6336 	{
6337 		all_words->array_storage = MemFree(all_words->array_storage);
6338 	}
6339 
6340 	MemFree(all_words);
6341 
6342 	return NULL;
6343 }
6344 
6345 /*
6346 	Allocates the BlastAllWordPtr and sets some flags.
6347 */
6348 BlastAllWordPtr LIBCALL
BlastAllWordNew(Int4 num_of_cols,Int4 wordsize,Boolean rows_allocated,Boolean specific)6349 BlastAllWordNew(Int4 num_of_cols, Int4 wordsize, Boolean rows_allocated, Boolean specific)
6350 
6351 {
6352 	BlastAllWordPtr all_words;
6353 
6354 	all_words = MemNew(sizeof(BlastAllWord));
6355 	if (all_words)
6356 	{
6357 		all_words->rows_allocated = rows_allocated;
6358 		all_words->specific = specific;
6359 		all_words->num_of_cols = num_of_cols;
6360 		all_words->wordsize = wordsize;
6361 	}
6362 
6363 	return all_words;
6364 }
6365 
6366 BLAST_HitListPtr LIBCALL
BlastHitListDestruct(BLAST_HitListPtr hitlist)6367 BlastHitListDestruct(BLAST_HitListPtr hitlist)
6368 {
6369         BLAST_HSPPtr PNTR hsp_array;
6370         Int4 hspcnt_max, index;
6371 
6372         if (hitlist == NULL)
6373                 return NULL;
6374 
6375         hspcnt_max = hitlist->hspcnt_max;
6376         hsp_array = hitlist->hsp_array;
6377 
6378         for (index=0; index<hspcnt_max; index++)
6379         {
6380            hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
6381         }
6382 
6383         hitlist->hsp_array = MemFree(hsp_array);
6384 	hitlist->lh_helper = MemFree(hitlist->lh_helper);
6385 
6386         MemFree(hitlist->exact_match_array);
6387 
6388         hitlist = MemFree(hitlist);
6389 
6390         return hitlist;
6391 }
6392 
6393 /****************************************************************
6394 
6395         Functions to allocate and destroy the BLAST_HitList.
6396 
6397 ***************************************************************/
6398 BLAST_HitListPtr LIBCALL
BlastHitListNew(BlastSearchBlkPtr search)6399 BlastHitListNew(BlastSearchBlkPtr search)
6400 {
6401         BLAST_HitListPtr hitlist;
6402 
6403         hitlist = (BLAST_HitListPtr) MemNew(sizeof(BLAST_HitList));
6404 
6405         if (hitlist == NULL)
6406                 return hitlist;
6407 
6408         hitlist->hspmax = search->hsp_array_size;
6409         hitlist->hsp_array = (BLAST_HSPPtr PNTR) MemNew(hitlist->hspmax*sizeof
6410 (BLAST_HSPPtr));
6411 
6412         if (hitlist->hsp_array == NULL)
6413         {
6414                 hitlist = BlastHitListDestruct(hitlist);
6415                 return NULL;
6416         }
6417 
6418         if (search->pbp->mb_params) {
6419            hitlist->exact_match_array = (MegaBlastExactMatchPtr)
6420               MemNew(hitlist->hspmax*sizeof(MegaBlastExactMatch));
6421            hitlist->exact_match_max = hitlist->hspmax;
6422         }
6423 
6424         return hitlist;
6425 }
6426 
6427 
6428 /*
6429 	This function translates the context number of a context into
6430 	the frame of the sequence.
6431 
6432 	Arguments:
6433 
6434 	BlastSearchBlkPtr search: search structure,
6435 	Int2 context_number: context number used by BLASTContextStruct array
6436 	Boolean is_query: if TRUE, refers to query, otherwise the subject.
6437 */
6438 
6439 Int2
ContextToFrame(BlastSearchBlkPtr search,Int2 context_number)6440 ContextToFrame(BlastSearchBlkPtr search, Int2 context_number)
6441 
6442 {
6443 	Int2 frame=255;
6444 	Uint1 prog_number = search->prog_number;
6445 
6446 	if (prog_number == blast_type_blastn)
6447 	{
6448 		if (context_number % 2 == 0)
6449 			frame = 1;
6450 		else
6451 			frame = -1;
6452 	}
6453         else if (prog_number == blast_type_blastp ||
6454                  prog_number == blast_type_tblastn ||
6455                  prog_number == blast_type_psitblastn)
6456 	{	/* Query and subject are protein, no frame. */
6457 		frame = 0;
6458 	}
6459 	else if (prog_number == blast_type_blastx || prog_number == blast_type_tblastx)
6460 	{
6461 		frame = context_number < 3 ? context_number+1 : -context_number+2;
6462 	}
6463 
6464 	return frame;
6465 }
6466 
6467 /*
6468 	Allocates and fills in the BLASTSubjectInfo structure.
6469 */
6470 
6471 BLASTSubjectInfoPtr LIBCALL
BLASTSubjectInfoNew(SeqIdPtr sip,CharPtr defline,Int4 length)6472 BLASTSubjectInfoNew(SeqIdPtr sip, CharPtr defline, Int4 length)
6473 
6474 {
6475 	BLASTSubjectInfoPtr subject_info;
6476 
6477 	subject_info = (BLASTSubjectInfoPtr) MemNew(sizeof(BLASTSubjectInfo));
6478 
6479 	if (subject_info == NULL)
6480 		return NULL;
6481 
6482 	subject_info->sip = sip;
6483 	subject_info->defline = defline;
6484 	subject_info->length = length;
6485 
6486 	return subject_info;
6487 }
6488 
6489 /*
6490 	Deallocates the BLASTSubjectInfo structure and the
6491 	SeqIdPtr, as well as the defline.
6492 */
6493 
6494 BLASTSubjectInfoPtr LIBCALL
BLASTSubjectInfoDestruct(BLASTSubjectInfoPtr subject_info)6495 BLASTSubjectInfoDestruct(BLASTSubjectInfoPtr subject_info)
6496 
6497 {
6498 
6499 	if (subject_info == NULL)
6500 		return NULL;
6501 
6502 	SeqIdFree(subject_info->sip);
6503 	MemFree(subject_info->defline);
6504 	subject_info = MemFree(subject_info);
6505 
6506 	return subject_info;
6507 }
6508 
6509 
6510 
6511 /*
6512 	Destroys BLASTResultsStructure and associated memory.
6513 */
6514 
6515 BLASTResultsStructPtr LIBCALL
BLASTResultsStructDelete(BLASTResultsStructPtr result_struct)6516 BLASTResultsStructDelete(BLASTResultsStructPtr result_struct)
6517 
6518 {
6519 	Int4 index;
6520 	BLASTResultHitlistPtr PNTR results;
6521 	BLASTHeapPtr hp, hpt;
6522 
6523 	if (result_struct == NULL)
6524 		return NULL;
6525 
6526 	results = result_struct->results;
6527 	for (index=0; index<result_struct->hitlist_max; index++)
6528 	{
6529 		if (results[index])
6530 		{
6531 			results[index] = BLASTResultHitlistFree(results[index]);
6532 		}
6533 	}
6534 
6535 
6536 	for (hp = result_struct->heap_ptr; hp; )
6537 	{
6538 	  hpt = hp->next;
6539 	  hp->heap = MemFree(hp->heap);
6540 	  hp = MemFree(hp);
6541 	  hp = hpt;
6542 	}
6543 	result_struct->results = MemFree(result_struct->results);
6544 	result_struct = MemFree(result_struct);
6545 
6546 	return result_struct;
6547 }
6548 
6549 /*
6550 	returns BLASTResultsStruct.
6551 */
6552 
6553 BLASTResultsStructPtr
BLASTResultsStructNew(Int4 results_size,Int4 max_pieces,Int4 range_max)6554 BLASTResultsStructNew(Int4 results_size, Int4 max_pieces, Int4 range_max)
6555 
6556 {
6557 	BLASTResultsStructPtr new;
6558 	Int4 index;
6559 
6560 	new = MemNew(sizeof(BLASTResultsStruct));
6561 	new->results = (BLASTResultHitlistPtr PNTR) MemNew(results_size*sizeof(BLASTResultHitlistPtr));
6562 
6563 	for (index=0; index<results_size; index++)
6564 		new->results[index] = NULL;
6565 
6566 	new->hitlist_max = results_size;
6567 	new->hitlist_count = 0;
6568 	new->max_pieces = max_pieces;
6569         if (range_max > 0) {
6570            new->heap_ptr = (BLASTHeapPtr) MemNew(sizeof(BLASTHeapStruct));
6571            new->heap_ptr->cutvalue = INT4_MAX;
6572            new->heap_ptr->num_in_heap = new->heap_ptr->num_of_ref = 0;
6573            new->heap_ptr->prev = new->heap_ptr->next = NULL;
6574            new->heap_ptr->heap = (BLASTResultHspPtr PNTR) MemNew(sizeof(BLASTResultHspPtr)*range_max);
6575         }
6576 	return new;
6577 }
6578 
6579 
6580 Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes);
6581 
6582 /*
6583 	GetTranslation to get the translation of the nucl. sequence in the
6584 	appropriate frame and with the appropriate GeneticCode.
6585 
6586 	The function return an allocated CharPtr, the caller must delete this.
6587 	The first and last spaces of this CharPtr contain NULLB's.
6588 */
6589 
6590 Uint1Ptr LIBCALL
GetTranslation(Uint1Ptr query_seq,Int4 nt_length,Int2 frame,Int4Ptr length,CharPtr genetic_code)6591 GetTranslation(Uint1Ptr query_seq, Int4 nt_length, Int2 frame, Int4Ptr length, CharPtr genetic_code)
6592 {
6593 	Uint1 codon[CODON_LENGTH];
6594 	Int4 index, index_prot;
6595 	SeqMapTablePtr smtp;
6596 	Uint1 residue, new_residue;
6597 	Uint1Ptr prot_seq;
6598 
6599 	smtp = SeqMapTableFind(Seq_code_ncbistdaa, Seq_code_ncbieaa);
6600 
6601 	/* Allocate two extra spaces for NULLB's at beginning and end of seq. */
6602 	prot_seq = (Uint1Ptr) MemNew((2+(nt_length+2)/CODON_LENGTH)*sizeof(Uint1));
6603 
6604 	/* The first character in the protein is the NULLB sentinel. */
6605 	prot_seq[0] = NULLB;
6606 	index_prot = 1;
6607 	for (index=ABS(frame)-1; index<nt_length-2; index += CODON_LENGTH)
6608 	{
6609 		codon[0] = query_seq[index];
6610 		codon[1] = query_seq[index+1];
6611 		codon[2] = query_seq[index+2];
6612 		residue = AAForCodon(codon, genetic_code);
6613 		new_residue = SeqMapTableConvert(smtp, residue);
6614 		if (IS_residue(new_residue))
6615 		{
6616 			prot_seq[index_prot] = new_residue;
6617 		}
6618 		index_prot++;
6619 	}
6620 	prot_seq[index_prot] = NULLB;
6621 	*length = index_prot-1;
6622 
6623 	return prot_seq;
6624 }
6625 
6626 
6627 /*************************************************************************
6628 *
6629 *	MaskTheResidues masks up to max_length residues in buffer.
6630 *	The residue to be used for masking (generally 'N' for nucleotides
6631 *	and 'X' for proteins) is mask_residue.  offset tells how far
6632 *	along the sequence the first residue in buffer is.  mask_slp
6633 *	specifies which parts of the sequence to mask.  'max_length is
6634 *	the total length of the sequence.
6635 *
6636 *************************************************************************/
6637 
6638 void
BlastMaskTheResidues(Uint1Ptr buffer,Int4 max_length,Uint1 mask_residue,SeqLocPtr mask_slp,Boolean reverse,Int4 offset)6639 BlastMaskTheResidues(Uint1Ptr buffer, Int4 max_length, Uint1 mask_residue, SeqLocPtr mask_slp, Boolean reverse, Int4 offset)
6640 
6641 {
6642 	SeqLocPtr slp=NULL;
6643         Int4 index, start, stop;
6644 
6645 	while (mask_slp)
6646 	{
6647 		slp=NULL;
6648         	while((slp = SeqLocFindNext(mask_slp, slp))!=NULL)
6649         	{
6650 			if (reverse)
6651 			{
6652 				start = max_length - 1 - SeqLocStop(slp);
6653 				stop = max_length - 1 - SeqLocStart(slp);
6654 			}
6655 			else
6656 			{
6657               			start = SeqLocStart(slp);
6658               			stop = SeqLocStop(slp);
6659 			}
6660 
6661 			start -= offset;
6662 			stop  -= offset;
6663 
6664 			for (index=start; index<=stop; index++)
6665 			{
6666 				buffer[index] = mask_residue;
6667 			}
6668         	}
6669 		mask_slp = mask_slp->next;
6670 	}
6671 
6672 }
6673 
6674 /*
6675 	COnverts a protein (translated) SeqLocPtr from the protein
6676 	coordinates to the nucl. coordinates.
6677 
6678 	Only works on a SeqLocPtr of type SeqIntPtr right now.
6679 */
6680 
6681 Boolean
BlastConvertProteinSeqLoc(SeqLocPtr slp,Int2 frame,Int4 full_length)6682 BlastConvertProteinSeqLoc(SeqLocPtr slp, Int2 frame, Int4 full_length)
6683 
6684 {
6685 	SeqIntPtr seq_int;
6686 	Int4 from, to;
6687 
6688 	if (slp == NULL)
6689 		return TRUE;
6690 
6691 	if (slp->choice == SEQLOC_PACKED_INT)
6692 		slp = slp->data.ptrvalue;
6693 
6694 	while (slp)
6695 	{
6696 		if (slp->choice != SEQLOC_INT)
6697 			return FALSE;
6698 
6699 		seq_int = slp->data.ptrvalue;
6700 		from = seq_int->from;
6701 		to = seq_int->to;
6702 
6703 		if (frame < 0)
6704 		{
6705 			seq_int->to = full_length - CODON_LENGTH*from + frame;
6706 			seq_int->from = full_length - CODON_LENGTH*to + frame + 1;
6707 			seq_int->strand = Seq_strand_minus;
6708 		}
6709 		else
6710 		{
6711 			seq_int->from = CODON_LENGTH*from + frame - 1;
6712 			seq_int->to = CODON_LENGTH*to + frame - 1;
6713 			seq_int->strand = Seq_strand_plus;
6714 		}
6715 		slp = slp->next;
6716 	}
6717 
6718 	return TRUE;
6719 }
6720 
6721 /*
6722   COnverts a DNA SeqLocPtr from the nucl. coordinates to
6723   the protein (translated) coordinates.
6724   Only works on a SeqLocPtr of type SEQLOC_INT or SEQLOC_PACKED_INT right now.
6725 */
6726 
6727 Boolean
BlastConvertDNASeqLoc(SeqLocPtr slp,Int2 frame,Int4 full_length)6728 BlastConvertDNASeqLoc(SeqLocPtr slp, Int2 frame, Int4 full_length)
6729 {
6730     SeqIntPtr seq_int;
6731     Int4 from, to;
6732 
6733     if (slp == NULL)
6734         return TRUE;
6735 
6736     if (slp->choice == SEQLOC_PACKED_INT)
6737         slp = slp->data.ptrvalue;
6738 
6739     while (slp) {
6740         if (slp->choice != SEQLOC_INT)
6741             return FALSE;
6742 
6743         seq_int = slp->data.ptrvalue;
6744         from = seq_int->from;
6745         to = seq_int->to;
6746 
6747         if (frame < 0) {
6748             seq_int->from = (full_length + frame - to)/CODON_LENGTH;
6749             seq_int->to = (full_length + frame - from)/CODON_LENGTH;
6750             seq_int->strand = Seq_strand_minus;
6751         } else {
6752             seq_int->from = (from - frame + 1)/CODON_LENGTH;
6753             seq_int->to = (to-frame + 1)/CODON_LENGTH;
6754             seq_int->strand = Seq_strand_plus;
6755         }
6756         slp = slp->next;
6757     }
6758 
6759     return TRUE;
6760 }
6761 
6762 SeqLocPtr
BioseqSegEx(BioseqPtr bsp_unfilter,CharPtr options)6763 BioseqSegEx(BioseqPtr bsp_unfilter, CharPtr options)
6764 
6765 {
6766 	BioseqPtr bsp_filter;
6767 	Boolean mask_state;
6768 	Char cmd_buf[2*PATH_MAX], temp_file[PATH_MAX];
6769 	CharPtr filter_dir;
6770 	Int4 index, mask_begin=0;
6771 	SeqEntryPtr sep;
6772 	SeqLocPtr slp_mask;
6773 	SeqPortPtr spp_filter, spp_unfilter;
6774 	Uint1 res_filter, res_unfilter;
6775 	FILE *fp;
6776 
6777 
6778 	if (bsp_unfilter == NULL)
6779 		return NULL;
6780 
6781 #ifdef OS_UNIX
6782 
6783 	TmpNam(temp_file);
6784 	fp = FileOpen(temp_file, "w");
6785 	if (BioseqToFasta(bsp_unfilter, fp, FALSE) == FALSE)
6786 	{
6787 		BioseqUnlock(bsp_unfilter);
6788 		FileClose(fp);
6789 		return NULL;
6790 	}
6791 	FileClose(fp);
6792 
6793 	filter_dir = getenv("BLASTFILTER");
6794 	if (filter_dir == NULL)
6795 		filter_dir = BLASTFILTER_DIR;
6796 
6797 	if (options != NULL)
6798 		sprintf(cmd_buf, "%s%s%s%s %s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, options, " -x");
6799 	else
6800 		sprintf(cmd_buf, "%s%s%s%s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, " -x");
6801 
6802 	fp = popen(cmd_buf, "r");
6803 	if (fp == NULL)
6804 	{
6805 		ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
6806 		return NULL;
6807 	}
6808 
6809 	sep = FastaToSeqEntry(fp, FALSE);
6810 	FileClose(fp);
6811 	if (sep == NULL)
6812 	{
6813 		ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
6814 		return NULL;
6815 	}
6816 	bsp_filter = sep->data.ptrvalue;
6817 
6818 	spp_filter = SeqPortNew(bsp_filter, 0, -1, Seq_strand_plus, Seq_code_ncbistdaa);
6819 	spp_unfilter = SeqPortNew(bsp_unfilter, 0, -1, Seq_strand_plus, Seq_code_ncbistdaa);
6820 
6821 	mask_state = FALSE;
6822 	index = 0;
6823 	slp_mask = NULL;
6824 	while ((res_filter=SeqPortGetResidue(spp_filter)) != SEQPORT_EOF)
6825 	{
6826 		res_unfilter=SeqPortGetResidue(spp_unfilter);
6827 		if (res_filter != res_unfilter)
6828 		{
6829 			if (mask_state == FALSE)
6830 			{
6831 				mask_begin = index;
6832 				mask_state = TRUE;
6833 			}
6834 		}
6835 		else if (mask_state == TRUE)
6836 		{
6837 			ValNodeLink(&slp_mask, SeqLocIntNew(mask_begin, index-1, Seq_strand_plus, bsp_filter->id));
6838 			mask_state = FALSE;
6839 		}
6840 		index++;
6841 	}
6842 
6843 	/* If the last portion of the sequence was masked. */
6844 	if (mask_state == TRUE)
6845 	{
6846 		ValNodeLink(&slp_mask, SeqLocIntNew(mask_begin, index-1, Seq_strand_plus, bsp_filter->id));
6847 	}
6848 
6849 	sep = SeqEntryFree(sep);
6850 	SeqPortFree(spp_filter);
6851 	SeqPortFree(spp_unfilter);
6852 
6853 	pclose(fp);
6854 	FileRemove(temp_file);
6855 
6856 	return slp_mask;
6857 #else
6858 	return NULL;
6859 #endif
6860 }
6861 
6862 /*
6863 	Runs seg and obtains a SeqLocPtr from it.
6864 */
6865 static SeqLocPtr
SeqLocSegEx(SeqLocPtr slp,CharPtr instructions)6866 SeqLocSegEx(SeqLocPtr slp, CharPtr instructions)
6867 
6868 {
6869 	BioseqPtr bsp_unfilter;
6870 	SeqLocPtr slp_mask;
6871 	SeqIdPtr sip;
6872 
6873 
6874 	if (slp == NULL)
6875 		return NULL;
6876 
6877 	sip = SeqIdFindBest(SeqLocId(slp), SEQID_GI);
6878 	bsp_unfilter = BioseqLockById(sip);
6879 	slp_mask = BioseqSegEx(bsp_unfilter, instructions);
6880 
6881 	BioseqUnlock(bsp_unfilter);
6882 
6883 	return slp_mask;
6884 }
6885 
6886 SeqLocPtr
SeqLocSeg(SeqLocPtr slp)6887 SeqLocSeg(SeqLocPtr slp)
6888 
6889 {
6890 	return SeqLocSegEx(slp, NULL);
6891 }
6892 
6893 SeqLocPtr
MyBioseqSeg(BioseqPtr bsp_unfilter)6894 MyBioseqSeg(BioseqPtr bsp_unfilter)
6895 
6896 {
6897 	return BioseqSegEx(bsp_unfilter, NULL);
6898 }
6899 
6900 #define BLASTSEQLOC_BUFFER_SIZE 128
6901 
6902 Boolean
parse_blast_options(BLAST_OptionsBlkPtr options,CharPtr string_options,CharPtr PNTR error_message,CharPtr PNTR database,Int4Ptr descriptions,Int4Ptr alignments)6903 parse_blast_options(BLAST_OptionsBlkPtr options, CharPtr string_options,
6904                     CharPtr PNTR error_message, CharPtr PNTR database,
6905                     Int4Ptr descriptions, Int4Ptr alignments)
6906 {
6907     	CharPtr opt_str = "GErqeWdyXZPAIvbYzcFsSpfwtgn", *values;
6908 	Int4 index;
6909 
6910 	if (options == NULL)
6911 		return FALSE;
6912 
6913 	if(!BlastParseInputString(string_options, opt_str, &values, error_message))
6914 	{
6915 	    return FALSE;
6916 	}
6917 
6918 	/* -G  gap open cost */
6919 
6920 	index = BlastGetLetterIndex(opt_str, 'G');
6921 	if(values[index] != NULL) {
6922 	    options->gap_open = atoi(values[index]);
6923 	}
6924 
6925 	/* -E gap extend cost */
6926 
6927 	index = BlastGetLetterIndex(opt_str, 'E');
6928 	if(values[index] != NULL) {
6929 	    options->gap_extend = atoi(values[index]);
6930 	}
6931 
6932 	/* -q penalty for nucleotide mismatch. */
6933 
6934 	index = BlastGetLetterIndex(opt_str, 'q');
6935 	if(values[index] != NULL) {
6936 	    options->penalty = atoi(values[index]);
6937 	}
6938 
6939 	/* -r reward for nucleotide match. */
6940 
6941 	index = BlastGetLetterIndex(opt_str, 'r');
6942 	if(values[index] != NULL) {
6943 	    options->reward = atoi(values[index]);
6944 	}
6945 
6946 	/* -e expect value. */
6947 
6948 	index = BlastGetLetterIndex(opt_str, 'e');
6949 	if(values[index] != NULL) {
6950 	    options->expect_value = atof(values[index]);
6951 	}
6952 
6953 	/* -W wordsize. */
6954 
6955 	index = BlastGetLetterIndex(opt_str, 'W');
6956 	if(values[index] != NULL) {
6957 	    options->wordsize = atoi(values[index]);
6958 	}
6959 
6960 	/* -d database. */
6961         if (database) {
6962            index = BlastGetLetterIndex(opt_str, 'd');
6963            if(values[index] != NULL) {
6964               *database = values[index];
6965               values[index] = NULL;
6966            }
6967         }
6968 
6969 	/* -y  Dropoff (X) for blast extensions in bits (default if zero) */
6970 
6971 	index = BlastGetLetterIndex(opt_str, 'y');
6972 	if(values[index] != NULL) {
6973 	    options->dropoff_2nd_pass = atof(values[index]);
6974 	}
6975 
6976 	/* -X  X dropoff value for gapped alignment (in bits) */
6977 
6978 	index = BlastGetLetterIndex(opt_str, 'X');
6979 	if(values[index] != NULL) {
6980 	    options->gap_x_dropoff = atof(values[index]);
6981 	}
6982 
6983 	/* -Z  final X dropoff value for gapped alignment (in bits) */
6984 
6985 	index = BlastGetLetterIndex(opt_str, 'Z');
6986 	if(values[index] != NULL) {
6987 	    options->gap_x_dropoff_final = atof(values[index]);
6988 	}
6989 
6990 	/* -P multiple hits/two-pass. */
6991 
6992         index = BlastGetLetterIndex(opt_str, 'P');
6993         if(values[index] != NULL) {
6994            if (atoi(values[index]) == 0)
6995            {
6996                  options->two_pass_method  = FALSE;
6997                  options->multiple_hits_only  = TRUE;
6998            }
6999            else if (atoi(values[index]) == 1)
7000            {
7001                  options->two_pass_method  = FALSE;
7002                  options->multiple_hits_only  = FALSE;
7003            }
7004            else
7005 	   {
7006                  options->two_pass_method  = TRUE;
7007                  options->multiple_hits_only  = FALSE;
7008            }
7009         }
7010 
7011 	/* -A window size. */
7012 
7013 	index = BlastGetLetterIndex(opt_str, 'A');
7014 	if(values[index] != NULL) {
7015 	    options->window_size = atoi(values[index]);
7016 	}
7017 
7018         /* -I Hitlist size */
7019         index = BlastGetLetterIndex(opt_str, 'I');
7020         if (values[index] != NULL)
7021            options->hitlist_size = atoi(values[index]);
7022 
7023         /* -v Number of descriptions */
7024         if (descriptions) {
7025            *descriptions = -1;
7026            index = BlastGetLetterIndex(opt_str, 'v');
7027            if (values[index] != NULL) {
7028               *descriptions = atoi(values[index]);
7029               options->hitlist_size =
7030                  MAX(options->hitlist_size, *descriptions);
7031            }
7032         }
7033 
7034         /* -b Number of alignments */
7035         if (alignments) {
7036            *alignments = -1;
7037            index = BlastGetLetterIndex(opt_str, 'b');
7038            if (values[index] != NULL) {
7039               *alignments = atoi(values[index]);
7040               options->hitlist_size =
7041                  MAX(options->hitlist_size, *alignments);
7042            }
7043         }
7044 
7045         /* -Y Effective search space */
7046         index = BlastGetLetterIndex(opt_str, 'Y');
7047         if (values[index] != NULL)
7048            options->searchsp_eff = atof(values[index]);
7049 
7050         /* -z Effective database length */
7051         index = BlastGetLetterIndex(opt_str, 'z');
7052         if (values[index] != NULL) {
7053            const char *dummy=NULL;
7054            options->db_length =  StringToInt8(values[index], &dummy);
7055         }
7056 
7057         /* -c Constant in pseudocounts for multipass version */
7058         index = BlastGetLetterIndex(opt_str, 'c');
7059         if (values[index] != NULL)
7060            options->pseudoCountConst = atoi(values[index]);
7061 
7062         /* -F Filter string */
7063         index = BlastGetLetterIndex(opt_str, 'F');
7064         if (values[index] != NULL)
7065            options->filter_string = values[index];
7066 
7067         /* -s Score cut off for megablast */
7068         index = BlastGetLetterIndex(opt_str, 's');
7069         if (values[index] != NULL)
7070            options->cutoff_s2 = atoi(values[index]);
7071 
7072         /* -S Strand option */
7073         index = BlastGetLetterIndex(opt_str, 'S');
7074         if (values[index] != NULL)
7075            options->strand_option = (Uint1) atoi(values[index]);
7076 
7077         /* -p Percentage of identity cut-off */
7078         index = BlastGetLetterIndex(opt_str, 'p');
7079         if (values[index] != NULL)
7080            options->perc_identity = (FloatLo) atof(values[index]);
7081 
7082 	/* -f  threshold for hits */
7083 
7084 	index = BlastGetLetterIndex(opt_str, 'f');
7085 	if(values[index] != NULL) {
7086 	    options->threshold_second = atoi(values[index]);
7087 	}
7088 
7089 	/* -w  Frame shift penalty (OOF algorithm for blastx) */
7090 
7091 	index = BlastGetLetterIndex(opt_str, 'w');
7092 	if(values[index] != NULL) {
7093 	    options->shift_pen = atoi(values[index]);
7094 	    options->is_ooframe = TRUE;
7095 	}
7096 
7097 	/* -t  Discontiguous word template length for megablast;
7098                Longest intron length for sum statistics in tblastn */
7099 
7100 	index = BlastGetLetterIndex(opt_str, 't');
7101 	if(values[index] != NULL) {
7102            if (options->is_megablast_search)
7103               options->mb_template_length = atoi(values[index]);
7104            else
7105               options->longest_intron = atoi(values[index]);
7106 	}
7107 
7108 	/* -g  Scan every base of the database for megablast */
7109 
7110 	index = BlastGetLetterIndex(opt_str, 'g');
7111 	if(values[index] != NULL) {
7112 	    options->mb_one_base_step = (TO_UPPER(*values[index]) == 'T');
7113 	}
7114 
7115 	/* -n  Use dynamic programming algorithm in megablast for gapped
7116                extensions instead of greedy algorithm */
7117 
7118 	index = BlastGetLetterIndex(opt_str, 'n');
7119 	if(values[index] != NULL) {
7120 	    options->mb_use_dyn_prog = (TO_UPPER(*values[index]) == 'T');
7121 	}
7122 
7123 	values = MemFree(values);
7124 
7125 	return TRUE;
7126 }
7127 
7128 static Boolean
parse_dust_options(CharPtr ptr,Int4Ptr level,Int4Ptr window,Int4Ptr cutoff,Int4Ptr linker)7129 parse_dust_options(CharPtr ptr, Int4Ptr level, Int4Ptr window, Int4Ptr cutoff, Int4Ptr linker)
7130 
7131 {
7132 	Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7133 	Int4 arg, index, index1, window_pri=-1, linker_pri=-1, level_pri=-1, cutoff_pri=-1;
7134 	long	tmplong;
7135 
7136 	arg = 0;
7137 	index1 = 0;
7138 	for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7139 	{
7140 		if (*ptr == ' ' || *ptr == NULLB)
7141 		{
7142 			buffer[index1] = NULLB;
7143 			index1 = 0;
7144 			switch(arg) {
7145 				case 0:
7146 					sscanf(buffer, "%ld", &tmplong);
7147 					level_pri = tmplong;
7148 					break;
7149 				case 1:
7150 					sscanf(buffer, "%ld", &tmplong);
7151 					window_pri = tmplong;
7152 					break;
7153 				case 2:
7154 					sscanf(buffer, "%ld", &tmplong);
7155 					cutoff_pri = tmplong;
7156 					break;
7157 				case 3:
7158 					sscanf(buffer, "%ld", &tmplong);
7159 					linker_pri = tmplong;
7160 					break;
7161 				default:
7162 					break;
7163 			}
7164 
7165 			arg++;
7166 			while (*ptr == ' ')
7167 				ptr++;
7168 
7169 			/* end of the buffer. */
7170 			if (*ptr == NULLB)
7171 				break;
7172 		}
7173 		else
7174 		{
7175 			buffer[index1] = *ptr; ptr++;
7176 			index1++;
7177 		}
7178 	}
7179 
7180 	*level = level_pri;
7181 	*window = window_pri;
7182 	*cutoff = cutoff_pri;
7183 	*linker = linker_pri;
7184 
7185 	return TRUE;
7186 }
7187 
7188 
7189 static Boolean
parse_seg_options(CharPtr ptr,Int4Ptr window,FloatHiPtr locut,FloatHiPtr hicut)7190 parse_seg_options(CharPtr ptr, Int4Ptr window, FloatHiPtr locut, FloatHiPtr hicut)
7191 
7192 {
7193 	Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7194 	Int4 arg, index, index1;
7195 	long	tmplong;
7196 	FloatHi	tmpdouble;
7197 
7198 	arg = 0;
7199 	index1 = 0;
7200 	for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7201 	{
7202 		if (*ptr == ' ' || *ptr == NULLB)
7203 		{
7204 			buffer[index1] = NULLB;
7205 			index1 = 0;
7206 			switch(arg) {
7207 				case 0:
7208 					sscanf(buffer, "%ld", &tmplong);
7209 					*window = tmplong;
7210 					break;
7211 				case 1:
7212 					sscanf(buffer, "%le", &tmpdouble);
7213 					*locut = tmpdouble;
7214 					break;
7215 				case 2:
7216 					sscanf(buffer, "%le", &tmpdouble);
7217 					*hicut = tmpdouble;
7218 					break;
7219 				default:
7220 					break;
7221 			}
7222 
7223 			arg++;
7224 			while (*ptr == ' ')
7225 				ptr++;
7226 
7227 			/* end of the buffer. */
7228 			if (*ptr == NULLB)
7229 				break;
7230 		}
7231 		else
7232 		{
7233 			buffer[index1] = *ptr; ptr++;
7234 			index1++;
7235 		}
7236 	}
7237 
7238 	return TRUE;
7239 }
7240 
7241 static Boolean
parse_cc_options(CharPtr ptr,Int4Ptr window,FloatHiPtr cutoff,Int4Ptr linker)7242 parse_cc_options(CharPtr ptr, Int4Ptr window, FloatHiPtr cutoff, Int4Ptr linker)
7243 
7244 {
7245 	Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7246 	Int4 arg, index, index1;
7247 	long	tmplong;
7248 	FloatHi	tmpdouble;
7249 
7250 	arg = 0;
7251 	index1 = 0;
7252 	for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7253 	{
7254 		if (*ptr == ' ' || *ptr == NULLB)
7255 		{
7256 			buffer[index1] = NULLB;
7257 			index1 = 0;
7258 			switch(arg) {
7259 				case 0:
7260 					sscanf(buffer, "%ld", &tmplong);
7261 					*window = tmplong;
7262 					break;
7263 				case 1:
7264 					sscanf(buffer, "%le", &tmpdouble);
7265 					*cutoff = tmpdouble;
7266 					break;
7267 				case 2:
7268 					sscanf(buffer, "%ld", &tmplong);
7269 					*linker = tmplong;
7270 					break;
7271 				default:
7272 					break;
7273 			}
7274 
7275 			arg++;
7276 			while (*ptr == ' ')
7277 				ptr++;
7278 
7279 			/* end of the buffer. */
7280 			if (*ptr == NULLB)
7281 				break;
7282 		}
7283 		else
7284 		{
7285 			buffer[index1] = *ptr; ptr++;
7286 			index1++;
7287 		}
7288 	}
7289 
7290 	return TRUE;
7291 }
7292 
7293 CharPtr
load_options_to_buffer(CharPtr instructions,CharPtr buffer)7294 load_options_to_buffer(CharPtr instructions, CharPtr buffer)
7295 {
7296 	Boolean not_started=TRUE;
7297 	CharPtr buffer_ptr, ptr;
7298 	Int4 index;
7299 
7300 	ptr = instructions;
7301 	buffer_ptr = buffer;
7302 	for (index=0; index<BLASTSEQLOC_BUFFER_SIZE && *ptr != NULLB; index++)
7303 	{
7304 		if (*ptr == ';')
7305 		{
7306 			ptr++;
7307 			break;
7308 		}
7309 		/* Remove blanks at the beginning. */
7310 		if (not_started && *ptr == ' ')
7311 		{
7312 			ptr++;
7313 		}
7314 		else
7315 		{
7316 			not_started = FALSE;
7317 			*buffer_ptr = *ptr;
7318 			buffer_ptr++; ptr++;
7319 		}
7320 	}
7321 
7322 	*buffer_ptr = NULLB;
7323 
7324 	if (not_started == FALSE)
7325 	{	/* Remove trailing blanks. */
7326 		buffer_ptr--;
7327 		while (*buffer_ptr == ' ' && buffer_ptr > buffer)
7328 		{
7329 			*buffer_ptr = NULLB;
7330 			buffer_ptr--;
7331 		}
7332 	}
7333 
7334 	return ptr;
7335 }
7336 
7337 #define CC_WINDOW 22
7338 #define CC_CUTOFF 40.0
7339 #define CC_LINKER 32
7340 
7341 /*
7342 	This function parses the 'instructions' string and then calls the appopriate
7343 	filtering functions.
7344 */
7345 SeqLocPtr
BlastBioseqFilter(BioseqPtr bsp,CharPtr instructions)7346 BlastBioseqFilter(BioseqPtr bsp, CharPtr instructions)
7347 
7348 {
7349 	return BlastBioseqFilterEx(bsp, instructions, NULL);
7350 }
7351 
7352 SeqLocPtr
BlastBioseqFilterEx(BioseqPtr bsp,CharPtr instructions,BoolPtr mask_at_hash)7353 BlastBioseqFilterEx(BioseqPtr bsp, CharPtr instructions, BoolPtr mask_at_hash)
7354 
7355 {
7356 	SeqLocPtr slp = NULL;
7357 	SeqLocPtr slp_mask;
7358 
7359 	ValNodeAddPointer(&slp, SEQLOC_WHOLE,
7360 		SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
7361 	slp_mask = BlastSeqLocFilterEx(slp, instructions, mask_at_hash);
7362 	slp = SeqLocFree(slp);
7363 	return slp_mask;
7364 }
7365 
7366 SeqLocPtr
BlastSeqLocFilter(SeqLocPtr slp,CharPtr instructions)7367 BlastSeqLocFilter(SeqLocPtr slp, CharPtr instructions)
7368 
7369 {
7370 	return BlastSeqLocFilterEx(slp, instructions, NULL);
7371 }
7372 
7373 SeqLocPtr
BlastSeqLocFilterEx(SeqLocPtr slp,CharPtr instructions,BoolPtr mask_at_hash)7374 BlastSeqLocFilterEx(SeqLocPtr slp, CharPtr instructions, BoolPtr mask_at_hash)
7375 
7376 {
7377 	BioseqPtr bsp;
7378 	BLAST_OptionsBlkPtr repeat_options, vs_options;
7379 	Boolean do_all=FALSE, do_seg=FALSE, do_coil_coil=FALSE, do_dust=FALSE, do_repeats=FALSE, do_vecscreen=FALSE;
7380 	Boolean myslp_allocated;
7381 	CharPtr buffer=NULL;
7382 	CharPtr ptr, repeat_database=NULL, vs_database=NULL, error_msg;
7383 	Int2 seqloc_num;
7384 	Int4 window_cc, linker_cc, window_dust, level_dust, minwin_dust, linker_dust;
7385 	SeqLocPtr cc_slp=NULL, dust_slp=NULL, seg_slp=NULL, seqloc_head=NULL, repeat_slp=NULL, vs_slp=NULL;
7386 	PccDatPtr pccp;
7387 	Nlm_FloatHiPtr scores;
7388 	Nlm_FloatHi cutoff_cc;
7389 	SegParamsPtr sparamsp=NULL;
7390 	SeqAlignPtr seqalign;
7391 	SeqIdPtr sip;
7392 	SeqLocPtr myslp, seqloc_var, seqloc_tmp;
7393 	ValNodePtr vnp=NULL, vnp_var;
7394 
7395 	cutoff_cc = CC_CUTOFF;
7396 
7397 	if (instructions == NULL || StringICmp(instructions, "F") == 0)
7398 		return NULL;
7399 
7400 	/* FALSE is the default right now. */
7401 	if (mask_at_hash)
7402 		*mask_at_hash = FALSE;
7403 
7404 	/* parameters for dust. */
7405 	/* -1 indicates defaults. */
7406 	level_dust = -1;
7407 	window_dust = -1;
7408 	minwin_dust = -1;
7409 	linker_dust = -1;
7410 	if (StringICmp(instructions, "T") == 0)
7411 	{ /* do_all actually means seg for proteins and dust for nt. */
7412 		do_all = TRUE;
7413 	}
7414 	else
7415 	{
7416 		buffer = MemNew(StringLen(instructions)*sizeof(Char));
7417 		ptr = instructions;
7418 		/* allow old-style filters when m cannot be followed by the ';' */
7419 		if (*ptr == 'm' && ptr[1] == ' ')
7420 		{
7421 			if (mask_at_hash)
7422 				*mask_at_hash = TRUE;
7423 			ptr += 2;
7424 		}
7425 		while (*ptr != NULLB)
7426 		{
7427 			if (*ptr == 'S')
7428 			{
7429 				sparamsp = SegParamsNewAa();
7430 				sparamsp->overlaps = TRUE;	/* merge overlapping segments. */
7431 				ptr = load_options_to_buffer(ptr+1, buffer);
7432 				if (buffer[0] != NULLB)
7433 				{
7434 					parse_seg_options(buffer, &sparamsp->window, &sparamsp->locut, &sparamsp->hicut);
7435 				}
7436 				do_seg = TRUE;
7437 			}
7438 			else if (*ptr == 'C')
7439 			{
7440 				ptr = load_options_to_buffer(ptr+1, buffer);
7441 				window_cc = CC_WINDOW;
7442 				cutoff_cc = CC_CUTOFF;
7443 				linker_cc = CC_LINKER;
7444 				if (buffer[0] != NULLB)
7445 					parse_cc_options(buffer, &window_cc, &cutoff_cc, &linker_cc);
7446 				do_coil_coil = TRUE;
7447 			}
7448 			else if (*ptr == 'D')
7449 			{
7450 				ptr = load_options_to_buffer(ptr+1, buffer);
7451 				if (buffer[0] != NULLB)
7452 					parse_dust_options(buffer, &level_dust, &window_dust, &minwin_dust, &linker_dust);
7453 				do_dust = TRUE;
7454 			}
7455 			else if (*ptr == 'R')
7456 			{
7457 				repeat_options = BLASTOptionNew("blastn", TRUE);
7458 				repeat_options->expect_value = 0.1;
7459 				repeat_options->penalty = -1;
7460 				repeat_options->wordsize = 11;
7461 				repeat_options->gap_x_dropoff_final = 90;
7462 				repeat_options->dropoff_2nd_pass = 40;
7463 				repeat_options->gap_open = 2;
7464 				repeat_options->gap_extend = 1;
7465 				ptr = load_options_to_buffer(ptr+1, buffer);
7466 				if (buffer[0] != NULLB)
7467                                    parse_blast_options(repeat_options,
7468                                       buffer, &error_msg, &repeat_database,
7469                                       NULL, NULL);
7470 				if (repeat_database == NULL)
7471                                    repeat_database = StringSave("humlines.lib humsines.lib retrovir.lib");
7472 				do_repeats = TRUE;
7473 			}
7474 			else if (*ptr == 'V')
7475 			{
7476 				vs_options = VSBlastOptionNew();
7477 				ptr = load_options_to_buffer(ptr+1, buffer);
7478 				if (buffer[0] != NULLB)
7479                                    parse_blast_options(vs_options, buffer,
7480                                       &error_msg, &vs_database, NULL, NULL);
7481 				vs_options = BLASTOptionDelete(vs_options);
7482 				if (vs_database == NULL)
7483                                    vs_database = StringSave("UniVec_Core");
7484 				do_vecscreen = TRUE;
7485 			}
7486 			else if (*ptr == 'L')
7487 			{ /* do low-complexity filtering; dust for blastn, otherwise seg.*/
7488 				do_all = TRUE;
7489 				ptr++;
7490 			}
7491 			else if (*ptr == 'm')
7492 			{
7493 				if (mask_at_hash)
7494 					*mask_at_hash = TRUE;
7495 				ptr++;
7496 			}
7497 			else
7498 			{	/* Nothing applied. */
7499 				ptr++;
7500 			}
7501 		}
7502 		buffer = MemFree(buffer);
7503 	}
7504 
7505 	seqloc_num = 0;
7506 	seqloc_head = NULL;
7507 	sip = SeqLocId(slp);
7508 	bsp = BioseqLockById(SeqIdFindBest(sip, SEQID_GI));
7509 	if (ISA_aa(bsp->mol))
7510 	{
7511 		if (do_all || do_seg)
7512 		{
7513 			seg_slp = SeqlocSegAa(slp, sparamsp);
7514 			SegParamsFree(sparamsp);
7515 			sparamsp = NULL;
7516 			seqloc_num++;
7517 		}
7518 		if (do_coil_coil)
7519 		{
7520 			pccp = PccDatNew ();
7521 			pccp->window = window_cc;
7522 			ReadPccData (pccp);
7523 			/*scores = PredictCCBioseq(bsp, 0, bsp->length-1, pccp);*/
7524 			scores = PredictCCSeqLoc(slp, pccp);
7525 			cc_slp = FilterCC(scores, cutoff_cc, SeqLocLen(slp), linker_cc, SeqIdDup(sip), FALSE);
7526 			MemFree(scores);
7527 			PccDatFree (pccp);
7528 			seqloc_num++;
7529 		}
7530 	}
7531 	else
7532 	{
7533 		if (do_all || do_dust)
7534 		{
7535                         dust_slp = SeqLocDustEx(slp, level_dust, window_dust, linker_dust);
7536 			seqloc_num++;
7537 		}
7538 		if (do_repeats)
7539 		{
7540 		/* Either the SeqLocPtr is SEQLOC_WHOLE (both strands) or SEQLOC_INT (probably
7541 one strand).  In that case we make up a double-stranded one as we wish to look at both strands. */
7542 			myslp_allocated = FALSE;
7543 			if (slp->choice == SEQLOC_INT)
7544 			{
7545 				myslp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp), Seq_strand_both, SeqLocId(slp));
7546 				myslp_allocated = TRUE;
7547 			}
7548 			else
7549 			{
7550 				myslp = slp;
7551 			}
7552 start_timer;
7553 			repeat_slp = BioseqHitRangeEngineByLoc(myslp, "blastn", repeat_database, repeat_options, NULL, NULL, NULL, NULL, NULL, 0);
7554 stop_timer("after repeat filtering");
7555 			repeat_options = BLASTOptionDelete(repeat_options);
7556 			repeat_database = MemFree(repeat_database);
7557 			if (myslp_allocated)
7558 				SeqLocFree(myslp);
7559 			seqloc_num++;
7560 		}
7561 		if (do_vecscreen)
7562 		{
7563 		/* Either the SeqLocPtr is SEQLOC_WHOLE (both strands) or SEQLOC_INT (probably
7564 one strand).  In that case we make up a double-stranded one as we wish to look at both strands. */
7565 			myslp_allocated = FALSE;
7566 			if (slp->choice == SEQLOC_INT)
7567 			{
7568 				myslp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp), Seq_strand_both, SeqLocId(slp));
7569 				myslp_allocated = TRUE;
7570 			}
7571 			else
7572 			{
7573 				myslp = slp;
7574 			}
7575 			VSScreenSequenceByLoc(myslp, NULL, vs_database, &seqalign, &vnp, NULL, NULL);
7576 			vnp_var = vnp;
7577 			while (vnp_var)
7578 			{
7579 				seqloc_tmp = vnp_var->data.ptrvalue;
7580 				if (vs_slp == NULL)
7581 				{
7582 					vs_slp = seqloc_tmp;
7583 				}
7584 				else
7585 				{
7586 					seqloc_var = vs_slp;
7587 					while (seqloc_var->next)
7588 						seqloc_var = seqloc_var->next;
7589 					seqloc_var->next = seqloc_tmp;
7590 				}
7591 				vnp_var->data.ptrvalue = NULL;
7592 				vnp_var = vnp_var->next;
7593 			}
7594 			vnp = ValNodeFree(vnp);
7595 			seqalign = SeqAlignSetFree(seqalign);
7596 			vs_database = MemFree(vs_database);
7597 			if (myslp_allocated)
7598 				SeqLocFree(myslp);
7599 			seqloc_num++;
7600 		}
7601 	}
7602 
7603 	if (seqloc_num == 0)
7604 	{ /* nothing. */
7605 		;
7606 	}
7607 	else if (seqloc_num == 1)
7608 	{
7609 		if (seg_slp)
7610 			seqloc_head = seg_slp;
7611 		if (cc_slp)
7612 			seqloc_head = cc_slp;
7613 		if (dust_slp)
7614 			seqloc_head = dust_slp;
7615 		if (repeat_slp)
7616 			seqloc_head = repeat_slp;
7617 		if (vs_slp)
7618 			seqloc_head = vs_slp;
7619 	}
7620 	else
7621 	{
7622 		if (seg_slp)
7623 			ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, seg_slp);
7624 		if (cc_slp)
7625 			ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, cc_slp);
7626 		if (dust_slp)
7627 			ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, dust_slp);
7628 		if (repeat_slp)
7629 			ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, repeat_slp);
7630 		if (vs_slp)
7631 			ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, vs_slp);
7632 	}
7633 
7634 	BioseqUnlock(bsp);
7635 	return seqloc_head;
7636 }
7637 
7638 /*
7639 	Program to run seg on a sequence.  Note that this program only
7640 	really works in UNIX systems.
7641 */
7642 Boolean LIBCALL
FilterWithSeg(Uint1Ptr sequence,Int4 length,Uint1 alphabet)7643 FilterWithSeg (Uint1Ptr sequence, Int4 length, Uint1 alphabet)
7644 
7645 {
7646 
7647 #ifdef OS_UNIX
7648 
7649 	BioseqPtr bsp;
7650 	Char cmd_buf[2*PATH_MAX], temp_file[PATH_MAX];
7651 	CharPtr filter_dir;
7652 	FILE PNTR fp;
7653 	Int4 byte_store_length;
7654 	Nlm_ByteStorePtr byte_store;
7655 	SeqEntryPtr sep;
7656 
7657 	if (sequence == NULL || length == 0)
7658 		return FALSE;
7659 
7660 	byte_store = Nlm_BSNew(length);
7661 
7662 	byte_store_length = Nlm_BSWrite(byte_store, (VoidPtr) sequence, length);
7663 	if (length != byte_store_length)
7664 	{
7665 		Nlm_BSDelete(byte_store, length);
7666 		return FALSE;
7667 	}
7668 
7669 	bsp = BioseqNew();
7670 	bsp->seq_data = (SeqDataPtr) byte_store;
7671 	bsp->length = length;
7672 	bsp->seq_data_type = alphabet;
7673 	bsp->mol = Seq_mol_aa;
7674 	bsp->repr = Seq_repr_raw;
7675 
7676 	TmpNam(temp_file);
7677 	fp = FileOpen(temp_file, "w");
7678 	if (BioseqToFasta(bsp, fp, FALSE) == FALSE)
7679 	{
7680 		bsp = BioseqFree(bsp);
7681 		return FALSE;
7682 	}
7683 	FileClose(fp);
7684 
7685 	bsp = BioseqFree(bsp);
7686 
7687 	filter_dir = getenv("BLASTFILTER");
7688 	if (filter_dir != NULL)
7689 		sprintf(cmd_buf, "%s%s%s%s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, " -x");
7690 	else
7691 		sprintf(cmd_buf, "%s%s%s%s%s", BLASTFILTER_DIR, DIRDELIMSTR, "seg ", temp_file, " -x");
7692 
7693 	fp = popen(cmd_buf, "r");
7694 	if (fp == NULL)
7695 	{
7696 		ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
7697 		return FALSE;
7698 	}
7699 
7700 	sep = FastaToSeqEntry(fp, FALSE);
7701 	if (sep == NULL)
7702 	{
7703 		ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
7704 		return FALSE;
7705 	}
7706 
7707 	pclose(fp);
7708 
7709 	bsp = sep->data.ptrvalue;
7710 	BioseqRawConvert(bsp, Seq_code_ncbistdaa);
7711 
7712 	BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET);
7713 	Nlm_BSRead((ByteStorePtr) bsp->seq_data, (VoidPtr) sequence, length);
7714 
7715 	SeqEntryFree(sep);
7716 
7717 	FileRemove(temp_file);
7718 
7719 	return TRUE;
7720 #else
7721 	return FALSE;
7722 #endif
7723 }
7724 
7725 
BLAST_HSPFree(BLAST_HSPPtr hsp)7726 BLAST_HSPPtr BLAST_HSPFree(BLAST_HSPPtr hsp)
7727 {
7728 if (hsp)
7729    hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
7730 
7731 return (BLAST_HSPPtr) MemFree(hsp);
7732 }
7733 
7734 /*
7735 	Frees memory used for HSP's on the ResultHitlist.
7736 	Should be called as the SeqAlignPtr for a hitlist
7737 	is produced to save memory.
7738 */
7739 
7740 void
BLASTResultFreeHsp(BLASTResultHitlistPtr result)7741 BLASTResultFreeHsp(BLASTResultHitlistPtr result)
7742 
7743 {
7744   	BLASTResultHspPtr hsp;
7745 	Int4 index;
7746 
7747 	if (result == NULL || result->hsp_array == NULL)
7748 		return;
7749 
7750         for(index=0; index < result->hspcnt; index++) {
7751           hsp = &result->hsp_array[index];
7752 	  if (hsp)
7753 	     hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
7754         }
7755 
7756 	if (result->hspcnt != 0)
7757 		result->hsp_array = MemFree(result->hsp_array);
7758 
7759 	result->hspcnt = 0;
7760 
7761 	return;
7762 }
7763 
7764 /*
7765 	Free's the hitlist without performing a check
7766 	on the integrity of the heap (used for culling).
7767 */
7768 BLASTResultHitlistPtr LIBCALL
BLASTResultHitlistFree(BLASTResultHitlistPtr result)7769 BLASTResultHitlistFree(BLASTResultHitlistPtr result)
7770 
7771 {
7772 	return BLASTResultHitlistFreeEx(NULL, result);
7773 
7774 }
7775 
7776 
7777 BLASTResultHitlistPtr LIBCALL
BLASTResultHitlistFreeEx(BlastSearchBlkPtr search,BLASTResultHitlistPtr result)7778 BLASTResultHitlistFreeEx(BlastSearchBlkPtr search, BLASTResultHitlistPtr result)
7779 
7780 {
7781         BLASTHeapPtr hp;
7782         Int4 index;
7783   	register Int4 subject_id;
7784 
7785 	if (result == NULL)
7786 		return NULL;
7787 
7788 
7789 	/*
7790 	Check the integrity of the heap used for culling.  Occassionally
7791 	HSP's that have been saved (in the heap before the start of
7792 	the HSP) are missed.
7793 	Only do this if the BlastSearchBlkPtr was provided.
7794 	*/
7795 	if (search && search->pbp->perform_culling == TRUE && result->num_ref > 0)
7796 	{
7797 		subject_id = result->subject_id;
7798 
7799 		/* result->num_ref can change in the loop. */
7800 		for (hp = search->result_struct->heap_ptr; hp && result->num_ref>0; hp = hp->next)
7801 		{
7802                 	index=0; /* Note that hp->num_in_heap can change in the loop */
7803 			while (index < hp->num_in_heap)
7804 			{
7805 				if (hp->heap[index]->point_back->subject_id == subject_id)
7806 				{
7807 					BlastDeleteHeap(hp, index);
7808 				}
7809 				else
7810 					index++;
7811 			}
7812 		}
7813 	}
7814 
7815 	/* In case it was not freed before. */
7816 	BLASTResultFreeHsp(result);
7817 
7818 	BLASTSubjectInfoDestruct(result->subject_info);
7819 
7820 	result = MemFree(result);
7821 
7822 	return result;
7823 }
7824 
7825 /*
7826 	Creates a new BLASTResultHitlist, with the an hsp-array of length hspcnt.  If the
7827 	allocation fails, then NULL is returned.
7828 */
7829 
7830 BLASTResultHitlistPtr LIBCALL
BLASTResultHitlistNew(Int4 hspcnt)7831 BLASTResultHitlistNew(Int4 hspcnt)
7832 
7833 {
7834 
7835 	BLASTResultHitlistPtr new;
7836 
7837 	new = (BLASTResultHitlistPtr) MemNew(sizeof(BLASTResultHitlist));
7838 	if (new == NULL)
7839 		return NULL;
7840 
7841 	new->hsp_array = (BLASTResultHspPtr) MemNew(hspcnt*sizeof(BLASTResultHsp));
7842 	if (new->hsp_array == NULL)
7843 	{
7844 		new = BLASTResultHitlistFree(new);
7845 		return NULL;
7846 	}
7847 	new->hspcnt = hspcnt;
7848 
7849 	return new;
7850 }
7851 
7852 
7853 static Boolean
CopyHSPToResultHsp(BLAST_KarlinBlkPtr kbp,BLAST_HSPPtr hsp,BLASTResultHspPtr result_hsp)7854 CopyHSPToResultHsp(BLAST_KarlinBlkPtr kbp, BLAST_HSPPtr hsp, BLASTResultHspPtr result_hsp)
7855 {
7856 	if (result_hsp == NULL || hsp == NULL)
7857 		return FALSE;
7858 
7859 	result_hsp->ordering_method = hsp->ordering_method;
7860 	result_hsp->number = hsp->num;
7861 	result_hsp->score = hsp->score;
7862 	result_hsp->bit_score = ((hsp->score*kbp->Lambda) - kbp->logK)/NCBIMATH_LN2;
7863 	result_hsp->e_value = hsp->evalue;
7864 	result_hsp->num_ident = hsp->num_ident;
7865 	result_hsp->query_offset = hsp->query.offset;
7866 	result_hsp->query_length = hsp->query.length;
7867 	result_hsp->query_frame = hsp->query.frame;
7868 	result_hsp->query_gapped_start = hsp->query.gapped_start;
7869 	result_hsp->subject_offset = hsp->subject.offset;
7870 	result_hsp->subject_length = hsp->subject.length;
7871 	result_hsp->subject_frame = hsp->subject.frame;
7872 	result_hsp->subject_gapped_start = hsp->subject.gapped_start;
7873 	result_hsp->context = hsp->context;
7874 	result_hsp->gap_info = hsp->gap_info;
7875 	/* Not set in the other type of HSP? */
7876 	result_hsp->hspset_cnt = 0;
7877 
7878 	return TRUE;
7879 }
7880 
7881 Boolean LIBCALL
CopyResultHspToHSP(BLASTResultHspPtr result_hsp,BLAST_HSPPtr hsp)7882 CopyResultHspToHSP(BLASTResultHspPtr result_hsp, BLAST_HSPPtr hsp)
7883 {
7884 	if (result_hsp == NULL || hsp == NULL)
7885 		return FALSE;
7886 
7887 	hsp->ordering_method = result_hsp->ordering_method;
7888 	hsp->num = result_hsp->number;
7889 	hsp->score = result_hsp->score;
7890 	hsp->evalue = result_hsp->e_value;
7891 	hsp->num_ident = result_hsp->num_ident;
7892 	hsp->query.offset = result_hsp->query_offset;
7893 	hsp->query.length = result_hsp->query_length;
7894 	hsp->query.end = result_hsp->query_offset + result_hsp->query_length;
7895 	hsp->query.frame = result_hsp->query_frame;
7896 	hsp->query.gapped_start = result_hsp->query_gapped_start;
7897 	hsp->subject.offset = result_hsp->subject_offset;
7898 	hsp->subject.length = result_hsp->subject_length;
7899 	hsp->subject.end = result_hsp->subject_offset + result_hsp->subject_length;
7900 	hsp->subject.frame = result_hsp->subject_frame;
7901 	hsp->subject.gapped_start = result_hsp->subject_gapped_start;
7902 	hsp->context = result_hsp->context;
7903 
7904 	return TRUE;
7905 }
7906 
7907 /* Same as FillInStdSegInfo, only taking BLAST_HSPPtr argument instead of
7908    BlastResultHspPtr */
7909 StdSegPtr
BLASTHspToStdSeg(BlastSearchBlkPtr search,Int4 subject_length,BLAST_HSPPtr hsp,SeqIdPtr sip,Boolean reverse,SeqIdPtr gi_list)7910 BLASTHspToStdSeg(BlastSearchBlkPtr search, Int4 subject_length, BLAST_HSPPtr hsp, SeqIdPtr sip, Boolean reverse, SeqIdPtr gi_list)
7911 {
7912    StdSegPtr ssp = NULL;
7913    BLASTResultHspPtr result_hsp =
7914       (BLASTResultHspPtr) Malloc(sizeof(BLASTResultHsp));
7915 
7916    CopyHSPToResultHsp(search->sbp->kbp[search->first_context],
7917                       hsp, result_hsp);
7918    ssp = FillInStdSegInfo(search, search->subject_id, subject_length, &ssp,
7919                              result_hsp, sip, reverse, gi_list);
7920    MemFree(result_hsp);
7921    return ssp;
7922 }
7923 
7924 /*
7925 	Sort the HSP's by score.
7926 */
7927 
7928 int LIBCALLBACK
score_compare_hsps(VoidPtr v1,VoidPtr v2)7929 score_compare_hsps(VoidPtr v1, VoidPtr v2)
7930 
7931 {
7932     BLAST_HSPPtr hsp1, hsp2;    /* the HSPs to be compared */
7933     int result = 0;             /* the result of the comparison */
7934 
7935     hsp1 = *((BLAST_HSPPtr PNTR) v1);
7936     hsp2 = *((BLAST_HSPPtr PNTR) v2);
7937 
7938     /* Null HSPs are "greater" than any non-null ones, so they go to the end
7939        of a sorted list. */
7940     if (!hsp1 && !hsp2)
7941         return 0;
7942     else if (!hsp1)
7943         return 1;
7944     else if (!hsp2)
7945         return -1;
7946 
7947     if (0 == (result = BLAST_CMP(hsp2->score,          hsp1->score)) &&
7948         0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
7949         0 == (result = BLAST_CMP(hsp2->subject.end,    hsp1->subject.end)) &&
7950         0 == (result = BLAST_CMP(hsp1->query  .offset, hsp2->query  .offset))) {
7951         /* if all other test can't distinguish the HSPs, then the final
7952            test is the result */
7953         result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
7954     }
7955     return result;
7956 }
7957 
7958 /*
7959 	Function to look for the highest scoring window (of size HSP_MAX_WINDOW)
7960 	in an HSP and return the middle of this.  Used by the gapped-alignment
7961 	functions to start the gapped alignments.
7962 */
7963 
GetStartForGappedAlignment(BlastSearchBlkPtr search,BLAST_HSPPtr hsp,Uint1Ptr query,Uint1Ptr subject,Int4Ptr PNTR matrix)7964 Int4 GetStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp, Uint1Ptr query, Uint1Ptr subject, Int4Ptr PNTR matrix)
7965 {
7966     Int4 index1, max_offset, score, max_score, hsp_end;
7967     Uint1Ptr query_var, subject_var;
7968     Boolean positionBased = (search->positionBased && search->sbp->posMatrix);
7969 
7970     if (hsp->query.length <= HSP_MAX_WINDOW) {
7971         max_offset = hsp->query.offset + hsp->query.length/2;
7972         return max_offset;
7973     }
7974 
7975     hsp_end = hsp->query.offset + HSP_MAX_WINDOW;
7976     query_var = query + hsp->query.offset;
7977     subject_var = subject + hsp->subject.offset;
7978     score=0;
7979     if (!positionBased) {
7980        for (index1=hsp->query.offset; index1<hsp_end; index1++) {
7981           score += matrix[*query_var][*subject_var];
7982           query_var++; subject_var++;
7983        }
7984     } else {
7985        for (index1=hsp->query.offset; index1<hsp_end; index1++) {
7986           score += search->sbp->posMatrix[index1][*subject_var];
7987           query_var++; subject_var++;
7988        }
7989     }
7990     max_score = score;
7991     max_offset = hsp_end - 1;
7992     hsp_end = hsp->query.end -
7993         MAX(0, hsp->query.length - hsp->subject.length);
7994     for (index1=hsp->query.offset + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
7995         if (!positionBased) {
7996             score -= matrix[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
7997             score += matrix[*query_var][*subject_var];
7998         } else {
7999             score -= search->sbp->posMatrix[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
8000             score += search->sbp->posMatrix[index1][*subject_var];
8001         }
8002         if (score > max_score) {
8003             max_score = score;
8004             max_offset = index1;
8005         }
8006         query_var++; subject_var++;
8007     }
8008     if (max_score > 0)
8009        max_offset -= HSP_MAX_WINDOW/2;
8010     else
8011        max_offset = hsp->query.offset;
8012 
8013     return max_offset;
8014 }
8015 
8016 /*
8017    Check whether the starting point for gapped alignment lies in
8018    region that has positive score.  This routine is called after a
8019    preliminary gapped alignment has been computed, but before the
8020    traceback is computed.  The score of the region containing the
8021    starting point may have changed due to the introduction of
8022    ambiguity characters, further filtering of the sequences or the
8023    application of composition based statistics.
8024 
8025    Usually, we check an ungapped alignment of length 11 about the
8026    starting point: 5 characters to the left and 5 to the right.
8027    However, the actual region checked is occassionally shorter because
8028    we don't check characters before the start, or after the end, of
8029    the preliminarily aligned regions in the query or subject.
8030 */
8031 Boolean
CheckStartForGappedAlignment(BlastSearchBlkPtr search,BLAST_HSPPtr hsp,Uint1Ptr query,Uint1Ptr subject,Int4Ptr PNTR matrix)8032 CheckStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp,
8033                               Uint1Ptr query, Uint1Ptr subject,
8034                               Int4Ptr PNTR matrix)
8035 {
8036     Int4 left, right;       /* Number of aligned characters to the
8037                                left and right of the starting point */
8038     Int4 score;             /* Score of the word alignment */
8039     Uint1Ptr subject_var;   /* Current character in the subject sequence */
8040     Uint1Ptr subject_right; /* last character to be considered in the subject
8041                                sequence */
8042     Boolean positionBased =
8043         (search->positionBased && search->sbp->posMatrix);
8044 
8045     /* Compute the number of characters to the left of the start
8046        to include in the word */
8047     left = -HSP_MAX_WINDOW/2;
8048     if (left < hsp->query.offset - hsp->query.gapped_start) {
8049         left = hsp->query.offset - hsp->query.gapped_start;
8050     }
8051     if (left < hsp->subject.offset - hsp->subject.gapped_start) {
8052         left = hsp->subject.offset - hsp->subject.gapped_start;
8053     }
8054 
8055     /* Compute the number of characters to right to include in the word,
8056        including the starting point itself. */
8057     right = HSP_MAX_WINDOW/2 + 1;
8058     if (right > hsp->query.end - hsp->query.gapped_start) {
8059         right = hsp->query.end - hsp->query.gapped_start;
8060     }
8061     if (right > hsp->subject.end - hsp->subject.gapped_start) {
8062         right = hsp->subject.end - hsp->subject.gapped_start;
8063     }
8064 
8065     /* Calculate the score of the word */
8066     score = 0;
8067     subject_var   = subject + hsp->subject.gapped_start + left;
8068     subject_right = subject + hsp->subject.gapped_start + right;
8069     if ( !positionBased ) {
8070         Uint1Ptr query_var;     /* Current character in the query */
8071         query_var = query + hsp->query.gapped_start + left;
8072         for ( ; subject_var < subject_right; subject_var++, query_var++) {
8073            score += matrix[*query_var][*subject_var];
8074         }
8075     } else {
8076         Int4 query_index;       /* Current position in the query */
8077         query_index = hsp->query.gapped_start + left;
8078         for ( ;  subject_var < subject_right;  subject_var++, query_index++) {
8079             score += search->sbp->posMatrix[query_index][*subject_var];
8080         }
8081     }
8082     if (score <= 0) {
8083         return FALSE;
8084     } else {
8085         return TRUE;
8086     }
8087 }
8088 
8089 
8090 /*
8091 	Gets the ratio used to change an evalue calculated with the subject
8092 	sequence length to one with a db length.
8093 */
8094 
8095 Nlm_FloatHi LIBCALL
GetDbSubjRatio(BlastSearchBlkPtr search,Int4 subject_length)8096 GetDbSubjRatio(BlastSearchBlkPtr search, Int4 subject_length)
8097 {
8098 	Nlm_FloatHi db_subj_ratio;
8099 
8100         db_subj_ratio =
8101             ((Nlm_FloatHi) search->context_factor * search->dblen) /
8102             ((Nlm_FloatHi) subject_length);
8103         if (StringCmp(search->prog_name, "tblastn") == 0 ||
8104             StringCmp(search->prog_name, "tblastx") == 0 ||
8105             StringCmp(search->prog_name, "psitblastn") == 0)
8106 	{
8107 		db_subj_ratio *= 3;
8108 	}
8109 
8110 	return db_subj_ratio;
8111 }
8112 
8113 /* The following value should be divisible by 3, to make sure that frames stay
8114    the same when translations are restricted to partial sequence. */
8115 #define SUBJECT_ADJUSTMENT 2100
8116 SeqAlignPtr LIBCALL
BlastGetGapAlgnTbckWithReaddb(BlastSearchBlkPtr search,Int4 hit_number,Boolean ordinal_number)8117 BlastGetGapAlgnTbckWithReaddb (BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number)
8118 
8119 {
8120 	BLASTResultHitlistPtr   result_hitlist;
8121 	BioseqPtr subject_bsp;
8122 	Boolean subject_allocated = FALSE;
8123 	Int4 index1, subject_length, rev_subject_length;
8124 	Int4 subject_start, subject_end;
8125 	Int4 hsp_count;
8126 	BLASTResultHspPtr hsp_array;
8127 	SeqAlignPtr seqalign;
8128 	SeqPortPtr spp;
8129 	Uint1Ptr subject, rev_subject;
8130 
8131         result_hitlist = search->result_struct->results[hit_number];
8132 
8133         if (StringCmp(search->prog_name, "tblastn") == 0 ||
8134             StringCmp(search->prog_name, "psitblastn") == 0)
8135 	{
8136 		subject_bsp = readdb_get_bioseq(search->rdfp, result_hitlist->subject_id);
8137     		spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_plus, Seq_code_ncbi4na);
8138 		/* make one longer to "protect" ALIGN. */
8139 		subject = MemNew((1+subject_bsp->length)*sizeof(Uint1));
8140 		hsp_array = result_hitlist->hsp_array;
8141 		hsp_count = result_hitlist->hspcnt;
8142 		for (index1=0; index1<hsp_count; index1++)
8143 		{
8144 			if (hsp_array[index1].subject_frame > 0)
8145 			{ /* Get subsequence corresponding to this hsp. */
8146 				Int4 offset;
8147 
8148 				subject_start = 3*hsp_array[index1].subject_offset;
8149 				subject_end = subject_start + 3*hsp_array[index1].subject_length;
8150 
8151 				/* add SUBJECT_ADJUSTMENT bases to either end. */
8152 				subject_start = MAX(subject_start - SUBJECT_ADJUSTMENT, 0);
8153 				subject_end = MIN(subject_end + SUBJECT_ADJUSTMENT, subject_bsp->length);
8154 
8155 				SeqPortSeek(spp, subject_start, SEEK_SET);
8156 
8157 				for (offset=subject_start; offset<subject_end; offset++)
8158 					subject[offset] = SeqPortGetResidue(spp);
8159 
8160 				if (subject_start == 0 && subject_end == subject_bsp->length)
8161 					break;    /* entire sequence has been fetched. */
8162 			}
8163 		}
8164 		/* Gap character in last space. */
8165 		subject[subject_bsp->length] = NULLB;
8166 		subject_length = subject_bsp->length;
8167 		spp = SeqPortFree(spp);
8168 
8169     		spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_minus, Seq_code_ncbi4na);
8170 		/* make one longer to "protect" ALIGN. */
8171 		rev_subject = MemNew((1+subject_bsp->length)*sizeof(Uint1));
8172 		hsp_array = result_hitlist->hsp_array;
8173 		hsp_count = result_hitlist->hspcnt;
8174 		for (index1=0; index1<hsp_count; index1++)
8175 		{
8176 			if (hsp_array[index1].subject_frame < 0)
8177 			{ /* Get subsequence corresponding to this hsp. */
8178 				Int4 offset;
8179 
8180 				subject_start = 3*hsp_array[index1].subject_offset;
8181 				subject_end = subject_start + 3*hsp_array[index1].subject_length;
8182 
8183 				/* add SUBJECT_ADJUSTMENT bases to either end. */
8184 				subject_start = MAX(subject_start - SUBJECT_ADJUSTMENT, 0);
8185 				subject_end = MIN(subject_end + SUBJECT_ADJUSTMENT, subject_bsp->length);
8186 
8187 				SeqPortSeek(spp, subject_start, SEEK_SET);
8188 
8189 				for (offset=subject_start; offset<subject_end; offset++)
8190 					rev_subject[offset] = SeqPortGetResidue(spp);
8191 
8192 				if (subject_start == 0 && subject_end == subject_bsp->length)
8193 					break;    /* entire sequence has been fetched. */
8194 			}
8195 		}
8196 		/* Gap character in last space. */
8197 		rev_subject[subject_bsp->length] = NULLB;
8198 		rev_subject_length = subject_bsp->length;
8199 		spp = SeqPortFree(spp);
8200 		subject_bsp = BioseqFree(subject_bsp);
8201 		subject_allocated = TRUE;
8202 	}
8203 	else
8204 	{
8205 		subject_length = readdb_get_sequence(search->rdfp, result_hitlist->subject_id, (Uint1Ptr PNTR) &subject);
8206 		rev_subject = NULL;
8207 		rev_subject_length = 0;
8208 	}
8209 
8210 	seqalign = BlastGetGapAlgnTbck (search, hit_number,  FALSE, ordinal_number, subject, subject_length, rev_subject, rev_subject_length);
8211 
8212 	if (subject_allocated)
8213 	{
8214 		subject = MemFree(subject);
8215 		rev_subject = MemFree(rev_subject);
8216 	}
8217 
8218 	return seqalign;
8219 }
8220 
8221 int LIBCALLBACK
query_offset_compare_hsp(VoidPtr v1,VoidPtr v2)8222 query_offset_compare_hsp(VoidPtr v1, VoidPtr v2)
8223 
8224 {
8225 	BLAST_HSPPtr h1, h2;
8226 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8227 
8228 	hp1 = (BLAST_HSPPtr PNTR) v1;
8229 	hp2 = (BLAST_HSPPtr PNTR) v2;
8230 	h1 = *hp1;
8231 	h2 = *hp2;
8232 
8233     if (h1 == NULL) {
8234         return (h2 == NULL) ? 0 : 1;
8235     } else if (h2 == NULL) {
8236       return -1;
8237     }
8238 
8239 	if (h1->query.offset < h2->query.offset)
8240 		return -1;
8241 	if (h1->query.offset > h2->query.offset)
8242 		return 1;
8243 
8244 	if (h1->subject.offset < h2->subject.offset)
8245 		return -1;
8246 	if (h1->subject.offset > h2->subject.offset)
8247 		return 1;
8248 
8249 	return 0;
8250 }
8251 
8252 int LIBCALLBACK
query_end_compare_hsp(VoidPtr v1,VoidPtr v2)8253 query_end_compare_hsp(VoidPtr v1, VoidPtr v2)
8254 
8255 {
8256 	BLAST_HSPPtr h1, h2;
8257 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8258 
8259 	hp1 = (BLAST_HSPPtr PNTR) v1;
8260 	hp2 = (BLAST_HSPPtr PNTR) v2;
8261 	h1 = *hp1;
8262 	h2 = *hp2;
8263 
8264     if (h1 == NULL) {
8265         return (h2 == NULL) ? 0 : 1;
8266     } else if (h2 == NULL) {
8267       return -1;
8268     }
8269 
8270 	if (h1->query.end < h2->query.end)
8271 		return -1;
8272 	if (h1->query.end > h2->query.end)
8273 		return 1;
8274 
8275 	if (h1->subject.end < h2->subject.end)
8276 		return -1;
8277 	if (h1->subject.end > h2->subject.end)
8278 		return 1;
8279 
8280 	return 0;
8281 }
8282 /*
8283 	Check the gapped alignments for an overlap of two different alignments.
8284 	A sufficient overlap is when two alignments have the same start values
8285 	of have the same final values.
8286 
8287 	The number of valid alignments remaining is returned.
8288 */
8289 
8290 static Int4
CheckGappedAlignmentsForOverlap(BlastSearchBlkPtr search,BLAST_HSPPtr * hsp_array,Int4 hsp_count,Int2 frame)8291 CheckGappedAlignmentsForOverlap(BlastSearchBlkPtr search, BLAST_HSPPtr *hsp_array, Int4 hsp_count, Int2 frame)
8292 
8293 {
8294 	Int4 index1, index, increment;
8295 
8296 	if (search == NULL || hsp_array == NULL || hsp_count == 0)
8297 		return 0;
8298 
8299     	HeapSort(hsp_array, hsp_count, sizeof(BLAST_HSPPtr), query_offset_compare_hsp);
8300 	index=0;
8301 	increment=1;
8302 	while (index < hsp_count-increment)
8303 	{ /* Check if both HSP's start on or end on the same digonal. */
8304 		if (hsp_array[index+increment] == NULL)
8305 		{
8306 			increment++;
8307 			continue;
8308 		}
8309 
8310 		if (frame != 0 && hsp_array[index+increment]->subject.frame != frame)
8311                                 break;
8312 
8313 		if (hsp_array[index] && hsp_array[index]->query.offset == hsp_array[index+increment]->query.offset &&
8314 			  hsp_array[index]->subject.offset == hsp_array[index+increment]->subject.offset &&
8315 			    SIGN(hsp_array[index]->query.frame) == SIGN(hsp_array[index+increment]->query.frame))
8316 		{
8317 			if (hsp_array[index]->score > hsp_array[index+increment]->score)
8318 			{
8319 				hsp_array[index+increment] =
8320                                    BLAST_HSPFree(hsp_array[index+increment]);
8321 				increment++;
8322 			}
8323 			else
8324 			{
8325 				hsp_array[index] =
8326                                    BLAST_HSPFree(hsp_array[index]);
8327 				index++;
8328 				increment = 1;
8329 			}
8330 		}
8331 		else
8332 		{
8333 			index++;
8334 			increment = 1;
8335 		}
8336 	}
8337 
8338     	HeapSort(hsp_array, hsp_count, sizeof(BLAST_HSPPtr), query_end_compare_hsp);
8339 	index=0;
8340 	increment=1;
8341 	while (index < hsp_count-increment)
8342 	{ /* Check if both HSP's start on or end on the same digonal. */
8343 		if (hsp_array[index+increment] == NULL)
8344 		{
8345 			increment++;
8346 			continue;
8347 		}
8348 
8349 		if (frame != 0 && hsp_array[index+increment]->subject.frame != frame)
8350                                 break;
8351 
8352 		if (hsp_array[index] &&
8353 			hsp_array[index]->query.end == hsp_array[index+increment]->query.end &&
8354 			  hsp_array[index]->subject.end == hsp_array[index+increment]->subject.end &&
8355 			    SIGN(hsp_array[index]->query.frame) == SIGN(hsp_array[index+increment]->query.frame))
8356 		{
8357 			if (hsp_array[index]->score > hsp_array[index+increment]->score)
8358 			{
8359 				hsp_array[index+increment] =
8360                                    BLAST_HSPFree(hsp_array[index+increment]);
8361 				increment++;
8362 			}
8363 			else
8364 			{
8365 				hsp_array[index] =
8366                                    BLAST_HSPFree(hsp_array[index]);
8367 				index++;
8368 				increment = 1;
8369 			}
8370 		}
8371 		else
8372 		{
8373 			index++;
8374 			increment = 1;
8375 		}
8376 	}
8377 
8378     	HeapSort(hsp_array,hsp_count,sizeof(BLAST_HSPPtr), score_compare_hsps);
8379 
8380 	index1 = 0;
8381 	for (index=0; index<hsp_count; index++)
8382 	{
8383 		if (hsp_array[index] != NULL)
8384 			index1++;
8385 	}
8386 
8387 
8388 	return index1;
8389 
8390 }
8391 
8392 /*
8393 	Sort the HSP's by frame.
8394 */
8395 
8396 int LIBCALLBACK
frame_compare_hsp_m3(VoidPtr v1,VoidPtr v2)8397 frame_compare_hsp_m3(VoidPtr v1, VoidPtr v2)
8398 
8399 {
8400 	BLAST_HSPPtr h1, h2;
8401 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8402 
8403 	hp1 = (BLAST_HSPPtr PNTR) v1;
8404 	hp2 = (BLAST_HSPPtr PNTR) v2;
8405 	h1 = *hp1;
8406 	h2 = *hp2;
8407 
8408 	if (h1->subject.frame == -3 && h2->subject.frame != -3)
8409 		return -1;
8410 	if (h2->subject.frame == -3 && h1->subject.frame != -3)
8411 		return 1;
8412 
8413 	return 0;
8414 }
8415 int LIBCALLBACK
frame_compare_hsp_m2(VoidPtr v1,VoidPtr v2)8416 frame_compare_hsp_m2(VoidPtr v1, VoidPtr v2)
8417 
8418 {
8419 	BLAST_HSPPtr h1, h2;
8420 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8421 
8422 	hp1 = (BLAST_HSPPtr PNTR) v1;
8423 	hp2 = (BLAST_HSPPtr PNTR) v2;
8424 	h1 = *hp1;
8425 	h2 = *hp2;
8426 
8427 	if (h1->subject.frame == -2 && h2->subject.frame != -2)
8428 		return -1;
8429 	if (h2->subject.frame == -2 && h1->subject.frame != -2)
8430 		return 1;
8431 
8432 	return 0;
8433 }
8434 
8435 int LIBCALLBACK
frame_compare_hsp_m1(VoidPtr v1,VoidPtr v2)8436 frame_compare_hsp_m1(VoidPtr v1, VoidPtr v2)
8437 
8438 {
8439 	BLAST_HSPPtr h1, h2;
8440 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8441 
8442 	hp1 = (BLAST_HSPPtr PNTR) v1;
8443 	hp2 = (BLAST_HSPPtr PNTR) v2;
8444 	h1 = *hp1;
8445 	h2 = *hp2;
8446 
8447 	if (h1->subject.frame == -1 && h2->subject.frame != -1)
8448 		return -1;
8449 	if (h2->subject.frame == -1 && h1->subject.frame != -1)
8450 		return 1;
8451 
8452 	return 0;
8453 }
8454 int LIBCALLBACK
frame_compare_hsp_p1(VoidPtr v1,VoidPtr v2)8455 frame_compare_hsp_p1(VoidPtr v1, VoidPtr v2)
8456 
8457 {
8458 	BLAST_HSPPtr h1, h2;
8459 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8460 
8461 	hp1 = (BLAST_HSPPtr PNTR) v1;
8462 	hp2 = (BLAST_HSPPtr PNTR) v2;
8463 	h1 = *hp1;
8464 	h2 = *hp2;
8465 
8466 	if (h1->subject.frame == 1 && h2->subject.frame != 1)
8467 		return -1;
8468 	if (h2->subject.frame == 1 && h1->subject.frame != 1)
8469 		return 1;
8470 
8471 	return 0;
8472 }
8473 int LIBCALLBACK
frame_compare_hsp_p2(VoidPtr v1,VoidPtr v2)8474 frame_compare_hsp_p2(VoidPtr v1, VoidPtr v2)
8475 
8476 {
8477 	BLAST_HSPPtr h1, h2;
8478 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8479 
8480 	hp1 = (BLAST_HSPPtr PNTR) v1;
8481 	hp2 = (BLAST_HSPPtr PNTR) v2;
8482 	h1 = *hp1;
8483 	h2 = *hp2;
8484 
8485 	if (h1->subject.frame == 2 && h2->subject.frame != 2)
8486 		return -1;
8487 	if (h2->subject.frame == 2 && h1->subject.frame != 2)
8488 		return 1;
8489 
8490 	return 0;
8491 }
8492 int LIBCALLBACK
frame_compare_hsp_p3(VoidPtr v1,VoidPtr v2)8493 frame_compare_hsp_p3(VoidPtr v1, VoidPtr v2)
8494 
8495 {
8496 	BLAST_HSPPtr h1, h2;
8497 	BLAST_HSPPtr PNTR hp1, PNTR hp2;
8498 
8499 	hp1 = (BLAST_HSPPtr PNTR) v1;
8500 	hp2 = (BLAST_HSPPtr PNTR) v2;
8501 	h1 = *hp1;
8502 	h2 = *hp2;
8503 
8504 	if (h1->subject.frame == 3 && h2->subject.frame != 3)
8505 		return -1;
8506 	if (h2->subject.frame == 3 && h1->subject.frame != 3)
8507 		return 1;
8508 
8509 	return 0;
8510 }
8511 /*
8512 	Engine to get the gapped scores from an array of HSP's.
8513 */
8514 static BLAST_HSPPtr PNTR
BlastGappedScoreInternal(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,GapAlignBlkPtr gap_align,BLAST_HSPPtr * hsp_array,Int4Ptr hspcnt,Int4Ptr hspcnt_max,Int4 hspmax,Int2 frame)8515 BlastGappedScoreInternal(BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, GapAlignBlkPtr gap_align, BLAST_HSPPtr *hsp_array, Int4Ptr hspcnt, Int4Ptr hspcnt_max, Int4 hspmax, Int2 frame)
8516 
8517 {
8518 	BLAST_HSPPtr hsp, hsp1=NULL;
8519 	BLAST_HSPPtr PNTR hsp_array_new;
8520 	BLAST_HSP_helperPtr helper;
8521 	Boolean hsp_start_is_contained, hsp_end_is_contained;
8522 	Int4 hsp_cnt=0, index, index1;
8523 	Int4 max_offset = 0, next_offset;
8524 	Int4 query_num; /* AM: Added to support query concatenation */
8525 
8526 	/* helper contains most frequently used information to speed up access. */
8527 	helper = Malloc((*hspcnt)*sizeof(BLAST_HSP_helper));
8528 	for (index=0; index<(*hspcnt); index++)
8529 	{
8530 		hsp_start_is_contained = FALSE;
8531 		hsp_end_is_contained = FALSE;
8532 		hsp = hsp_array[index];
8533 	/* This prefetches this value for the test below. */
8534 		next_offset = hsp->query.offset;
8535 
8536 		if (frame != 0 && hsp->subject.frame != frame)
8537 			break;
8538 
8539 		for (index1=0; index1<index; index1++)
8540 		{
8541 			hsp_start_is_contained = FALSE;
8542 			hsp_end_is_contained = FALSE;
8543 
8544 			hsp1 = hsp_array[index1];
8545 			if (hsp1 == NULL)
8546 				continue;
8547 
8548 			/* Check with the helper array whether further
8549 				tests are warranted.  Having only two ints
8550 				in the helper array speeds up access. */
8551 			if (helper[index1].qoffset <= next_offset &&
8552 				helper[index1].qend >= next_offset)
8553 			{
8554 			   if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE)
8555 
8556 			   {	/* Check that it's on diff. strands. */
8557 				if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8558 					SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
8559 					hsp_start_is_contained = TRUE;
8560 			   }
8561 			   if (hsp_start_is_contained && CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE)
8562 
8563 			   {	/* Check that it's on diff. strands. */
8564 				if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8565 					SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
8566 					hsp_end_is_contained = TRUE;
8567 				if (hsp_start_is_contained && hsp_end_is_contained && hsp->score <= hsp1->score)
8568 				{
8569 					break;
8570 				}
8571 			   }
8572 			}
8573 		}
8574 
8575 		if (hsp_start_is_contained == FALSE ||
8576                     hsp_end_is_contained   == FALSE ||
8577                     (hsp1 == NULL) || (hsp->score > hsp1->score))
8578 		{
8579 			gap_align->include_query = 0;
8580 
8581                         if(!search->pbp->is_ooframe) {
8582                             max_offset = GetStartForGappedAlignment(search, hsp, search->context[hsp->context].query->sequence, subject, search->sbp->matrix);
8583                         }
8584 
8585 #ifdef BLAST_COLLECT_STATS
8586 			search->real_gap_number_of_hsps++;
8587 #endif
8588 			Nlm_MemSet((VoidPtr) &(hsp_array[index]->hsp_link), 0, sizeof(BLAST_HSP_LINK));
8589 			hsp_array[index]->linked_set = FALSE;
8590 			hsp_array[index]->start_of_chain = FALSE;
8591 			hsp_array[index]->num = 0;
8592 			hsp_array[index]->xsum = 0.0;
8593 
8594                         if(search->pbp->is_ooframe) {
8595                             gap_align->is_ooframe = TRUE;
8596                             gap_align->query = subject;
8597                             if(hsp->query.frame > 0) {
8598                                 gap_align->subject = search->query_dnap[0]->sequence;
8599                                 gap_align->subject_length = search->query_dnap[0]->length;
8600                             } else {
8601                                 gap_align->subject = search->query_dnap[1]->sequence;
8602                                 gap_align->subject_length = search->query_dnap[1]->length;
8603                             }
8604 
8605                             gap_align->query_length = subject_length;
8606 
8607                             gap_align->q_start = hsp->subject.offset;
8608                             gap_align->s_start = hsp->query.offset;
8609 
8610                             hsp->query.gapped_start = gap_align->s_start;
8611                             hsp->subject.gapped_start = gap_align->q_start;
8612 
8613                         } else {
8614                             gap_align->query = search->context[hsp->context].query->sequence;
8615                             gap_align->query_length = search->context[hsp->context].query->length;
8616                             gap_align->q_start = max_offset;
8617                             gap_align->s_start =
8618                                (hsp->subject.offset - hsp->query.offset) + max_offset;
8619                             hsp->query.gapped_start = gap_align->q_start;
8620                             hsp->subject.gapped_start = gap_align->s_start;
8621 
8622                                gap_align->subject = subject;
8623                                gap_align->subject_length = subject_length;
8624                         }
8625 
8626                         /* For out-of frame gapping - query is protein
8627                            and subject is DNA translated into 3 frames */
8628 
8629 			PerformGappedAlignment(gap_align);
8630 
8631                         if(search->pbp->is_ooframe) {
8632                             hsp->query.offset = gap_align->subject_start;
8633                             hsp->subject.offset = gap_align->query_start;
8634                             /* The end is one further for BLAST than for the gapped align. */
8635                             hsp->query.end = gap_align->subject_stop + 1;
8636                             hsp->subject.end = gap_align->query_stop + 1;
8637                         } else {
8638                             hsp->query.offset = gap_align->query_start;
8639                             hsp->query.end = gap_align->query_stop + 1;
8640                             hsp->subject.offset = gap_align->subject_start;
8641                             hsp->subject.end = gap_align->subject_stop + 1;
8642                             /* The end is one further for BLAST than for the gapped align. */
8643                         }
8644 
8645                         hsp->query.length = hsp->query.end - hsp->query.offset;
8646                         hsp->subject.length = hsp->subject.end - hsp->subject.offset;
8647 			hsp->score = gap_align->score;
8648             if( hsp->score >= search->pbp->cutoff_s1 ) {
8649                 /* AM: Changed to support query concatenation */
8650                 if( !search->mult_queries )
8651                     hsp->evalue =
8652                         BlastKarlinStoE_simple(hsp->score,
8653                                                search->sbp->
8654                                                kbp_gap[search->first_context],
8655                                                search->searchsp_eff);
8656                 else {
8657                     query_num = GetQueryNum( search->mult_queries,
8658                                              hsp->query.offset,
8659                                              hsp->query.end,
8660                                              hsp->query.frame );
8661                     hsp->evalue =
8662                         BlastKarlinStoE_simple(hsp->score,
8663                                                search->sbp->
8664                                                kbp_gap[search->first_context],
8665                                                search->mult_queries->
8666                                                SearchSpEff[query_num]);
8667                 }
8668 
8669                 hsp_cnt++;
8670                 /* Fill in the helper structure. */
8671                 helper[index].qoffset = hsp->query.offset;
8672                 helper[index].qend = hsp->query.end;
8673             } else {
8674                 /* Score of the gapped extension is below the required
8675                    cutoff, delete this hsp */
8676                 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8677             }
8678         }
8679 		else
8680 		{ /* Contained within another HSP, delete. */
8681 			hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8682 		}
8683 	}
8684 	helper = MemFree(helper);
8685 
8686 	hsp_cnt = CheckGappedAlignmentsForOverlap(search, hsp_array, *hspcnt, frame);
8687 
8688 	if (hsp_cnt < (*hspcnt))
8689 	{
8690 /* Save HSP's again, discarding those that have been NULLed out. */
8691 		hsp_array_new = MemNew(hspmax*sizeof(BLAST_HSPPtr));
8692 		index1 = 0;
8693 		for (index=0; index<(*hspcnt_max); index++)
8694 		{
8695 			if (hsp_array[index] != NULL)
8696 			{
8697 				hsp_array_new[index1] = hsp_array[index];
8698 				index1++;
8699 			}
8700 		}
8701 
8702 		hsp_array = MemFree(hsp_array);
8703 
8704 		*hspcnt = index1;
8705 		*hspcnt_max = index1;
8706 		hsp_array = hsp_array_new;
8707 	}
8708 	*hspcnt = hsp_cnt;
8709 
8710 	return hsp_array;
8711 }
8712 
8713 /*
8714 	Engine to get the gapped scores from an array of HSP's.
8715 */
8716 static Boolean
BlastNtGappedScoreInternal(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,GapAlignBlkPtr gap_align,BLAST_HSPPtr * hsp_array,Int4Ptr hspcnt,Int4Ptr hspcnt_max,Int4 hspmax)8717 BlastNtGappedScoreInternal(BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, GapAlignBlkPtr gap_align, BLAST_HSPPtr *hsp_array, Int4Ptr hspcnt, Int4Ptr hspcnt_max, Int4 hspmax)
8718 
8719 {
8720 	BLAST_HSPPtr hsp, hsp1=NULL;
8721 	BLAST_HSP_helperPtr helper;
8722 	Boolean hsp_start_is_contained, hsp_end_is_contained;
8723 	Int4 hsp_cnt=0, index, index1, next_offset, query_length;
8724 	/* AM: Added to support query concatenation. */
8725 	Int4 query_num;
8726 
8727 	/* helper contains most frequently used information to speed up access. */
8728         helper = Malloc((*hspcnt)*sizeof(BLAST_HSP_helper));
8729 
8730 	for (index=0; index<(*hspcnt); index++)
8731 	{
8732 		hsp_start_is_contained = FALSE;
8733 		hsp_end_is_contained = FALSE;
8734 		hsp = hsp_array[index];
8735         /* This prefetches this value for the test below. */
8736                 next_offset = hsp->query.offset;
8737 
8738 		for (index1=0; index1<index; index1++)
8739 		{
8740 			hsp_start_is_contained = FALSE;
8741 			hsp_end_is_contained = FALSE;
8742 
8743 			hsp1 = hsp_array[index1];
8744 			if (hsp1 == NULL)
8745 				continue;
8746 
8747 
8748                         /* Check with the helper array whether further
8749                                 tests are warranted.  Having only two ints
8750                                 in the helper array speeds up access. */
8751                         if (helper[index1].qoffset <= next_offset &&
8752                                 helper[index1].qend >= next_offset)
8753                         {
8754 
8755 			if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE)
8756 			{	/* Check that it's on diff. strands. */
8757 					hsp_start_is_contained = TRUE;
8758 			}
8759 			if (hsp_start_is_contained && CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE)
8760 			{	/* Check that it's on diff. strands. */
8761 					hsp_end_is_contained = TRUE;
8762 			}
8763 			if (hsp_start_is_contained && hsp_end_is_contained && hsp->score <= hsp1->score)
8764 			{
8765 				break;
8766 			}
8767 			}
8768 		}
8769 
8770 		if (hsp_start_is_contained == FALSE ||
8771 			 hsp_end_is_contained == FALSE ||
8772 				hsp->score > hsp1->score)
8773 		{
8774 			gap_align->include_query = 0;
8775 #ifdef BLAST_COLLECT_STATS
8776 			search->real_gap_number_of_hsps++;
8777 #endif
8778 /*
8779 			Nlm_MemSet((VoidPtr) &(hsp_array[index]->hsp_link), 0, sizeof(BLAST_HSP_LINK));
8780 			hsp_array[index]->linked_set = FALSE;
8781 			hsp_array[index]->start_of_chain = FALSE;
8782 			hsp_array[index]->num = 0;
8783 			hsp_array[index]->sumscore = 0;
8784 */
8785 
8786 			gap_align->query = search->context[hsp->context].query->sequence;
8787 			gap_align->query_length = search->context[hsp->context].query->length;
8788 			gap_align->q_start = hsp->query.gapped_start;
8789 			gap_align->s_start = hsp->subject.gapped_start;
8790 
8791                         gap_align->subject = subject;
8792                         gap_align->subject_length = subject_length;
8793 
8794 			if (hsp->subject.gapped_start >= 0) {
8795                            if (!PerformNtGappedAlignment(gap_align))
8796                               return FALSE;
8797                         }
8798 
8799                         query_length =
8800                            search->query_context_offsets[search->first_context+1] - 1;
8801                         if (gap_align->query_start / query_length !=
8802                             (gap_align->query_stop - 1) / query_length) {
8803                            if (gap_align->q_start < query_length) {
8804                               gap_align->subject_stop -=
8805                                  (gap_align->query_stop - query_length + 1);
8806                               gap_align->query_stop = query_length - 1;
8807                            } else {
8808                               gap_align->subject_start +=
8809                                  (query_length + 1 - gap_align->query_start);
8810                               gap_align->query_start = query_length + 1;
8811                            }
8812                         }
8813 			hsp->query.offset = gap_align->query_start;
8814 			hsp->subject.offset = gap_align->subject_start;
8815 	/* The end is one further for BLAST than for the gapped align. */
8816 			hsp->query.end = gap_align->query_stop + 1;
8817 			hsp->subject.end = gap_align->subject_stop + 1;
8818 			hsp->query.length = hsp->query.end - hsp->query.offset;
8819 			hsp->subject.length = hsp->subject.end - hsp->subject.offset;
8820 			hsp->score = gap_align->score;
8821 /* TLM */
8822                         /* AM: Changed to support query concatenation. */
8823 			if( !search->mult_queries )
8824 			  hsp->evalue = BlastKarlinStoE_simple(hsp->score, search->sbp->kbp[search->first_context], search->searchsp_eff);
8825                         else
8826 			{
8827 			  query_num = GetQueryNum( search->mult_queries,
8828 			                           hsp->query.offset,
8829 						   hsp->query.end,
8830 						   hsp->query.frame );
8831 	       	          hsp->evalue = BlastKarlinStoE_simple( hsp->score,
8832 		                                                search->sbp->kbp[search->first_context],
8833 						                search->mult_queries->SearchSpEff[query_num] );
8834 			}
8835 
8836 			hsp_cnt++;
8837                         /* Fill in the helper structure. */
8838                         helper[index].qoffset = hsp->query.offset;
8839                         helper[index].qend = hsp->query.end;
8840 		}
8841 		else
8842 		{ /* Contained within another HSP, delete. */
8843 			hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8844 		}
8845 	}
8846 	helper = MemFree(helper);
8847 
8848 /*
8849 	hsp_cnt = CheckGappedAlignmentsForOverlap(search, hsp_array, *hspcnt, 0);
8850 */
8851 
8852 	if (hsp_cnt < (*hspcnt))
8853 	{
8854 /* Save HSP's again, discarding those that have been NULLed out. */
8855 		index1 = 0;
8856 		for (index=0; index<(*hspcnt); index++)
8857 		{
8858 			if (hsp_array[index] != NULL)
8859 			{
8860 				hsp_array[index1] = hsp_array[index];
8861 				index1++;
8862 			}
8863 		}
8864 
8865 	}
8866 	*hspcnt = hsp_cnt;
8867 
8868 	return TRUE;
8869 }
8870 
8871 /*
8872 	Loads the HSP's into the BlastHitRangePtr.
8873 */
8874 static Boolean
BlastHitRangeLoad(BlastSearchBlkPtr search,BLAST_HSPPtr * hsp_array,Int4 hspcnt,BlastHitRangePtr bhrp)8875 BlastHitRangeLoad (BlastSearchBlkPtr search, BLAST_HSPPtr *hsp_array, Int4 hspcnt, BlastHitRangePtr bhrp)
8876 
8877 {
8878 	BlastDoubleInt4Ptr tmp;
8879 	BLAST_HSPPtr hsp;
8880 	Int4 index, query_length;
8881 
8882 	if (bhrp->current+hspcnt > bhrp->total)
8883 		return FALSE;
8884 
8885 	if (hspcnt <= 0)
8886 		return TRUE;
8887 
8888 	tmp = bhrp->range_list;
8889 
8890 	tmp += bhrp->current;
8891 
8892 	for (index=0; index<hspcnt; index++)
8893 	{
8894 		hsp = hsp_array[index];
8895 		query_length = search->context[hsp->context].query->length;
8896 		if (hsp->query.frame >= 0)
8897 		{
8898 			tmp->gi = hsp->query.offset;
8899 			tmp->ordinal_id = hsp->query.end - 1;
8900 		}
8901 		else
8902 		{
8903 			tmp->gi = query_length - hsp->query.end;
8904 			tmp->ordinal_id = query_length - hsp->query.offset - 1;
8905 		}
8906 		tmp++;
8907 	}
8908 
8909 	bhrp->current += hspcnt;
8910 
8911 	return TRUE;
8912 }
8913 
rpsFilterSequenceByMask(ValNodePtr mask,Uint1Ptr sequence,Int4 length,Int4 frame,Int4 dna_length)8914 static void rpsFilterSequenceByMask(ValNodePtr mask, Uint1Ptr sequence, Int4 length, Int4 frame, Int4 dna_length)
8915 {
8916     SeqLocPtr filter_slp = NULL;
8917     ValNodePtr vnp;
8918 
8919     if(mask == NULL)
8920         return;
8921 
8922     for(vnp = mask; vnp != NULL; vnp = vnp->next) {
8923 
8924         if(vnp->choice == FrameToDefine(frame)) {
8925             filter_slp = (SeqLocPtr) vnp->data.ptrvalue;
8926             break;
8927         }
8928     }
8929 
8930     if(filter_slp != NULL)
8931         BlastMaskTheResidues(sequence+1, length, 21, filter_slp, FALSE, 0);
8932 
8933     /* BlastConvertProteinSeqLoc(filter_slp, frame, dna_length); */
8934 
8935     return;
8936 }
8937 
BLASTCheckHSPInclusion(BLAST_HSPPtr * hsp_array,Int4 hspcnt,Boolean is_ooframe)8938 void BLASTCheckHSPInclusion(BLAST_HSPPtr *hsp_array, Int4 hspcnt,
8939                             Boolean is_ooframe)
8940 {
8941     Int4 index, index1;
8942     BLAST_HSPPtr hsp, hsp1;
8943 
8944     for (index = 0; index < hspcnt; index++) {
8945 
8946         hsp = hsp_array[index];
8947 
8948         if (hsp == NULL)
8949             continue;
8950 
8951         for (index1 = 0; index1 < index; index1++) {
8952 
8953             hsp1 = hsp_array[index1];
8954 
8955             if (hsp1 == NULL)
8956                 continue;
8957 
8958             if(is_ooframe) {
8959                 if (SIGN(hsp1->query.frame) != SIGN(hsp->query.frame))
8960                     continue;
8961             } else {
8962                 if (hsp->context != hsp1->context)
8963                     continue;
8964             }
8965 
8966             /* Check of the start point of this HSP */
8967             if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE) {
8968                 /* Check of the end point of this HSP */
8969                 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE) {
8970                     /* Now checking correct strand */
8971                     if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8972                         SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame)){
8973 
8974                         /* If we come here through all these if-s - this
8975                            mean, that current HSP should be removed. */
8976 
8977                         if(hsp_array[index] != NULL) {
8978                            hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8979                            break;
8980                         }
8981                     }
8982                 }
8983             }
8984         }
8985     }
8986     return;
8987 }
8988 
8989 
8990 /*
8991 	Take a BLAST_HSPPtr (array of HSP's) and get a traceback for them.
8992 */
8993 
8994 Int4
RealBlastGetGappedAlignmentTraceback(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,Uint1Ptr rev_subject,Int4 rev_subject_length,SeqIdPtr subject_id,BLAST_HSPPtr * hsp_array,Int4 hspcnt,SeqAlignPtr * head,BlastHitRangePtr bhrp,Int4 min_score_to_keep,Boolean reverse,Int4 ordinal_id,Boolean do_traceback)8995 RealBlastGetGappedAlignmentTraceback(BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, Uint1Ptr rev_subject, Int4 rev_subject_length, SeqIdPtr subject_id, BLAST_HSPPtr *hsp_array, Int4 hspcnt, SeqAlignPtr *head, BlastHitRangePtr bhrp, Int4 min_score_to_keep, Boolean reverse, Int4 ordinal_id, Boolean do_traceback)
8996 
8997 {
8998     BLAST_HSPPtr hsp, hsp1, hsp2;
8999     BLAST_ParameterBlkPtr pbp;
9000     BLASTResultHsp       	result_hsp;
9001     Boolean hsp_start_is_contained, hsp_end_is_contained, keep;
9002     Boolean do_not_do;
9003     GapAlignBlkPtr gap_align;
9004     Int4 new_hspcnt=0;
9005     Int4 index, index1, index2, query_length, max_offset;
9006     Int4Ptr translated_subject_length=NULL;
9007     Int4Ptr translated_subject_length_orig=NULL;
9008     SeqAlignPtr seqalign, seqalign_var, *seqalign_array;
9009     Uint1Ptr query, PNTR translated_subject=NULL, PNTR translated_subject_orig=NULL;
9010     ValNodePtr gi_list=NULL;
9011     BLAST_HitListPtr tmp_hitlist;
9012     BLAST_HitListPtr real_hitlist;
9013     SeqIdPtr query_id, new_subject_seqid = NULL, seqid_tmp;
9014     Int4 max_start = MAX_DBSEQ_LEN / 2, start_shift;
9015     Int4 align_length;
9016     Int4 query_num; /* AM: Added to support query concatenation. */
9017     Boolean partial_translation;
9018     Int4 translation_length;
9019 
9020     pbp = search->pbp;
9021     MemSet(&result_hsp, 0, sizeof(BLASTResultHsp));
9022 
9023     seqalign=NULL;
9024     if (do_traceback)
9025         seqalign_array = MemNew(hspcnt*sizeof(SeqAlignPtr));
9026 
9027     if (search->gap_align == NULL) {
9028         search->gap_align = GapAlignBlkNew(1, 1);
9029     }
9030 
9031     gap_align = search->gap_align;
9032 
9033     gi_list = BlastGetAllowedGis(search, ordinal_id, &new_subject_seqid);
9034 
9035 #if 1
9036     if (gi_list) {
9037         /* change subject's gi with this 'use_this_gi' gi */
9038         subject_id->data.intvalue = gi_list->data.intvalue;
9039     }
9040 #endif
9041 
9042     gap_align->is_ooframe = pbp->is_ooframe; /* For OOF: blastx and tblastn */
9043     gap_align->shift_pen = pbp->shift_pen;
9044 
9045     gap_align->discontinuous = pbp->discontinuous;
9046     gap_align->positionBased =
9047        (search->positionBased && search->sbp->posMatrix);
9048     gap_align->gap_open = pbp->gap_open;
9049     gap_align->gap_extend = pbp->gap_extend;
9050     gap_align->decline_align = pbp->decline_align;
9051     gap_align->x_parameter = pbp->gap_x_dropoff_final;
9052     gap_align->matrix = search->sbp->matrix;
9053     gap_align->posMatrix = search->sbp->posMatrix;
9054     partial_translation = (subject_length > SUBJECT_ADJUSTMENT);
9055 
9056     for (index=0; index<hspcnt; index++) {
9057         hsp_start_is_contained = FALSE;
9058         hsp_end_is_contained = FALSE;
9059         hsp = hsp_array[index];
9060 
9061         for (index1=0; index1<index; index1++) {
9062             hsp_start_is_contained = FALSE;
9063             hsp_end_is_contained = FALSE;
9064 
9065             hsp1 = hsp_array[index1];
9066             if (hsp1 == NULL)
9067                 continue;
9068 
9069             if(pbp->is_ooframe) {
9070                 if (SIGN(hsp1->query.frame) != SIGN(hsp->query.frame))
9071                     continue;
9072             } else {
9073                 if (hsp->context != hsp1->context)
9074                     continue;
9075             }
9076 
9077             if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE) {
9078 		if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
9079                     SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
9080                     hsp_start_is_contained = TRUE;
9081             }
9082             if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE) {
9083 		if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
9084                     SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
9085                     hsp_end_is_contained = TRUE;
9086             }
9087             if (hsp_start_is_contained && hsp_end_is_contained && hsp->score <= hsp1->score) {
9088                 break;
9089             }
9090         }
9091 
9092         do_not_do = FALSE;
9093         /* Check whether this part of query has already been covered. */
9094         /* Commented out by TLM as this seems buggy.
9095            if (bhrp) {
9096            total = bhrp->current;
9097            for (index1=0; index1<total; index1++) {
9098            if (hsp->query.offset >= bhrp->range_list_pointer[index1]->gi &&
9099            hsp->query.end <= bhrp->range_list_pointer[index1]->ordinal_id) {
9100            do_not_do = TRUE;
9101            break;
9102            }
9103            }
9104            }
9105         */
9106         if (do_not_do == FALSE && (hsp_start_is_contained == FALSE || hsp_end_is_contained == FALSE ||
9107                                    hsp->score > hsp1->score)) {
9108             query = (Uint1Ptr) search->context[hsp->context].query->sequence;
9109             query_length = search->context[hsp->context].query->length;
9110 
9111             gap_align->include_query = 0;
9112 
9113 
9114             if(search->pbp->is_ooframe) {
9115                 gap_align->is_ooframe = TRUE;
9116                 gap_align->query = subject;
9117 
9118                 if(hsp->query.frame > 0) {
9119                     gap_align->subject = search->query_dnap[0]->sequence;
9120                     gap_align->subject_length = search->query_dnap[0]->length;
9121                 } else {
9122                     gap_align->subject = search->query_dnap[1]->sequence;
9123                     gap_align->subject_length = search->query_dnap[1]->length;
9124                 }
9125 
9126                 gap_align->query_frame = hsp->subject.frame;
9127                 gap_align->subject_frame = ContextToFrame(search, hsp->context);
9128                 gap_align->query_length = subject_length;
9129             } else {
9130                 gap_align->query_frame = ContextToFrame(search, hsp->context);
9131                 gap_align->query = query;
9132 
9133                 gap_align->subject_frame = hsp->subject.frame;
9134                 gap_align->subject = subject;
9135 
9136                 gap_align->query_length = query_length;
9137                 gap_align->subject_length = subject_length;
9138             }
9139 
9140             gap_align->translate1 = FALSE;
9141             gap_align->translate2 = FALSE;
9142             if (StringCmp(search->prog_name, "blastx") == 0) {
9143                 gap_align->translate1 = TRUE;
9144                 gap_align->translate2 = FALSE;
9145             }
9146 
9147             start_shift = 0;
9148 
9149             if (StringCmp(search->prog_name, "tblastn") == 0 ||
9150                 StringCmp(search->prog_name, "psitblastn") == 0) {
9151                 gap_align->translate1 = FALSE;
9152                 gap_align->translate2 = TRUE;
9153                 if (translated_subject == NULL) {
9154                     translated_subject_orig = MemNew(8*sizeof(Uint1Ptr));
9155                     translated_subject = translated_subject_orig + 3;
9156                     translated_subject_length_orig = MemNew(8*sizeof(Int4));
9157                     translated_subject_length = translated_subject_length_orig + 3;
9158                 }
9159                 if (partial_translation) {
9160                    translated_subject[hsp->subject.frame] =
9161                       MemFree(translated_subject[hsp->subject.frame]);
9162                    /* NB: since SUBJECT_ADJUSTMENT is divisible by 3, the frame
9163                       will remain the same.
9164                    */
9165                    start_shift =
9166                       MAX(0, 3*hsp->subject.offset - SUBJECT_ADJUSTMENT);
9167                    translation_length =
9168                       MIN(3*hsp->subject.end + SUBJECT_ADJUSTMENT, subject_length)
9169                       - start_shift;
9170                    if (hsp->subject.frame > 0) {
9171                       translated_subject[hsp->subject.frame] =
9172                          GetTranslation(subject+start_shift, translation_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9173                    } else {
9174                       translated_subject[hsp->subject.frame] =
9175                          GetTranslation(rev_subject+start_shift, translation_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9176                    }
9177                    /* Below, the start_shift will be used for the protein
9178                       coordinates, so need to divide it by 3 */
9179                    start_shift /= CODON_LENGTH;
9180                    hsp->subject.offset -= start_shift;
9181                    hsp->subject.gapped_start -= start_shift;
9182 
9183                 } else if (translated_subject[hsp->subject.frame] == NULL) {
9184                     if (hsp->subject.frame > 0) {
9185                         translated_subject[hsp->subject.frame] =
9186                             GetTranslation(subject, subject_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9187                     } else {
9188                         translated_subject[hsp->subject.frame] =
9189                             GetTranslation(rev_subject, rev_subject_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9190                     }
9191                     /* For RPS Blast filtering if needed */
9192                     if(search->pbp->is_rps_blast && search->pbp->filter_string != NULL && StringICmp(search->pbp->filter_string, "F")) {
9193                         rpsFilterSequenceByMask(search->mask, translated_subject[hsp->subject.frame], translated_subject_length[hsp->subject.frame], hsp->subject.frame, (hsp->subject.frame > 0) ? subject_length : rev_subject_length);
9194                     }
9195                 }
9196 
9197                 gap_align->subject = translated_subject[hsp->subject.frame] + 1;
9198                 gap_align->subject_length = translated_subject_length[hsp->subject.frame];
9199             }
9200 
9201             /* these should both only be zero for blastn. */
9202             if (!search->pbp->is_ooframe &&
9203                 (((hsp->query.gapped_start == 0 && hsp->subject.gapped_start == 0) ||
9204                 CheckStartForGappedAlignment(search, hsp, gap_align->query, gap_align->subject, search->sbp->matrix) == FALSE))) {
9205                 max_offset = GetStartForGappedAlignment(search, hsp, gap_align->query, gap_align->subject, search->sbp->matrix);
9206                 gap_align->q_start = max_offset;
9207                 gap_align->s_start = (hsp->subject.offset - hsp->query.offset) + max_offset;
9208                 hsp->query.gapped_start = gap_align->q_start;
9209                 hsp->subject.gapped_start = gap_align->s_start;
9210             } else {
9211                 if(search->pbp->is_ooframe) {
9212                     /* Code above should be investigated for possible
9213                        optimization for OOF */
9214                     gap_align->q_start = hsp->subject.gapped_start;
9215                     gap_align->s_start = hsp->query.gapped_start;
9216                     gap_align->subject_start = 0;
9217                     gap_align->query_start = 0;
9218                 } else {
9219                     gap_align->q_start = hsp->query.gapped_start;
9220                     gap_align->s_start = hsp->subject.gapped_start;
9221                 }
9222             }
9223 
9224             if (search->prog_number == blast_type_blastn) {
9225                /* For blastn, use only part of a long subject sequence,
9226                   because the placeholders for the gapped alignment
9227                   information have only been allocated for at most a
9228                   certain length */
9229                if (gap_align->s_start > max_start) {
9230                   start_shift = (gap_align->s_start / max_start) * max_start;
9231                   gap_align->subject = gap_align->subject + start_shift;
9232 
9233                   gap_align->s_start %= max_start;
9234                } else
9235                   start_shift = 0;
9236 
9237                gap_align->subject_length =
9238                   MIN(gap_align->subject_length - start_shift,
9239                       gap_align->s_start + hsp->subject.length + max_start);
9240             }
9241 
9242             if (do_traceback) {
9243                if (!search->pbp->mb_params ||
9244                    search->pbp->mb_params->use_dyn_prog) {
9245                   PerformGappedAlignmentWithTraceback(gap_align);
9246                } else {
9247                   PerformGreedyAlignmentWithTraceback(gap_align, search->abmp,
9248                                                       search->sbp);
9249                }
9250             } else {
9251                 PerformGappedAlignment(gap_align);
9252             }
9253 
9254             if (gap_align->score >= min_score_to_keep) {
9255 
9256                 if(search->pbp->is_ooframe) {
9257                     hsp->query.offset = gap_align->subject_start + start_shift;
9258                     hsp->subject.offset = gap_align->query_start;
9259                     /* The end is one further for BLAST than for the gapped align. */
9260                     hsp->query.end = gap_align->subject_stop + 1 + start_shift;
9261                     hsp->subject.end = gap_align->query_stop + 1;
9262                 } else {
9263                     hsp->query.offset = gap_align->query_start;
9264                     hsp->subject.offset = gap_align->subject_start + start_shift;
9265                     /* The end is one further for BLAST than for the gapped align. */
9266                     hsp->query.end = gap_align->query_stop + 1;
9267                     hsp->subject.end = gap_align->subject_stop + 1 + start_shift;
9268                 }
9269 
9270                 if (gap_align->edit_block && start_shift > 0) {
9271                    gap_align->edit_block->start2 += start_shift;
9272                    gap_align->edit_block->length2 += start_shift;
9273                 }
9274                 hsp->query.length = hsp->query.end - hsp->query.offset;
9275                 hsp->subject.length = hsp->subject.end - hsp->subject.offset;
9276                 hsp->score = gap_align->score;
9277 
9278                 if (do_traceback) {
9279                     hsp->gap_info = gap_align->edit_block;
9280                 }
9281 
9282 		keep = TRUE;
9283                 /* If greedy alignment was used for traceback, we still need
9284                    to reevaluate the score with ambiguity information */
9285                 if (search->pbp->mb_params &&
9286                     !search->pbp->mb_params->use_dyn_prog &&
9287                     ReevaluateScoreWithAmbiguities(search, subject, hsp)) {
9288                    /* HSP became below the cutoff after reevaluation */
9289                    keep = FALSE;
9290                 }
9291 
9292                 if (keep && (search->prog_number == blast_type_blastp ||
9293                     search->prog_number == blast_type_blastn)) {
9294                    if (search->pbp->mb_params) {
9295                       FloatHi searchsp_eff = (FloatHi) search->dblen_eff *
9296                          (FloatHi) search->context[hsp->context].query->effective_length;
9297 
9298                       hsp->evalue = BlastKarlinStoE_simple(hsp->score,
9299                                                            search->sbp->kbp_gap[hsp->context],
9300                                                            searchsp_eff);
9301                    } else {
9302 		      /* AM: Changed to support query concatenation. */
9303 		      if( !search->mult_queries )
9304                         hsp->evalue = BlastKarlinStoE_simple(hsp->score,
9305                                                            search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9306                       else
9307 		      {
9308 		        /* AM: First determine which query to use, then use the
9309 			       corresponding SearchSpEff element in the call to
9310 			       BlastKarlinStoE_simple() */
9311 			query_num = GetQueryNum( search->mult_queries,
9312 			                         hsp->query.offset,
9313 						 hsp->query.end,
9314 						 hsp->query.frame );
9315 			hsp->evalue = BlastKarlinStoE_simple( hsp->score,
9316 			                                      search->sbp->kbp_gap[search->first_context],
9317 							      search->mult_queries->SearchSpEff[query_num] );
9318 		      }
9319                    }
9320                    /*hsp->pvalue = BlastKarlinEtoP(hsp->evalue);*/
9321 		    if (hsp->evalue > search->pbp->cutoff_e) /* put in for comp. based stats. */
9322 			keep = FALSE;
9323                 }
9324 
9325                 if (keep) {
9326                    if (search->pbp->is_ooframe) {
9327                       OOFBlastHSPGetNumIdentical(gap_align->query,
9328                                                  gap_align->subject-start_shift, hsp, NULL,
9329                                                  &hsp->num_ident, &align_length);
9330                    } else {
9331                       search->subject->sequence_start =
9332                          gap_align->subject - start_shift - 1;
9333                       BlastHSPGetNumIdentical(search, hsp, NULL, &hsp->num_ident,
9334                                               &align_length);
9335                    }
9336                    if (search->pbp->mb_params &&
9337                        search->pbp->mb_params->use_dyn_prog) {
9338                       if (hsp->num_ident * 100 <
9339                           align_length * search->pbp->mb_params->perc_identity) {
9340                          keep = FALSE;
9341                       }
9342                    }
9343                    search->subject->sequence_start = NULL;
9344 
9345                    if (search->pbp->scalingFactor != 0.0 && search->pbp->scalingFactor != 1.0)
9346                       /* Scale down score for blastp and tblastn. */
9347                       hsp->score = (hsp->score+(0.5*search->pbp->scalingFactor))/search->pbp->scalingFactor;
9348 
9349 				/* only one alignment considered for blast[np]. */
9350 				/* This may be changed by LinkHsps for blastx or tblastn. */
9351                    hsp->num = 1;
9352                    if ((search->prog_number == blast_type_tblastn ||
9353                         search->prog_number == blast_type_psitblastn) &&
9354                        search->pbp->longest_intron > 0)
9355                       hsp->evalue = BlastKarlinStoE_simple(hsp->score,
9356                                        search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9357                 }
9358 
9359                 for (index2=0; index2<index && keep == TRUE; index2++) {
9360                     hsp2 = hsp_array[index2];
9361                     if (hsp2 == NULL)
9362                         continue;
9363 
9364                     /* Check if both HSP's start or end on the same diagonal (and are on same strands). */
9365                     if (((hsp->query.offset == hsp2->query.offset &&
9366                           hsp->subject.offset == hsp2->subject.offset) ||
9367                          (hsp->query.end == hsp2->query.end &&
9368                           hsp->subject.end == hsp2->subject.end))  &&
9369                         hsp->context == hsp2->context &&
9370 			hsp->subject.frame == hsp2->subject.frame) {
9371                         if (hsp2->score > hsp->score) {
9372                             keep = FALSE;
9373                             break;
9374                         } else {
9375                             new_hspcnt--;
9376                             if (do_traceback) {
9377                                 seqalign_array[index2] =
9378                                    SeqAlignFree(seqalign_array[index2]);
9379                             }
9380                             hsp_array[index2] =
9381                                BLAST_HSPFree(hsp_array[index2]);
9382                         }
9383                     }
9384                 }
9385 
9386                 if (keep) {
9387                     new_hspcnt++;
9388                 } else {
9389                    hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9390                 }
9391             } else {	/* Should be kept? */
9392                 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9393             }
9394         } else { /* Contained within another HSP, delete. */
9395             hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9396         }
9397     }
9398     if (search->pbp->scalingFactor != 0.0 && search->pbp->scalingFactor != 1.0)
9399     {	/* Rescale Lambda. */
9400 		search->sbp->kbp_gap[0]->Lambda *= search->pbp->scalingFactor;
9401     }
9402 
9403     /* Now for OOF alignment we try to detect simular alignments */
9404 
9405     HeapSort(hsp_array,hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
9406     BLASTCheckHSPInclusion(hsp_array, hspcnt, pbp->is_ooframe);
9407 
9408     /* Make up fake hitlist, relink and rereap. */
9409 
9410     if (StringCmp(search->prog_name, "blastx") == 0 ||
9411         StringCmp(search->prog_name, "tblastn") == 0 ||
9412         StringCmp(search->prog_name, "psitblastn") == 0) {
9413         hspcnt = HspArrayPurge(hsp_array, hspcnt, FALSE);
9414         tmp_hitlist = (BLAST_HitListPtr) MemNew(sizeof(BLAST_HitList));
9415         real_hitlist = search->current_hitlist;
9416 
9417         search->current_hitlist = tmp_hitlist;
9418         tmp_hitlist->hsp_array = hsp_array;
9419         tmp_hitlist->hspcnt = hspcnt;
9420         tmp_hitlist->hspmax = hspcnt;
9421 
9422         /* Use real subject length for all programs - it will be adjusted inside
9423            the functions that need it */
9424         search->subject->length = subject_length;
9425 
9426         if (search->prog_number == blast_type_tblastn &&
9427             search->pbp->longest_intron > 0) {
9428            BlastSequenceAddSequence(search->subject, NULL, subject-1,
9429                                     subject_length, subject_length, 0);
9430            search->subject_id = ordinal_id;
9431         }
9432 
9433         if (!search->pbp->do_sum_stats || search->pbp->longest_intron > 0)
9434            BlastGetNonSumStatsEvalue(search);
9435 
9436 	/* AM: Changed to support query concatenation. */
9437         if (search->pbp->do_sum_stats == TRUE)
9438 	{
9439 	    if( search->mult_queries ) search->mult_queries->use_mq = FALSE;
9440 
9441             BlastLinkHsps(search);
9442         }
9443 
9444         if (search->prog_number == blast_type_tblastn &&
9445             search->pbp->longest_intron > 0)
9446            search->subject->sequence_start = search->subject->sequence = NULL;
9447 
9448         BlastReapHitlistByEvalue(search);
9449 
9450         hspcnt = search->current_hitlist->hspcnt;
9451         search->current_hitlist = real_hitlist;
9452 	tmp_hitlist->lh_helper = MemFree(tmp_hitlist->lh_helper);
9453         MemFree(tmp_hitlist);
9454     }
9455 
9456     new_hspcnt = HspArrayPurge(hsp_array, hspcnt, FALSE);
9457 
9458     HeapSort(hsp_array,new_hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
9459 
9460     /* Remove extra HSPs if there is a user proveded limit on the number
9461        of HSPs per database sequence */
9462     if (search->pbp->hsp_num_max > new_hspcnt) {
9463        for (index=new_hspcnt; index<search->pbp->hsp_num_max; ++index) {
9464           hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9465        }
9466        new_hspcnt = MIN(new_hspcnt, search->pbp->hsp_num_max);
9467     }
9468 
9469     if (do_traceback) {
9470         for (index=0; index<new_hspcnt; index++) {
9471             hsp = hsp_array[index];
9472             hsp->gap_info->reverse = reverse;
9473             hsp->gap_info->original_length1 = search->context[hsp->context].query->original_length;
9474             hsp->gap_info->original_length2 = subject_length;
9475 	    if (search->pbp->mb_params) {
9476                query_id = search->qid_array[hsp->context/2];
9477 	    } else {
9478                query_id = search->query_id;
9479             }
9480             CopyHSPToResultHsp(search->sbp->kbp_gap[search->first_context],
9481                                hsp, &result_hsp);
9482 
9483 	    if (new_subject_seqid) {
9484                 if (search->pbp->explode_seqids)
9485                     seqid_tmp = gi_list;
9486                 else
9487                     seqid_tmp = new_subject_seqid;
9488 	    } else {
9489 		seqid_tmp = subject_id;
9490             }
9491 
9492 	    while (seqid_tmp) {
9493                 if(search->pbp->is_ooframe) {
9494                     seqalign = OOFGapXEditBlockToSeqAlign(hsp->gap_info, seqid_tmp, query_id, hsp->query.frame > 0 ? search->query_dnap[0]->length : search->query_dnap[1]->length);
9495                 } else {
9496                     seqalign = GapXEditBlockToSeqAlign(hsp->gap_info, seqid_tmp, query_id);
9497                 }
9498 
9499             	seqalign->score = GetScoreSetFromBlastResultHsp(&result_hsp, gi_list);
9500 
9501 		if (seqalign_array[index] == NULL)
9502                     seqalign_array[index] = seqalign;
9503 		else {
9504                     seqalign_var = seqalign_array[index];
9505                     while (seqalign_var->next)
9506                         seqalign_var = seqalign_var->next;
9507                     seqalign_var->next = seqalign;
9508 		}
9509 		seqid_tmp = seqid_tmp->next;
9510 	    }
9511 	}
9512 
9513         *head = NULL;
9514         for (index=0; index<new_hspcnt; index++) {
9515             if (seqalign_array[index] != NULL) {
9516                 if (*head == NULL) {
9517                     *head = seqalign_array[index];
9518                 } else {
9519                     for (seqalign_var=*head; seqalign_var->next != NULL;) {
9520                         seqalign_var = seqalign_var->next;
9521                     }
9522                     seqalign_var->next = seqalign_array[index];
9523                 }
9524             }
9525         }
9526 
9527         seqalign_array = MemFree(seqalign_array);
9528     } else {
9529         if (bhrp)
9530             BlastHitRangeLoad(search, hsp_array, new_hspcnt, bhrp);
9531     }
9532 
9533     gi_list = SeqIdSetFree(gi_list);
9534     if (new_subject_seqid)
9535         new_subject_seqid = SeqIdSetFree(new_subject_seqid);
9536 
9537     if ((StringCmp(search->prog_name, "tblastn") == 0 ||
9538          StringCmp(search->prog_name, "psitblastn") == 0)&&
9539         translated_subject_orig) {
9540         for (index=0; index<8; index++) {
9541             MemFree(translated_subject_orig[index]);
9542         }
9543         MemFree(translated_subject_orig);
9544         MemFree(translated_subject_length_orig);
9545     }
9546 
9547     return new_hspcnt;
9548 }
9549 
9550 
9551 /*
9552 	find the traceback for a gapped alignment.  Do this by
9553 	organizing the list of HSP's by sum group, then order
9554 	these groups by score.  Then attempt to perform the alignment
9555 	by using the highest scoring HSP of every sum group, then the
9556 	2nd highest scoring HSP, etc. until all the HSP's of a sum
9557 	group have been examined.  Then move onto the next sum group.
9558 */
9559 SeqAlignPtr LIBCALL
SumBlastGetGappedAlignmentTraceback(BlastSearchBlkPtr search,Int4 hit_number,Boolean reverse,Boolean ordinal_number,Uint1Ptr subject,Int4 subject_length)9560 SumBlastGetGappedAlignmentTraceback (BlastSearchBlkPtr search, Int4 hit_number, Boolean reverse, Boolean ordinal_number, Uint1Ptr subject, Int4 subject_length)
9561 
9562 {
9563 	SeqAlignPtr seqalign;
9564 
9565 	SumBlastGetGappedAlignmentEx(search, hit_number, reverse, ordinal_number, subject, subject_length, TRUE, &seqalign, NULL, 0);
9566 
9567 	return seqalign;
9568 }
9569 
9570 Boolean LIBCALL
SumBlastGetGappedAlignmentEx(BlastSearchBlkPtr search,Int4 hit_number,Boolean reverse,Boolean ordinal_number,Uint1Ptr subject,Int4 subject_length,Boolean do_traceback,SeqAlignPtr PNTR seqalignP,BlastHitRangePtr bhrp,Int2 query_number)9571 SumBlastGetGappedAlignmentEx (BlastSearchBlkPtr search, Int4 hit_number, Boolean reverse, Boolean ordinal_number, Uint1Ptr subject, Int4 subject_length, Boolean do_traceback, SeqAlignPtr PNTR seqalignP, BlastHitRangePtr bhrp, Int2 query_number)
9572 
9573 {
9574 	BLAST_HSPPtr PNTR hsp_array;
9575 	BLASTResultHitlistPtr   result_hitlist;
9576         BLASTResultHspPtr       result_hsp_array=NULL, hsp;
9577 	Boolean not_done;
9578 	Int4 hspcnt=0, new_hspcnt=0, hspset_cnt_old;
9579 	Int4 index, index1, high_score=0, ordinal_id, next_start, start, stop;
9580 	SeqAlignPtr seqalign=NULL;
9581 	SeqIdPtr subject_id=NULL, sip, subject_id_var;
9582 	Nlm_FloatHi current_evalue=DBL_MAX;
9583 	ValNodePtr vnp, vnp_start;
9584    BLASTResultsStructPtr result_struct;
9585    Boolean is_megablast = (search->pbp->mb_params != NULL);
9586 
9587 	if (search == NULL)
9588 		return FALSE;
9589 
9590         if (is_megablast) {
9591            result_struct = search->mb_result_struct[query_number];
9592         } else {
9593            result_struct = search->result_struct;
9594         }
9595         result_hitlist = result_struct->results[hit_number];
9596         hspcnt = result_hitlist->hspcnt;
9597 
9598 	if (search->pbp->explode_seqids)
9599 	{ /* Obtain and connect all SeqId's if explode demanded. */
9600 		vnp = NULL;
9601                 if (is_megablast)
9602                    BlastGetSubjectIdEx(search, hit_number,
9603                                        ordinal_number, &vnp, query_number);
9604                 else
9605                    BlastGetSubjectId(search, hit_number, ordinal_number, &vnp);
9606 		vnp_start = vnp;
9607 		while (vnp)
9608 		{
9609 			sip = GetTheSeqAlignID(vnp->data.ptrvalue);
9610 			SeqIdFree(vnp->data.ptrvalue);
9611 			if (subject_id == NULL)
9612 			{
9613 				subject_id = sip;
9614 			}
9615 			else
9616 			{
9617 				subject_id_var = subject_id;
9618 				while (subject_id_var->next)
9619 					subject_id_var = subject_id_var->next;
9620 				subject_id_var->next = sip;
9621 			}
9622 			vnp = vnp->next;
9623 		}
9624 		vnp_start = vnp = ValNodeFree(vnp_start);
9625 	}
9626 	else
9627 	{
9628 		sip = BlastGetSubjectIdEx(search, hit_number, ordinal_number,
9629                                           NULL, query_number);
9630 		subject_id = GetTheSeqAlignID(sip);
9631 	    	sip = SeqIdSetFree(sip);
9632 	}
9633 	ordinal_id = result_hitlist->subject_id;
9634 
9635 	hsp_array = MemNew(hspcnt*sizeof(BLAST_HSPPtr));
9636 	not_done = TRUE;
9637 	start=0;
9638 	next_start=0;
9639 	while (not_done)
9640 	{
9641 		hsp = &(result_hitlist->hsp_array[start]);
9642 		hspset_cnt_old = hsp->hspset_cnt;
9643 		for (index=start; index<hspcnt; index++)
9644 		{
9645 			hsp = &(result_hitlist->hsp_array[index]);
9646 			if(hspset_cnt_old != hsp->hspset_cnt)
9647 			{
9648 				hspset_cnt_old = hsp->hspset_cnt;
9649 				stop = index;
9650 				next_start = stop;
9651 				break;
9652 			}
9653 		}
9654 
9655 		if (index == hspcnt)
9656 		{
9657 			stop = hspcnt;
9658 			not_done = FALSE;
9659 		}
9660 
9661 		index1=0;
9662 		for (index=start; index<stop; index++)
9663 		{
9664 			hsp_array[index] = MemNew(sizeof(BLAST_HSP));
9665 			CopyResultHspToHSP(&(result_hitlist->hsp_array[index]), hsp_array[index]);
9666 			index1++;
9667 		}
9668 
9669 		/* heap sort the last sum group */
9670 		HeapSort(hsp_array+start,(stop-start),sizeof(BLAST_HSPPtr), score_compare_hsps);
9671 		start = next_start;
9672 	}
9673 
9674 	new_hspcnt = RealBlastGetGappedAlignmentTraceback(search, subject, subject_length, NULL, 0, subject_id, hsp_array, hspcnt, &seqalign, bhrp, search->pbp->cutoff_s, reverse, ordinal_id, do_traceback);
9675 
9676 /* Save HSP's again, discarding those that have been NULLed out. */
9677 /* If no HSP's were valid, best_evalue is set to DBL_MAX. */
9678 	index1 = 0;
9679 	if (new_hspcnt > 0)
9680 	{
9681 		result_hsp_array = MemNew((new_hspcnt)*sizeof(BLASTResultHsp));
9682 		index1 = 0;
9683 		for (index=0; index<hspcnt; index++)
9684 		{
9685 			if (hsp_array[index] != NULL)
9686 			{
9687 				if (current_evalue > hsp_array[index]->evalue)
9688 					current_evalue = hsp_array[index]->evalue;
9689 				if (high_score < hsp_array[index]->score)
9690        	                        	high_score = hsp_array[index]->score;
9691 				CopyHSPToResultHsp(search->sbp->kbp_gap[search->first_context], hsp_array[index], &(result_hsp_array[index1]));
9692 				index1++;
9693                                 /* Do not free edit block, just the
9694                                    BLAST_HSP structure. */
9695 				hsp_array[index] = MemFree(hsp_array[index]);
9696 			}
9697 		}
9698 	}
9699 	hsp_array = MemFree(hsp_array);
9700 
9701 	result_hitlist->hspcnt = index1;
9702 	if (result_hitlist->hsp_array)
9703 		MemFree(result_hitlist->hsp_array);
9704 	result_hitlist->hsp_array = result_hsp_array;
9705 	result_hitlist->best_evalue = current_evalue;
9706 	result_hitlist->high_score = high_score;
9707 
9708 	subject_id = SeqIdSetFree(subject_id);
9709 
9710 	if (seqalignP)
9711 	*	seqalignP = seqalign;
9712 
9713 	return TRUE;
9714 }
9715 
9716 /*
9717 	Performs a gapped alignment on the HSP's in a hitlist.
9718 	Discards those that do not meet the standard.
9719 */
9720 
9721 SeqAlignPtr LIBCALL
BlastGetGapAlgnTbck(BlastSearchBlkPtr search,Int4 hit_number,Boolean reverse,Boolean ordinal_number,Uint1Ptr subject,Int4 subject_length,Uint1Ptr rev_subject,Int4 rev_subject_length)9722 BlastGetGapAlgnTbck (BlastSearchBlkPtr search, Int4 hit_number, Boolean reverse, Boolean ordinal_number, Uint1Ptr subject, Int4 subject_length, Uint1Ptr rev_subject, Int4 rev_subject_length)
9723 
9724 {
9725 	BLAST_HSPPtr PNTR hsp_array;
9726 	BLASTResultHitlistPtr   result_hitlist;
9727         BLASTResultHspPtr       result_hsp_array=NULL;
9728 	Int4 hspcnt=0, new_hspcnt=0;
9729 	Int4 index, index1, high_score=0, ordinal_id;
9730 	SeqAlignPtr seqalign, head, seqalign_var;
9731 	SeqIdPtr subject_id=NULL, sip, subject_id_var;
9732 	Nlm_FloatHi current_evalue=DBL_MAX;
9733 	ValNodePtr vnp, vnp_start;
9734 
9735 	if (search == NULL)
9736 		return NULL;
9737 
9738         result_hitlist = search->result_struct->results[hit_number];
9739         hspcnt = result_hitlist->hspcnt;
9740 	ordinal_id = result_hitlist->subject_id;
9741 
9742 	if (search->pbp->explode_seqids)
9743 	{ /* Obtain and connect all SeqId's if explode demanded. */
9744 		vnp = NULL;
9745 		BlastGetSubjectId(search, hit_number, ordinal_number, &vnp);
9746 		vnp_start = vnp;
9747 		while (vnp)
9748 		{
9749 			sip = GetTheSeqAlignID(vnp->data.ptrvalue);
9750 			SeqIdFree(vnp->data.ptrvalue);
9751 			if (subject_id == NULL)
9752 			{
9753 				subject_id = sip;
9754 			}
9755 			else
9756 			{
9757 				subject_id_var = subject_id;
9758 				while (subject_id_var->next)
9759 					subject_id_var = subject_id_var->next;
9760 				subject_id_var->next = sip;
9761 			}
9762 			vnp = vnp->next;
9763 		}
9764 		vnp_start = vnp = ValNodeFree(vnp_start);
9765 	}
9766 	else
9767 	{
9768 		sip = BlastGetSubjectId(search, hit_number, ordinal_number, NULL);
9769 		subject_id = GetTheSeqAlignID(sip);
9770 	    	sip = SeqIdSetFree(sip);
9771 	}
9772 
9773 	head = NULL;
9774 
9775 	hsp_array = MemNew(hspcnt*sizeof(BLAST_HSPPtr));
9776 	for (index=0; index<hspcnt; index++)
9777 	{
9778 		hsp_array[index] = MemNew(sizeof(BLAST_HSP));
9779 		CopyResultHspToHSP(&(result_hitlist->hsp_array[index]),
9780                                    hsp_array[index]);
9781 	}
9782 	HeapSort(hsp_array,hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
9783 
9784 	new_hspcnt = RealBlastGetGappedAlignmentTraceback(search, subject, subject_length, rev_subject, rev_subject_length, subject_id, hsp_array, hspcnt, &seqalign, NULL, 0, reverse, ordinal_id, TRUE);
9785 	if (seqalign != NULL)
9786 	{
9787 		if (head == NULL)
9788 		{
9789 			head = seqalign;
9790 		}
9791 		else
9792 		{
9793 			for (seqalign_var=head; seqalign_var->next != NULL;)
9794 			{
9795 				seqalign_var = seqalign_var->next;
9796 			}
9797 			seqalign_var->next = seqalign;
9798 		}
9799 	}
9800 
9801 /* Save HSP's again, discarding those that have been NULLed out. */
9802 	result_hsp_array = MemNew((new_hspcnt)*sizeof(BLASTResultHsp));
9803 	index1 = 0;
9804 	for (index=0; index<hspcnt; index++)
9805 	{
9806 		if (hsp_array[index] != NULL)
9807 		{
9808 			if (current_evalue > hsp_array[index]->evalue)
9809 				current_evalue = hsp_array[index]->evalue;
9810 			if (high_score < hsp_array[index]->score)
9811                                	high_score = hsp_array[index]->score;
9812 
9813 			CopyHSPToResultHsp(search->sbp->kbp_gap[search->first_context], hsp_array[index], &(result_hsp_array[index1]));
9814 			index1++;
9815                         /* Do not free edit block, just the BLAST_HSP
9816                            structure */
9817 			hsp_array[index] = MemFree(hsp_array[index]);
9818 		}
9819 	}
9820 	hsp_array = MemFree(hsp_array);
9821 
9822 	if (result_hitlist->hsp_array) {
9823            /* Delete any edit blocks from a previous traceback. */
9824            for (index=0; index< result_hitlist->hspcnt; ++index)
9825               GapXEditBlockDelete(result_hitlist->hsp_array[index].gap_info);
9826 
9827            MemFree(result_hitlist->hsp_array);
9828         }
9829 	result_hitlist->hspcnt = index1;
9830 	result_hitlist->hsp_array = result_hsp_array;
9831 	result_hitlist->best_evalue = current_evalue;
9832 	result_hitlist->high_score = high_score;
9833 
9834 	subject_id = SeqIdSetFree(subject_id);
9835 
9836 	return head;
9837 }
9838 
9839 /*
9840 	Performs a gapped alignment on the HSP's in a hitlist.
9841 	Discards those that do not meet the standard.
9842 */
9843 
9844 Int2 LIBCALL
BlastPreliminaryGappedScore(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,Int2 frame)9845 BlastPreliminaryGappedScore (BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, Int2 frame)
9846 
9847 {
9848 	BLAST_HitListPtr hitlist;
9849 	BLAST_HSPPtr hsp;
9850 	BLAST_HSPPtr PNTR hsp_array;
9851 	GapAlignBlkPtr gap_align;
9852 	Int2 status;
9853 	Int4 index, max_offset = 0, query_length, min_score;
9854 	BLAST_ParameterBlkPtr pbp;
9855 
9856 	if (search == NULL)
9857 		return 1;
9858 
9859 	pbp = search->pbp;
9860 
9861 	if (search->gap_align == NULL)
9862 	{
9863 		search->gap_align = GapAlignBlkNew(1, 1);
9864 	}
9865 	gap_align = search->gap_align;
9866 
9867 	min_score = search->pbp->cutoff_s1;
9868 
9869 	status = 0;
9870 	hitlist = search->current_hitlist;
9871 	if (hitlist && hitlist->hspcnt > 0)
9872 	{
9873 		query_length = search->context[search->first_context].query->length;
9874 
9875 		hitlist->hspcnt_max = hitlist->hspcnt;
9876 		hsp_array = hitlist->hsp_array;
9877 		if (frame != 0)
9878 		{
9879 			for (index=0; index<hitlist->hspcnt; index++)
9880 			{
9881 				hsp = hsp_array[index];
9882 				if (frame == hsp->subject.frame)
9883 					break;
9884 			}
9885 			if (frame != hsp->subject.frame)
9886 				return 0;
9887 		}
9888 		else
9889 		{ /* The first HSP has the highest score. */
9890 			hsp = hsp_array[0];
9891 		}
9892 
9893 		/* The first HSP has the highest score. */
9894 /*
9895 		e_value = BlastKarlinStoE_simple(hsp->score, search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9896 */
9897 		if (hsp->score >= min_score)
9898 		{
9899 #ifdef BLAST_COLLECT_STATS
9900 			search->prelim_gap_no_contest++;
9901 #endif
9902 			hitlist->further_process = TRUE;
9903 			return 1;
9904 		}
9905                 gap_align->is_ooframe = pbp->is_ooframe;
9906                 gap_align->shift_pen = pbp->shift_pen;
9907                 gap_align->discontinuous = pbp->discontinuous;
9908 		gap_align->positionBased =
9909                    (search->positionBased && search->sbp->posMatrix);
9910 		gap_align->include_query = 0;
9911 		gap_align->gap_open = pbp->gap_open;
9912 		gap_align->gap_extend = pbp->gap_extend;
9913                 gap_align->decline_align = pbp->decline_align;
9914 		gap_align->x_parameter = pbp->gap_x_dropoff;
9915 		gap_align->matrix = search->sbp->matrix;
9916 		gap_align->posMatrix = search->sbp->posMatrix;
9917 		for (index=0; index<hitlist->hspcnt; index++)
9918 		{
9919 			hsp = hsp_array[index];
9920 			if (frame != 0)
9921 			{
9922 				if (frame != hsp->subject.frame)
9923 					continue;
9924 			}
9925 
9926 			if (hsp->score < search->pbp->gap_trigger)
9927 			{	/* Stop looking, we're below the cutoff. */
9928 				status = 0;
9929 				break;
9930 			}
9931 
9932 #ifdef BLAST_COLLECT_STATS
9933 				search->prelim_gap_attempts++;
9934 #endif
9935 			gap_align->score = 0;
9936 
9937                         if(!search->pbp->is_ooframe) {
9938                             max_offset = GetStartForGappedAlignment(search, hsp, search->context[hsp->context].query->sequence, subject, search->sbp->matrix);
9939                         }
9940 
9941                         if(search->pbp->is_ooframe) {
9942                             gap_align->is_ooframe = TRUE;
9943                             gap_align->query = subject;
9944 
9945                             if(hsp->query.frame > 0) {
9946                                 gap_align->subject = search->query_dnap[0]->sequence;
9947                                 gap_align->subject_length = search->query_dnap[0]->length;
9948                             } else {
9949                                 gap_align->subject = search->query_dnap[1]->sequence;
9950                                 gap_align->subject_length = search->query_dnap[1]->length;
9951                             }
9952 
9953                             gap_align->query_frame = hsp->subject.frame;
9954                             gap_align->subject_frame = ContextToFrame(search, hsp->context);
9955 
9956                             gap_align->query_length = subject_length;
9957                             gap_align->q_start = hsp->subject.offset;
9958                             gap_align->s_start = hsp->query.offset;
9959                         } else {
9960                             gap_align->query = search->context[hsp->context].query->sequence;
9961                             gap_align->subject = subject;
9962                             gap_align->query_length = search->context[hsp->context].query->length;
9963                             gap_align->subject_length = subject_length;
9964                             gap_align->q_start = max_offset;
9965                             gap_align->s_start = (hsp->subject.offset - hsp->query.offset) + max_offset;
9966                         }
9967 
9968 			gap_align->include_query = 0;
9969 
9970 	/* Perform only if the query's required start corresponds to a point after the start of the subject. */
9971 			if (gap_align->s_start >= 0)
9972 				PerformGappedAlignment(gap_align);
9973 /*
9974 			e_value = BlastKarlinStoE_simple(gap_align->score, search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9975 */
9976 			if (gap_align->score >= min_score)
9977 			{	/* Found one, stop looking. */
9978 				hitlist->further_process = TRUE;
9979 				status = 1;
9980 #ifdef BLAST_COLLECT_STATS
9981 				search->prelim_gap_passed++;
9982 #endif
9983 				break;
9984 			}
9985 		}
9986 	}
9987 
9988 	return status;
9989 }
9990 
9991 /*
9992 	Performs a gapped alignment on the HSP's in a hitlist.
9993 	This is to be used with blastn assuming the database sequence
9994 	will be unpacked on the fly.
9995 	Discards those that do not meet the standard.
9996 */
9997 
9998 Int2 LIBCALL
BlastNTPreliminaryGappedScore(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length)9999 BlastNTPreliminaryGappedScore (BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length)
10000 
10001 {
10002 	BLAST_HitListPtr hitlist;
10003 	BLAST_HSPPtr hsp;
10004 	BLAST_HSPPtr PNTR hsp_array;
10005 	GapAlignBlkPtr gap_align;
10006 	Int2 status;
10007 	Int4 index;
10008 	Nlm_FloatHi e_value;
10009 	BLAST_ParameterBlkPtr pbp;
10010 	/* AM: To support query concatenation. */
10011 	Int4 query_num;
10012 
10013 	if (search == NULL)
10014 		return -1;
10015 
10016 	pbp = search->pbp;
10017 
10018 	if (search->gap_align == NULL)
10019 	{
10020 		search->gap_align = GapAlignBlkNew(1, 1);
10021 	}
10022 	gap_align = search->gap_align;
10023 
10024 	status = 0;
10025 	hitlist = search->current_hitlist;
10026 	if (hitlist && hitlist->hspcnt > 0)
10027 	{
10028 
10029 		hitlist->hspcnt_max = hitlist->hspcnt;
10030 		hsp_array = hitlist->hsp_array;
10031 
10032 		/* The first HSP has the highest score. */
10033 		hsp = hsp_array[0];
10034 
10035                 /* AM: Changed to support query concatenation. */
10036 		/* The first HSP has the highest score. */
10037 		if( !search->mult_queries )
10038 		  e_value = BlastKarlinStoE_simple(hsp->score, search->sbp->kbp[search->first_context], search->searchsp_eff);
10039                 else
10040 		{
10041 		  /* AM: First determine which query to use, then use the
10042 		         corresponding SearchSpEff element in the call to
10043 		         BlastKarlinStoE_simple() */
10044 		  query_num = GetQueryNum( search->mult_queries,
10045 		                           hsp->query.offset,
10046 					   hsp->query.end,
10047 					   hsp->query.frame );
10048 	       	  e_value = BlastKarlinStoE_simple( hsp->score,
10049 		                                    search->sbp->kbp[search->first_context],
10050 						    search->mult_queries->SearchSpEff[query_num] );
10051 		}
10052 
10053 		if (e_value <= pbp->cutoff_e)
10054 		{
10055 #ifdef BLAST_COLLECT_STATS
10056 			search->prelim_gap_no_contest++;
10057 #endif
10058 			hitlist->further_process = TRUE;
10059 			return 1;
10060 		}
10061 
10062                 gap_align->is_ooframe = pbp->is_ooframe;
10063                 gap_align->shift_pen = pbp->shift_pen;
10064 		gap_align->positionBased = search->positionBased;
10065                 gap_align->discontinuous = pbp->discontinuous;
10066 		gap_align->include_query = 0;
10067 		gap_align->gap_open = pbp->gap_open;
10068 		gap_align->gap_extend = pbp->gap_extend;
10069                 gap_align->decline_align = pbp->decline_align;
10070 		gap_align->x_parameter = pbp->gap_x_dropoff;
10071 		gap_align->matrix = search->sbp->matrix;
10072 		gap_align->posMatrix = search->sbp->posMatrix;
10073 		for (index=0; index<hitlist->hspcnt; index++)
10074 		{
10075 			hsp = hsp_array[index];
10076 
10077 			if (hsp->score < search->pbp->gap_trigger)
10078 			{	/* Stop looking, we're below the cutoff. */
10079 				status = 0;
10080 				break;
10081 			}
10082 
10083 #ifdef BLAST_COLLECT_STATS
10084 				search->prelim_gap_attempts++;
10085 #endif
10086 			gap_align->score = 0;
10087 			gap_align->query = search->context[hsp->context].query->sequence;
10088 			gap_align->subject = subject;
10089 			gap_align->query_length = search->context[hsp->context].query->length;
10090 			gap_align->subject_length = subject_length;
10091 			gap_align->include_query = 0;
10092 			gap_align->q_start = hsp->query.gapped_start;
10093 			gap_align->s_start = hsp->subject.gapped_start;
10094 	/* Perform only if the query's required start corresponds to a point after the start of the subject. */
10095 			if (gap_align->s_start >= 0) {
10096                            if (!PerformNtGappedAlignment(gap_align))
10097                               return -1;
10098                         }
10099 
10100 			/* AM: Change to support query concatenation */
10101 			if( !search->mult_queries )
10102 			  e_value = BlastKarlinStoE_simple(gap_align->score, search->sbp->kbp[search->first_context], search->searchsp_eff);
10103                         else
10104 			{
10105 			  query_num = GetQueryNum( search->mult_queries,
10106 			                           hsp->query.offset,
10107 						   hsp->query.end,
10108 						   hsp->query.frame );
10109 			  e_value = BlastKarlinStoE_simple(gap_align->score,
10110 			                                   search->sbp->kbp[search->first_context],
10111 							   search->mult_queries->SearchSpEff[query_num]);
10112 			}
10113 
10114 			if (e_value <= pbp->cutoff_e)
10115 			{	/* Found one, stop looking. */
10116 				hitlist->further_process = TRUE;
10117 				status = 1;
10118 #ifdef BLAST_COLLECT_STATS
10119 				search->prelim_gap_passed++;
10120 #endif
10121 				break;
10122 			}
10123 		}
10124 	}
10125 
10126 	return status;
10127 }
10128 
10129 /*
10130 	Performs a gapped alignment on the HSP's in a hitlist.
10131 	Discards those that do not meet the standard.
10132 	Do this by obtaining the sequence from readdb and calling
10133 	BlastGetGappedScore.
10134 */
10135 
10136 Int2 LIBCALL
BlastGetGappedScoreWithReaddb(BlastSearchBlkPtr search,Int4 sequence_number)10137 BlastGetGappedScoreWithReaddb (BlastSearchBlkPtr search, Int4 sequence_number)
10138 
10139 {
10140 	BLAST_HitListPtr hitlist;
10141 	Int2 retval;
10142 	Int4 subject_length;
10143 	Uint1Ptr subject;
10144 
10145 	if (search == NULL)
10146 		return 1;
10147 
10148 	retval=0;
10149 	hitlist = search->current_hitlist;
10150 	if (hitlist && hitlist->hspcnt > 0)
10151 	{
10152 		if (hitlist->further_process == FALSE)
10153 		{
10154 			BlastHitListPurge(hitlist);
10155 			return 0;
10156 		}
10157 		subject_length = readdb_get_sequence(search->rdfp, sequence_number, &subject);
10158 		retval = BlastGetGappedScore(search, subject_length, subject, 0);
10159 	}
10160 
10161 	return retval;
10162 }
10163 
10164 
10165 /*
10166 	Performs a gapped alignment on the HSP's in a hitlist.
10167 	Discards those that do not meet the standard.
10168 */
10169 
10170 Int2 LIBCALL
BlastGetGappedScore(BlastSearchBlkPtr search,Int4 subject_length,Uint1Ptr subject,Int2 frame)10171 BlastGetGappedScore (BlastSearchBlkPtr search, Int4 subject_length, Uint1Ptr subject, Int2 frame)
10172 
10173 {
10174 	BLAST_HitListPtr hitlist;
10175 	BLAST_HSPPtr PNTR hsp_array, PNTR hsp_array_new;
10176 	BLAST_ParameterBlkPtr pbp;
10177 	GapAlignBlkPtr gap_align;
10178 	Int2 status=0;
10179 	Int4 hsp_cnt=0, hspcnt_max;
10180 	Int4 index, index1;
10181 
10182 	if (search == NULL)
10183 		return 1;
10184 
10185 	pbp = search->pbp;
10186 
10187 
10188 	if (search->gap_align == NULL)
10189 	{
10190 		search->gap_align = GapAlignBlkNew(1, 1);
10191 	}
10192 	gap_align = search->gap_align;
10193 
10194 	hitlist = search->current_hitlist;
10195 	if (hitlist && hitlist->hspcnt > 0)
10196 	{
10197 		if (hitlist->further_process == FALSE)
10198 		{
10199 			BlastHitListPurge(hitlist);
10200 			return 0;
10201 		}
10202 
10203 
10204 		hsp_array = hitlist->hsp_array;
10205 		if (hitlist->hspcnt != hitlist->hspcnt_max)
10206 		{
10207 /* Save HSP's again, discarding those that have been NULLed out. */
10208 			hsp_array_new = MemNew((hitlist->hspmax)*sizeof(BLAST_HSPPtr));
10209 			index1 = 0;
10210 			for (index=0; index<hitlist->hspcnt_max; index++)
10211 			{
10212 				if (hsp_array[index] != NULL)
10213 				{
10214 					hsp_array_new[index1] = hsp_array[index];
10215 					index1++;
10216 				}
10217 			}
10218 			hsp_array = MemFree(hsp_array);
10219 			hsp_array = hsp_array_new;
10220 			hitlist->hsp_array = hsp_array_new;
10221 			hitlist->hspcnt = index1;
10222 			hitlist->hspcnt_max = index1;
10223 		}
10224 
10225                 gap_align->is_ooframe = pbp->is_ooframe;
10226                 gap_align->shift_pen = pbp->shift_pen;
10227                 gap_align->discontinuous = pbp->discontinuous;
10228 		gap_align->positionBased =
10229                    (search->positionBased && search->sbp->posMatrix);
10230 		gap_align->include_query = 0;
10231 		gap_align->gap_open = pbp->gap_open;
10232 		gap_align->gap_extend = pbp->gap_extend;
10233                 gap_align->decline_align = pbp->decline_align;
10234 		gap_align->x_parameter = pbp->gap_x_dropoff;
10235 		gap_align->matrix = search->sbp->matrix;
10236 		gap_align->posMatrix = search->sbp->posMatrix;
10237 
10238 		if (frame != 0)
10239 		{
10240 			hsp_array = hitlist->hsp_array;
10241 			switch (frame) {
10242 			case -3:
10243 				HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_m3);
10244 				break;
10245 			case -2:
10246 				HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_m2);
10247 				break;
10248 			case -1:
10249 				HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_m1);
10250 				break;
10251 			case 1:
10252 				HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_p1);
10253 				break;
10254 			case 2:
10255 				HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_p2);
10256 				break;
10257 			case 3:
10258 				HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_p3);
10259 				break;
10260 			default:
10261 				break;
10262 			}
10263 
10264 			for (index=0; index<hitlist->hspcnt; index++)
10265 			{
10266 				if (hsp_array[index]->subject.frame != frame)
10267 					break;
10268 			}
10269 			HeapSort(hsp_array,index,sizeof(BLAST_HSPPtr), score_compare_hsps);
10270 		}
10271 		else
10272 		{
10273 			HeapSort(hsp_array,hitlist->hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
10274 		}
10275 		hitlist->hspcnt_max = hitlist->hspcnt;
10276 		hsp_array = hitlist->hsp_array;
10277 		hspcnt_max = hitlist->hspcnt;
10278 		hsp_cnt = hitlist->hspcnt;
10279 
10280 start_timer;
10281 		hsp_array = BlastGappedScoreInternal(search, subject, subject_length, gap_align, hsp_array, &hsp_cnt, &hspcnt_max, hitlist->hspmax, frame);
10282 stop_timer("after BlastGappedScoreInternal");
10283 		hitlist->hspcnt = hsp_cnt;
10284 		hitlist->hspcnt_max = hspcnt_max;
10285 		hitlist->hsp_array = hsp_array;
10286 	}
10287 
10288 	return status;
10289 }
10290 
10291 /*
10292 	Performs a gapped alignment on the HSP's in a hitlist.
10293 	Discards those that do not meet the standard.
10294 */
10295 
10296 Int2 LIBCALL
BlastNTGetGappedScore(BlastSearchBlkPtr search,Int4 subject_length,Uint1Ptr subject)10297 BlastNTGetGappedScore (BlastSearchBlkPtr search, Int4 subject_length, Uint1Ptr subject)
10298 
10299 {
10300 	BLAST_HitListPtr hitlist;
10301 	BLAST_HSPPtr PNTR hsp_array, PNTR hsp_array_new;
10302 	BLAST_ParameterBlkPtr pbp;
10303 	GapAlignBlkPtr gap_align;
10304 	Int2 status=0;
10305 	Int4 hsp_cnt=0, hspcnt_max;
10306 	Int4 index, index1;
10307 
10308 	if (search == NULL)
10309 		return -1;
10310 
10311 	pbp = search->pbp;
10312 
10313 
10314 	if (search->gap_align == NULL)
10315 	{
10316 		search->gap_align = GapAlignBlkNew(1, 1);
10317 	}
10318 	gap_align = search->gap_align;
10319 
10320 	hitlist = search->current_hitlist;
10321 	if (hitlist && hitlist->hspcnt > 0)
10322  	{
10323                 if (hitlist->further_process == FALSE)
10324                 {
10325                         BlastHitListPurge(hitlist);
10326                         return 0;
10327                 }
10328 
10329 		hsp_array = hitlist->hsp_array;
10330 		if (hitlist->hspcnt != hitlist->hspcnt_max)
10331 		{
10332 /* Save HSP's again, discarding those that have been NULLed out. */
10333 			hsp_array_new = MemNew((hitlist->hspmax)*sizeof(BLAST_HSPPtr));
10334 			index1 = 0;
10335 			for (index=0; index<hitlist->hspcnt_max; index++)
10336 			{
10337 				if (hsp_array[index] != NULL)
10338 				{
10339 					hsp_array_new[index1] = hsp_array[index];
10340 					index1++;
10341 				}
10342 			}
10343 			hsp_array = MemFree(hsp_array);
10344 			hsp_array = hsp_array_new;
10345 			hitlist->hsp_array = hsp_array_new;
10346 			hitlist->hspcnt = index1;
10347 			hitlist->hspcnt_max = index1;
10348 		}
10349 
10350                 gap_align->is_ooframe = pbp->is_ooframe;
10351                 gap_align->shift_pen = pbp->shift_pen;
10352                 gap_align->discontinuous = pbp->discontinuous;
10353 		gap_align->positionBased = search->positionBased;
10354 		gap_align->include_query = 0;
10355 		gap_align->gap_open = pbp->gap_open;
10356 		gap_align->gap_extend = pbp->gap_extend;
10357                 gap_align->decline_align = pbp->decline_align;
10358 		gap_align->x_parameter = pbp->gap_x_dropoff;
10359 		gap_align->matrix = search->sbp->matrix;
10360 		gap_align->posMatrix = search->sbp->posMatrix;
10361 
10362 		HeapSort(hsp_array,hitlist->hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
10363 		hitlist->hspcnt_max = hitlist->hspcnt;
10364 		hsp_array = hitlist->hsp_array;
10365 		hspcnt_max = hitlist->hspcnt;
10366 		hsp_cnt = hitlist->hspcnt;
10367 
10368 		if (!BlastNtGappedScoreInternal(search, subject, subject_length, gap_align, hsp_array, &hsp_cnt, &hspcnt_max, hitlist->hspmax))
10369                    /* Gapped extension failed */
10370                    return -1;
10371 		hitlist->hspcnt = hsp_cnt;
10372 		hitlist->hspcnt_max = hspcnt_max;
10373 		hitlist->hsp_array = hsp_array;
10374 	}
10375 
10376 	return status;
10377 }
10378 
10379 
10380 /******************************************************************
10381 
10382 	Purges (i.e., cleans) the HitList for reuse.
10383 
10384 *******************************************************************/
10385 
10386 Int2 LIBCALL
BlastHitListPurge(BLAST_HitListPtr hitlist)10387 BlastHitListPurge(BLAST_HitListPtr hitlist)
10388 
10389 {
10390 	BLAST_HSPPtr PNTR hsp_array;
10391 	Int4 hspcnt_max, index;
10392 
10393 	if (hitlist == NULL)
10394 		return 1;
10395 
10396 	hsp_array = hitlist->hsp_array;
10397 
10398 	if (hitlist->hspcnt > hitlist->hspcnt_max)
10399 		hspcnt_max = hitlist->hspcnt;
10400 	else
10401 		hspcnt_max = hitlist->hspcnt_max;
10402 
10403 	for (index=0; index<hspcnt_max; index++) {
10404            hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
10405         }
10406 
10407 	hitlist->hspcnt = 0;
10408 	hitlist->hspcnt_max = 0;
10409 	hitlist->further_process = FALSE;
10410 
10411 	return 0;
10412 }
10413 
10414 /*
10415 	Cleans out the NULLed out HSP's from the HSP array,
10416 	moving the BLAST_HSPPtr's up to fill in the gaps.
10417 
10418 	returns the number of valid HSP's.
10419 */
10420 
10421 Int4 LIBCALL
HspArrayPurge(BLAST_HSPPtr PNTR hsp_array,Int4 hspcnt,Boolean clear_num)10422 HspArrayPurge (BLAST_HSPPtr PNTR hsp_array, Int4 hspcnt, Boolean clear_num)
10423 
10424 {
10425 	Int4 index, index1;
10426 
10427 	if (hspcnt == 0 || hsp_array == NULL)
10428 		return 0;
10429 
10430 	index1 = 0;
10431 	for (index=0; index<hspcnt; index++)
10432 	{
10433 		if (hsp_array[index] != NULL)
10434 		{
10435 			hsp_array[index1] = hsp_array[index];
10436 			if (clear_num)
10437 				hsp_array[index1]->num = 0;
10438 			index1++;
10439 		}
10440 	}
10441 
10442 	for (index=index1; index<hspcnt; index++)
10443 	{
10444 		hsp_array[index] = NULL;
10445 	}
10446 
10447 	hspcnt = index1;
10448 
10449 	return index1;
10450 }
10451 
10452 
OOF_TranslateHspToDNAP(BLAST_HSPPtr hspp,Int4 length)10453 static void OOF_TranslateHspToDNAP(BLAST_HSPPtr hspp, Int4 length)
10454 {
10455     Int4 from, to, frame;
10456 
10457     from = hspp->query.offset;
10458     to = hspp->query.end;
10459     frame = abs(hspp->query.frame);
10460 
10461     hspp->query.offset = CODON_LENGTH*from + frame - 1;
10462     hspp->query.end = CODON_LENGTH*to + frame - 1;
10463     hspp->query.length = hspp->query.end - hspp->query.offset + 1;
10464     hspp->query.gapped_start = CODON_LENGTH*hspp->query.gapped_start + frame - 1;
10465 
10466     return;
10467 }
10468 /**************************************************************************
10469 *
10470 *	Save the current HSP in the appropriate ranking.
10471 *
10472 **************************************************************************/
10473 
10474 #define BLAST_HSP_ADD 100
10475 
10476 void
BlastSaveCurrentHsp(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 length,Int2 context)10477 BlastSaveCurrentHsp(BlastSearchBlkPtr search, BLAST_Score score, Int4 q_offset, Int4 s_offset, Int4 length, Int2 context)
10478 
10479 {
10480 	BLAST_HitListPtr current_hitlist;
10481 	BLAST_HSPPtr PNTR hsp_array, new_hsp;
10482 	Int4 hspcnt, hspmax, high_index, low_index;
10483 
10484 	current_hitlist = search->current_hitlist;
10485 	hsp_array = current_hitlist->hsp_array;
10486 	hspcnt = current_hitlist->hspcnt;
10487 	hspmax = current_hitlist->hspmax;
10488 
10489         /* Check if need to create a new list */
10490         if (hspmax == 0 && current_hitlist->do_not_reallocate == FALSE) {
10491            hsp_array = (BLAST_HSPPtr PNTR) Malloc(BLAST_HSP_ADD*sizeof(BLAST_HSPPtr));
10492            hspmax = current_hitlist->hspmax = BLAST_HSP_ADD;
10493         }
10494 
10495 	/* Check if list is already full, then reallocate. */
10496 	if (hspcnt >= hspmax && current_hitlist->do_not_reallocate == FALSE)
10497 	{
10498 		hsp_array = (BLAST_HSPPtr PNTR) Realloc(hsp_array, current_hitlist->hspmax*2*sizeof(BLAST_HSPPtr));
10499 		if (hsp_array == NULL)
10500 		{
10501 			ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in BlastSaveCurrentHsp for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, (long) hspmax);
10502 			current_hitlist->do_not_reallocate = TRUE;
10503 		}
10504 		else
10505 		{
10506 			current_hitlist->hsp_array = hsp_array;
10507 			current_hitlist->hspmax *= 2;
10508 			hspmax = current_hitlist->hspmax;
10509 			/* Prohibit future allocations. */
10510 			if (search->pbp->hsp_num_max != 0 && current_hitlist->hspmax >= search->pbp->hsp_num_max)
10511 			{
10512 				ErrPostEx(SEV_WARNING, 0, 0, "Reached max %ld HSPs in BlastSaveCurrentHsp, continuing with this limit",
10513 					(long) hspmax);
10514 				current_hitlist->do_not_reallocate = TRUE;
10515               			/* HSPs must be now sorted */
10516               			HeapSort(hsp_array, hspcnt, sizeof(BLAST_HSPPtr), score_compare_hsps);
10517 			}
10518 		}
10519 	}
10520 
10521 	new_hsp = (BLAST_HSPPtr) MemNew(sizeof(BLAST_HSP));
10522 	new_hsp->score = score;
10523 	new_hsp->query.offset = q_offset;
10524 	new_hsp->subject.offset = s_offset;
10525 	new_hsp->query.length = length;
10526 	new_hsp->subject.length = length;
10527 	new_hsp->query.end = q_offset + length;
10528 	new_hsp->subject.end = s_offset + length;
10529 	new_hsp->context = context;
10530 	new_hsp->query.frame = ContextToFrame(search, context);
10531 	new_hsp->subject.frame = search->subject->frame;
10532 
10533         /* HACK */
10534         new_hsp->query.gapped_start = q_offset;
10535         new_hsp->subject.gapped_start = s_offset;
10536 
10537         /* For out-of frame gapping - subject is protein
10538            and query is DNA translated into 3 frames
10539            so we have to adjust DNA sequence and
10540            coordinates */
10541 
10542         if(search->pbp->is_ooframe) {
10543             OOF_TranslateHspToDNAP(new_hsp, new_hsp->query.frame > 0 ? search->query_dnap[0]->length : search->query_dnap[1]->length);
10544         }
10545 
10546         if (!search->pbp->gapped_calculation &&
10547             search->prog_number != blast_type_blastn) {
10548            Int4 align_length;
10549            BlastHSPGetNumIdentical(search, new_hsp, NULL, &new_hsp->num_ident,
10550               &align_length);
10551         }
10552 
10553 /* If we are saving ALL HSP's, simply save and sort later. */
10554 	if (current_hitlist->do_not_reallocate == FALSE)
10555 	{
10556 		hsp_array[current_hitlist->hspcnt] = new_hsp;
10557 		(current_hitlist->hspcnt)++;
10558 		return;
10559 	}
10560 
10561 /* Use a binary search to insert the HSP. */
10562         low_index  = 0;
10563         high_index = hspcnt;
10564         while(low_index < high_index) {
10565             Int4 next_index = (low_index + high_index)/2;
10566             if( score_compare_hsps(&new_hsp, &hsp_array[next_index]) > 0 ) {
10567                 low_index = next_index  + 1;
10568             } else {
10569                 high_index = next_index;
10570             }
10571         }
10572 
10573 	if (hspcnt >= hspmax)
10574 	{
10575             if (low_index >= hspcnt) {
10576                 /* this HSP is less significant than others on a full list.*/
10577                 new_hsp = BLAST_HSPFree(new_hsp);
10578                 return;
10579             } else {
10580                 /* Delete the last HPS on the list. */
10581                 hspcnt = --current_hitlist->hspcnt;
10582                 hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10583             }
10584 	}
10585         /* Move existing elements out of the way */
10586         Nlm_MemMove(&hsp_array[low_index] + 1, &hsp_array[low_index],
10587                     (hspcnt-low_index)*sizeof(hsp_array[0]));
10588         hspcnt = ++current_hitlist->hspcnt;
10589 
10590         /* Insert the new HSP */
10591         hsp_array[low_index] = new_hsp;
10592 	return;
10593 }
10594 
10595 void
BlastSaveCurrentHspGapped(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 q_length,Int4 s_length,Int2 context,GapXEditScriptPtr esp)10596 BlastSaveCurrentHspGapped(BlastSearchBlkPtr search, BLAST_Score score,
10597 			  Int4 q_offset, Int4 s_offset, Int4 q_length,
10598 			  Int4 s_length, Int2 context, GapXEditScriptPtr esp)
10599 {
10600    BlastNtSaveCurrentHspGapped(search, score, q_offset, s_offset, q_length,
10601                                s_length, q_offset, s_offset, context, esp);
10602 
10603 }
10604 
10605 void
BlastNtSaveCurrentHspGapped(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 q_length,Int4 s_length,Int4 q_gapped_start,Int4 s_gapped_start,Int2 context,GapXEditScriptPtr esp)10606 BlastNtSaveCurrentHspGapped(BlastSearchBlkPtr search, BLAST_Score score,
10607                             Int4 q_offset, Int4 s_offset, Int4 q_length,
10608                             Int4 s_length, Int4 q_gapped_start,
10609                             Int4 s_gapped_start, Int2 context,
10610                             GapXEditScriptPtr esp)
10611 {
10612 	BLAST_HitListPtr current_hitlist;
10613 	BLAST_HSPPtr PNTR hsp_array, new_hsp;
10614 	BLAST_Score highscore, lowscore;
10615 	Int4 hspcnt, hspmax, index, new_index, high_index, old_index, low_index;
10616         Int4 new_hspmax;
10617 
10618 	current_hitlist = search->current_hitlist;
10619 	hsp_array = current_hitlist->hsp_array;
10620 	hspcnt = current_hitlist->hspcnt;
10621 	hspmax = current_hitlist->hspmax;
10622 
10623 	/* Check if list is already full, then reallocate. */
10624 	if (hspcnt >= hspmax && current_hitlist->do_not_reallocate == FALSE)
10625 	{
10626            new_hspmax = 2*current_hitlist->hspmax;
10627            if (search->pbp->hsp_num_max && search->last_context <= 1)
10628               /* The HSP limit can only be applied here in case of a
10629                  single query sequence; even then, save twice as many HSPs
10630                  so far, to accommodate for possible inclusion check
10631                  failures and score changes because of ambiguities */
10632               new_hspmax = MIN(new_hspmax, 2*search->pbp->hsp_num_max);
10633            if (new_hspmax > current_hitlist->hspmax) {
10634 		hsp_array = (BLAST_HSPPtr PNTR) Realloc(hsp_array, current_hitlist->hspmax*2*sizeof(BLAST_HSPPtr));
10635 		if (hsp_array == NULL)
10636 		{
10637 			ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in BlastSaveCurrentHsp for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, (long) hspmax);
10638 			current_hitlist->do_not_reallocate = TRUE;
10639 		}
10640 		else
10641 		{
10642 			current_hitlist->hsp_array = hsp_array;
10643 			current_hitlist->hspmax = new_hspmax;
10644 			hspmax = new_hspmax;
10645                 }
10646            } else {
10647               /*ErrPostEx(SEV_WARNING, 0, 0,
10648                         "Sequence %ld: reached max %ld HSPs",
10649                         search->subject_id, (long) hspmax);*/
10650               current_hitlist->do_not_reallocate = TRUE;
10651            }
10652            if (current_hitlist->do_not_reallocate) {
10653               /* HSPs must be now sorted */
10654               HeapSort(hsp_array, hspcnt, sizeof(BLAST_HSPPtr),
10655                        score_compare_hsps);
10656            }
10657 	}
10658 
10659 	new_hsp = (BLAST_HSPPtr) MemNew(sizeof(BLAST_HSP));
10660 	new_hsp->score = score;
10661 	new_hsp->query.offset = q_offset;
10662 	new_hsp->subject.offset = s_offset;
10663 	new_hsp->query.length = q_length;
10664 	new_hsp->subject.length = s_length;
10665 	new_hsp->query.end = q_offset + q_length;
10666 	new_hsp->subject.end = s_offset + s_length;
10667 	new_hsp->context = context;
10668 	new_hsp->query.frame = ContextToFrame(search, context);
10669 	new_hsp->subject.frame = search->subject->frame;
10670 
10671 	new_hsp->query.gapped_start = q_gapped_start;
10672 	new_hsp->subject.gapped_start = s_gapped_start;
10673 
10674 	if (esp)
10675 	   MegaBlastFillHspGapInfo(new_hsp, esp);
10676 
10677 /* If we are saving ALL HSP's, simply save and sort later. */
10678 	if (current_hitlist->do_not_reallocate == FALSE)
10679 	{
10680 		hsp_array[current_hitlist->hspcnt] = new_hsp;
10681 		(current_hitlist->hspcnt)++;
10682 		return;
10683 	}
10684 
10685 /* Use a binary search to insert the HSP. */
10686 
10687 	if (hspcnt != 0)
10688 	{
10689 		highscore = hsp_array[0]->score;
10690 		lowscore = hsp_array[hspcnt-1]->score;
10691 	}
10692 	else
10693 	{
10694 		highscore = 0;
10695 		lowscore = 0;
10696 	}
10697 
10698 	if (score >= highscore)
10699 	{
10700 		new_index = 0;
10701 	}
10702 	else if (score <= lowscore)
10703 	{
10704 		new_index = hspcnt;
10705 	}
10706 	else
10707 	{
10708 		low_index = 0;
10709 		high_index = hspcnt-1;
10710 		new_index = (low_index+high_index)/2;
10711 		old_index = new_index;
10712 
10713 		for (index=0; index<BLAST_SAVE_ITER_MAX; index++)
10714 		{
10715 			if (score > hsp_array[new_index]->score)
10716 			{
10717 				high_index = new_index;
10718 			}
10719 			else
10720 			{
10721 				low_index = new_index;
10722 			}
10723 			new_index = (low_index+high_index)/2;
10724                         if (new_index == old_index)
10725                         { /* Perform this check as new_index get rounded DOWN a
10726 bove.*/
10727                                 if (score < hsp_array[new_index]->score)
10728                                 {
10729                                         new_index++;
10730                                 }
10731                                 break;
10732                         }
10733                         old_index = new_index;
10734 		}
10735 	}
10736 
10737 	if (hspcnt >= hspmax)
10738 	{
10739 		if (new_index >= hspcnt)
10740 		{ /* this HSP is less significant than others on a full list.*/
10741                    new_hsp = BLAST_HSPFree(new_hsp);
10742                    return;
10743 		}
10744 		else
10745 		{ /* Delete the last HSP on the list. */
10746                    hspcnt = --current_hitlist->hspcnt;
10747                    hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10748 		}
10749 	}
10750 	current_hitlist->hspcnt++;
10751 	Nlm_MemMove((hsp_array+new_index+1), (hsp_array+new_index), (hspcnt-new_index)*sizeof(hsp_array[0]));
10752 	hsp_array[new_index] = new_hsp;
10753 
10754 	return;
10755 }
10756 
10757 void
BlastNtSaveCurrentHsp(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 length,Int2 context,Int4 query_gap_start,Int4 subject_gap_start)10758 BlastNtSaveCurrentHsp(BlastSearchBlkPtr search, BLAST_Score score, Int4 q_offset, Int4 s_offset, Int4 length, Int2 context, Int4 query_gap_start, Int4 subject_gap_start)
10759 
10760 {
10761 	BLAST_HitListPtr current_hitlist;
10762 	BLAST_HSPPtr PNTR hsp_array, PNTR hsp_array_new, new_hsp;
10763 	BLAST_Score highscore, lowscore;
10764 	Int4 hspcnt, hspmax, index, new_index, high_index, old_index, low_index;
10765 
10766 	current_hitlist = search->current_hitlist;
10767 	hsp_array = current_hitlist->hsp_array;
10768 	hspcnt = current_hitlist->hspcnt;
10769 	hspmax = current_hitlist->hspmax;
10770 
10771 
10772 	/* Check if list is already full, then reallocate. */
10773 	if (hspcnt >= hspmax && current_hitlist->do_not_reallocate == FALSE)
10774 	{
10775 		hsp_array_new = (BLAST_HSPPtr PNTR) MemNew((current_hitlist->hspmax+BLAST_HSP_ADD)*sizeof(BLAST_HSPPtr));
10776 		if (hsp_array_new == NULL)
10777 		{
10778 			ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in BlastSaveCurrentHsp for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, (long) hspmax);
10779 			current_hitlist->do_not_reallocate = TRUE;
10780 		}
10781 		else
10782 		{
10783 			Nlm_MemCopy(hsp_array_new, hsp_array, current_hitlist->hspmax*sizeof(BLAST_HSPPtr));
10784 			current_hitlist->hsp_array = MemFree(current_hitlist->hsp_array);
10785 			current_hitlist->hsp_array = hsp_array_new;
10786 			current_hitlist->hspmax += BLAST_HSP_ADD;
10787 			hspmax = current_hitlist->hspmax;
10788 			hsp_array = hsp_array_new;
10789 			/* Prohibit future allocations. */
10790 			if (search->pbp->hsp_num_max != 0 && current_hitlist->hspmax >= 2*search->pbp->hsp_num_max)
10791 			{
10792 				ErrPostEx(SEV_WARNING, 0, 0, "Reached max %ld HSPs in BlastSaveCurrentHsp, continuing with this limit",
10793 					(long) hspmax);
10794 				current_hitlist->do_not_reallocate = TRUE;
10795 			}
10796 		}
10797                 if (current_hitlist->do_not_reallocate) {
10798                    HeapSort(hsp_array, hspcnt, sizeof(BLAST_HSPPtr),
10799                             score_compare_hsps);
10800                 }
10801 	}
10802 
10803 	new_hsp = (BLAST_HSPPtr) MemNew(sizeof(BLAST_HSP));
10804 	new_hsp->score = score;
10805 	new_hsp->query.offset = q_offset;
10806 	new_hsp->subject.offset = s_offset;
10807 	new_hsp->query.length = length;
10808 	new_hsp->subject.length = length;
10809 	new_hsp->query.end = q_offset + length;
10810 	new_hsp->subject.end = s_offset + length;
10811 	new_hsp->context = context;
10812 	new_hsp->query.frame = ContextToFrame(search, context);
10813 	new_hsp->subject.frame = search->subject->frame;
10814 
10815     new_hsp->query.gapped_start   = query_gap_start;
10816     new_hsp->subject.gapped_start = subject_gap_start;
10817 
10818 /* If we are saving ALL HSP's, simply save and sort later. */
10819 	if (current_hitlist->do_not_reallocate == FALSE)
10820 	{
10821 		hsp_array[current_hitlist->hspcnt] = new_hsp;
10822 		(current_hitlist->hspcnt)++;
10823 		return;
10824 	}
10825 
10826 /* Use a binary search to insert the HSP. */
10827 
10828 	if (hspcnt != 0)
10829 	{
10830 		highscore = hsp_array[0]->score;
10831 		lowscore = hsp_array[hspcnt-1]->score;
10832 	}
10833 	else
10834 	{
10835 		highscore = 0;
10836 		lowscore = 0;
10837 	}
10838 
10839 	if (score >= highscore)
10840 	{
10841 		new_index = 0;
10842 	}
10843 	else if (score <= lowscore)
10844 	{
10845 		new_index = hspcnt;
10846 	}
10847 	else
10848 	{
10849 		low_index = 0;
10850 		high_index = hspcnt-1;
10851 		new_index = (low_index+high_index)/2;
10852 		old_index = new_index;
10853 
10854 		for (index=0; index<BLAST_SAVE_ITER_MAX; index++)
10855 		{
10856 			if (score > hsp_array[new_index]->score)
10857 			{
10858 				high_index = new_index;
10859 			}
10860 			else
10861 			{
10862 				low_index = new_index;
10863 			}
10864 			new_index = (low_index+high_index)/2;
10865                         if (new_index == old_index)
10866                         { /* Perform this check as new_index get rounded DOWN a
10867 bove.*/
10868                                 if (score < hsp_array[new_index]->score)
10869                                 {
10870                                         new_index++;
10871                                 }
10872                                 break;
10873                         }
10874                         old_index = new_index;
10875 		}
10876 	}
10877 
10878 	if (hspcnt >= hspmax)
10879 	{
10880 		if (new_index >= hspcnt)
10881 		{ /* this HSP is less significant than others on a full list.*/
10882 			new_hsp = MemFree(new_hsp);
10883 			return;
10884 		}
10885 		else
10886 		{ /* Delete the last HPS on the list. */
10887 			hspcnt = --current_hitlist->hspcnt;
10888 			hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10889 		}
10890 	}
10891 	current_hitlist->hspcnt++;
10892 	Nlm_MemMove((hsp_array+new_index+1), (hsp_array+new_index), (hspcnt-new_index)*sizeof(hsp_array[0]));
10893 	hsp_array[new_index] = new_hsp;
10894 
10895 	return;
10896 }
10897 
10898 Uint1Ptr
GetSequenceWithDenseSeg(DenseSegPtr dsp,Boolean query,Int4Ptr start,Int4Ptr length)10899 GetSequenceWithDenseSeg(DenseSegPtr dsp, Boolean query, Int4Ptr start, Int4Ptr length)
10900 
10901 {
10902 	BioseqPtr bsp;
10903 	Int4 index, offset;
10904 	SeqIdPtr id;
10905 	SeqPortPtr spp;
10906 	Uint1Ptr buffer;
10907 	Boolean startSet = FALSE;
10908 
10909 	if (dsp == NULL)
10910 		return NULL;
10911 
10912 	if (query == TRUE)
10913 	{
10914 		offset = 0;
10915 		id = dsp->ids;
10916 	}
10917 	else
10918 	{
10919 		offset = 1;
10920 		id = dsp->ids->next;
10921 	}
10922 
10923 	*length = 0;
10924 	for (index=0; index<dsp->numseg; index++)
10925 	{
10926 	  if (dsp->starts[offset+2*index] != -1) {
10927 	    *length += dsp->lens[index];
10928 	    if (!startSet) {
10929 	      *start = dsp->starts[offset + 2*index];
10930 	      startSet = TRUE;
10931 	    }
10932 	  }
10933 	}
10934 
10935 	bsp = BioseqLockById(id);
10936     if (bsp == NULL) {
10937         Char buf[1024];
10938         StringCpy(buf, "Failed to retrieve sequence ");
10939         SeqIdWrite(id, &buf[StringLen(buf)], PRINTID_FASTA_LONG,
10940                    sizeof(buf)-StringLen(buf)-1);
10941         ErrPostEx(SEV_WARNING, 0, 0, buf);
10942         return NULL;
10943     }
10944 
10945 	spp = SeqPortNew(bsp, *start, (*start)+(*length)-1, Seq_strand_unknown, Seq_code_ncbistdaa);
10946 
10947 	buffer = MemNew((*length)*sizeof(Uint1));
10948 
10949 	for (index=0; index<*length; index++)
10950 		buffer[index] = SeqPortGetResidue(spp);
10951 
10952 	spp = SeqPortFree(spp);
10953 	BioseqUnlock(bsp);
10954 
10955 	return buffer;
10956 }
10957 
10958 /*
10959 Produces a 'fake' BioseqPtr, for use with BLAST when the
10960 ID of the original BioseqPtr cannot be trusted.  Note that
10961 the ID of the original BioseqPtr is removed.
10962 */
10963 
10964 BioseqPtr LIBCALL
BlastMakeFakeBioseq(BioseqPtr bsp,CharPtr name)10965 BlastMakeFakeBioseq(BioseqPtr bsp, CharPtr name)
10966 
10967 {
10968 	BioseqPtr fake_bsp;
10969 	ObjectIdPtr obidp;
10970 
10971 	if (bsp == NULL)
10972 		return NULL;
10973 
10974         fake_bsp = BioseqNew();
10975         fake_bsp->descr = bsp->descr;
10976         fake_bsp->repr = bsp->repr;
10977         fake_bsp->mol = bsp->mol;
10978         fake_bsp->length = bsp->length;
10979         fake_bsp->strand = bsp->strand;
10980         fake_bsp->seq_data_type = bsp->seq_data_type;
10981         fake_bsp->seq_ext_type = bsp->seq_ext_type;
10982         fake_bsp->seq_data = bsp->seq_data;
10983         fake_bsp->seq_ext = bsp->seq_ext;
10984 
10985         obidp = ObjectIdNew();
10986 	if (name)
10987    	     obidp->str = StringSave(name);
10988 	else
10989    	     obidp->str = StringSave("QUERY");
10990         ValNodeAddPointer(&(fake_bsp->id), SEQID_LOCAL, obidp);
10991 
10992         SeqMgrAddToBioseqIndex (fake_bsp);
10993 
10994 	return fake_bsp;
10995 }
10996 
10997 BioseqPtr LIBCALL
BlastDeleteFakeBioseq(BioseqPtr fake_bsp)10998 BlastDeleteFakeBioseq(BioseqPtr fake_bsp)
10999 
11000 {
11001 	if (fake_bsp == NULL)
11002 		return NULL;
11003 
11004          fake_bsp->descr = NULL;
11005          fake_bsp->length = 0;
11006          fake_bsp->seq_data = NULL;
11007          fake_bsp->seq_ext = NULL;
11008 
11009 	return BioseqFree(fake_bsp);
11010 }
11011 
11012 /* Comparison function for sorting gi list */
Int4Compare(const void * i,const void * j)11013 static int Int4Compare(const void* i, const void* j)
11014 {
11015   if (*(Int4Ptr)i > *(Int4Ptr)j)
11016     return (1);
11017   if (*(Int4Ptr)i < *(Int4Ptr)j)
11018     return (-1);
11019   return (0);
11020 }
11021 
11022 /* Remove hits from a SeqAlignPtr that are not from a gi list. The function
11023  * is optimized with an assumption that the incoming gi list is not sorted.
11024  * Since sorting of the gi list may be expensive, the hit gis are found
11025  * and sorted. Then for each gi in the (presumably large) incoming gi list,
11026  * a binary search is performed to check if it is present in the list of hit
11027  * gis. This procedure is linear in the gi list size.
11028  */
11029 SeqAlignPtr
BlastPruneSeqAlignByGiList(SeqAlignPtr seqalign,Int4Ptr gi_list,Int4 gi_list_total,Int4 hitlist_size)11030 BlastPruneSeqAlignByGiList(SeqAlignPtr seqalign, Int4Ptr gi_list,
11031                            Int4 gi_list_total, Int4 hitlist_size)
11032 {
11033    SeqAlignPtr head = NULL, last_sap = NULL, next_sap, sap;
11034    SeqIdPtr sip;
11035    BioseqPtr bsp;
11036    Int4 gi = 0, index;
11037    Int4* hit_gis;
11038    Int4 num_hit_gis, gi_index;
11039    Boolean* good_gis;
11040    Boolean good_gi = FALSE;
11041 
11042    if (!gi_list || gi_list_total <= 0)
11043       return NULL;
11044 
11045    /* If the size of the gi list is small, sort it and use a different
11046       routine, which takes a sorted list argument. */
11047 
11048    if (hitlist_size >= gi_list_total) {
11049       qsort((void*)gi_list, gi_list_total, sizeof(Int4), Int4Compare);
11050       return BlastPruneSeqAlignBySortedGiList(seqalign, gi_list,
11051                                               gi_list_total);
11052    }
11053 
11054    hit_gis = (Int4*) MemNew(hitlist_size*sizeof(Int4));
11055 
11056    gi = 0;
11057    index = 0;
11058    /* Find all subject gis in the Seq-align chain */
11059    for (sap = seqalign; sap; sap = sap->next) {
11060       sip = SeqAlignId(sap, 1);
11061       if (sip->choice != SEQID_GI) {
11062          bsp = BioseqLockById(sip);
11063          if (bsp) {
11064             sip = SeqIdFindBest(bsp->id, SEQID_GI);
11065             BioseqUnlock(bsp);
11066          }
11067       }
11068       if (sip->choice == SEQID_GI) {
11069          /* Save this gi if the previous value of gi
11070             is different from the current value. */
11071          if (gi != sip->data.intvalue) {
11072            gi = sip->data.intvalue;
11073            hit_gis[index] = gi;
11074            ++index;
11075          }
11076       }
11077    }
11078    num_hit_gis = index;
11079    qsort((void*)hit_gis, num_hit_gis, sizeof(Int4),
11080          Int4Compare);
11081    good_gis = (Boolean*) MemNew(num_hit_gis*sizeof(Boolean));
11082 
11083    for (index = 0; index < gi_list_total; ++index) {
11084       gi_index = BinarySearchInt4(gi_list[index], hit_gis, num_hit_gis);
11085       if (hit_gis[gi_index] == gi_list[index])
11086          good_gis[gi_index] = TRUE;
11087    }
11088 
11089    for (sap = seqalign; sap; sap = next_sap) {
11090       next_sap = sap->next;
11091       sip = SeqAlignId(sap, 1);
11092       if (sip->choice != SEQID_GI) {
11093          bsp = BioseqLockById(sip);
11094          if (bsp) {
11095             sip = SeqIdFindBest(bsp->id, SEQID_GI);
11096             BioseqUnlock(bsp);
11097          }
11098       }
11099       if (sip->choice == SEQID_GI) {
11100          /* Do the following check only if the previous value of gi
11101             is different from the current value. */
11102          if (gi != sip->data.intvalue) {
11103             gi = sip->data.intvalue;
11104             index = BinarySearchInt4(gi, hit_gis, num_hit_gis);
11105             good_gi = good_gis[index];
11106          }
11107       } else {
11108          good_gi = FALSE;
11109       }
11110       if (good_gi) {
11111          /* Advance the pointer to the last link in the pruned chain to
11112             the current Seq-align. */
11113          if (head == NULL)
11114             head = last_sap = sap;
11115          else {
11116             last_sap = sap;
11117          }
11118       } else {
11119          /* Link last Seq-align in the pruned chain to the next Seq-align
11120             in the original chain. */
11121          if (last_sap)
11122             last_sap->next = sap->next;
11123          sap->next = NULL;
11124          /* Free this Seq-align, since it's no longer needed. */
11125          sap = SeqAlignFree(sap);
11126       }
11127    }
11128 
11129 
11130    return head;
11131 }
11132 
11133 /* Remove hits from a SeqAlignPtr that are not from a sorted gi list.
11134  * No check is made that incoming gi list is sorted. User must make
11135  * sure that it is that way. The pruning is done by a single pass over the
11136  * list of Seq-aligns, in which a binary search is performed for any new
11137  * subject gi to check if it is present in the gi list.
11138  */
11139 SeqAlignPtr
BlastPruneSeqAlignBySortedGiList(SeqAlignPtr seqalign,Int4Ptr gi_list,Int4 gi_list_total)11140 BlastPruneSeqAlignBySortedGiList(SeqAlignPtr seqalign, Int4Ptr gi_list,
11141                                  Int4 gi_list_total)
11142 {
11143    SeqAlignPtr head = NULL, last_sap = NULL, next_sap, sap;
11144    SeqIdPtr sip;
11145    BioseqPtr bsp;
11146    Int4 gi = 0;
11147    Boolean good_gi = FALSE;
11148 
11149    if (!gi_list || gi_list_total <= 0)
11150       return NULL;
11151 
11152    /* Find all subject gis in the Seq-align chain */
11153    for (sap = seqalign; sap; sap = next_sap) {
11154       next_sap = sap->next;
11155       sip = SeqAlignId(sap, 1);
11156       if (sip->choice != SEQID_GI) {
11157          bsp = BioseqLockById(sip);
11158          if (bsp) {
11159             sip = SeqIdFindBest(bsp->id, SEQID_GI);
11160             BioseqUnlock(bsp);
11161          }
11162       }
11163       if (sip->choice == SEQID_GI) {
11164          /* Do the following check only if the previous value of gi is
11165             different from the current value. Otherwise the "good_gi"
11166             variable is left with its previous value. */
11167          if (gi != sip->data.intvalue) {
11168             Int4 index;
11169             gi = sip->data.intvalue;
11170             index = BinarySearchInt4(gi, gi_list, gi_list_total);
11171             good_gi = (gi_list[index] == gi);
11172          }
11173       } else {
11174          good_gi = FALSE;
11175       }
11176 
11177       if (good_gi) {
11178          /* Advance the pointer to the last link in the pruned chain to
11179             the current Seq-align. */
11180          if (head == NULL)
11181             head = last_sap = sap;
11182          else {
11183             last_sap = sap;
11184          }
11185       } else {
11186          /* Link last Seq-align in the pruned chain to the next Seq-align
11187             in the original chain. */
11188          if (last_sap)
11189             last_sap->next = sap->next;
11190          sap->next = NULL;
11191          /* Free this Seq-align, since it's no longer needed. */
11192          sap = SeqAlignFree(sap);
11193       }
11194    }
11195 
11196    return head;
11197 }
11198 
11199 /*
11200    Remove hits from a SeqAlignPtr that are not within an expect
11201    value range
11202 */
11203 
11204 SeqAlignPtr
BlastPruneSeqAlignByEvalueRange(SeqAlignPtr seqalign,FloatHi expect_low,FloatHi expect_high)11205 BlastPruneSeqAlignByEvalueRange(SeqAlignPtr seqalign, FloatHi expect_low,
11206                            FloatHi expect_high)
11207 {
11208    SeqAlignPtr head = NULL, last_sap = NULL, sap, next_sap;
11209    Int4 score, number;
11210    FloatHi evalue, bit_score;
11211    SeqIdPtr sip = NULL;
11212 
11213    for (sap = seqalign; sap; sap = next_sap) {
11214       next_sap = sap->next;
11215       GetScoreAndEvalue(sap, &score, &bit_score, &evalue, &number);
11216       if (evalue >= expect_low && evalue <= expect_high) {
11217          /* Leave this Seq-align */
11218          if (head == NULL)
11219             head = last_sap = sap;
11220          else {
11221             last_sap = sap;
11222          }
11223 
11224          if (sip && SeqIdComp(TxGetSubjectIdFromSeqAlign(sap), sip)
11225              == SIC_YES) {
11226             /* Add message about deleted high scoring hits */
11227             MakeBlastScore(&sap->score, "warning", expect_low, 0);
11228          }
11229          sip = NULL;
11230 
11231       } else {
11232          if (evalue < expect_low && sip == NULL) {
11233             sip = TxGetSubjectIdFromSeqAlign(sap);
11234          }
11235          /* Remove this Seq-align: link last Seq-align in the pruned
11236             chain to the next Seq-align in the original chain. */
11237          if (last_sap)
11238             last_sap->next = sap->next;
11239          sap->next = NULL;
11240          /* Free this Seq-align, since it's no longer needed. */
11241          sap = SeqAlignFree(sap);
11242       }
11243 
11244    }
11245    return head;
11246 }
11247 
11248 /*
11249 	Returns the program name for a given program number.
11250 	The caller must delete the returned string.
11251 */
11252 
11253 CharPtr LIBCALL
BlastGetProgramName(Uint1 number)11254 BlastGetProgramName(Uint1 number)
11255 
11256 {
11257 	CharPtr string=NULL;
11258 
11259 	switch (number) {
11260 
11261 		case blast_type_blastn:
11262 			string = StringSave("blastn");
11263 		break;
11264 		case blast_type_blastp:
11265 			string = StringSave("blastp");
11266 		break;
11267 		case blast_type_blastx:
11268 			string = StringSave("blastx");
11269 		break;
11270 		case blast_type_tblastn:
11271 			string = StringSave("tblastn");
11272 		break;
11273 		case blast_type_tblastx:
11274 			string = StringSave("tblastx");
11275 		break;
11276 	        case blast_type_psitblastn:
11277                         string = StringSave("psitblastn");
11278 		break;
11279 		default:
11280 			string = NULL;
11281 		break;
11282 	}
11283 
11284 	return string;
11285 }
11286 
11287 /*
11288 	Returns the program number for a string containing the
11289 	program name.
11290 */
11291 
11292 Uint1 LIBCALL
BlastGetProgramNumber(CharPtr blast_program)11293 BlastGetProgramNumber(CharPtr blast_program)
11294 
11295 {
11296 	if (blast_program == NULL)
11297 		return blast_type_undefined;
11298 
11299 	if (StringICmp("blastn", blast_program) == 0)
11300 	{
11301 		return blast_type_blastn;
11302 	}
11303 	else if (StringICmp("blastp", blast_program) == 0)
11304 	{
11305 		return blast_type_blastp;
11306 	}
11307 	else if (StringICmp("blastx", blast_program) == 0)
11308 	{
11309 		return blast_type_blastx;
11310 	}
11311 	else if (StringICmp("tblastn", blast_program) == 0)
11312 	{
11313 		return blast_type_tblastn;
11314 	}
11315         else if (StringICmp("psitblastn", blast_program) == 0)
11316 	  {
11317                 return blast_type_psitblastn;
11318 	  }
11319 	else if (StringICmp("tblastx", blast_program) == 0)
11320 	{
11321 		return blast_type_tblastx;
11322 	}
11323 
11324 	return blast_type_undefined;
11325 }
11326 
11327 /*
11328 	returns information aobut the db and query types (protein or dna)
11329 	as well as the 'align_type' that should be attached to the SeqAnnot
11330 	for formatting.
11331 
11332 	If an invalid program is entered, then 0 is returned.
11333 */
11334 
11335 Uint1 LIBCALL
BlastGetTypes(CharPtr blast_program,Boolean PNTR query_is_na,Boolean PNTR db_is_na)11336 BlastGetTypes(CharPtr blast_program, Boolean PNTR query_is_na, Boolean PNTR db_is_na)
11337 
11338 {
11339 	Uint1 align_type=0;
11340 
11341 	align_type = BlastGetProgramNumber(blast_program);
11342         if(align_type == blast_type_undefined) {
11343             ErrPostEx(SEV_ERROR, 0,0, "Program name undefined: \"%s\"",
11344                       blast_program);
11345             return blast_type_undefined;
11346         }
11347 
11348 	if (align_type == blast_type_blastn)
11349 	{
11350 		*query_is_na = TRUE;
11351 		*db_is_na = TRUE;
11352 	}
11353 	else if (align_type == blast_type_blastp)
11354 	{
11355 		*query_is_na = FALSE;
11356 		*db_is_na = FALSE;
11357 	}
11358 	else if (align_type == blast_type_blastx)
11359 	{
11360 		*query_is_na = TRUE;
11361 		*db_is_na = FALSE;
11362 	}
11363 	else if (align_type == blast_type_tblastn)
11364 	{
11365 		*query_is_na = FALSE;
11366 		*db_is_na = TRUE;
11367 	}
11368         else if (align_type == blast_type_psitblastn)
11369 	{
11370                 *query_is_na = FALSE;
11371                 *db_is_na = TRUE;
11372 	}
11373 	else if (align_type == blast_type_tblastx)
11374 	{
11375 		*query_is_na = TRUE;
11376 		*db_is_na = TRUE;
11377 	}
11378 
11379 	return align_type;
11380 }
11381 
11382 
11383 /*
11384 	Find the word hits for a nucl. query.  No neighbors are found here.
11385 	If no indices are saved, then return 1, indicating that the
11386 	search should not be performed.
11387 
11388 */
11389 
11390 Int2
BlastNtFindWords(BlastSearchBlkPtr search,Int4 start,Int4 len,Int1 context_index)11391 BlastNtFindWords(BlastSearchBlkPtr search, Int4 start, Int4 len, Int1 context_index)
11392 {
11393 	register Int4 offset, initial_wordsize, reduced_wordsize;
11394 	Boolean found_ambig, saved_index=FALSE;
11395 	BLAST_WordFinderPtr	wfp;
11396 	Int4 end, index, index_addition, lookup_index, stop;
11397 	LookupTablePtr		lookup;
11398 	Uint1Ptr str;
11399 	ValNodePtr              vnp, vnp_start=NULL;
11400 
11401 
11402 	if (search == NULL)
11403 	{
11404 		return -1;
11405 	}
11406 
11407 	wfp = search->wfp;
11408 	if (wfp == NULL)
11409 	{
11410 		return -2;
11411 	}
11412 
11413 	lookup = wfp->lookup;
11414 	if (lookup == NULL)
11415 	{
11416 		return -3;
11417 	}
11418 
11419 	initial_wordsize = (lookup->wordsize)*(wfp->compression_ratio);
11420 	reduced_wordsize = (lookup->reduced_wordsize);
11421 
11422 	vnp = search->context[context_index].location;
11423 	if (vnp == NULL)
11424 	{
11425 		ValNodeAddInt(&vnp, 1, -1);
11426 		vnp_start = vnp;
11427 		ValNodeAddInt(&vnp, 0, len);
11428 	}
11429 
11430         while (vnp)
11431 	{
11432 	    if (vnp->choice == 1)
11433 	    {
11434 	    	start = vnp->data.intvalue + 1;
11435 	    	vnp = vnp->next;
11436 	    	if (vnp == NULL)
11437 	    		end = len;
11438 	    }
11439 	    if (vnp && vnp->choice == 0)
11440 	    {
11441 	    	end = vnp->data.intvalue - initial_wordsize;
11442 	    	vnp = vnp->next;
11443 	    }
11444 
11445 	    end = MIN(end, len-initial_wordsize);
11446 
11447 	    str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11448 
11449 	    for (offset=start; offset<end; offset++, str++)
11450 	    {
11451 		found_ambig= FALSE;
11452 		lookup_index = 0;
11453 		stop = reduced_wordsize;
11454 		index_addition = 0;
11455 		for (index=0; index<stop; index++)
11456 		{
11457 			if (*(str+index_addition) > 3 || *(str+index_addition+1) > 3 || *(str+index_addition+2) > 3 || *(str+index_addition+3) > 3)
11458 			{
11459 				found_ambig = TRUE;
11460 				break;
11461 			}
11462 
11463 			lookup_index += (*(str+index_addition)   << 6);
11464 			lookup_index += (*(str+1+index_addition) << 4);
11465 			lookup_index += (*(str+2+index_addition) << 2);
11466 			lookup_index += *(str+3+index_addition);
11467 
11468 			if (index != stop-1)
11469 			{	/* 8 bits/byte */
11470 					lookup_index <<= 8;
11471 			  		index_addition += 4;
11472 			}
11473 		}
11474 
11475 		if (found_ambig == FALSE)
11476 		{
11477 			lookup_add_index(lookup, (Int4) lookup_index, offset+(reduced_wordsize*(wfp->compression_ratio)), context_index);
11478 			saved_index = TRUE;
11479 		}
11480 	    }
11481 	}
11482 
11483 	if (vnp_start)
11484 	{
11485 		vnp_start = ValNodeFree(vnp_start);
11486 	}
11487 
11488 	if (saved_index == FALSE)
11489 		return 1;
11490 
11491 	return 0;
11492 }
11493 
11494 /*
11495 	This functions finds the words.
11496 	return values:
11497 		0: success, words saved
11498 		1: no words saved, no error
11499 		-1: error
11500 
11501 */
11502 
11503 
11504 Int2 LIBCALL
BlastFindWords(BlastSearchBlkPtr search,Int4 start,Int4 len,BLAST_Score threshold,Int1 context_index)11505 BlastFindWords(BlastSearchBlkPtr search, Int4 start, Int4 len, BLAST_Score threshold, Int1 context_index)
11506 
11507 {
11508 	register Uint1		last_char, last_char2;
11509 	Uint1Ptr		words, PNTR array;
11510 	Uint1Ptr		s_string_start, s_string;
11511 	register Uint1Ptr	str;
11512 	BLAST_Score		best_total, delta_score, diff, diff2, first_score;
11513 	BLAST_Score		second_score, start_score, start_score2, score;
11514 	BLAST_ScoreBlkPtr 	sbp;
11515 	register BLAST_ScorePtr PNTR	matrix;
11516 	BLAST_WordFinderPtr	wfp;
11517 	Boolean			exact_match, saved_index=FALSE;
11518 	LookupTablePtr		lookup;
11519 	register Int4		index1, index3, offset;
11520 	register Int1		index2;
11521 	Int4			num_of_cols, alphabet_size, wordsize;
11522 	Int4 			loop_increment, loop_increment2, stop;
11523 	SeqCodeTablePtr 	sctp;
11524 	ValNodePtr		vnp, vnp_start;
11525 
11526 	sbp = search->sbp;
11527 	matrix = sbp->matrix;
11528 	str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11529 	wfp = search->wfp;
11530 	if (wfp == NULL)
11531 		return -2;
11532 	lookup = wfp->lookup;
11533 	if (lookup == NULL)
11534 		return -3;
11535 	wordsize = wfp->wordsize;
11536 
11537 
11538 	sctp = SeqCodeTableFindObj(sbp->alphabet_code);
11539 	alphabet_size=sctp->num;
11540 	if (search->all_words == NULL)
11541 	{
11542 		search->all_words = BlastPopulateAllWordArrays(wordsize, alphabet_size);
11543 		if (search->all_words == NULL)
11544 		{
11545 			return -1;
11546 		}
11547 		num_of_cols = search->all_words->num_of_cols;
11548 		array = search->all_words->array;
11549 	}
11550 	else
11551 	{
11552 		num_of_cols = search->all_words->num_of_cols;
11553 		array = search->all_words->array;
11554 	}
11555 
11556 	/* Index a specific small set, such as one db sequence. */
11557 	if (search->all_words->specific)
11558 	{
11559 		len -= (wordsize-1);
11560 		for (offset=start; offset<len; offset++, str++)
11561 		{
11562 			for (index1=0; index1<num_of_cols; index1++)
11563 			{
11564 				Boolean		ExactMatch = TRUE;
11565 				words = array[index1];
11566 				score = 0;
11567 				for (index2=0; index2<wordsize; index2++)
11568 				{
11569 					score += matrix[*(str+index2)][*(words+index2)];
11570 					if (*(str+index2) != *(words+index2))
11571 					    ExactMatch = FALSE;
11572 				}
11573 	/* If score is above threshold or an exact match gives a non-zero value. */
11574 		     	     	if (score >= threshold || (ExactMatch && score > 0))
11575 			     	{
11576 					lookup_add(lookup, (CharPtr) words, offset+wordsize-1, context_index);
11577 					saved_index = TRUE;
11578 				}
11579 			}
11580 		}
11581 
11582 		if (saved_index)
11583 		    return 0;
11584 		else
11585 		    return 1;
11586 	}
11587 
11588 	s_string_start = s_string = MemNew((wordsize+2)*sizeof(Uint1));
11589 
11590 	if (s_string_start == NULL)
11591 		return -1;
11592 
11593 /* Amounts to advance loops if the same character is to be checked again. */
11594 	loop_increment=(long) (Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2)));
11595 	loop_increment2=loop_increment/alphabet_size;
11596 /* Shorten len so up to the last complete word is checked. */
11597 	len -= (wordsize-1);
11598 
11599 	vnp_start = NULL;
11600 	vnp = search->context[context_index].location;
11601 	if (vnp == NULL)
11602 	{
11603 		ValNodeAddInt(&vnp, 1, -1);
11604 		ValNodeAddInt(&vnp, 0, len+wordsize);
11605 		vnp_start = vnp;
11606 	}
11607 
11608     while (vnp)
11609 	{
11610 		if (vnp->choice == 1)
11611 		{
11612 			start = vnp->data.intvalue + 1;
11613 			vnp = vnp->next;
11614 			if (vnp == NULL)
11615 				stop = len;
11616 		}
11617 		if (vnp && vnp->choice == 0)
11618 		{
11619 			stop = vnp->data.intvalue - (wordsize-1);
11620 			vnp = vnp->next;
11621 		}
11622 
11623 		stop = MIN(stop, len);
11624 
11625 		str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11626 
11627 	for (offset=start; offset<stop; offset++, str++)
11628 	{
11629 /* Put query into the lookup table, after checking that word would give
11630 a positive value. */
11631 	/* These are the exact matches. */
11632 		best_total=0;
11633 		for (index1=0; index1<wordsize; index1++)
11634 		{
11635 		    best_total += matrix[(Int4) *(str+index1)][(Int4) *(str+index1)];
11636 		}
11637 		if (best_total > 0)
11638 		{
11639 			lookup_add(lookup, (CharPtr) str, offset+wordsize-1, context_index);
11640 			saved_index = TRUE;
11641 		}
11642 
11643 /* Check if a match with a non-identical word could give a score above T. */
11644 		best_total=0;
11645 		for (index1=0; index1<wordsize; index1++)
11646 		{
11647 			best_total += sbp->maxscore[str[index1]];
11648 		}
11649 
11650 		if (best_total < threshold)
11651 		{	/* no chance of a match! */
11652 			continue;
11653 		}
11654 
11655 		delta_score = best_total-threshold;
11656 
11657 /* pick a last_char that is at end of the array, could this be improved? */
11658 		last_char=array[num_of_cols-1][wordsize-2];
11659 		last_char2=array[num_of_cols-1][wordsize-2];
11660 
11661 		for (index1=0; index1<num_of_cols; index1++)
11662 		{
11663 			words = array[index1];
11664 
11665 /*
11666 only do this check if the letter has changed from last time.  See if
11667 the new letter, matched with the first letter of the word, changes the
11668 total possible score to below threshold.  If so, move ahead to the next letter.
11669 This is repeated with the second letter in the word.
11670 
11671 The order of the letters in the first and second columns of array is
11672 important here!
11673 */
11674 			if (last_char != *words)
11675 			{
11676 				last_char = *words;
11677 				first_score = matrix[(Int4) *str][(Int4) *words];
11678 				diff = delta_score + first_score - sbp->maxscore[*str];
11679 				if (diff < 0)
11680 				{
11681 /* index1 should be advanced by loop_increment, decrement by one as the "for"
11682 loop above increments by one.	*/
11683 					index1 += loop_increment;
11684 					index1--;
11685 					continue;
11686 				}
11687 				start_score = first_score;
11688 			}
11689 
11690 			if (wordsize > 2 && last_char2 != *(words+1) && wordsize != 1)
11691 			{
11692 				last_char2 = *(words+1);
11693 				second_score = matrix[(Int4) *(str+1)][(Int4) *(words+1)];
11694 				diff2 =  second_score - sbp->maxscore[*(str+1)];
11695 				diff2 += diff;
11696 				if (diff2 < 0)
11697 				{
11698 /* index1 should be advanced by loop_increment2, decrement by one as the "for"
11699 loop above increments by one.	*/
11700 					index1 += loop_increment2;
11701 					index1--;
11702 					continue;
11703 				}
11704 				start_score = second_score+first_score;
11705 			}
11706 
11707 			start_score2 = start_score;
11708 
11709 			for (index2=2; index2<wordsize-1; index2++)
11710 			{
11711 				start_score2 += matrix[(Int4) *(str+index2)][*(words+index2)];
11712 			}
11713 
11714 			for (index2=0; index2<alphabet_size; index2++)
11715 			{
11716 			     score = start_score2;
11717 			     score += matrix[(Int4) *(str+wordsize-1)][index2];
11718 
11719 		     	     if (score >= threshold)
11720 			     {
11721 				exact_match=TRUE;
11722 				for (index3=0; index3<wordsize-1; index3++)
11723 				{
11724 					if (*(str+index3) != *(words+index3))
11725 					{
11726 						exact_match=FALSE;
11727 						break;
11728 					}
11729 				}
11730 				if (*(str+wordsize-1) != index2)
11731 				{
11732 					exact_match=FALSE;
11733 				}
11734 
11735 /* Exact matches were done above, exclude here.  Is this really needed? */
11736 /* Could exact matches just be done here? */
11737 				if (exact_match == FALSE)
11738 				{
11739 					s_string = s_string_start;
11740 					for (index3=0; index3<wordsize-1; index3++)
11741 					{
11742 						*s_string = *(words+index3);
11743 						s_string++;
11744 					}
11745 					*s_string = index2;
11746 					lookup_add(lookup, (CharPtr) s_string_start, offset+wordsize-1, context_index);
11747 					saved_index = TRUE;
11748 				}
11749 			     }
11750 			}
11751 		}
11752 	}
11753 	}
11754 
11755 	if (vnp_start)
11756 	{
11757 		vnp_start = ValNodeFree(vnp_start);
11758 	}
11759 
11760 	s_string_start = MemFree(s_string_start);
11761 
11762 	if (saved_index == FALSE)
11763 		return 1;
11764 
11765 	return 0;
11766 }
11767 
11768 /*AAS position-based version of BlastFindWords*/
11769 Int2 LIBCALL
BlastNewFindWords_Old(BlastSearchBlkPtr search,Int4 start,Int4 len,BLAST_Score threshold,Int1 context_index)11770 BlastNewFindWords_Old(BlastSearchBlkPtr search, Int4 start, Int4 len, BLAST_Score threshold, Int1 context_index)
11771 
11772 {
11773 	register Uint1		last_char, last_char2;
11774 	Uint1Ptr		words, PNTR array;
11775 	Uint1Ptr		s_string_start, s_string;
11776 	register Uint1Ptr	str;
11777 	BLAST_Score		best_total, delta_score, diff, diff2, first_score;
11778 	BLAST_Score		second_score, start_score, start_score2, score;
11779 	BLAST_ScoreBlkPtr 	sbp;
11780 	BLAST_WordFinderPtr	wfp;
11781 	Boolean			exact_match;
11782 	LookupTablePtr		lookup;
11783 	register Int4		index1, index3, offset;
11784 	register Int1		index2;
11785 	Int4			num_of_cols, alphabet_size, wordsize;
11786 	Int4 			loop_increment, loop_increment2;
11787 	SeqCodeTablePtr 	sctp;
11788 
11789 	sbp = search->sbp;
11790 	str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11791 	wfp = search->wfp;
11792 	lookup = wfp->lookup;
11793 	wordsize = wfp->wordsize;
11794 
11795 	sctp = SeqCodeTableFindObj(sbp->alphabet_code);
11796 	alphabet_size=sctp->num;
11797 	if (search->all_words == NULL)
11798 	{
11799 		search->all_words = BlastPopulateAllWordArrays(wordsize, alphabet_size);
11800 		if (search->all_words == NULL)
11801 		{
11802 			return -1;
11803 		}
11804 		num_of_cols = search->all_words->num_of_cols;
11805 		array = search->all_words->array;
11806 	}
11807 	else
11808 	{
11809 		num_of_cols = search->all_words->num_of_cols;
11810 		array = search->all_words->array;
11811 	}
11812 
11813 	/* Index a specific small set, such as one db sequence. */
11814 	if (search->all_words->specific)
11815 	{
11816 		len -= (wordsize-1);
11817 		for (offset=start; offset<len; offset++, str++)
11818 		{
11819 			for (index1=0; index1<num_of_cols; index1++)
11820 			{
11821 				words = array[index1];
11822 				score = 0;
11823 				for (index2=0; index2<wordsize; index2++)
11824 				{
11825 					score += MtrxScorePosSearch(search->sbp,
11826 							offset + index2,
11827 							*(words+index2));
11828 				}
11829 		     	     	if (score >= threshold)
11830 			     	{
11831 					lookup_add(lookup, (CharPtr) words, offset+wordsize-1, context_index);
11832 				}
11833 			}
11834 		}
11835 		return 0;
11836 	}
11837 
11838 	s_string_start = s_string = MemNew((wordsize+2)*sizeof(Uint1));
11839 
11840 	if (s_string_start == NULL)
11841 		return 1;
11842 
11843 /* Amounts to advance loops if the same character is to be checked again. */
11844 	loop_increment=(long) (Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2)));
11845 	loop_increment2=loop_increment/alphabet_size;
11846 /* Shorten len so up to the last complete word is checked. */
11847 	len -= (wordsize-1);
11848 	for (offset=start; offset<len; offset++, str++)
11849 	{
11850 /* Put query into the lookup table, after checking that word would give
11851 a positive value. */
11852 		best_total=0;
11853 		for (index1=0; index1<wordsize; index1++)
11854 		{
11855 		    best_total += MtrxScorePosSearch(search->sbp,
11856 					offset+index1,(Int4) *(str+index1));
11857 		}
11858 		if (best_total > 0)
11859 			lookup_add(lookup, (CharPtr) str, offset+wordsize-1, context_index);
11860 
11861 /* Check if a match with a non-identical word could give a score above T. */
11862 		best_total=0;
11863 		for (index1=0; index1<wordsize; index1++)
11864 		{
11865 			best_total += sbp->maxscore[str[index1]];
11866 		}
11867 
11868 		if (best_total < threshold)
11869 		{	/* no chance of a match! */
11870 			continue;
11871 		}
11872 
11873 		delta_score = best_total-threshold;
11874 
11875 /* pick a last_char that is at end of the array, could this be improved? */
11876 		last_char=array[num_of_cols-1][wordsize-2];
11877 		last_char2=array[num_of_cols-1][wordsize-2];
11878 
11879 		for (index1=0; index1<num_of_cols; index1++)
11880 		{
11881 			words = array[index1];
11882 
11883 /*
11884 only do this check if the letter has changed from last time.  See if
11885 the new letter, matched with the first letter of the word, changes the
11886 total possible score to below threshold.  If so, move ahead to the next letter.
11887 This is repeated with the second letter in the word.
11888 
11889 The order of the letters in the first and second columns of array is
11890 important here!
11891 */
11892 			if (last_char != *words)
11893 			{
11894 				last_char = *words;
11895 				first_score = MtrxScorePosSearch(search->sbp,
11896 					offset,(Int4) *words);
11897 				diff = delta_score + first_score - sbp->maxscore[*str];
11898 				if (diff < 0)
11899 				{
11900 /* index1 should be advanced by loop_increment, decrement by one as the "for"
11901 loop above increments by one.	*/
11902 					index1 += loop_increment;
11903 					index1--;
11904 					continue;
11905 				}
11906 				start_score = first_score;
11907 			}
11908 
11909 			if (last_char2 != *(words+1) && wordsize != 1)
11910 			{
11911 				last_char2 = *(words+1);
11912 				second_score = MtrxScorePosSearch(search->sbp,
11913 						offset+1,(Int4) *(words+1));
11914 				diff2 =  second_score - sbp->maxscore[*(str+1)];
11915 				diff2 += diff;
11916 				if (diff2 < 0)
11917 				{
11918 /* index1 should be advanced by loop_increment2, decrement by one as the "for"
11919 loop above increments by one.	*/
11920 					index1 += loop_increment2;
11921 					index1--;
11922 					continue;
11923 				}
11924 				start_score = second_score+first_score;
11925 			}
11926 
11927 			start_score2 = start_score;
11928 
11929 			for (index2=2; index2<wordsize-1; index2++)
11930 			{
11931 				start_score2 += MtrxScorePosSearch(search->sbp,
11932 						offset+index2,*(words+index2));
11933 			}
11934 
11935 			for (index2=0; index2<alphabet_size; index2++)
11936 			{
11937 			     score = start_score2;
11938 			     score += MtrxScorePosSearch(search->sbp,
11939 					offset+wordsize-1,index2);
11940 
11941 		     	     if (score >= threshold)
11942 			     {
11943 				exact_match=TRUE;
11944 				for (index3=0; index3<wordsize-1; index3++)
11945 				{
11946 					if (*(str+index3) != *(words+index3))
11947 					{
11948 						exact_match=FALSE;
11949 						break;
11950 					}
11951 				}
11952 				if (*(str+wordsize-1) != index2)
11953 				{
11954 					exact_match=FALSE;
11955 				}
11956 
11957 				if (exact_match == FALSE)
11958 				{
11959 					s_string = s_string_start;
11960 					for (index3=0; index3<wordsize-1; index3++)
11961 					{
11962 						*s_string = *(words+index3);
11963 						s_string++;
11964 					}
11965 					*s_string = index2;
11966 					lookup_add(lookup, (CharPtr) s_string_start, offset+wordsize-1, context_index);
11967 
11968 				}
11969 			     }
11970 			}
11971 		}
11972 	}
11973 
11974 	s_string_start = MemFree(s_string_start);
11975 	return 0;
11976 }
11977 
11978 /* SSH position-based version of BlastFindWords
11979    Lookup structure should be allocated at this point*/
11980 
BlastNewFindWordsEx(LookupTablePtr lookup,BLAST_ScorePtr PNTR posMatrix,Int4 start,Int4 len,BlastAllWordPtr all_words,BLAST_Score threshold,Int4 wordsize,Int1 context_index)11981 Int2 BlastNewFindWordsEx(LookupTablePtr lookup, BLAST_ScorePtr PNTR posMatrix,
11982                          Int4 start, Int4 len, BlastAllWordPtr all_words,
11983                          BLAST_Score threshold, Int4 wordsize,
11984                          Int1 context_index)
11985 {
11986     register Uint1	last_char, last_char2;
11987     Uint1Ptr		words, PNTR array;
11988     Uint1Ptr		s_string_start, s_string;
11989     BLAST_Score		best_total, delta_score, diff, diff2, first_score;
11990     BLAST_Score		second_score, start_score, start_score2, score;
11991     register Int4	index1, index3, offset;
11992     register Int1	index2;
11993     Int4		num_of_cols, alphabet_size;
11994     Int4 		loop_increment, loop_increment2;
11995     Boolean all_words_allocated = FALSE;
11996     BLAST_ScorePtr      maxscore;
11997 
11998     if(lookup == NULL || posMatrix == NULL)
11999         return -1;
12000 
12001     alphabet_size = PSI_ALPHABET_SIZE; /* 26 */
12002 
12003     if (all_words == NULL) {
12004         all_words = BlastPopulateAllWordArrays(wordsize, alphabet_size);
12005         if (all_words == NULL) {
12006             return -1;
12007         }
12008 
12009         all_words_allocated = TRUE;
12010         num_of_cols = all_words->num_of_cols;
12011         array = all_words->array;
12012     } else {
12013         num_of_cols = all_words->num_of_cols;
12014         array = all_words->array;
12015     }
12016 
12017     /* Index a specific small set, such as one db sequence.
12018        This is used when all_words actually a subset of "all_words" found
12019        in some sequence. Used for ex. in BlastTwoSequences */
12020 
12021     if (all_words->specific) {
12022         len -= (wordsize-1);
12023         for (offset = start; offset < len; offset++) {
12024             for (index1 = 0; index1 < num_of_cols; index1++) {
12025                 words = array[index1];
12026                 score = 0;
12027                 for (index2 = 0; index2 < wordsize; index2++)
12028                     score += posMatrix[offset + index2][*(words+index2)];
12029                 if (score >= threshold) {
12030                     lookup_add(lookup, (CharPtr) words, offset + wordsize - 1,
12031                                context_index);
12032                 }
12033             }
12034         }
12035 
12036         if(all_words_allocated)
12037             BlastAllWordDestruct(all_words);
12038 
12039         return 0;
12040     }
12041     /* ----------- End of "specific" word finding ------------ */
12042 
12043     /* maxscore matrix will be used position-specific -
12044        of length = (len - start) */
12045 
12046     maxscore = BlastPSIMaxScoreGet(posMatrix, start, len);
12047 
12048     s_string_start = s_string = MemNew((wordsize+2)*sizeof(Uint1));
12049 
12050     if (s_string_start == NULL)
12051         return 1;
12052 
12053     /* Amounts to advance loops if the same character is to be checked again. */
12054     loop_increment=(long) (Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2)));
12055     loop_increment2=loop_increment/alphabet_size;
12056 
12057     /* Shorten len so up to the last complete word is checked. */
12058     len -= (wordsize-1);
12059     for (offset = start; offset < len; offset++) {
12060 
12061         /* Check if a match with a non-identical word could give a score above T. */
12062         best_total = 0;
12063         for (index1 = 0; index1 < wordsize; index1++) {
12064             best_total += maxscore[offset+index1];
12065         }
12066 
12067         if (best_total < threshold) {	/* no chance of a match! */
12068             continue;
12069         }
12070 
12071         delta_score = best_total - threshold;
12072 
12073         /* pick a last_char that is at end of the array, could this be improved? */
12074         last_char=array[num_of_cols-1][wordsize-2];
12075         last_char2=array[num_of_cols-1][wordsize-2];
12076 
12077         for (index1 = 0; index1 < num_of_cols; index1++) {
12078             words = array[index1];
12079 
12080 /*
12081 only do this check if the letter has changed from last time.  See if
12082 the new letter, matched with the first letter of the word, changes the
12083 total possible score to below threshold.  If so, move ahead to the next letter.
12084 This is repeated with the second letter in the word.
12085 
12086 The order of the letters in the first and second columns of array is
12087 important here!
12088 */
12089             if (last_char != *words) {
12090                 last_char = *words;
12091 
12092                 first_score = posMatrix[offset][*words];
12093                 diff = delta_score + first_score - maxscore[offset];
12094 
12095                 if (diff < 0) {
12096 /* index1 should be advanced by loop_increment, decrement by one as the "for"
12097 loop above increments by one.	*/
12098                     index1 += loop_increment;
12099                     index1--;
12100                     continue;
12101                 }
12102                 start_score = first_score;
12103             }
12104 
12105             if (last_char2 != *(words+1) && wordsize != 1) {
12106                 last_char2 = *(words+1);
12107                 second_score = posMatrix[offset+1][*(words+1)];
12108 
12109                 diff2 =  second_score - maxscore[offset+1];
12110                 diff2 += diff;
12111                 if (diff2 < 0) {
12112 /* index1 should be advanced by loop_increment2, decrement by one as the "for"
12113 loop above increments by one.	*/
12114                     index1 += loop_increment2;
12115                     index1--;
12116                     continue;
12117                 }
12118                 start_score = second_score + first_score;
12119             }
12120 
12121             start_score2 = start_score;
12122 
12123             for (index2 = 2; index2 < wordsize - 1; index2++) {
12124                 start_score2 += posMatrix[offset+index2][*(words+index2)];
12125             }
12126 
12127             for (index2 = 0; index2 < alphabet_size; index2++) {
12128                 score = start_score2;
12129                 score += posMatrix[offset+wordsize-1][index2];
12130 
12131                 if (score >= threshold) {
12132                     s_string = s_string_start;
12133                     for (index3=0; index3 < wordsize-1; index3++) {
12134                         *s_string = *(words+index3);
12135                         s_string++;
12136                     }
12137                     *s_string = index2;
12138                     lookup_add(lookup, (CharPtr) s_string_start,
12139                                offset+wordsize-1, context_index);
12140                 }
12141             }
12142         }
12143     }
12144 
12145     s_string_start = MemFree(s_string_start);
12146 
12147     if(all_words_allocated)
12148         BlastAllWordDestruct(all_words);
12149 
12150     MemFree(maxscore);
12151 
12152     return 0;
12153 }
12154 /* SSH position-based version of BlastFindWords*/
12155 Int2 LIBCALL
BlastNewFindWords(BlastSearchBlkPtr search,Int4 start,Int4 len,BLAST_Score threshold,Int1 context_index)12156 BlastNewFindWords(BlastSearchBlkPtr search, Int4 start, Int4 len, BLAST_Score threshold, Int1 context_index)
12157 {
12158     Int2 status;
12159 
12160     status = BlastNewFindWordsEx(search->wfp->lookup, search->sbp->posMatrix,
12161                                  start, len, search->all_words, threshold,
12162                                  search->wfp->wordsize, context_index);
12163     return status;
12164 }
12165 
12166 /*******************************************************************
12167 
12168 This function allocates and populates an array containing every possible
12169 letter combination for an alphabet of size alphabet_size for the first
12170 wordsize-1 letters.   The last letter of the word is done on the fly.
12171 The approach is best described with a table that demonstrates
12172 the strategy with a two-letter alphabet and a wordsize of three:
12173 
12174 	index   1	2
12175 	col.	1	0
12176 
12177 		A	A
12178 		A	B
12179 		B	A
12180 		B	B
12181 		A	A
12182 		A	B
12183 		B	A
12184 		B	B
12185 
12186 col. 0: basic pattern repeated N**(W-1) times, where N is the size of the
12187 	alphabet and W is the wordsize.
12188 col. 1: AABB repeated N**(W-2) times.
12189 
12190 Each pattern is repeated N**(W-1-C) times, where "C" is the column number.
12191 The number of repeats of a given letter is N**C.
12192 The total number of rows in the array is then N**(W-1-C) * N**C * N = N**W,
12193 as we expect.
12194 
12195 NOTE:  The order of the columns is important, as it is used in
12196 BlastWFContextNeighborhoodAdd above.  In particular it is useful to have
12197 all the letters grouped together.
12198 *********************************************************************/
12199 
12200 BlastAllWordPtr
BlastPopulateAllWordArrays(Int4 wordsize,Int4 alphabet_size)12201 BlastPopulateAllWordArrays(Int4 wordsize, Int4 alphabet_size)
12202 
12203 {
12204 	BlastAllWordPtr all_words;
12205 	Uint1Ptr *array_ptr, *array;
12206 	Uint1Ptr array_storage;
12207 	register Int4 index, index1, index3, num_of_cols, times, repeat_num;
12208 	register Int1 index2;
12209 	num_of_cols = (Int4) Nlm_Powi((Nlm_FloatHi)alphabet_size, wordsize-1);
12210 	array = (Uint1Ptr *) MemNew(num_of_cols*sizeof(Uint1Ptr));
12211 
12212 	array_storage = (Uint1Ptr) MemNew(num_of_cols*(wordsize-1)*sizeof(Uint1));
12213 	for (index=0; index<num_of_cols; index++)
12214 	{
12215 	    array[index] = array_storage+(index*(wordsize-1));
12216 	}
12217 
12218 	for (index=0; index<wordsize-1; index++)
12219 	{
12220 	    array_ptr = array;
12221 	    repeat_num= (Int4) Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2-index));
12222 	    times = (Int4) Nlm_Powi((Nlm_FloatHi)alphabet_size, index);
12223 	    for (index1=0; index1<times; index1++)
12224 	    {
12225 	    	for (index2=0; index2<alphabet_size; index2++)
12226 	    	{
12227 	    	    for (index3=0; index3<repeat_num; index3++)
12228 		    {
12229 			(*array_ptr)[index] = index2;
12230 			array_ptr++;
12231 		    }
12232 		 }
12233 	     }
12234 	}
12235 
12236 	all_words = BlastAllWordNew(num_of_cols, wordsize, TRUE, FALSE);
12237 	if (all_words)
12238 	{
12239 		all_words->array = array;
12240 		all_words->array_storage = array_storage;
12241 	}
12242 
12243 	return all_words;
12244 }
12245 
12246 /**************************************************************************
12247 *
12248 *	Get the "ScoreSet" ScorePtr from the BLAST data, which is provided
12249 *	by "hsp".  "score_set" should be NULL, a chain of scores is added.
12250 **************************************************************************/
12251 
12252 ScorePtr LIBCALL
GetScoreSetFromBlastResultHsp(BLASTResultHspPtr hsp,SeqIdPtr gi_list)12253 GetScoreSetFromBlastResultHsp(BLASTResultHspPtr hsp, SeqIdPtr gi_list)
12254 
12255 {
12256 	ScorePtr	score_set=NULL;
12257 	Nlm_FloatHi	prob;
12258 	Int4		score;
12259 	CharPtr		scoretype;
12260 
12261 	score = hsp->score;
12262 	if (score > 0)
12263 		MakeBlastScore(&score_set, "score", 0.0, score);
12264 
12265 	score = hsp->number;
12266 	scoretype = "sum_n";
12267 
12268 	if (score > 1)
12269 		MakeBlastScore(&score_set, scoretype, 0.0, score);
12270 
12271 	prob = hsp->e_value;
12272 	if (hsp->number <= 1)
12273 	{
12274 		scoretype = "e_value";
12275 	}
12276 	else
12277 	{
12278 		scoretype = "sum_e";
12279 	}
12280 	if (prob >= 0.)
12281 	{
12282 		if (prob < 1.0e-180)
12283 			prob = 0.0;
12284 		MakeBlastScore(&score_set, scoretype, prob, 0);
12285 	}
12286 
12287 	prob = hsp->bit_score;
12288 	if (prob >= 0.)
12289 		MakeBlastScore(&score_set, "bit_score", prob, 0);
12290 
12291         if (hsp->num_ident > 0)
12292            MakeBlastScore(&score_set, "num_ident", 0.0, hsp->num_ident);
12293 
12294 	if (hsp->number > 1 && hsp->ordering_method == BLAST_SMALL_GAPS)
12295 	{
12296 		MakeBlastScore(&score_set, "small_gap", 0.0, 1);
12297 	} else if (hsp->ordering_method > 3) {
12298            /* In new tblastn this means splice junction was found */
12299            MakeBlastScore(&score_set, "splice_junction", 0.0, 1);
12300         }
12301 
12302 	while (gi_list)
12303 	{
12304 		MakeBlastScore(&score_set, "use_this_gi", 0.0, gi_list->data.intvalue);
12305 		gi_list = gi_list->next;
12306 	}
12307 
12308 	return score_set;
12309 }
12310 
12311 /** Configure the database chunk size adaptively.
12312  * Note: Must be called from a single-threaded context
12313  * @param search the The search block to configure [inout]
12314  * @param num_seq The number of sequences in the database [in]
12315  */
ConfigureDbChunkSize(BlastSearchBlkPtr search,Int4 num_seq)12316 void ConfigureDbChunkSize(BlastSearchBlkPtr search, Int4 num_seq)
12317 {
12318 /* Emit a tick after how many sequences? */
12319 search->thr_info->db_incr = num_seq / BLAST_NTICKS;
12320 
12321 /* Divide the search into chunks */
12322 search->thr_info->db_chunk_size = MAX(num_seq / 100,1);
12323 
12324 /* Loadbalance more finely for multithreaded searches. */
12325 if (NlmThreadsAvailable() && search->pbp->process_num > 1)
12326     search->thr_info->db_chunk_size = MAX(num_seq/(100*(search->pbp->process_num
12327 )), 1);
12328 
12329 return;
12330 }
12331