1 static char const rcsid[] = "$Id: blastutl.c,v 6.472 2010/12/10 18:47:12 madden Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26
27 /*****************************************************************************
28
29 File name: blastutl.c
30
31 Author: Tom Madden
32
33 Contents: Utilities for BLAST
34
35 $Revision: 6.472 $
36
37 ******************************************************************************/
38 /*
39 *
40 * $Log: blastutl.c,v $
41 * Revision 6.472 2010/12/10 18:47:12 madden
42 * Do not change db_length on options
43 *
44 * Revision 6.471 2007/05/08 19:03:33 kans
45 * in FilterWithSeg added SeqDataPtr and ByteStorePtr casts for seq_data
46 *
47 * Revision 6.470 2007/03/13 20:40:24 madden
48 * - In s_ComputeAverageLength, compute the floating point value retval
49 * using floating point division.
50 *
51 * - In BioseqBlastEngineCore, call blast_set_paramters for rounds > 1
52 * of PSI-BLAST.
53 *
54 * - In GetDbSubjRatio, use floating point operations to compute the
55 * floating point value db_subj_ratio.
56 * [from Mike Gertz]
57 *
58 * Revision 6.469 2007/03/05 14:51:24 camacho
59 * - Make s_ComputeAverageLength static.
60 *
61 * Revision 6.468 2007/01/23 15:25:44 madden
62 * Use SeqLocDustEx rather than SeqLocDust
63 *
64 * Revision 6.467 2007/01/17 15:46:00 madden
65 * remove FilterDNA
66 *
67 * Revision 6.466 2006/08/10 17:34:38 merezhuk
68 * Fix for reading -z advanced option by StringToInt8; RT # 15187990
69 *
70 * Revision 6.465 2006/02/15 18:23:47 madden
71 * Made changes so that CheckStartForGappedAlignment by default
72 * checks ungapped alignments of length 11, rather than length 10.
73 * Made changes to the rules used when the starting point is close to
74 * the edge of the preliminary gapped alignment.
75 * (from Mike Gertz)
76 *
77 * Revision 6.464 2005/12/01 15:10:23 madden
78 * Gave BLASTCheckHSPInclusion external linkage (i.e. removed the static specifier).
79 *
80 * Revision 6.463 2005/10/13 15:59:06 camacho
81 * Add code to fix cutoff scores in PSI-BLAST.
82 *
83 * Revision 6.462 2005/07/28 14:57:09 coulouri
84 * remove dead code
85 *
86 * Revision 6.461 2005/07/27 15:51:54 coulouri
87 * remove unused queue_callback
88 *
89 * Revision 6.460 2005/05/02 16:03:14 coulouri
90 * refactor code to set db_chunk_size
91 *
92 * Revision 6.459 2005/04/25 14:16:36 coulouri
93 * set db_chunk_size adaptively
94 *
95 * Revision 6.458 2005/04/04 20:44:27 camacho
96 * Do not overwrite the effective search space in Pssm2Sequences if specified in the options structure
97 *
98 * Revision 6.457 2005/02/07 15:30:08 dondosha
99 * Removed restriction on the value of longest intron option
100 *
101 * Revision 6.456 2005/01/24 20:37:37 camacho
102 * Added conditional compilation to structs need for BLAST_CLUSTER_HITS
103 *
104 * Revision 6.455 2005/01/18 14:54:13 camacho
105 * Change in tie-breakers for score comparison, suggestion by Mike Gertz
106 *
107 * Revision 6.454 2004/12/20 15:22:16 camacho
108 * Calculate kbp_ideal values rather than loading them from pre-computed values
109 *
110 * Revision 6.453 2004/12/01 17:24:15 coulouri
111 * do not dereference null pointer
112 *
113 * Revision 6.452 2004/11/22 16:10:11 dondosha
114 * Minor fix to make sure that "evalue" score type is always used when hsp is not part of a linked set
115 *
116 * Revision 6.451 2004/11/04 15:51:55 bealer
117 * - bl2seq should use dblen as average length if database is not available.
118 *
119 * Revision 6.450 2004/11/01 14:07:56 madden
120 * From Mike Gertz:
121 *
122 * - In query_offset_compare_hsp and query_end_compare_hsp, use the
123 * subject query/offset as a tie-breaker. Without this tie-breaker
124 * CheckGappedAlignmentsForOverlap won't work properly.
125 *
126 * - In CheckGappedAlignmentsForOverlap check that hsp_array, rather
127 * than *hsp_array, is not nil.
128 *
129 * - In BlastSaveCurrentHsp, rewrote the binary search to use
130 * score_compare_hsps, so that the answers are consistent with the
131 * heap code used in the algo/blast/core code.
132 *
133 * - In BlastGappedScoreInternal delete gapped extensions that don't
134 * reach the cutoff score (cutoff_s1).
135 *
136 * Revision 6.449 2004/10/25 18:36:17 papadopo
137 * From Michael Gertz: remove unneeded decrement of alignment offsets in BlastNtSaveCurrentHsp
138 *
139 * Revision 6.448 2004/10/19 19:42:17 dondosha
140 * Optimized algorithm in BlastPruneSeqAlignByGiList to make it up to 25 times faster; Added new function BlastPruneSeqAlignBySortedGiList
141 *
142 * Revision 6.447 2004/10/18 13:02:41 madden
143 * Changes from Mike Gertz:
144 * - In score_compare_hsps, query_offset_compare_hsp and
145 * query_end_compare_hsp, change the comparison tests so that
146 * nil HSPs are less than any non-nil HSP. Previously, these
147 * comparison functions would return 0 if either HSP was nil,
148 * which would result in sort routines terminating before the
149 * non-nil HSPs in the list were fully sorted.
150 *
151 * - In score_compare_hsps, copied the set of tie-breakers from
152 * the corresponding routine in algo/blast/core/blast_hits.c.
153 *
154 * - In RealBlastGetGappedAlignmentTraceback, the HSP list must
155 * be sorted before BLASTCheckHSPInclusion is invoked.
156 *
157 * Revision 6.446 2004/09/28 16:05:32 papadopo
158 * From Michael Gertz: In BlastGappedScoreInternal, changed a
159 * reference to the sumscore field of an HSP to a reference to the
160 * xsum field of an HSP.
161 *
162 * Revision 6.445 2004/08/23 17:05:42 papadopo
163 * From Michael Gertz: make CopyResultHspToHSP public
164 *
165 * Revision 6.444 2004/08/16 19:37:26 dondosha
166 * Enabled uneven gap HSP linking for blastx
167 *
168 * Revision 6.443 2004/08/05 21:52:28 camacho
169 * Gracefully handle inability to calculate ungapped lambda for PSSM in psiblast2sequences
170 *
171 * Revision 6.442 2004/07/24 18:55:29 camacho
172 * Fix to GetSequenceWithDenseSeg when sequence cannot be found
173 *
174 * Revision 6.441 2004/07/19 17:05:36 papadopo
175 * specify (unused) 'output-to-scoremat' parameter
176 *
177 * Revision 6.440 2004/06/30 12:29:39 madden
178 * Moved some functions to blfmtutl.c
179 *
180 * Revision 6.439 2004/06/22 14:16:55 camacho
181 * Changed invocation of posFreqsToMatrix to conform with new signature
182 *
183 * Revision 6.438 2004/06/01 20:34:06 dondosha
184 * Fix in previous change; memory leak fix
185 *
186 * Revision 6.437 2004/05/27 17:36:24 dondosha
187 * Minor fix for previous 2 changes
188 *
189 * Revision 6.436 2004/05/25 21:42:47 dondosha
190 * Fix in previous change: in some cases edit block should not be freed when BLAST_HSP is freed
191 *
192 * Revision 6.435 2004/05/21 13:53:04 dondosha
193 * Use BLAST_HSPFree to free BLAST_HSP structures, hence no need to call GapXEditBlockDelete in multiple places
194 *
195 * Revision 6.434 2004/04/22 16:40:32 dondosha
196 * Set search->subject_id to correct ordinal id, needed for finding splice junctions in HSP links at traceback stage
197 *
198 * Revision 6.433 2004/03/22 22:10:38 dondosha
199 * Use kbp_gap instead of kbp pointers in megablast traceback
200 *
201 * Revision 6.432 2004/02/26 15:52:30 papadopo
202 * Mike Gertz' modifications to unify handling of gapped Karlin blocks between protein and nucleotide searches
203 *
204 * Revision 6.431 2004/02/04 15:35:03 camacho
205 * Rollback to fix problems in release 2.2.7
206 *
207 * Revision 6.429 2004/01/30 16:54:45 dondosha
208 * Check if HSP needs to be deleted after reevaluation with ambiguities, after greedy traceback
209 *
210 * Revision 6.428 2004/01/28 16:54:03 dondosha
211 * Restored the code that shifts subject coordinates for blastn traceback with long subject sequences
212 *
213 * Revision 6.427 2004/01/25 05:06:21 dondosha
214 * Translate only relevant parts of long subject sequences for tblastn traceback
215 *
216 * Revision 6.426 2004/01/16 23:43:44 dondosha
217 * No more need for special argument for partial search: it is set in options
218 *
219 * Revision 6.425 2004/01/14 17:01:06 dondosha
220 * Gapped alignment is position based only if posMatrix exists
221 *
222 * Revision 6.424 2004/01/09 18:13:24 dondosha
223 * In [Get,Check]StartForGappedAlignment: if posMatrix not available, use square matrix for calculations
224 *
225 * Revision 6.423 2004/01/06 22:37:40 dondosha
226 * Use BLAST_HSPfree function; in particular fixes a bug with wrong memory being freed
227 *
228 * Revision 6.422 2003/12/11 23:46:28 dondosha
229 * Correction in setting hit ranges after repeats filtering
230 *
231 * Revision 6.421 2003/12/10 17:05:28 dondosha
232 * Added function ReevaluateScoreWithAmbiguities to reevaluate score for one HSP; use it after greedy traceback
233 *
234 * Revision 6.420 2003/11/24 22:06:41 madden
235 * Tblastn optimization, only fetch part of sequence needed
236 *
237 * Revision 6.419 2003/10/30 18:37:19 dondosha
238 * Fix for megablast with non-greedy traceback
239 *
240 * Revision 6.418 2003/10/29 17:46:59 dondosha
241 * Allow 2-stage greedy extension in megablast
242 *
243 * Revision 6.417 2003/08/20 22:14:08 dondosha
244 * Little correction in call to OOFBlastHSPGetNumIdentical
245 *
246 * Revision 6.416 2003/08/04 16:19:16 dondosha
247 * Added effective HSP length (length adjustment) to other returns, so it can be reported in XML output
248 *
249 * Revision 6.415 2003/05/30 17:25:36 coulouri
250 * add rcsid
251 *
252 * Revision 6.414 2003/05/23 22:12:11 camacho
253 * Fix memory leak in PsiBlast2Sequences
254 *
255 * Revision 6.413 2003/04/22 21:52:13 dondosha
256 * Added function OOFBlastHSPGetNumIdentical
257 *
258 * Revision 6.412 2003/04/10 19:21:16 dondosha
259 * Memory leak fix for megablast with limited number of HSPs per hit
260 *
261 * Revision 6.411 2003/03/24 19:42:14 madden
262 * Changes to support query concatenation for blastn and tblastn
263 *
264 * Revision 6.410 2003/03/11 14:33:48 madden
265 * Sort HSPs after array is no longer reallocated
266 *
267 * Revision 6.409 2003/02/21 02:52:16 madden
268 * Ensure stable sorting in score_compare_hsp (change from Morgulis)
269 *
270 * Revision 6.408 2003/01/24 22:26:03 camacho
271 * RPSInit is deprecated, use RPSInitEx instead
272 *
273 * Revision 6.407 2002/12/09 17:22:16 dondosha
274 * When alignment jumps beyond a strand boundary, keep the part of it where initial word is
275 *
276 * Revision 6.406 2002/12/04 23:32:50 camacho
277 * Do not set use_this_gi with nucleotide dbs (redundant)
278 *
279 * Revision 6.405 2002/12/04 18:42:22 camacho
280 * Minor change to previous commit
281 *
282 * Revision 6.404 2002/12/04 18:38:58 camacho
283 * Use correct effective search space in B2SPssmMultipleQueries
284 *
285 * Revision 6.403 2002/12/04 17:08:33 camacho
286 * Minor change to B2SPssmCleanUpSearch
287 *
288 * Revision 6.402 2002/11/27 15:41:51 dondosha
289 * Added -t, -g and -n megablast options to parse_blast_options
290 *
291 * Revision 6.401 2002/11/26 23:02:07 madden
292 * Add w option to parse_blast_options (OOF for blastx)
293 *
294 * Revision 6.400 2002/11/25 19:57:30 dondosha
295 * Further fix to the HSP limit (-H) megablast option
296 *
297 * Revision 6.399 2002/11/22 23:31:43 dondosha
298 * 1. Use array of structures instead of array of pointers for initial offset pairs;
299 * 2. Sort the HSP array when maximal number of HSPs is reached for a sequence
300 *
301 * Revision 6.398 2002/11/13 23:23:53 dondosha
302 * Correction for getting number of identities in tblastn
303 *
304 * Revision 6.397 2002/11/07 22:25:34 dondosha
305 * Correction in calculating number of identities for very long database sequences
306 *
307 * Revision 6.396 2002/11/04 23:00:54 dondosha
308 * Calculate number of identities while computing the traceback, and save it in the seqalign
309 *
310 * Revision 6.395 2002/10/22 21:03:42 camacho
311 * Calculate the effective search space correctly for rpsblast in BlastOtherReturnsPrepare
312 *
313 * Revision 6.394 2002/10/22 17:57:48 camacho
314 * Changes to B2SPssmMultipleQueries
315 *
316 * Revision 6.393 2002/10/22 15:28:45 kans
317 * SeqAlignCompare takes LIBCALLBACK
318 *
319 * Revision 6.392 2002/10/21 23:13:36 camacho
320 * Added B2SPssmOnTheFly functions
321 *
322 * Revision 6.391 2002/10/18 15:08:28 dondosha
323 * Correction in SaveCurrentHsp functions when maximal number of HSPs is reached
324 *
325 * Revision 6.390 2002/10/17 14:33:12 dondosha
326 * Correction for the maximal number of HSPs option
327 *
328 * Revision 6.389 2002/09/19 22:22:18 camacho
329 * Sanity checks in BlastTwoSequencesByLocWithCallback
330 *
331 * Revision 6.388 2002/09/16 15:54:59 camacho
332 * Turn off RedoAlignmentCore from psi-bl2seq
333 *
334 * Revision 6.387 2002/09/13 20:05:43 camacho
335 * Set the dbseq_num to 1 in BlastTwoSequencesByLocWithCallback
336 *
337 * Revision 6.386 2002/09/11 20:46:25 camacho
338 * Removed deprecated BlastSeqIdListPtr code
339 *
340 * Revision 6.385 2002/09/03 14:22:45 camacho
341 * Changes to pacify mac compiler
342 *
343 * Revision 6.384 2002/09/02 21:54:41 camacho
344 * Correction to previous revision
345 *
346 * Revision 6.383 2002/09/02 20:44:56 camacho
347 * Allow pssm rescaling if scalingFactor is non-zero
348 *
349 * Revision 6.382 2002/08/30 15:42:49 dondosha
350 * In blastn, use ewp structure only for the first context
351 *
352 * Revision 6.381 2002/08/29 19:22:20 camacho
353 * Save karlinK parameter when rescaling pssm
354 *
355 * Revision 6.380 2002/08/29 16:23:42 camacho
356 * Removed debugging code
357 *
358 * Revision 6.379 2002/08/29 15:49:56 camacho
359 * Added matrix rescaling code for psi-blast2sequences
360 *
361 * Revision 6.378 2002/08/26 16:55:52 madden
362 * Fix for scaling with translated searches
363 *
364 * Revision 6.376 2002/08/05 20:07:37 dondosha
365 * Correction for bl2seq with megablast option: convert gap info to seqalign after search
366 *
367 * Revision 6.375 2002/08/02 21:49:56 vakatov
368 * + LIBCALL
369 *
370 * Revision 6.374 2002/08/01 21:33:12 madden
371 * Do not put p-value and small_gap into SeqAlign
372 *
373 * Revision 6.373 2002/08/01 20:45:34 dondosha
374 * Changed prototype of the BLASTPostSearchLogic function to make it
375 * more convenient
376 *
377 * Revision 6.372 2002/07/18 19:40:45 dondosha
378 * Added an option to restrict number of HSPs per database sequence
379 *
380 * Revision 6.371 2002/07/11 22:31:54 camacho
381 * Added sanity check to BlastTwoSequencesByLocWithCallback with PSSM
382 *
383 * Revision 6.370 2002/07/02 17:08:01 dondosha
384 * Reverse previous change - not needed
385 *
386 * Revision 6.369 2002/07/02 01:41:31 dondosha
387 * Typo fix
388 *
389 * Revision 6.368 2002/07/02 01:36:40 dondosha
390 * For megablast use larger window in CheckStartForGappedAlignment
391 *
392 * Revision 6.367 2002/06/21 21:43:01 camacho
393 * Removed obsolete BlastSeqIdList structure and functions
394 *
395 * Revision 6.366 2002/06/13 16:51:41 madden
396 * BlastTwoSequencesCore and BlastTwoSequencesCoreEx return status instead of SearchBlk
397 *
398 * Revision 6.365 2002/06/12 20:34:50 coulouri
399 * Don't dereference possibly NULL pointer
400 *
401 * Revision 6.364 2002/06/11 20:40:05 dondosha
402 * Correction to previous change
403 *
404 * Revision 6.363 2002/06/11 14:44:46 dondosha
405 * Return status from some functions instead of search block pointer
406 *
407 * Revision 6.362 2002/05/31 16:06:20 kans
408 * changed MemSet (..., NULL, ...) to MemSet (..., 0, ...) for Mac compiler
409 *
410 * Revision 6.361 2002/05/29 17:14:49 dondosha
411 * Check whether an id found by SeqIdFindBest is indeed a gi
412 *
413 * Revision 6.360 2002/05/28 22:00:12 camacho
414 * *** empty log message ***
415 *
416 * Revision 6.359 2002/05/13 13:51:32 dondosha
417 * Made two functions public
418 *
419 * Revision 6.358 2002/05/08 22:51:11 dondosha
420 * Do the starting positions check for final gapped alignment in Mega BLAST case as well
421 *
422 * Revision 6.357 2002/04/23 20:41:21 dondosha
423 * In case of non-affine extension in megablast, check percent identity cutoff after the traceback is obtained
424 *
425 * Revision 6.356 2002/04/19 17:26:07 madden
426 * Fix for last update
427 *
428 * Revision 6.355 2002/04/18 20:16:52 madden
429 * Fix problem with FUM for SeqLoc
430 *
431 * Revision 6.354 2002/04/17 20:42:23 madden
432 * Fix typo for mask1
433 *
434 * Revision 6.353 2002/04/04 21:19:15 dondosha
435 * Corrections for megablast with non-greedy extensions
436 *
437 * Revision 6.352 2002/03/28 18:51:39 madden
438 * All threads get access to (query) masking seqloc, merge overlapping segments for seg
439 *
440 * Revision 6.351 2002/03/26 23:18:00 dondosha
441 * Duplicate mb_endpoint_results structure on all threads
442 *
443 * Revision 6.350 2002/03/26 16:49:33 madden
444 * Use scaled up/down Lambda
445 *
446 * Revision 6.349 2002/03/14 16:11:40 camacho
447 * Extended BlastTwoSequences to allow comparison between sequence and PSSM
448 *
449 * Revision 6.348 2002/03/05 17:58:56 dondosha
450 * Set same offsets for the traceback as for preliminary extension for megablast with non-greedy extensions
451 *
452 * Revision 6.347 2002/02/15 23:36:22 dondosha
453 * Correction for megablast with non-greedy extensions
454 *
455 * Revision 6.346 2002/01/11 20:14:28 madden
456 * Put the use_this_gi into the SeqAlign
457 *
458 * Revision 6.345 2002/01/07 23:16:00 dondosha
459 * Fixed several memory leaks and allocation/freeing bugs in multithreaded megablast
460 *
461 * Revision 6.344 2001/12/28 20:38:40 dondosha
462 * Moved Mega BLAST related parameters into a separate structure
463 *
464 * Revision 6.343 2001/12/13 16:06:54 dondosha
465 * Use separate mb_endpoint_results list for each of multiple threads
466 *
467 * Revision 6.342 2001/11/26 20:19:25 madden
468 * Add call to BLASTOptionValidateEx to BlastTwoSequencesWithCallback
469 *
470 * Revision 6.341 2001/11/16 15:44:26 dondosha
471 * In BlastPruneSeqAlignByGiList: retrieve bioseq only if seqid in seqalign is not a gi
472 *
473 * Revision 6.340 2001/11/14 00:31:44 camacho
474 * Updated BlastGetAllowedGis and BlastGetFirstGiofSubset functions
475 * to return the correct seqid's when dealing with the new database
476 * format and mask (subset) databases.
477 *
478 * Revision 6.339 2001/11/13 18:20:33 dondosha
479 * Use GapxEditScript structure instead of edit_script_t in higher level function calls
480 *
481 * Revision 6.338 2001/10/12 16:10:07 dondosha
482 * 1. Made BLASTResultFreeHsp public
483 * 2. Added BioseqBlastEngineCoreEx with partial search option
484 *
485 * Revision 6.337 2001/10/05 18:10:29 madden
486 * Add threshold_second to parse_blast_options
487 *
488 * Revision 6.336 2001/09/19 17:24:17 kans
489 * removed extra parameter from BioseqMegaBlastEngineCore
490 *
491 * Revision 6.335 2001/09/07 14:46:43 dondosha
492 * Roll back removal of threshold_first from functions and structures
493 *
494 * Revision 6.334 2001/09/06 20:24:33 dondosha
495 * Removed threshold_first
496 *
497 * Revision 6.333 2001/07/27 20:04:09 dondosha
498 * Small correction in passing effective db length for two sequences engine
499 *
500 * Revision 6.332 2001/07/26 18:19:03 dondosha
501 * Added a few more letter options in parse_blast_options
502 *
503 * Revision 6.331 2001/07/20 18:55:58 dondosha
504 * 1. Use effective db length option in 2 sequences engine
505 * 2. Create diagonal array for megablast when needed
506 *
507 * Revision 6.330 2001/07/09 14:17:24 madden
508 * Fix PC-lint complaints from R. Williams
509 *
510 * Revision 6.329 2001/07/09 13:12:03 madden
511 * Removed unused variables
512 *
513 * Revision 6.328 2001/06/25 18:30:24 madden
514 * Add define for NLM_GENERATED_CODE_PROTO to get prototypes in fdlobj.h
515 *
516 * Revision 6.327 2001/06/25 16:03:31 madden
517 * Comment out CheckGappedAlignmentsForOverlap
518 *
519 * Revision 6.326 2001/06/12 19:48:55 madden
520 * Introduce total_hsp_limit, check before making SeqAlign
521 *
522 * Revision 6.325 2001/06/04 21:29:42 dondosha
523 * Add message about deleted hits with e-value below the low threshold
524 *
525 * Revision 6.324 2001/05/07 13:18:24 madden
526 * Fix to really remove deleted HSPs from (culling) heap
527 *
528 * Revision 6.323 2001/05/04 19:50:45 dondosha
529 * Improved error message when all queries are shorter than word size
530 *
531 * Revision 6.322 2001/05/03 21:48:28 dondosha
532 * Handle some cases when memory allocation fails
533 *
534 * Revision 6.321 2001/04/16 21:28:11 dondosha
535 * Added function BlastPruneSeqAlignByEvalueRange
536 *
537 * Revision 6.320 2001/04/12 21:34:50 dondosha
538 * Added function BlastPruneSeqAlignByGiList
539 *
540 * Revision 6.319 2001/04/12 17:17:15 madden
541 * Fixes core-dump for small query
542 *
543 * Revision 6.318 2001/04/12 15:01:25 madden
544 * change repeat filtering db
545 *
546 * Revision 6.317 2001/04/11 20:56:06 madden
547 * Added scalingFactor for rpsblast
548 *
549 * Revision 6.316 2001/04/11 18:22:13 dondosha
550 * Copy query_slp in BlastSearchBlkDuplicate for all programs
551 *
552 * Revision 6.315 2001/04/03 21:59:49 dondosha
553 * Implemented tabulated output for non-megablast bl2seq
554 *
555 * Revision 6.314 2001/03/28 21:05:23 dondosha
556 * Set dbinfo->is_protein in other returns
557 *
558 * Revision 6.313 2001/03/27 21:27:01 madden
559 * Minor efficiency in how lookup table is made
560 *
561 * Revision 6.312 2001/03/27 21:13:56 dondosha
562 * Do not print error if OID list exists without CommonIndex
563 *
564 * Revision 6.311 2001/03/27 20:35:10 dondosha
565 * Small bug fix
566 *
567 * Revision 6.310 2001/03/26 15:03:25 madden
568 * Fix number warnings and two bugs found by PC compiler
569 *
570 * Revision 6.309 2001/03/21 15:46:32 dondosha
571 * Added missing parentheses in previous change
572 *
573 * Revision 6.308 2001/03/20 20:06:13 dondosha
574 * Added protection from crossing strand boundary for blastn
575 *
576 * Revision 6.307 2001/03/19 18:51:39 madden
577 * HitRangeToSeqLoc returns values appropriate for subsequences
578 *
579 * Revision 6.306 2001/03/12 14:53:46 dondosha
580 * Uninitialized variable corrections
581 *
582 * Revision 6.305 2001/03/08 22:05:48 dondosha
583 * Split very long database sequences in all BLAST programs
584 *
585 * Revision 6.304 2001/02/16 18:45:39 dondosha
586 * Fixed minor purify errors
587 *
588 * Revision 6.303 2001/02/08 20:41:16 dondosha
589 * Implemented tabulated output for all translated programs
590 *
591 * Revision 6.302 2001/02/07 21:12:05 dondosha
592 * 1. Added Blast Engine functions with callback argument
593 * 2. Pass output stream from options block to search
594 *
595 * Revision 6.301 2001/01/29 22:23:00 madden
596 * Do not recreate hsp_array
597 *
598 * Revision 6.300 2001/01/26 17:43:09 madden
599 * Comment out unneeded memset
600 *
601 * Revision 6.299 2001/01/23 20:25:43 dondosha
602 * 1. Renamed BlastParceInputString to BlastParseInputString
603 * 2. Recognize a double quoted string as an option value in
604 * BlastParseInputString
605 *
606 * Revision 6.298 2001/01/23 18:23:57 madden
607 * Fix memory leak
608 *
609 * Revision 6.297 2001/01/19 16:49:37 madden
610 * Added helper array to BlastNtGappedScoreInternal
611 *
612 * Revision 6.296 2001/01/16 23:16:51 dondosha
613 * Added 2 arguments and several options to parse_blast_options
614 *
615 * Revision 6.295 2001/01/16 20:32:46 kans
616 * included simutil.h to suppress Mac error
617 *
618 * Revision 6.294 2001/01/12 17:10:04 dondosha
619 * If subject SeqLoc is on a single strand and query on both, swap the strands
620 *
621 * Revision 6.293 2001/01/11 18:34:20 dondosha
622 * Changed error level for nonexistent database from ERROR to FATAL
623 *
624 * Revision 6.292 2001/01/09 20:16:27 dondosha
625 * Implemented from-to location options for both sequences in bl2seq
626 *
627 * Revision 6.291 2001/01/05 17:12:48 dondosha
628 * Correction in previous memory leak fix
629 *
630 * Revision 6.290 2001/01/04 15:01:25 dondosha
631 * Fix for tblastx in blast two sequences engine
632 *
633 * Revision 6.289 2001/01/03 21:45:30 dondosha
634 * Fixed a memory leak - some edit blocks not freed in megablast
635 *
636 * Revision 6.288 2000/12/28 18:23:05 madden
637 * Add -P and -A to parse_blast_options
638 *
639 * Revision 6.287 2000/12/19 15:52:47 dondosha
640 * Forbid reversing query and subject for two sequences megablast
641 *
642 * Revision 6.286 2000/12/19 14:52:59 dondosha
643 * Previous change wrong
644 *
645 * Revision 6.285 2000/12/15 15:38:38 dondosha
646 * Call AdjustOffSetsInSeqAlign with correct query and subject SeqLocs
647 *
648 * Revision 6.284 2000/12/15 14:25:41 madden
649 * Optimization to BlastTranslateUnambiguousSequence
650 *
651 * Revision 6.283 2000/12/15 14:23:34 madden
652 * Use readdb_get_sequence_ex to get sequence faster
653 *
654 * Revision 6.282 2000/12/13 22:26:44 dondosha
655 * Free the ncbi4na-encoded subject sequence after search in two sequences megablast engine
656 *
657 * Revision 6.281 2000/12/13 13:51:35 madden
658 * Free SeqLocPtr in BlastSequencesOnTheFly
659 *
660 * Revision 6.280 2000/12/07 17:46:56 dondosha
661 * Call AdjustOffSetsInSeqAlign for for megablast too
662 *
663 * Revision 6.279 2000/12/04 18:51:24 madden
664 * Fix memory leaks
665 *
666 * Revision 6.278 2000/11/29 23:05:00 dondosha
667 * Keep ncbi4na-encoded subject sequence in search->subject for megablast
668 *
669 * Revision 6.277 2000/11/16 19:15:31 dondosha
670 * Pass back endpoint results in other_returus for Mega BLAST with no traceback
671 *
672 * Revision 6.276 2000/11/09 17:28:35 dondosha
673 * Set block_width to 0 for Mega BLAST in BlastTwoSequences engine
674 *
675 * Revision 6.275 2000/11/08 22:21:33 dondosha
676 * Enabled new tblastn by adding a longest_intron option
677 *
678 * Revision 6.274 2000/11/08 20:20:31 dondosha
679 * Do not free subject in BlastTwoSequencesCore for new tblastn - done elsewhere
680 *
681 * Revision 6.273 2000/11/07 16:30:27 madden
682 * Introduce intermediate score (before linking of HSPs) for blastx and tblastn
683 *
684 * Revision 6.272 2000/11/03 20:15:19 dondosha
685 * Pass the subject sequence to new_link_hsps from two sequences engine
686 *
687 * Revision 6.271 2000/11/02 20:15:38 dondosha
688 * Added functions BlastTwoSequencesByLocWithCallback and BlastTwoSequencesWithCallback
689 *
690 * Revision 6.270 2000/11/02 16:36:12 madden
691 * Fixed another minor problem from merge
692 *
693 * Revision 6.269 2000/11/02 16:12:37 madden
694 * fix Errors during merge of code
695 *
696 * Revision 6.268 2000/11/01 16:25:57 madden
697 * Changes from Futamura for psitblastn
698 *
699 * Revision 6.267 2000/10/31 17:51:44 dondosha
700 * Copy the necessary search block data for multi-threaded megablast
701 *
702 * Revision 6.266 2000/10/23 22:17:54 shavirin
703 * Added creation of "no database found" message in case if database is
704 * not found.
705 *
706 * Revision 6.265 2000/10/18 19:46:29 dondosha
707 * Fixed bug in BlastTwoSequencesCore for partial subject sequence search
708 *
709 * Revision 6.264 2000/10/16 19:34:16 shavirin
710 * Added possibility to run RPS Blast search from function BioseqBlastEngineByLocEx().
711 *
712 * Revision 6.263 2000/10/13 17:32:50 shavirin
713 * Adjusted calls to readdb_get_header for ASN.1 structured deflines.
714 *
715 * Revision 6.262 2000/10/13 16:05:44 shavirin
716 * Fixed minir bug with reporting database name.
717 *
718 * Revision 6.261 2000/10/12 14:45:34 madden
719 * Break out of loop if hsp is freed
720 *
721 * Revision 6.260 2000/10/11 17:14:02 dondosha
722 * For tblastn traceback convert subject sequence to ncbi4na encoding in BlastTwoSequencesCore
723 *
724 * Revision 6.259 2000/10/10 16:11:15 shavirin
725 * Added check for NULL in the function BLASTCheckHSPInclusion().
726 *
727 * Revision 6.258 2000/10/06 19:32:02 shavirin
728 * Added call to SeqMgrAddToBioseqIndex() for created fake Bioseq.
729 *
730 * Revision 6.257 2000/10/05 22:43:10 dondosha
731 * Use mb_result_struct for Mega BLAST results in two sequences functions
732 *
733 * Revision 6.256 2000/10/05 19:57:08 dondosha
734 * In Mega BLAST, results are saved in and freed from mb_result_struct, not result_struct
735 *
736 * Revision 6.255 2000/10/03 21:28:54 shavirin
737 * Added check for search->pbp for not NULL in BlastSearchBlkDestruct().
738 *
739 * Revision 6.254 2000/09/29 21:14:47 shavirin
740 * Added additional check for inclusion of HSPs after traceback for
741 * OOF gapped alignment case.
742 *
743 * Revision 6.253 2000/09/28 14:57:50 dondosha
744 * Initialize exact match array for megablast in BlastHitListNew
745 *
746 * Revision 6.252 2000/09/25 15:43:36 madden
747 * Fix for rpsblast, too high expect values getting through
748 *
749 * Revision 6.251 2000/09/14 15:05:46 dondosha
750 * For new tblastn, reset evalues to individual ones before relinking HSPs
751 *
752 * Revision 6.250 2000/09/07 13:41:42 madden
753 * Fix if first start is -1 in DenseSeg
754 *
755 * Revision 6.249 2000/09/01 18:29:12 dondosha
756 * Removed calls to ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles
757 *
758 * Revision 6.248 2000/08/31 18:37:21 shavirin
759 * Added check for NULL in BlastMakeCopyQueryDNAP().
760 *
761 * Revision 6.247 2000/08/31 16:55:17 shavirin
762 * Fixed problem with OOF alignment of negative starnd HSPs.
763 *
764 * Revision 6.246 2000/08/28 21:53:12 shavirin
765 * Added function BlastOtherReturnsFree(). Cleaned memory in case of
766 * tweak_parameters = TRUE. (Freed SeqAlign calculated before RedoAlignmentCore.
767 *
768 * Revision 6.245 2000/08/22 20:02:27 dondosha
769 * Previous change not quite right: use real subject length for all programs
770 *
771 * Revision 6.244 2000/08/22 19:42:25 dondosha
772 * Divide search->subject->length by 3 for tblastn in RealBlastGetGappedAlignmentTraceback
773 *
774 * Revision 6.243 2000/08/18 21:27:59 madden
775 * undo change 6.240 when smith_waterman is not set, the extra alignment is needed when only tweak_parameters is set
776 *
777 * Revision 6.242 2000/08/18 20:12:29 dondosha
778 * Do not use search->query_id in megablast, use only qid_array
779 *
780 * Revision 6.241 2000/08/08 21:43:35 shavirin
781 * Initialized GapAlignBlkPtr for the value of discontinuous parametrers.
782 *
783 * Revision 6.240 2000/08/03 22:25:36 shavirin
784 * Removed redundant gapped Traceback in case when tweak_parameters or
785 * smith_waterman is set.
786 *
787 * Revision 6.239 2000/07/31 23:08:13 dondosha
788 * Do not go over the end of the HSP in subject sequence when computing start for gapped alignment
789 *
790 * Revision 6.238 2000/07/25 18:12:03 shavirin
791 * WARNING: This is no-turning-back changed related to S&W Blast from
792 * Alejandro Schaffer
793 *
794 * Revision 6.237 2000/07/25 16:54:26 shavirin
795 * Corrected functions initializing gap_align in case of OOF gapping.
796 *
797 * Revision 6.236 2000/07/18 22:33:02 shavirin
798 * Adjusted start for gapped alignment in OOF case.
799 *
800 * Revision 6.235 2000/07/17 14:26:08 shavirin
801 * Added support for Out of frame gapping.
802 *
803 * Revision 6.234 2000/07/13 18:33:28 madden
804 * Fix for exploded hits with pdb
805 *
806 * Revision 6.233 2000/07/11 18:38:02 madden
807 * decreased size of helper array, added prefetch to BlastGappedScoreInternal
808 *
809 * Revision 6.232 2000/07/10 15:23:30 dondosha
810 * Moved check query_invalid from BlastTwoSequencesCoreEx to BlastTwoSequencesCore
811 *
812 * Revision 6.231 2000/07/10 15:06:23 madden
813 * Use helper array in BlastGappedScoreInternal to reduce cache misses
814 *
815 * Revision 6.230 2000/06/30 17:52:44 madden
816 * Move AWAKE_THR_MIN_SIZE to blastdef.h
817 *
818 * Revision 6.229 2000/06/29 21:27:02 dondosha
819 * Fixed memory leaks in culling by similarity
820 *
821 * Revision 6.228 2000/06/29 19:19:39 madden
822 * Fix minus strand offset in BlastConvertDNASeqLoc
823 *
824 * Revision 6.227 2000/06/26 20:15:34 shavirin
825 * Fixed coordinates transfer in the function BlastConvertDNASeqLoc().
826 *
827 * Revision 6.226 2000/06/23 20:17:42 madden
828 * Optimization for CheckGappedAlignmentsForOverlap (remove n-squared hsp check)
829 *
830 * Revision 6.225 2000/06/23 15:22:43 madden
831 * Fix problem with removing translated hits with different frames
832 *
833 * Revision 6.224 2000/06/21 18:02:25 dondosha
834 * In BlastSaveCurrentHspGapped no need to allocate new memory for hsp_array
835 *
836 * Revision 6.223 2000/06/21 15:10:27 madden
837 * efficiency in BlastGappedScoreInternal
838 *
839 * Revision 6.222 2000/06/21 12:53:22 madden
840 * Do each frame separately in CheckGappedScoreInternal for efficiency
841 *
842 * Revision 6.221 2000/06/20 16:45:36 dondosha
843 * Fixed a minor bug in revision 6.219
844 *
845 * Revision 6.220 2000/06/19 20:07:19 madden
846 * Skip transferring sequence to blastna format
847 *
848 * Revision 6.219 2000/06/19 19:16:19 dondosha
849 * Optimized reallocation of hsp array when it is overflowing
850 *
851 * Revision 6.218 2000/06/15 15:31:26 dondosha
852 * Added two sequences BLAST functions returning SearchBlk instead of SeqAlign;added code to cluster hits and keep only one hit per cluster - disabled so far; enabled two sequences BLAST for tblastn
853 *
854 * Revision 6.217 2000/06/13 20:54:38 shavirin
855 * Added return of EFF_SEARCH_SPACE in the function BlastOtherReturnsPrepare
856 *
857 * Revision 6.216 2000/06/08 20:34:15 madden
858 * add explode_seqids option to show all ids in a defline
859 *
860 * Revision 6.215 2000/05/24 20:53:48 dondosha
861 * Fixed a bug in previous change
862 *
863 * Revision 6.214 2000/05/24 19:49:07 dondosha
864 * Create qid_array for the new search in BlastSearchDuplicate, if megablast
865 *
866 * Revision 6.213 2000/05/22 19:49:35 dondosha
867 * Initialize vnp to NULL in BlastSeqLocFilterEx
868 *
869 * Revision 6.212 2000/05/16 20:00:02 madden
870 * fix for formatting db names
871 *
872 * Revision 6.211 2000/05/12 19:41:54 dondosha
873 * Free qid_array in BlastSearchBlkDestruct
874 *
875 * Revision 6.210 2000/05/05 20:10:22 madden
876 * Add vecscreen filtering capability
877 *
878 * Revision 6.209 2000/04/29 18:55:53 wheelan
879 * temporary fix for BlastTwoSequences NULL return problem
880 *
881 * Revision 6.208 2000/04/28 16:52:31 madden
882 * Fix for ungapped search of subset databases
883 *
884 * Revision 6.207 2000/04/10 17:26:28 madden
885 * Add BLASTResultFreeHsp to free memory as it is no longer needed
886 *
887 * Revision 6.206 2000/04/10 15:24:49 dondosha
888 * Enabled use of MegaBlast for BlastTwoSequences
889 *
890 * Revision 6.205 2000/04/07 16:57:45 shavirin
891 * Transfered queue parameters in BlastSearchBlkDuplicate() function.
892 *
893 * Revision 6.204 2000/04/06 17:33:57 madden
894 * Check if pointer is NULL in BlastGetAllowedGis
895 *
896 * Revision 6.203 2000/04/03 21:23:18 dondosha
897 * Do not construct ewp_params and ewp for MegaBlast search
898 *
899 * Revision 6.202 2000/04/03 20:05:27 madden
900 * Free lh_helper on tmp_hitlist, fixes leak
901 *
902 * Revision 6.201 2000/03/31 19:11:06 dondosha
903 * Changed some names related to MegaBlast
904 *
905 * Revision 6.200 2000/03/31 16:45:43 dondosha
906 * Enabled blastx for BlastTwoSequences search
907 *
908 * Revision 6.199 2000/03/30 21:44:22 madden
909 * Add BLASTResultHitlistFreeEx that checks Heap integrity
910 *
911 * Revision 6.198 2000/03/29 22:18:02 dondosha
912 * Moved adjustment of offsets in blastn to BlastSaveCurrentHitlist, added gap info processing for MegaBlast
913 *
914 * Revision 6.197 2000/03/22 17:58:54 dondosha
915 * Duplicate entire list of query_ids in BlastSearchBlkDuplicate
916 *
917 * Revision 6.196 2000/03/08 20:34:30 madden
918 * Add BlastGetFirstGiofSubset, BlastGetAllowedGis returns primary SeqId
919 *
920 * Revision 6.195 2000/03/03 18:15:52 dondosha
921 * Fixed bugs and memory leaks in MegaBlast related code
922 *
923 * Revision 6.194 2000/03/03 17:58:23 shavirin
924 * Added new function BlastConvertDNASeqLoc()
925 *
926 * Revision 6.193 2000/03/01 14:37:45 dondosha
927 * Adjust query offsets after search for all 3 versions of blastn
928 *
929 * Revision 6.192 2000/02/29 18:06:07 dondosha
930 * In case of MegaBlast save correct query ids in seqaligns
931 *
932 * Revision 6.191 2000/02/24 23:21:27 dondosha
933 * Adjust context offsets before gapped alignment to avoid strand crossover
934 *
935 * Revision 6.190 2000/02/23 20:51:05 dondosha
936 * Modifications for blastn to concatenate strands - handling of query offsets
937 *
938 * Revision 6.189 2000/02/17 21:23:10 shavirin
939 * Added parameter is_rps_blast.
940 *
941 * Revision 6.188 2000/02/17 19:02:09 shavirin
942 * Removed all references to absolete theCacheSize variable.
943 *
944 * Revision 6.187 2000/02/17 18:30:56 shavirin
945 * Added translated DNA filtering for RPS Blast
946 *
947 * Revision 6.186 2000/02/17 14:38:27 madden
948 * Duplicate filter_string for multiple threads
949 *
950 * Revision 6.185 2000/02/16 21:49:16 shavirin
951 * Fixed some memory leaks.
952 *
953 * Revision 6.184 2000/02/15 19:16:26 shavirin
954 * MemFree(pbp->filter_string) in BlastSearchBlkDestruct
955 *
956 * Revision 6.183 2000/02/14 16:15:50 madden
957 * Revert to 6.179
958 *
959 * Revision 6.182 2000/02/11 22:03:03 shavirin
960 * Returned back previous change.
961 *
962 * Revision 6.181 2000/02/11 21:25:58 shavirin
963 * Removed call to BlastLinkHsps() function for tblastn program.
964 *
965 * Revision 6.180 2000/02/11 20:45:54 dondosha
966 * Adjust the second strand offsets after blastn search
967 *
968 * Revision 6.179 2000/02/11 16:40:53 egorov
969 * The parse_blast_options is made public.
970 *
971 * Revision 6.178 2000/02/04 22:31:38 kans
972 * test subject_bsp for NULL before dereferencing in BlastTwoSequencesByLocEx
973 *
974 * Revision 6.177 2000/02/04 16:13:15 shavirin
975 * Returned changes done in Revision 6.172.
976 *
977 * Revision 6.176 2000/02/02 18:22:05 madden
978 * Free memory for LinkHelpStruct
979 *
980 * Revision 6.175 2000/02/01 22:13:26 dondosha
981 * Added code related to greedy basic gapped alignment
982 *
983 * Revision 6.174 2000/01/28 16:45:53 madden
984 * HitRangeToSeqLoc called with combine TRUE
985 *
986 * Revision 6.173 2000/01/26 22:01:56 madden
987 * Add function BlastGetProgramName
988 *
989 * Revision 6.172 2000/01/14 18:28:11 shavirin
990 * Some WordExtention* function mad external.
991 *
992 * Revision 6.171 2000/01/12 21:46:19 dondosha
993 * Minor memory leak clean-up (routine BlastSeqLocFilterEx)
994 *
995 * Revision 6.170 2000/01/12 18:54:44 madden
996 * Do not free bestid to fix problem
997 *
998 * Revision 6.169 2000/01/11 17:12:51 shavirin
999 * Added handling of the new parameter theCacheSize.
1000 *
1001 * Revision 6.168 2000/01/11 15:32:47 dondosha
1002 * Fixed memory leaks in opening shared header and sequence file memory maps
1003 *
1004 * Revision 6.167 2000/01/04 21:56:59 madden
1005 * Add NULLB to both ends of db sequence before gap extend, use dynamic buffer for blast options in repeat filtering
1006 *
1007 * Revision 6.166 2000/01/03 17:38:33 shavirin
1008 * Added check for rdfp in BlastGetAllowedGis() function.
1009 *
1010 * Revision 6.165 1999/12/31 14:23:20 egorov
1011 * Add support for using mixture of real and maks database with gi-list files:
1012 * 1. Change logic of creating rdfp list.
1013 * 2. BlastGetDbChunk gets real databases first, then masks.
1014 * 3. Propoper calculation of database sizes using alias files.
1015 * 4. Change to CommonIndex to support using of mask databases.
1016 * 5. Use correct gis in formated output (BlastGetAllowedGis()).
1017 * 6. Other small changes
1018 *
1019 * Revision 6.164 1999/12/22 22:00:35 dondosha
1020 * Destruct the header and sequence memory maps separately before destructing the search structure
1021 *
1022 * Revision 6.163 1999/12/22 21:08:36 shavirin
1023 * Rewritten function BlastNewFindWords() added function BlastNewFindWordsEx()
1024 *
1025 * Revision 6.160 1999/12/21 20:02:45 egorov
1026 * Fix memory leak.
1027 *
1028 * Revision 6.159 1999/12/17 22:22:57 madden
1029 * New masking parameters from Wojtek
1030 *
1031 * Revision 6.158 1999/12/16 19:08:36 egorov
1032 * Check rdfp for NULL before using. Bug reported by Patrick and Sergei Sh.
1033 *
1034 * Revision 6.157 1999/12/15 17:42:26 egorov
1035 * Change BlastGetAllowedGis() to handle gi's belonged to a database alias.
1036 *
1037 * Revision 6.156 1999/12/13 21:53:02 madden
1038 * Some fixes for repeat masking
1039 *
1040 * Revision 6.155 1999/11/26 22:11:26 madden
1041 * Added BlastNT functions for nucl. extensions
1042 *
1043 * Revision 6.154 1999/11/24 15:21:38 egorov
1044 * Avoid GCC warning
1045 *
1046 * Revision 6.153 1999/11/09 14:14:12 madden
1047 * Start alive thread for masking only if query is above min size
1048 *
1049 * Revision 6.152 1999/11/02 15:32:36 madden
1050 * Allow setting of repeat filtering options and database
1051 *
1052 * Revision 6.151 1999/11/01 20:18:22 egorov
1053 * New format of filter_string
1054 *
1055 * Revision 6.150 1999/10/27 21:33:02 madden
1056 * Use housekeeping threads only for larger sequences
1057 *
1058 * Revision 6.149 1999/10/18 20:06:52 shavirin
1059 * evalue_compare_hits() : In case of equal scores and E-values order
1060 * will be determined by subject id
1061 *
1062 * Revision 6.148 1999/10/18 16:15:04 egorov
1063 * Bug fixed
1064 *
1065 * Revision 6.147 1999/10/15 20:52:10 shavirin
1066 * Fixed bug with seq_id_list initialization
1067 *
1068 * Revision 6.146 1999/10/12 21:50:47 shavirin
1069 * Added intialization of db_chunk_size in BlastThrInfoNew().
1070 *
1071 * Revision 6.145 1999/10/05 17:42:55 shavirin
1072 * Removed global variables from blast.c
1073 *
1074 * Revision 6.144 1999/10/01 18:26:56 madden
1075 * Check for search->rdfp before search->rdfp->oidlist
1076 *
1077 * Revision 6.143 1999/09/28 20:14:33 madden
1078 * Joerg changes to mimize cache misses
1079 *
1080 * Revision 6.142 1999/09/22 20:58:49 egorov
1081 * OID list change
1082 *
1083 * Revision 6.141 1999/09/16 16:55:12 madden
1084 * Changes for long words in blastn
1085 *
1086 * Revision 6.140 1999/09/03 17:23:25 madden
1087 * Fixed bug in CheckStartForGappedAlignment
1088 *
1089 * Revision 6.139 1999/09/01 19:21:06 shavirin
1090 * Added propagation of the score for discontinuous alignment in
1091 * functions: RealBlastGetGappedAlignmentTraceback() and BioseqBlastEngineCore()
1092 *
1093 * Revision 6.138 1999/08/27 18:07:34 shavirin
1094 * Passed parameter decline_align from top to the engine.
1095 *
1096 * Revision 6.137 1999/08/20 20:54:12 madden
1097 * place sentinel byte at beginning of nt sequence for ALIGN
1098 *
1099 * Revision 6.136 1999/08/20 19:48:13 madden
1100 * Changed call to BlastSearchBlkNew(Extra), removed use of version array
1101 *
1102 * Revision 6.135 1999/08/20 16:35:25 shavirin
1103 * Added protection against invalid program name in BlastGetTypes().
1104 *
1105 * Revision 6.134 1999/08/06 18:53:57 madden
1106 * Added calls to lookup_position_aux_destruct
1107 *
1108 * Revision 6.133 1999/08/05 19:01:29 madden
1109 * Add check for NULL search or invalid query in BlastTwoSequencesCore
1110 *
1111 * Revision 6.132 1999/07/01 13:03:24 sicotte
1112 * Updated for DenseDiag and Moved seqalign_reverse_strand from blastutl.c(blast.h) to SeqAlignListReverseStrand in salpedit.ch and fixed call in salutil.c
1113 *
1114 * Revision 6.131 1999/06/24 17:24:12 madden
1115 * Fix bug in GetSeqAlignCount when SeqAlignPtr is NULL
1116 *
1117 * Revision 6.130 1999/06/18 21:17:58 madden
1118 * Check that an exact match gives a positive value when making words for blast2seqs
1119 *
1120 * Revision 6.129 1999/06/14 15:20:26 madden
1121 * Produce temporary BLAST_HitList to fix blastx core-dump
1122 *
1123 * Revision 6.128 1999/05/27 17:33:05 madden
1124 * Fixed Int2 (should have been Int4) problem
1125 *
1126 * Revision 6.127 1999/05/25 13:37:49 madden
1127 * Make smallest float 1.0e-180
1128 *
1129 * Revision 6.126 1999/05/19 12:44:00 madden
1130 * Change in longest_db_seq for multiple db search
1131 *
1132 * Revision 6.125 1999/05/13 13:48:11 madden
1133 * Only filter out hits if on same strand
1134 *
1135 * Revision 6.124 1999/04/15 13:24:35 madden
1136 * Fix for sum stats problems
1137 *
1138 * Revision 6.123 1999/04/13 19:16:47 madden
1139 * Check that two HSPs are on same strand before deleting one
1140 *
1141 * Revision 6.122 1999/04/12 20:24:54 egorov
1142 * Fix MT problem
1143 *
1144 * Revision 6.121 1999/04/01 21:42:46 madden
1145 * Fix memory leaks when gi list is used
1146 *
1147 * Revision 6.120 1999/04/01 14:18:58 madden
1148 * Fixed memory leaks with gi_list
1149 *
1150 * Revision 6.119 1999/03/31 15:46:52 madden
1151 * Removed unused code and variables
1152 *
1153 * Revision 6.118 1999/03/17 13:21:06 madden
1154 * Fix comment in comment problem
1155 *
1156 * Revision 6.117 1999/03/16 19:27:36 egorov
1157 * More type castings
1158 *
1159 * Revision 6.116 1999/03/12 17:19:59 egorov
1160 * More type casting fixes
1161 *
1162 * Revision 6.115 1999/03/12 15:03:45 egorov
1163 * Add proper Int4-long type casting
1164 *
1165 * Revision 6.114 1999/03/04 14:18:09 egorov
1166 * Do correct filter masking when query is seqloc
1167 * The only BlastMaskTheResidues() function is changed:
1168 *
1169 * Revision 6.113 1999/02/22 21:59:05 madden
1170 * binary search in GetAllowedGis function
1171 *
1172 * Revision 6.112 1999/02/22 17:32:46 madden
1173 * Fix memory leak
1174 *
1175 * Revision 6.111 1999/02/18 21:18:23 madden
1176 * Optimization
1177 *
1178 * Revision 6.110 1999/02/17 13:23:01 madden
1179 * Added hsp_num_max
1180 *
1181 * Revision 6.109 1999/02/11 13:53:46 madden
1182 * Added combine Boolean to HitRangeToSeqLoc, fixed mem leak
1183 *
1184 * Revision 6.108 1999/01/28 17:20:57 madden
1185 * Check do_sum_stats for linking, Int2 to Int4, UMR
1186 *
1187 * Revision 6.107 1999/01/28 16:05:49 madden
1188 * HspArrayPurge change, HSPs saved more efficiently
1189 *
1190 * Revision 6.106 1999/01/26 18:27:23 madden
1191 * handle delta sequences correctly
1192 *
1193 * Revision 6.105 1999/01/26 17:59:26 madden
1194 * ContextToFrame no longer static
1195 *
1196 * Revision 6.104 1999/01/25 21:31:25 madden
1197 * Check for illegal chars when nucl. query is translated
1198 *
1199 * Revision 6.103 1999/01/25 19:04:37 madden
1200 * prevent core-dump when query is empty
1201 *
1202 * Revision 6.102 1999/01/20 21:05:33 madden
1203 * Look for repeats on both strands
1204 *
1205 * Revision 6.101 1999/01/19 13:29:24 madden
1206 * Change to HspArrayPurge
1207 *
1208 * Revision 6.100 1998/12/31 18:17:08 madden
1209 * Added strand option
1210 *
1211 * Revision 6.99 1998/12/31 15:36:07 victorov
1212 * filtering internals is now based on SeqLoc instead of Bioseq
1213 *
1214 * Revision 6.98 1998/12/18 16:20:18 madden
1215 * efficiencies
1216 *
1217 * Revision 6.97 1998/12/15 14:11:29 madden
1218 * Change to permit an arbitrary number of HSPs
1219 *
1220 * Revision 6.96 1998/11/30 15:58:20 madden
1221 * Added CheckStartForGappedAlignment
1222 *
1223 * Revision 6.95 1998/11/27 15:24:12 madden
1224 * Duplicated handle_results and query_id if SearchBlk duplicated
1225 *
1226 * Revision 6.94 1998/11/16 17:39:23 kans
1227 * added FALSE for new paramter to FilterCC
1228 *
1229 * Revision 6.93 1998/11/06 14:13:01 madden
1230 * Added call to AdjustOffSetsInSeqAlign in BioseqBlastEngineByLocEx
1231 *
1232 * Revision 6.92 1998/10/21 13:44:16 madden
1233 * Fixed UMR found by purify
1234 *
1235 * Revision 6.91 1998/10/20 19:57:21 madden
1236 * Run dust if filtering is selected for nt
1237 *
1238 * Revision 6.90 1998/10/13 20:37:53 madden
1239 * Use IS_residue after call to SeqPortGetResidue
1240 *
1241 * Revision 6.89 1998/09/24 15:26:38 egorov
1242 * Fix lint complaints
1243 *
1244 * Revision 6.88 1998/09/16 19:00:16 madden
1245 * Added subset Boolean
1246 *
1247 * Revision 6.87 1998/09/15 13:12:29 madden
1248 * Fixed memory leak
1249 *
1250 * Revision 6.86 1998/09/14 15:11:18 egorov
1251 * Add support for Int8 length databases; remove unused variables
1252 *
1253 * Revision 6.85 1998/09/04 20:48:48 madden
1254 * typo fix (= instead of ==)
1255 *
1256 * Revision 6.84 1998/09/03 20:23:42 madden
1257 * Copied seq_ext and seq_ext_type in MakeFakeBioseq
1258 *
1259 * Revision 6.83 1998/09/03 19:41:09 madden
1260 * do not switch sequences for Blast2Sequences if filtering is performed
1261 *
1262 * Revision 6.82 1998/08/24 14:59:59 madden
1263 * readdb_get_sequence_ex function
1264 *
1265 * Revision 6.81 1998/07/30 19:00:56 madden
1266 * Fix memory leak
1267 *
1268 * Revision 6.80 1998/07/29 21:29:45 madden
1269 * Fixed UMR with longest_db_seq that showed up in Blast 2 sequences
1270 *
1271 * Revision 6.79 1998/07/28 21:18:35 madden
1272 * Change to BLAST_ExtendWordParamsNew saves memory
1273 *
1274 * Revision 6.78 1998/07/24 14:58:53 madden
1275 * Jinqhuis call to SeqLocRevCmp put back
1276 *
1277 * Revision 6.77 1998/07/22 20:31:51 madden
1278 * Replaced cutvalue of 1000000 with INT4_MAX
1279 *
1280 * Revision 6.76 1998/07/22 12:17:03 madden
1281 * Added BioseqHitRange call for repeat filtering
1282 *
1283 * Revision 6.75 1998/07/21 20:58:10 madden
1284 * Changes to allow masking at hash only
1285 *
1286 * Revision 6.74 1998/07/20 15:51:28 zjing
1287 * add a check for plus-minus before SeqLocRevCmp
1288 *
1289 * Revision 6.73 1998/07/17 15:39:59 madden
1290 * Changes for Effective search space.
1291 *
1292 * Revision 6.72 1998/07/14 21:31:43 madden
1293 * Fix for incorrectly sorted HSP bug and speed-up of CheckHspOverlap
1294 *
1295 * Revision 6.71 1998/07/06 13:39:04 madden
1296 * Fixed improper use of Int4 in parse_seg_options
1297 *
1298 * Revision 6.70 1998/07/02 21:00:39 egorov
1299 * Remove memory leak in threaded version
1300 *
1301 * Revision 6.69 1998/06/12 22:09:14 madden
1302 * Added call to SegParamsFree
1303 *
1304 * Revision 6.68 1998/06/12 16:08:51 madden
1305 * BlastHitRange stuff
1306 *
1307 * Revision 6.67 1998/06/08 15:07:32 madden
1308 * Fixed bug in BlastConvertProteinSeqLoc
1309 *
1310 * Revision 6.66 1998/06/04 16:23:17 madden
1311 * Use new seg
1312 *
1313 * Revision 6.65 1998/05/28 19:59:58 madden
1314 * Zhengs new culling code
1315 *
1316 * Revision 6.64 1998/05/22 20:20:38 madden
1317 * Added BlastTwoSequencesByLocEx and BlastTwoSequencesEx
1318 *
1319 * Revision 6.63 1998/05/18 17:58:31 madden
1320 * fixed parsing of coil-coil options, added parsing of dust options
1321 *
1322 * Revision 6.62 1998/05/17 16:28:41 madden
1323 * Allow changes to filter options and cc filtering.
1324 *
1325 * Revision 6.61 1998/05/05 14:05:35 madden
1326 * Added functions BlastStartAwakeThread and BlastStopAwakeThread
1327 *
1328 * Revision 6.60 1998/04/28 21:04:19 madden
1329 * Reset number of HSPs to zero if relinking
1330 *
1331 * Revision 6.59 1998/04/24 21:52:09 madden
1332 * Protection against NULL pointers
1333 *
1334 * Revision 6.58 1998/04/24 19:10:59 egorov
1335 * Fix bug when if wordsize == 2 blastall produces extra alignments
1336 *
1337 * Revision 6.57 1998/04/23 21:15:09 egorov
1338 * Show exact matching even if score is below threshold (case of two sequences)
1339 *
1340 * Revision 6.56 1998/04/15 20:24:54 madden
1341 * BlastMaskTheResidues optimized
1342 *
1343 * Revision 6.55 1998/04/10 17:46:58 madden
1344 * Changed FALSE to NULL in BioseqSeg
1345 *
1346 * Revision 6.54 1998/04/02 21:12:55 madden
1347 * Properly set value for linking HSPs in blastx and tblastn
1348 *
1349 * Revision 6.53 1998/04/01 22:47:35 madden
1350 * Check for query_invalid flag
1351 *
1352 * Revision 6.52 1998/03/26 14:20:20 madden
1353 * Changed GetScoreSetFromBlastResultHsp1 from static to LIBCALL
1354 *
1355 * Revision 6.51 1998/03/25 22:28:16 madden
1356 * Changes to allow random access BLAST by gi
1357 *
1358 * Revision 6.50 1998/03/24 15:38:25 madden
1359 * Use BlastDoubleInt4Ptr to keep track of gis and ordinal_ids
1360 *
1361 * Revision 6.49 1998/03/19 22:16:24 madden
1362 * Changes to allow blasting by gi list
1363 *
1364 * Revision 6.48 1998/03/18 14:14:11 madden
1365 * Support random access by gi list
1366 *
1367 * Revision 6.47 1998/03/16 17:41:59 madden
1368 * Fixed leaks
1369 *
1370 * Revision 6.46 1998/03/14 18:28:10 madden
1371 * Added BioseqBlastEngineEx
1372 *
1373 * Revision 6.45 1998/03/09 16:35:10 madden
1374 * Fixed bug with tblastn and blastx gapped searches
1375 *
1376 * Revision 6.44 1998/02/27 14:32:33 madden
1377 * Functions moved to blastool.c
1378 *
1379 * Revision 6.43 1998/02/26 22:34:27 madden
1380 * Changes for 16 bit windows
1381 *
1382 * Revision 6.42 1998/02/26 19:12:39 madden
1383 * Removed AdjustOffSetsInSeqAlign, added BlastNtFindWords BlastPopulateAllWordArrays BlastFindWords and BlastNewFindWords
1384 *
1385 * Revision 6.41 1998/02/24 22:47:06 madden
1386 * Fixed problem with Option validation
1387 *
1388 * Revision 6.40 1998/02/23 16:09:57 madden
1389 * Corrected from offset for subject in tblastx search
1390 *
1391 * Revision 6.39 1998/02/19 17:17:05 madden
1392 * Use of Int4 rather than Int2 when pruning SeqAlign
1393 *
1394 * Revision 6.38 1998/02/12 21:50:39 madden
1395 * protection against NULL hitlist in blastx and tblastn
1396 *
1397 * Revision 6.37 1998/02/11 17:18:19 madden
1398 * Made BlastGetGappedAlignmentTraceback functions to BlastGetGapAlgnTbck (shorter than 32 chars)
1399 *
1400 * Revision 6.36 1998/01/31 21:34:09 madden
1401 * Fix to SeqAlign pruning
1402 *
1403 * Revision 6.35 1998/01/06 18:26:22 madden
1404 * Use SeqLocLen rather than bsp->length, wordsize done properly for nucl
1405 *
1406 * Revision 6.34 1998/01/05 22:41:40 madden
1407 * Added seqalign_reverse_strand
1408 *
1409 * Revision 6.33 1998/01/05 20:53:16 madden
1410 * Added ability to align minus-minus or plus-minus in BlastTwoSeqsByLoc
1411 *
1412 * Revision 6.32 1998/01/05 16:46:55 madden
1413 * One or both strands can be searched, as opposed to only both, changes to number of contexts
1414 *
1415 * Revision 6.31 1997/12/31 17:52:09 madden
1416 * Change to BLAST_WordFinderNew
1417 *
1418 * Revision 6.30 1997/12/23 19:16:52 madden
1419 * Minor efficiency in ExtendWordExit
1420 *
1421 * Revision 6.29 1997/12/23 18:12:34 madden
1422 * Changes for range-dependent blast
1423 *
1424 * Revision 6.28 1997/12/12 20:38:55 madden
1425 * ContextToFrame lost last parameter, fix to sprintf
1426 *
1427 * Revision 6.27 1997/12/11 22:22:24 madden
1428 * Proper casting of variables
1429 *
1430 * Revision 6.26 1997/12/10 22:43:09 madden
1431 * proper casting
1432 *
1433 * Revision 6.25 1997/12/01 22:07:10 madden
1434 * Changed call to BLASTOptionValidateEx
1435 *
1436 * Revision 6.24 1997/11/28 18:19:33 madden
1437 * Changes to TxDfDbInfoNew
1438 *
1439 * Revision 6.23 1997/11/18 22:23:20 madden
1440 * Added BLASTOptionSetGapParams
1441 *
1442 * Revision 6.22 1997/11/14 17:15:29 madden
1443 * Realign matches when they contain ambiguities in blastx/tblastn
1444 *
1445 * Revision 6.21 1997/11/07 00:49:02 madden
1446 * Added call to BLAST_MatrixFill
1447 *
1448 * Revision 6.20 1997/10/29 22:11:13 madden
1449 * ABS value of frames
1450 *
1451 * Revision 6.19 1997/10/24 20:44:52 madden
1452 * Removed BlastSetReadDB and BlastGetReadDB_ID
1453 *
1454 * Revision 6.18 1997/10/22 21:46:34 madden
1455 * Changed default values
1456 *
1457 * Revision 6.17 1997/10/21 20:39:18 madden
1458 * Fix for more alignments than descriptions.
1459 *
1460 * Revision 6.16 1997/10/21 19:50:00 madden
1461 * Fix for no valid query sequence and hitlist_max of 1
1462 *
1463 * Revision 6.15 1997/10/03 21:27:28 madden
1464 * Added BlastGetTypes
1465 *
1466 * Revision 6.14 1997/10/02 17:29:29 madden
1467 * Added PrintDbInformationBasic
1468 *
1469 * Revision 6.13 1997/10/01 13:35:31 madden
1470 * Changed BLAST_VERSION to BLAST_ENGINE_VERSION
1471 *
1472 * Revision 6.12 1997/09/30 20:03:07 madden
1473 * Saved db filename in dbinfo
1474 *
1475 * Revision 6.11 1997/09/24 22:36:35 madden
1476 * Fixes for MT multidb searches
1477 *
1478 * Revision 6.10 1997/09/23 16:43:41 madden
1479 * removed unneeded DenseSegPtr
1480 *
1481 * Revision 6.9 1997/09/22 18:18:35 madden
1482 * Added umlaut to Schaffer in reference
1483 *
1484 * Revision 6.8 1997/09/18 22:22:03 madden
1485 * Added prune functions
1486 *
1487 * Revision 6.7 1997/09/16 16:54:09 kans
1488 * return FASLE instead of NULL for Boolean value
1489 *
1490 * Revision 6.6 1997/09/16 16:31:28 madden
1491 * More changes for multiple db runs
1492 *
1493 * Revision 6.5 1997/09/11 18:49:31 madden
1494 * Changes to enable searches against multiple databases.
1495 *
1496 * Revision 6.4 1997/09/10 21:28:00 madden
1497 * Changes to set CPU limits
1498 *
1499 * Revision 6.3 1997/09/08 16:25:32 madden
1500 * Fixed bug that did not mask low-complexity regions at the end of a query
1501 *
1502 * Revision 6.2 1997/08/27 14:46:51 madden
1503 * Changes to enable multiple DB searches
1504 *
1505 * Revision 6.1 1997/08/26 15:05:26 madden
1506 * Fix for negative effective search space
1507 *
1508 * Revision 6.0 1997/08/25 18:52:49 madden
1509 * Revision changed to 6.0
1510 *
1511 * Revision 1.105 1997/08/22 18:37:43 madden
1512 * Added function BlastOtherReturnsPrepare
1513 *
1514 * Revision 1.104 1997/08/20 21:43:34 madden
1515 * Added page numbers
1516 *
1517 * Revision 1.103 1997/08/14 21:07:08 madden
1518 * ignored gapped for tblastx
1519 *
1520 * Revision 1.102 1997/08/14 14:30:35 madden
1521 * BlastNewFindWords called with range set for ranged blast
1522 *
1523 * Revision 1.101 1997/07/31 21:18:11 madden
1524 * Removed left-over file from seg
1525 *
1526 * Revision 1.100 1997/07/30 16:39:30 madden
1527 * Print gap existence and extension parameters for blastn
1528 *
1529 * Revision 1.99 1997/07/30 16:31:37 madden
1530 * tblastx prepares StdSeg
1531 *
1532 * Revision 1.98 1997/07/29 17:07:27 madden
1533 * better tblastx error messages.
1534 *
1535 * Revision 1.97 1997/07/25 15:39:49 madden
1536 * Corrected citation
1537 *
1538 * Revision 1.96 1997/07/25 13:47:46 madden
1539 * Made buffer longer to avoid ABR
1540 *
1541 * Revision 1.95 1997/07/23 20:59:02 madden
1542 * Changed blastn defaults for gap opening and extension
1543 *
1544 * Revision 1.94 1997/07/22 17:22:41 madden
1545 * Added NULL arg (for index callback) to BLASTSetUpSearch funcs
1546 *
1547 * Revision 1.93 1997/07/21 17:36:42 madden
1548 * Added BlastGetReleaseDate
1549 *
1550 * Revision 1.92 1997/07/18 20:57:02 madden
1551 * Added functions BlastGetVersionNumber and BlastGetReference
1552 *
1553 * Revision 1.91 1997/07/18 14:26:20 madden
1554 * call to AcknowledgeBlastQuery changed, SeqId no longer deleted there.
1555 *
1556 * Revision 1.90 1997/07/16 20:34:35 madden
1557 * Added function BlastConvertProteinSeqLoc
1558 *
1559 * Revision 1.89 1997/07/15 20:36:14 madden
1560 * Added BioseqSeg and SeqLocSeg
1561 *
1562 * Revision 1.88 1997/07/14 20:11:10 madden
1563 * Removed unused variables
1564 *
1565 * Revision 1.87 1997/07/14 16:15:41 madden
1566 * call to BLASTOptionValidateEx in BlastBioseqEngine
1567 *
1568 * Revision 1.86 1997/07/14 15:31:49 madden
1569 * Added BlastErrorMessage functions
1570 *
1571 * Revision 1.85 1997/07/11 19:29:37 madden
1572 * Added function BioseqBlastEngineByLoc
1573 *
1574 * Revision 1.84 1997/07/10 20:35:43 madden
1575 * Changed parameter output
1576 *
1577 * Revision 1.83 1997/07/02 20:18:39 madden
1578 * Made continuous SeqAlign the default
1579 *
1580 * Revision 1.82 1997/07/02 18:31:39 madden
1581 * changed defaults
1582 *
1583 * Revision 1.81 1997/07/01 19:15:44 madden
1584 * More changes to FormatBlastParameters
1585 *
1586 * Revision 1.80 1997/07/01 17:51:36 madden
1587 * changed gap_decay rate, gap_prob
1588 *
1589 * Revision 1.79 1997/07/01 15:44:44 madden
1590 * Changes to FormatBlastParameters per S. Altschul
1591 *
1592 * Revision 1.78 1997/06/30 15:50:06 madden
1593 * Changes to FormatBlastParameters
1594 *
1595 * Revision 1.77 1997/06/27 22:18:51 madden
1596 * Updated default parameters
1597 *
1598 * Revision 1.76 1997/06/27 14:31:08 madden
1599 * Added functions BlastAddSeqIdToList and BlastSeqIdListDestruct
1600 *
1601 * Revision 1.75 1997/06/24 13:51:27 madden
1602 * Fixed SeqLoc leak
1603 *
1604 * Revision 1.74 1997/06/23 20:49:31 madden
1605 * BLASTOptionValidate checks for proper gapping parameters
1606 *
1607 * Revision 1.73 1997/06/20 13:11:33 madden
1608 * Made AdjustOffSetsInSeqAlign non-static, Fixed purify error
1609 *
1610 * Revision 1.72 1997/06/06 21:29:48 madden
1611 * Added Boolean html to AcknowledgeBlastQuery and PrintDbInformation
1612 *
1613 * Revision 1.71 1997/06/06 19:49:46 madden
1614 * Added BlastMakeFakeBioseq and BlastDeleteFakeBioseq
1615 *
1616 * Revision 1.70 1997/05/30 21:05:59 madden
1617 * corrected call to readdb_new
1618 *
1619 * Revision 1.69 1997/05/27 20:20:02 madden
1620 * Added function BlastMaskTheResidues
1621 *
1622 * Revision 1.68 1997/05/22 21:24:55 madden
1623 * Added support for final gapX dropoff value
1624 *
1625 * Revision 1.67 1997/05/20 17:52:58 madden
1626 * Added functions BlastTwoSequencesByLoc and BlastSequencesOnTheFlyByLoc
1627 *
1628 * Revision 1.66 1997/05/12 21:34:16 madden
1629 * readdb_new allows indeterminate database type
1630 *
1631 * Revision 1.65 1997/05/06 22:17:59 madden
1632 * Duplicate dblen_eff, dbseq_num, and length_adjustment
1633 *
1634 * Revision 1.64 1997/05/01 15:53:19 madden
1635 * Addition of extra KarlinBlk's for psi-blast
1636 *
1637 * Revision 1.63 1997/04/29 14:07:45 madden
1638 * Fixed problem with hits failing PreliminaryGapping; fixed UMR.
1639 *
1640 * Revision 1.62 1997/04/25 20:23:06 madden
1641 * Freed SeqPort to clear mem leak.
1642 *
1643 * Revision 1.61 1997/04/24 14:43:07 madden
1644 * Fix for minus strand (ungapped) tblastn runs.
1645 *
1646 * Revision 1.60 1997/04/23 21:56:07 madden
1647 * Changes in BlastGetGappedAlignmentTraceback for in-frame gapping tblastn.
1648 *
1649 * Revision 1.59 1997/04/22 14:00:14 madden
1650 * Removed unused variables.
1651 *
1652 * Revision 1.58 1997/04/22 13:04:19 madden
1653 * Changes for in-frame blastx gapping.
1654 *
1655 * Revision 1.57 1997/04/21 15:35:26 madden
1656 * Fixes for 'gapped' StdSegs.
1657 *
1658 * Revision 1.56 1997/04/18 17:08:35 madden
1659 * Corrected printing of threshold values.
1660 *
1661 * Revision 1.55 1997/04/17 22:12:43 madden
1662 * Fix for offset in GetStartForGappedAlignment.
1663 *
1664 * Revision 1.54 1997/04/17 22:07:48 madden
1665 * Changes to allow in-frame gapped tblastn.
1666 *
1667 * Revision 1.53 1997/04/15 22:02:59 madden
1668 * Set original_length1 for translating searches.
1669 *
1670 * Revision 1.52 1997/04/14 21:31:58 madden
1671 * Checking for NULL pointer.
1672 *
1673 * Revision 1.51 1997/04/14 15:59:47 madden
1674 * Changes for ungapped psi-blast.
1675 *
1676 * Revision 1.50 1997/04/11 21:18:45 madden
1677 * Added GetSequenceWithDenseSeg.
1678 *
1679 * Revision 1.49 1997/04/11 19:02:49 madden
1680 * Changes for in-frame blastx, tblastn gapping.
1681 *
1682 * Revision 1.48 1997/04/09 20:01:53 madden
1683 * Copied seqid_list from search structure to duplicate, for use on threads.
1684 *
1685 * Revision 1.47 1997/04/08 16:27:28 madden
1686 * Fixed leaks; fix for blastn formatting of parameters.
1687 *
1688 * Revision 1.46 1997/04/07 21:42:56 madden
1689 * Freed SeqLocPtr used for dust.
1690 *
1691 * Revision 1.45 1997/04/07 18:17:09 madden
1692 * Formatted parameters for Stephen.
1693 *
1694 * Revision 1.44 1997/04/04 20:44:09 madden
1695 * Added check for NULL return.
1696 *
1697 * Revision 1.43 1997/04/04 20:42:35 madden
1698 * Added function BioseqBlastEngineCore.
1699 *
1700 * Revision 1.42 1997/04/03 19:50:56 madden
1701 * Changes to use effective database length instead of the length of each
1702 * sequence in statistical calculations.
1703 *
1704 * Revision 1.41 1997/03/27 22:30:51 madden
1705 * Correctly checked for overlapping HSP's.
1706 *
1707 * Revision 1.40 1997/03/20 22:56:24 madden
1708 * Added gap_info to hsp.
1709 *
1710 * Revision 1.39 1997/03/20 21:52:10 madden
1711 * Fix for segmented query BioseqPtr when gapped alignment is performed.
1712 *
1713 * Revision 1.39 1997/03/20 21:52:10 madden
1714 * Fix for segmented query BioseqPtr when gapped alignment is performed.
1715 *
1716 * Revision 1.38 1997/03/14 22:06:11 madden
1717 * fixed MT bug in BlastReevaluateWithAmbiguities.
1718 *
1719 * Revision 1.37 1997/03/14 15:57:23 madden
1720 * Removed superfluous call to SeqAlignNew
1721 *
1722 * Revision 1.36 1997/03/14 15:22:11 madden
1723 * Fixed UMR of seqalign in BlastTwoSequencesCore.
1724 *
1725 * Revision 1.35 1997/03/11 14:38:40 madden
1726 * Added BlastSequencesOnTheFly and BlastTwoSequencesCore.
1727 *
1728 * Revision 1.34 1997/03/07 22:35:54 madden
1729 * Fix for BLASTOptionNew.
1730 *
1731 * Revision 1.33 1997/03/07 21:58:36 madden
1732 * Added Boolean gapped argument to BLASTOptionNew.
1733 *
1734 * Revision 1.32 1997/03/07 21:11:22 madden
1735 * Added in check for blastn on gapped calculations.
1736 *
1737 * Revision 1.31 1997/03/06 21:47:27 madden
1738 * Made FormatBlastParameters non-static.
1739 *
1740 * Revision 1.30 1997/03/05 18:16:16 madden
1741 * SeqIdFree replaced by SeqIdSetFree, fixed memory leak.
1742 *
1743 * Revision 1.29 1997/03/05 14:29:46 madden
1744 * Moved BlastSaveCurrentHsp from blast.c; Added function CheckHspOverlap.
1745 *
1746 * Revision 1.28 1997/03/04 21:34:59 madden
1747 * Added in HspArrayPurge.
1748 *
1749 * Revision 1.27 1997/03/04 20:08:19 madden
1750 * Moved gapped alignment code from blast.c to blastutl.c
1751 *
1752 * Revision 1.26 1997/03/03 22:39:45 madden
1753 * Moved code from blast.c to blastutl.c.
1754 *
1755 * Revision 1.25 1997/03/03 21:47:22 madden
1756 * Moved functions from blast.c to blastutl.c for 16-bit windows.
1757 *
1758 * Revision 1.24 1997/03/03 20:58:09 madden
1759 * Fixed offsets for minus strands.
1760 *
1761 * Revision 1.23 1997/03/03 17:30:21 madden
1762 * Set SeqAlignPtr to NULL in BlastTwoSequences and BlastBioseqEngine, possible UMR.
1763 *
1764 * Revision 1.22 1997/03/01 18:25:33 madden
1765 * reverse flag added to BlastGetGappedAlignmentTraceback functions.
1766 *
1767 * Revision 1.21 1997/02/27 22:47:07 madden
1768 * Replaced tblastx with tblastn in BioseqBlastEngine.
1769 *
1770 * Revision 1.20 1997/02/26 23:39:54 madden
1771 * Added Txdfline stuff.
1772 *
1773 * Revision 1.19 1997/02/26 20:37:31 madden
1774 * Added *error_returns to BioseqBlastEngine.
1775 *
1776 * Revision 1.18 1997/02/25 19:17:05 madden
1777 * Changes to BioseqBlastEngine.
1778 *
1779 * Revision 1.17 1997/02/20 23:00:34 madden
1780 * Checked for NULL return in BlastTwoSequences.
1781 *
1782 * Revision 1.16 1997/02/20 18:38:34 madden
1783 * Set Default db_length to zero in Options.
1784 *
1785 * Revision 1.15 1997/02/19 16:25:22 madden
1786 * Reset gapped_calculation for blastn; returned proper SeqAlign for blastx, tblastn
1787 * in BioseqBlastEngine.
1788 *
1789 * Revision 1.14 1997/02/19 13:45:13 madden
1790 * replaced zero in call to BlastGetGappedAlignmentTraceback with FALSE.
1791 *
1792 * Revision 1.13 1997/02/18 22:09:02 madden
1793 * Removed unused variable.
1794 *
1795 * Revision 1.12 1997/02/18 21:03:00 madden
1796 * Changes to BioseqBlastEngine for gapped calculations.
1797 *
1798 * Revision 1.11 1997/02/18 18:31:34 madden
1799 * Used SeqIdFindBest in BlastTwoSequences.
1800 *
1801 * Revision 1.10 1997/02/18 17:58:52 madden
1802 * Added BioseqBlastEngine.
1803 *
1804 * Revision 1.9 1997/02/14 17:17:59 madden
1805 * Changes to default options and BlastTwoSequences for nucl.
1806 * sequences with ambiguites.
1807 *
1808 * Revision 1.8 1997/02/13 18:23:56 madden
1809 * Fixed ID type from BlastTwoSequences.
1810 *
1811 * Revision 1.7 1997/02/11 19:30:54 madden
1812 * Changes to BlastTwoSequences for gapped alignments.
1813 *
1814 * Revision 1.6 1997/02/10 20:03:58 madden
1815 * BlastTwoSequences indexes only the subject.
1816 *
1817 * Revision 1.5 1997/02/10 15:24:26 madden
1818 * Removed unused variable.
1819 *
1820 * Revision 1.4 1997/02/07 22:43:03 madden
1821 * Moved BLAST_WordFinderNew and Destruct from blast.c to blastutl.c, made
1822 * non-static.
1823 *
1824 * Revision 1.3 1997/02/07 22:32:40 madden
1825 * Changed prototypes for BlastGetSubjectId and GetSeqAlignForResultHitList.
1826 *
1827 * Revision 1.2 1997/02/05 13:36:48 madden
1828 * Removed Unused variable.
1829 *
1830 * Revision 1.1 1997/02/04 18:23:58 madden
1831 * Initial revision
1832 *
1833 */
1834
1835 #define NLM_GENERATED_CODE_PROTO
1836 #include <ncbi.h>
1837 #include <blast.h>
1838 #include <blastpri.h>
1839 #include <objcode.h>
1840 #include <objseq.h>
1841 #include <sequtil.h>
1842 #include <tofasta.h>
1843 #include <seqport.h>
1844 #include <readdb.h>
1845 #include <ncbithr.h>
1846 #include <blast_dust.h>
1847 #include <urkpcc.h>
1848 #include <txalign.h>
1849 #include <seg.h>
1850 #include <salpedit.h>
1851 #include <mbalign.h>
1852 #include <mblast.h>
1853 #include <vecscrn.h>
1854 #include <rpsutil.h>
1855 #include <simutil.h>
1856 #include <blfmtutl.h>
1857
1858 typedef struct _pgp_blast_options {
1859 BLAST_OptionsBlkPtr options;
1860 CharPtr blast_database;
1861 BioseqPtr query_bsp, fake_bsp;
1862 Int4 number_of_descriptions, number_of_alignments;
1863 FILE *infp, *outfp;
1864 AsnIoPtr aip_out;
1865 Boolean html;
1866 Boolean believe_query;
1867 Uint4 align_options, print_options;
1868 /* PHI-PSI Blast variables */
1869 Uint1 featureOrder[FEATDEF_ANY];
1870 Uint1 groupOrder[FEATDEF_ANY];
1871 Int4 program_flag;
1872 CharPtr patfile;
1873 FILE *patfp;
1874 seedSearchItems *seedSearch;
1875 } PGPBlastOptions, PNTR PGPBlastOptionsPtr;
1876
1877 /* Window size used to scan HSP for highest score region, where gapped
1878 extension starts. */
1879 #define HSP_MAX_WINDOW 11
1880
1881 #define BLASTFILTER_DIR "/usr/ncbi/blast/filter"
1882
1883 static SeqIdPtr
BlastGetFirstGiofSubset(ReadDBFILEPtr rdfp,Int4 ordinal_id,Int2 aliasfilebit)1884 BlastGetFirstGiofSubset(ReadDBFILEPtr rdfp, Int4 ordinal_id, Int2 aliasfilebit)
1885 {
1886 Boolean not_done = TRUE;
1887 SeqIdPtr bestid = NULL, tmp_seqid, seqid=NULL;
1888 Uint4 header_index = 0;
1889 Int4 gi = 0;
1890 Int4 alias_mask;
1891 BlastDefLinePtr bdfp;
1892
1893 if (!rdfp->cih && rdfp->formatdb_ver < FORMATDB_VER) {
1894 /* FORMATDB_VER_TEXT version requires the common index
1895 * to determine the subset databases */
1896 ErrPostEx(SEV_ERROR, 0, 0, "Database mask cannot be used without CommonIndex");
1897 return NULL;
1898 }
1899
1900 alias_mask = (0x1 << rdfp->aliasfilebit);
1901
1902 bdfp = NULL;
1903 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
1904 bdfp = FDReadDeflineAsn(rdfp, ordinal_id);
1905 if(bdfp == NULL) {
1906 ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d",
1907 ordinal_id);
1908 return NULL;
1909 }
1910
1911 bestid = SeqIdFindBest(bdfp->seqid, SEQID_GI);
1912 if (bestid->choice == SEQID_GI) {
1913 gi = bestid->data.intvalue;
1914 ValNodeAddInt(&seqid, SEQID_GI, gi);
1915 }
1916 bdfp = BlastDefLineSetFree(bdfp);
1917
1918 return seqid;
1919 }
1920
1921 while (not_done) {
1922 CommonIndexPtr cigi;
1923
1924 /* get seqid from database headers file */
1925 not_done = readdb_get_header (rdfp, ordinal_id, &header_index, &tmp_seqid, NULL);
1926
1927 if (not_done == FALSE)
1928 break;
1929
1930 if (not_done) {
1931 /* get gi number */
1932 bestid = SeqIdFindBest(tmp_seqid, SEQID_GI);
1933 if (bestid->choice != SEQID_GI) {
1934 tmp_seqid = SeqIdSetFree(tmp_seqid);
1935 break;
1936 }
1937 gi = bestid->data.intvalue;
1938
1939 /* get database commonindex mask */
1940 cigi = rdfp->cih->ci + gi;
1941 if (alias_mask & SwapUint4(cigi->dbmask)) {
1942 ValNodeAddInt(&seqid, SEQID_GI, gi);
1943 break;
1944 }
1945 tmp_seqid = SeqIdSetFree(tmp_seqid);
1946 }
1947 }
1948 tmp_seqid = SeqIdSetFree(tmp_seqid);
1949
1950 return seqid;
1951 }
1952
1953 #define BLAST_ITER_MAX 30
1954
1955 /*
1956 Goes through the list of gi's/ordinal id's looking for matches
1957 to the ordinal ID. Returns those acceptable gi's as SeqIdPtr's.
1958 */
1959 SeqIdPtr
BlastGetAllowedGis(BlastSearchBlkPtr search,Int4 ordinal_id,SeqIdPtr PNTR seqid)1960 BlastGetAllowedGis (BlastSearchBlkPtr search, Int4 ordinal_id, SeqIdPtr PNTR seqid)
1961 {
1962 BlastGiListPtr blast_gi_list;
1963 Boolean found=FALSE;
1964 BlastDoubleInt4Ptr *gi_list_pointer;
1965 Int4 index, total, first, last, current;
1966 ValNodePtr gi_list=NULL;
1967
1968 if (seqid)
1969 *seqid = NULL;
1970 gi_list = NULL;
1971 if (search->thr_info->blast_gi_list) {
1972 blast_gi_list = search->thr_info->blast_gi_list;
1973 total = blast_gi_list->total;
1974 found = FALSE;
1975 gi_list_pointer = blast_gi_list->gi_list_pointer;
1976 first = 0;
1977 last = total;
1978 for (index=0; index<BLAST_ITER_MAX; index++) {
1979 current = (first+last)/2;
1980 if (ordinal_id < gi_list_pointer[current]->ordinal_id)
1981 last = current;
1982 else if (ordinal_id > gi_list_pointer[current]->ordinal_id)
1983 first = current;
1984 else { /* back up looking for all gi's associated with this oid. */
1985 while (current > 0 &&
1986 ordinal_id == gi_list_pointer[current-1]->ordinal_id)
1987 current--;
1988 found = TRUE;
1989 break;
1990 }
1991 }
1992
1993 if (found) {
1994 while (current < total) {
1995 if (ordinal_id == gi_list_pointer[current]->ordinal_id) {
1996 ValNodeAddInt(&gi_list, SEQID_GI, blast_gi_list->gi_list_pointer[current]->gi);
1997 } else {
1998 break;
1999 }
2000 current++;
2001 }
2002 }
2003
2004 if (seqid && search->rdfp && search->rdfp->aliasfilebit != 0) {
2005 *seqid = BlastGetFirstGiofSubset(search->rdfp, ordinal_id, search->rdfp->aliasfilebit);
2006 }
2007 return (SeqIdPtr) gi_list;
2008 } else if (search->rdfp != NULL && search->rdfp->oidlist != NULL) {
2009 /* if we have at least one mask, then we need print only those gis, which
2010 are in the database list (reals and masks) */
2011
2012 Boolean not_done = TRUE;
2013 SeqIdPtr bestid = NULL, tmp_seqid = NULL;
2014 Uint4 header_index = 0;
2015 Int4 gi = 0;
2016 Int4 mask;
2017 Int2 firstpos, curfirstpos;
2018 ReadDBFILEPtr rdfp = search->rdfp, tmprdfp;
2019 BlastDefLinePtr bdfp, bdfp_head;
2020
2021 if (!rdfp->cih && rdfp->formatdb_ver < FORMATDB_VER) {
2022 /* FORMATDB_VER_TEXT version requires the common index
2023 * to determine the subset databases */
2024 /*ErrPostEx(SEV_ERROR, 0, 0, "Database mask cannot be used without CommonIndex");*/
2025 return NULL;
2026 }
2027
2028 /* kludge: only protein databases are non-redundant */
2029 if (readdb_is_prot(search->rdfp) == FALSE)
2030 return NULL;
2031
2032 bdfp = NULL; bdfp_head = NULL;
2033 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
2034 /* just chain the seqid's returned, as they are filtered in
2035 * FDReadDeflineAsn according to the membership_bit in the
2036 * rdfp */
2037 bdfp = FDReadDeflineAsn(rdfp, ordinal_id);
2038 if(bdfp == NULL) {
2039 ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d", ordinal_id);
2040 return NULL;
2041 }
2042 for (bdfp_head = bdfp; bdfp; bdfp = bdfp->next) {
2043 bestid = SeqIdFindBest(bdfp->seqid, SEQID_GI);
2044 if (bestid->choice == SEQID_GI) {
2045 gi = bestid->data.intvalue;
2046 ValNodeAddInt(&gi_list, SEQID_GI, gi);
2047 }
2048 }
2049
2050 BlastDefLineSetFree(bdfp_head);
2051
2052 } else {
2053
2054 while (not_done) {
2055 CommonIndexPtr cigi;
2056
2057 /* get seqid from database headers file */
2058 not_done = readdb_get_header (search->rdfp, ordinal_id, &header_index, &tmp_seqid, NULL);
2059
2060 if (not_done == FALSE)
2061 break;
2062
2063 if (not_done) {
2064 /* get gi number */
2065 bestid = SeqIdFindBest(tmp_seqid, SEQID_GI);
2066 if (bestid->choice != SEQID_GI) {
2067 tmp_seqid = SeqIdSetFree(tmp_seqid);
2068 break;
2069 }
2070 gi = bestid->data.intvalue;
2071
2072 /* get database commonindex mask */
2073 cigi = search->rdfp->cih->ci + gi;
2074 mask = SwapUint4(cigi->dbmask);
2075
2076 firstpos = 0;
2077 while (((curfirstpos = bit_engine_firstbit(mask)) != -1)) {
2078 CharPtr dbname;
2079
2080 firstpos += curfirstpos;
2081
2082 dbname = DBName(search->rdfp->cih->num_of_DBs,
2083 search->rdfp->cih->dbids, firstpos);
2084
2085 /* search in rdfp list this database */
2086 tmprdfp = search->rdfp;
2087 while (tmprdfp) {
2088 if (tmprdfp->aliasfilename) {
2089 /* use mask name, if exists */
2090 if (!StrCmp(dbname, tmprdfp->aliasfilename)) {
2091 ValNodeAddInt(&gi_list, SEQID_GI, gi);
2092 }
2093 } else {
2094 /* use real file name */
2095 if (!StrCmp(dbname, tmprdfp->filename)) {
2096 ValNodeAddInt(&gi_list, SEQID_GI, gi);
2097 }
2098 }
2099 tmprdfp = tmprdfp->next;
2100 }
2101 mask >>= (curfirstpos + 1);
2102 firstpos++;
2103 }
2104 }
2105
2106 if (tmp_seqid) {
2107 tmp_seqid = SeqIdSetFree(tmp_seqid);
2108 }
2109 }
2110 }
2111 if (seqid)
2112 *seqid = BlastGetFirstGiofSubset(search->rdfp, ordinal_id, search->rdfp->aliasfilebit);
2113
2114 return (SeqIdPtr) gi_list;
2115 }
2116
2117 return NULL;
2118 }
2119
2120 /*
2121 SOME FUNCTIONS TO PRODUCE A SeqAlign from the BLAST results.
2122 */
2123
2124 /*****************************************************************************
2125
2126 Finds the best SeqId for the SeqAlign. Looks for the GI, then takes
2127 anything if that's not found and makes up a local ID if no ID is
2128 found at all.
2129 *****************************************************************************/
2130
2131 SeqIdPtr
GetTheSeqAlignID(SeqIdPtr seq_id)2132 GetTheSeqAlignID(SeqIdPtr seq_id)
2133 {
2134 SeqIdPtr new_id, ret_id;
2135 ObjectIdPtr obidp;
2136
2137 ret_id = NULL;
2138 if (seq_id)
2139 {
2140 /* Get the gi from the chain, if it's there. */
2141 new_id = SeqIdFindBest(seq_id, SEQID_GI);
2142 if (new_id)
2143 {
2144 ret_id = SeqIdDup(new_id);
2145 }
2146 else
2147 { /* No Gi was found, use any ID. */
2148 ret_id = SeqIdDup(seq_id);
2149 }
2150 }
2151
2152 if (ret_id == NULL)
2153 { /* make up an ID. */
2154 obidp = ObjectIdNew();
2155 obidp->str = StringSave("lcl|unknown");
2156 ValNodeAddPointer(&ret_id, SEQID_LOCAL, obidp);
2157 }
2158
2159 return ret_id;
2160 }
2161 static SeqAlignPtr
FillInSegsInfo(SeqAlignPtr sap_head,StdSegPtr ssp_head,DenseDiagPtr ddp_head)2162 FillInSegsInfo(SeqAlignPtr sap_head, StdSegPtr ssp_head, DenseDiagPtr ddp_head)
2163
2164 {
2165 SeqAlignPtr sap;
2166
2167 if (ddp_head || ssp_head)
2168 {
2169 if (sap_head)
2170 {
2171 sap = sap_head;
2172 while (sap->next)
2173 sap = sap->next;
2174 sap->next = SeqAlignNew();
2175 sap = sap->next;
2176 }
2177 else
2178 {
2179 sap_head = sap = SeqAlignNew();
2180 }
2181
2182 if (ddp_head)
2183 {
2184 sap->type = 2;
2185 sap->segs = ddp_head;
2186 sap->segtype = 1;
2187 }
2188 else if (ssp_head)
2189 {
2190 sap->type = 2;
2191 sap->segs = ssp_head;
2192 sap->segtype = 3;
2193 }
2194 }
2195 return sap_head;
2196 }
2197
2198
2199 /*************************************************************************
2200 *
2201 * This function fills in the DenseDiag Information from the variable
2202 * hsp. On the first call to this function *old should be
2203 * NULL, after that pass in the head of the DenseDiagPtr chain.
2204 * The newest DenseDiagPtr is returned.
2205 *
2206 ************************************************************************/
2207
2208 static DenseDiagPtr
FillInDenseDiagInfo(DenseDiagPtr PNTR old,BLASTResultHspPtr hsp,Boolean reverse,Int4 query_length,Int4 subject_length,SeqIdPtr gi_list)2209 FillInDenseDiagInfo(DenseDiagPtr PNTR old, BLASTResultHspPtr hsp, Boolean reverse, Int4 query_length, Int4 subject_length, SeqIdPtr gi_list)
2210
2211 {
2212 DenseDiagPtr ddp, new;
2213
2214 new = DenseDiagNew();
2215
2216 new->dim = 2; /* Only 2 is supported in spec. */
2217 new->len = hsp->query_length;
2218 new->starts = (Int4Ptr) MemNew(2 * sizeof(Int4));
2219 new->strands = (Uint1Ptr) MemNew(2 * sizeof(Uint1));
2220 if (reverse)
2221 {
2222 if (hsp->subject_frame >= 0)
2223 {
2224 new->strands[0] = Seq_strand_plus;
2225 new->starts[0] = hsp->subject_offset;
2226 }
2227 else
2228 {
2229 new->strands[0] = Seq_strand_minus;
2230 new->starts[0] = subject_length - hsp->subject_offset - hsp->subject_length;
2231 }
2232 if (hsp->query_frame >= 0)
2233 {
2234 new->strands[1] = Seq_strand_plus;
2235 new->starts[1] = hsp->query_offset;
2236 }
2237 else
2238 {
2239 new->strands[1] = Seq_strand_minus;
2240 new->starts[1] = query_length - hsp->query_offset - hsp->query_length;
2241 }
2242 }
2243 else
2244 {
2245 if (hsp->query_frame >= 0)
2246 {
2247 new->strands[0] = Seq_strand_plus;
2248 new->starts[0] = hsp->query_offset;
2249 }
2250 else
2251 {
2252 new->strands[0] = Seq_strand_minus;
2253 new->starts[0] = query_length - hsp->query_offset - hsp->query_length;
2254 }
2255 if (hsp->subject_frame >= 0)
2256 {
2257 new->strands[1] = Seq_strand_plus;
2258 new->starts[1] = hsp->subject_offset;
2259 }
2260 else
2261 {
2262 new->strands[1] = Seq_strand_minus;
2263 new->starts[1] = subject_length - hsp->subject_offset - hsp->subject_length;
2264 }
2265 }
2266 new->scores = GetScoreSetFromBlastResultHsp(hsp, gi_list);
2267
2268 /* Go to the end of the chain, and then attach "new" */
2269 if (*old)
2270 {
2271 ddp = *old;
2272 while (ddp->next)
2273 ddp = ddp->next;
2274 ddp->next = new;
2275 }
2276 else
2277 {
2278 *old = new;
2279 }
2280
2281 new->next = NULL;
2282
2283 return new;
2284 }
2285
2286 /*************************************************************************
2287 *
2288 * This function fills in the StdSeg Information from the variable
2289 * hsp. On the first call to this function *old should be
2290 * NULL, after that pass in the head of the DenseDiagPtr chain.
2291 * The newest StdSegPtr is returned.
2292 *
2293 ************************************************************************/
2294 static StdSegPtr
FillInStdSegInfo(BlastSearchBlkPtr search,Int4 subject_id,Int4 length,StdSegPtr PNTR old,BLASTResultHspPtr hsp,SeqIdPtr sip,Boolean reverse,SeqIdPtr gi_list)2295 FillInStdSegInfo(BlastSearchBlkPtr search, Int4 subject_id, Int4 length, StdSegPtr PNTR old, BLASTResultHspPtr hsp, SeqIdPtr sip, Boolean reverse, SeqIdPtr gi_list)
2296
2297 {
2298 Int4 subject_length;
2299 StdSegPtr ssp, new;
2300 SeqIdPtr query_sip, subject_sip;
2301 SeqIntPtr seq_int1, seq_int2;
2302 SeqLocPtr slp=NULL;
2303
2304 new = StdSegNew();
2305 /* Duplicate the id and split it up into query and subject parts */
2306 query_sip = SeqIdDup(sip);
2307 subject_sip = SeqIdDup(sip->next);
2308
2309 new->dim = 2; /* Only 2 is supported in spec. */
2310 seq_int1 = SeqIntNew();
2311 if (hsp->query_frame == 0)
2312 {
2313 seq_int1->from = hsp->query_offset;
2314 seq_int1->to = hsp->query_offset + hsp->query_length - 1;
2315 seq_int1->strand = Seq_strand_unknown;
2316 }
2317 else if (hsp->query_frame < 0)
2318 {
2319 seq_int1->to = search->context[hsp->context].query->original_length - CODON_LENGTH*hsp->query_offset + hsp->query_frame;
2320 seq_int1->from = search->context[hsp->context].query->original_length - CODON_LENGTH*(hsp->query_offset+hsp->query_length) + hsp->query_frame + 1;
2321 seq_int1->strand = Seq_strand_minus;
2322 }
2323 else if (hsp->query_frame > 0)
2324 {
2325 seq_int1->from = CODON_LENGTH*(hsp->query_offset) + hsp->query_frame - 1;
2326 seq_int1->to = CODON_LENGTH*(hsp->query_offset+hsp->query_length) + hsp->query_frame - 2;
2327 seq_int1->strand = Seq_strand_plus;
2328 }
2329 seq_int1->id = query_sip;
2330 seq_int2 = SeqIntNew();
2331 if (hsp->subject_frame == 0)
2332 {
2333 seq_int2->from = hsp->subject_offset;
2334 seq_int2->to = hsp->subject_offset + hsp->subject_length - 1;
2335 seq_int2->strand = Seq_strand_unknown;
2336 }
2337 else if (hsp->subject_frame < 0)
2338 {
2339 if (search->rdfp)
2340 subject_length = readdb_get_sequence_length(search->rdfp, subject_id);
2341 else
2342 subject_length = length;
2343
2344 seq_int2->from = subject_length - CODON_LENGTH*(hsp->subject_offset + hsp->subject_length) + hsp->subject_frame + 1;
2345 seq_int2->to = subject_length - CODON_LENGTH*(hsp->subject_offset) + hsp->subject_frame;
2346 seq_int2->strand = Seq_strand_minus;
2347 }
2348 else if (hsp->subject_frame > 0)
2349 {
2350 seq_int2->from = CODON_LENGTH*(hsp->subject_offset) + hsp->subject_frame - 1;
2351 seq_int2->to = CODON_LENGTH*(hsp->subject_offset + hsp->subject_length) + hsp->subject_frame - 2;
2352 seq_int2->strand = Seq_strand_plus;
2353 }
2354 seq_int2->id = subject_sip;
2355
2356 if (reverse)
2357 {
2358 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
2359 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
2360 }
2361 else
2362 {
2363 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
2364 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
2365 }
2366 new->loc = slp;
2367
2368 search->subject->sequence = MemFree(search->subject->sequence);
2369 new->scores = GetScoreSetFromBlastResultHsp(hsp, gi_list);
2370
2371 /* Go to the end of the chain, and then attach "new" */
2372 if (*old)
2373 {
2374 ssp = *old;
2375 while (ssp->next)
2376 ssp = ssp->next;
2377 ssp->next = new;
2378 }
2379 else
2380 {
2381 *old = new;
2382 }
2383
2384 new->next = NULL;
2385
2386 return new;
2387 }
2388
2389 /************************************************************************
2390 *
2391 * This function assembles all the components of the Seq-align from
2392 * a "sparse" BLAST HitList. "sparse" means that the hitlist
2393 * may contain no sequence and not even a descriptor. It is only
2394 * required to contain the sequence_number that readdb refers to
2395 * and scoring/alignment information.
2396 *
2397 * If dbname is non-NULL, then only a general ("gnl") ID is
2398 * issued, with the ordinal number of the subject sequence in
2399 * the ObjectIdPtr.
2400 *
2401 * Boolean reverse: reverse the query and db order in SeqAlign.
2402 *
2403 ************************************************************************/
2404 SeqAlignPtr LIBCALL
GetSeqAlignForResultHitList(BlastSearchBlkPtr search,Boolean getdensediag,Boolean ordinal_number,Boolean discontinuous,Boolean reverse,Boolean get_redundant_seqs)2405 GetSeqAlignForResultHitList(BlastSearchBlkPtr search, Boolean getdensediag, Boolean ordinal_number, Boolean discontinuous, Boolean reverse, Boolean get_redundant_seqs)
2406
2407 {
2408 BLASTResultHspPtr hsp;
2409 BLASTResultHitlistPtr results;
2410 BLASTResultsStructPtr result_struct;
2411 DenseDiagPtr ddp_head=NULL, ddp;
2412 SeqIdPtr gi_list=NULL, sip, sip_subject,
2413 sip_subject_start, query_id, new_sip;
2414 StdSegPtr ssp_head=NULL, ssp;
2415 SeqAlignPtr last, seqalign_head, seqalign, sap_head;
2416 Int4 hsp_cnt, index, index2, hspset_cnt_old, i;
2417 Int4 hitlist_count;
2418 Int4 subject_length;
2419 ValNodePtr vnp, vnp_start;
2420
2421 ddp_head = NULL;
2422 ssp_head = NULL;
2423 sap_head = NULL;
2424 seqalign_head = NULL;
2425
2426 /* discontinuous = FALSE; */
2427 result_struct = search->result_struct;
2428 hitlist_count = result_struct->hitlist_count;
2429
2430 last = NULL;
2431 sip = NULL;
2432 sip_subject_start = NULL;
2433 for (index=0; index<hitlist_count; index++)
2434 {
2435 results = result_struct->results[index];
2436 sip_subject_start = NULL;
2437 if (get_redundant_seqs)
2438 {
2439 vnp = NULL;
2440 sip = BlastGetSubjectId(search, index, ordinal_number, &vnp);
2441 vnp_start = vnp;
2442 while (vnp)
2443 {
2444 sip = GetTheSeqAlignID(vnp->data.ptrvalue);
2445 SeqIdFree(vnp->data.ptrvalue);
2446 if (sip_subject_start == NULL)
2447 {
2448 sip_subject_start = sip;
2449 }
2450 else
2451 {
2452 sip_subject = sip_subject_start;
2453 while (sip_subject->next)
2454 sip_subject = sip_subject->next;
2455 sip_subject->next = sip;
2456 }
2457 vnp = vnp->next;
2458 }
2459 vnp_start = vnp = ValNodeFree(vnp_start);
2460 }
2461 else
2462 {
2463 sip = BlastGetSubjectId(search, index, ordinal_number, NULL);
2464 sip_subject_start = sip_subject = GetTheSeqAlignID(sip);
2465 sip = SeqIdSetFree(sip);
2466 }
2467
2468 results = result_struct->results[index];
2469 if (search->rdfp)
2470 subject_length = readdb_get_sequence_length(search->rdfp, results->subject_id);
2471 else if (results->subject_info)
2472 subject_length = results->subject_info->length;
2473 else
2474 subject_length = 0;
2475
2476 gi_list = BlastGetAllowedGis(search, results->subject_id, &new_sip);
2477 /* right now sip_subject should only contain one ID. At some
2478 point it will contain multiple ID's for identical sequences. */
2479 if (new_sip != NULL)
2480 sip_subject = new_sip;
2481 else
2482 sip_subject = sip_subject_start;
2483 while (sip_subject)
2484 {
2485 seqalign = SeqAlignNew();
2486 seqalign->type = 2; /* alignment is diags */
2487 if (last == NULL) /* First sequence. */
2488 seqalign_head = seqalign;
2489 else
2490 last->next = seqalign;
2491
2492 last = seqalign;
2493
2494 hspset_cnt_old = -1;
2495 hsp_cnt = results->hspcnt;
2496 for (index2=0; index2<hsp_cnt; index2++)
2497 {
2498 hsp = &(results->hsp_array[index2]);
2499 if (discontinuous && hspset_cnt_old != hsp->hspset_cnt)
2500 {
2501 hspset_cnt_old = hsp->hspset_cnt;
2502 if (index2 != 0)
2503 { /* nothing to save on first pass. */
2504 if (getdensediag)
2505 {
2506 sap_head = FillInSegsInfo(sap_head, NULL, ddp_head);
2507 ddp_head = NULL;
2508 }
2509 else
2510 {
2511 sap_head = FillInSegsInfo(sap_head, ssp_head, NULL);
2512 ssp_head = NULL;
2513 }
2514 }
2515 }
2516
2517 query_id = search->query_id;
2518 if (search->prog_number==blast_type_blastn) {
2519 for (i=0; i<hsp->context/2; i++)
2520 query_id = query_id->next;
2521 }
2522 if (reverse)
2523 {
2524 sip = SeqIdDup(sip_subject);
2525 sip->next = GetTheSeqAlignID(query_id);
2526 }
2527 else
2528 {
2529 sip = GetTheSeqAlignID(query_id);
2530 sip->next = SeqIdDup(sip_subject);
2531 }
2532
2533 if (getdensediag)
2534 {
2535 ddp = FillInDenseDiagInfo(&ddp_head, hsp, reverse, search->context[hsp->context].query->length, subject_length, gi_list);
2536 ddp->id = sip;
2537 }
2538 else
2539 {
2540 Int4 length = 0;
2541
2542 if (results->subject_info)
2543 length = results->subject_info->length;
2544
2545 ssp = FillInStdSegInfo(search, results->subject_id, length, &ssp_head, hsp, sip, reverse, gi_list);
2546 ssp->ids = sip;
2547 }
2548 sip = NULL; /* This SeqIdPtr is now on the SeqAlign. */
2549 }
2550
2551 if (discontinuous)
2552 {
2553 if (getdensediag)
2554 {
2555 sap_head = FillInSegsInfo(sap_head, NULL, ddp_head);
2556 ddp_head = NULL;
2557 }
2558 else
2559 {
2560 sap_head = FillInSegsInfo(sap_head, ssp_head, NULL);
2561 ssp_head = NULL;
2562 }
2563 seqalign->segs = sap_head;
2564 seqalign->segtype = 5; /* Discontinuous */
2565 }
2566 else
2567 {
2568 if (getdensediag)
2569 {
2570 seqalign->segs = ddp_head;
2571 seqalign->segtype = 1; /* DenseDiag */
2572 ddp_head = NULL;
2573 }
2574 else
2575 {
2576 seqalign->segs = ssp_head;
2577 seqalign->segtype = 3; /* StdSeg */
2578 ssp_head = NULL;
2579 }
2580 }
2581
2582 sap_head = NULL;
2583
2584 sip_subject = sip_subject->next;
2585 }
2586 if (sip_subject_start)
2587 sip_subject_start = SeqIdFree(sip_subject_start);
2588 if (new_sip)
2589 new_sip = SeqIdFree(new_sip);
2590 gi_list = SeqIdSetFree(gi_list);
2591 }
2592
2593 return seqalign_head;
2594 }
2595
2596 /*
2597 "Core" function to compare two sequences, for use by
2598 BlastTwoSequences and BlastSequencesOnTheFly.
2599
2600 The subject_bsp is redundant with the subject_seq_start and
2601 subject_length (or visa-versa), but the subject must be
2602 extracted from the subject_bsp for BlastTwoSequences anyway, while
2603 the title and ID are needed from subject_bsp.
2604 */
2605 static Int2
BlastTwoSequencesCoreEx(BlastSearchBlkPtr search,BioseqPtr subject_bsp,Uint1Ptr subject_seq,Int4 subject_length)2606 BlastTwoSequencesCoreEx (BlastSearchBlkPtr search, BioseqPtr subject_bsp, Uint1Ptr subject_seq, Int4 subject_length)
2607 {
2608 Int2 status=0;
2609
2610 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
2611 if (!search->handle_results)
2612 search->subject_info = BLASTSubjectInfoNew(SeqIdDup(SeqIdFindBest(subject_bsp->id, SEQID_GI)), StringSave(BioseqGetTitle(subject_bsp)), subject_length);
2613 else
2614 search->subject_info = BLASTSubjectInfoNew(SeqIdSetDup(subject_bsp->id), StringSave(BioseqGetTitle(subject_bsp)), subject_length);
2615
2616 /*CC: is search->sbp->posMatrix, we're comparing a pssm with a subject
2617 * sequence, thus we need to do some set up */
2618 if (search->sbp->posMatrix && search->prog_number == blast_type_blastp) {
2619 Int4 hitlist_max;
2620 BLAST_ScoreBlkPtr sbp = search->sbp;
2621 BLAST_ParameterBlkPtr pbp = search->pbp;
2622
2623 search->positionBased = TRUE;
2624 sbp->kbp = sbp->kbp_psi;
2625 sbp->kbp_gap = sbp->kbp_gap_psi;
2626 hitlist_max = search->result_struct->hitlist_max;
2627 search->result_struct =
2628 BLASTResultsStructDelete(search->result_struct);
2629 search->result_struct = BLASTResultsStructNew(hitlist_max,
2630 pbp->max_pieces, pbp->hsp_range_max);
2631
2632 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST) {
2633 search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
2634 search->wfp_first = BLAST_WordFinderNew(sbp->alphabet_size,
2635 search->all_words->wordsize, 1, FALSE);
2636 }
2637
2638 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND) {
2639 search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
2640 search->wfp_second = BLAST_WordFinderNew(sbp->alphabet_size,
2641 search->all_words->wordsize, 1, FALSE);
2642 }
2643
2644 /* threshold_first is defunct ! */
2645 search->wfp = search->wfp_first;
2646 if (search->whole_query == TRUE)
2647 BlastNewFindWords(search, 0, search->context[0].query->length,
2648 pbp->threshold_second, (Uint1) 0);
2649 else
2650 BlastNewFindWords(search, search->required_start,
2651 search->required_end, pbp->threshold_second, (Uint1) 0);
2652 lookup_position_aux_destruct(search->wfp->lookup);
2653 search->wfp_second = search->wfp_first;
2654 }
2655 status = BLASTPerformSearch(search, subject_length, subject_seq);
2656
2657 if (status) {
2658 BlastConstructErrorMessage("BlastTwoSequencesCoreEx", "non-zero status", 2, &(search->error_return));
2659 return status;
2660 }
2661
2662 if (search->prog_number == blast_type_tblastn &&
2663 search->pbp->longest_intron > 0) {
2664 Uint1 rem;
2665 Uint1Ptr seq_4na, seq_2na, subject;
2666 Int4 i;
2667 /* Need to convert from ncbi2na to ncbi4na encoding */
2668 subject = (Uint1Ptr) MemNew(subject_length + 1);
2669 seq_4na = subject;
2670 seq_2na = subject_seq;
2671 rem = 3;
2672 for (i=0; i<subject_length; i++) {
2673 *seq_4na = (Uint1) (1 << READDB_UNPACK_BASE_N(*seq_2na, rem));
2674 seq_4na++;
2675 if (rem>0) rem--;
2676 else {
2677 rem = 3;
2678 seq_2na++;
2679 }
2680 }
2681 BlastSequenceAddSequence(search->subject, NULL, subject-1, subject_length, subject_length, 0);
2682 status = BlastLinkHsps(search);
2683 }
2684
2685 if (StringCmp(search->prog_name, "blastn") == 0 || search->pbp->gapped_calculation == FALSE)
2686 {
2687 if (search->pbp->do_sum_stats == TRUE &&
2688 !search->pbp->mb_params)
2689 status = BlastLinkHsps(search);
2690 else
2691 status = BlastGetNonSumStatsEvalue(search);
2692 }
2693 if (search->pbp->mb_params) {
2694 search->subject->sequence = subject_seq;
2695 MegaBlastReevaluateWithAmbiguities(search);
2696 }
2697 status = BlastReapHitlistByEvalue(search);
2698
2699 if (search->handle_results)
2700 search->handle_results((VoidPtr) search);
2701 else if (!search->pbp->mb_params)
2702 BlastSaveCurrentHitlist(search);
2703 else
2704 MegaBlastSaveCurrentHitlist(search);
2705 if (search->pbp->mb_params)
2706 /* Free the ncbi4na-encoded sequence */
2707 search->subject->sequence_start = (Uint1Ptr)
2708 MemFree(search->subject->sequence_start);
2709
2710 search->subject->sequence = NULL;
2711 search->subject->sequence_start = NULL;
2712 if (search->prog_number==blast_type_blastn) {
2713 /* Unconcatenate the strands by adjusting the query offsets in
2714 all hsps */
2715 search->context[search->first_context].query->length =
2716 search->query_context_offsets[search->first_context+1] - 1;
2717 }
2718
2719 return status;
2720 }
2721
RPS2SeqImpalaStatCorrections(BlastSearchBlkPtr search,Uint1Ptr subject_seq,Int4 subject_length)2722 static BLAST_ScorePtr *RPS2SeqImpalaStatCorrections
2723 (BlastSearchBlkPtr search, Uint1Ptr subject_seq, Int4 subject_length)
2724 {
2725 BLAST_ScorePtr *retval = NULL;
2726 Nlm_FloatHi *scoreArray; /*array of score probabilities*/
2727 Nlm_FloatHi *resProb; /*array of probabilities for each residue*/
2728 BLAST_ScoreFreqPtr this_sfp, return_sfp; /*score frequency pointers to compute lambda*/
2729 BLAST_ScorePtr *posMatrix; /* position-specific matrix. */
2730 Nlm_FloatHi initialUngappedLambda, scaledInitialUngappedLambda,
2731 correctUngappedLambda, scalingFactor, lambdaRatio;
2732 Nlm_FloatHi temp1; /*intermediate variable for adjusting matrix*/
2733 Int4 temp2; /*intermediate variable for adjusting matrix*/
2734 Int4 seqlength; /* length of posMatrix (or target sequence). */
2735 Int4 i, j; /* loop indices */
2736
2737 if (search == NULL)
2738 return retval;
2739
2740 posMatrix = search->sbp->posMatrix;
2741 scalingFactor = search->pbp->scalingFactor;
2742
2743 resProb = (Nlm_FloatHi *) MemNew (PRO_ALPHABET_SIZE * sizeof(Nlm_FloatHi));
2744 scoreArray = (Nlm_FloatHi *) MemNew(scoreRange * sizeof(Nlm_FloatHi));
2745 return_sfp = (BLAST_ScoreFreqPtr) MemNew(1 * sizeof(BLAST_ScoreFreq));
2746
2747 seqlength = search->sbp->query_length;
2748
2749 IMPALAfillResidueProbability(subject_seq, subject_length, resProb);
2750 this_sfp = IMPALAfillSfp(posMatrix, seqlength, resProb, scoreArray,
2751 return_sfp, scoreRange);
2752 initialUngappedLambda = IMPALAfindUngappedLambda(search->sbp->name);
2753 scaledInitialUngappedLambda = initialUngappedLambda/scalingFactor;
2754 correctUngappedLambda = impalaKarlinLambdaNR(this_sfp, scaledInitialUngappedLambda);
2755 if(correctUngappedLambda == -1.0) {
2756 ErrPostEx(SEV_ERROR, 0, 0,
2757 "RPS2SeqImpalaStatCorrections: Could not calculate ungapped "
2758 "lambda for PSSM");
2759 MemFree(resProb);
2760 MemFree(scoreArray);
2761 MemFree(return_sfp);
2762 return retval;
2763 }
2764
2765 lambdaRatio = correctUngappedLambda/scaledInitialUngappedLambda;
2766
2767 retval = (BLAST_Score **) MemNew((seqlength+1) * sizeof(BLAST_Score *));
2768 for (i = 0; i < seqlength+1; i++)
2769 retval[i] = (BLAST_Score *)MemNew(PRO_ALPHABET_SIZE *
2770 sizeof(BLAST_Score));
2771
2772 for (i = 0; i < seqlength+1; i++) {
2773 for (j = 0; j < PRO_ALPHABET_SIZE; j++) {
2774 if ((posMatrix[i][j] == BLAST_SCORE_MIN) || (Xchar == j))
2775 retval[i][j] = posMatrix[i][j];
2776 else {
2777 temp1 = ((Nlm_FloatHi) (posMatrix[i][j]));
2778 temp1 = temp1 * (lambdaRatio);
2779 temp2 = Nlm_Nint(temp1);
2780 retval[i][j] = temp2;
2781 }
2782 }
2783 }
2784
2785 resProb = MemFree(resProb);
2786 scoreArray = MemFree(scoreArray);
2787 return_sfp = MemFree(return_sfp);
2788
2789 return retval;
2790 }
2791
2792 static SeqAlignPtr
BlastTwoSequencesCore(BlastSearchBlkPtr search,SeqLocPtr slp,Uint1Ptr subject_seq,Int4 subject_length,Boolean reverse)2793 BlastTwoSequencesCore (BlastSearchBlkPtr search, SeqLocPtr slp, Uint1Ptr subject_seq, Int4 subject_length, Boolean reverse)
2794
2795 {
2796 BLASTResultsStructPtr result_struct;
2797 BioseqPtr subject_bsp;
2798 Int2 status;
2799 Int4 index, hitlist_count, rev_subject_length=0;
2800 SeqAlignPtr seqalign=NULL;
2801 SeqPortPtr spp;
2802 Uint1 residue;
2803 Uint1Ptr sequence, sequence_start, rev_subject=NULL;
2804 SeqIdPtr sip;
2805 BLAST_ScorePtr *scaledMatrix = NULL, *copyMatrix = NULL;
2806
2807 if (search == NULL || search->query_invalid)
2808 return NULL;
2809
2810 sip = SeqLocId(slp);
2811 subject_bsp = BioseqLockById(sip);
2812
2813 /* Save subject sequence location for tabulated output */
2814 if (search->handle_results && SeqLocLen(slp) < subject_bsp->length)
2815 search->query_slp->next = slp;
2816
2817 status = BlastTwoSequencesCoreEx(search, subject_bsp, subject_seq,
2818 subject_length);
2819
2820 if (status == 0) {
2821 /*CC: if we're emulating rpsblast, do the impala style matrix
2822 * rescaling */
2823 if (search->positionBased && search->pbp->scalingFactor != 0.0) {
2824 scaledMatrix = RPS2SeqImpalaStatCorrections(search, subject_seq,
2825 subject_length);
2826 if ( !scaledMatrix ) {
2827 BioseqUnlock(subject_bsp);
2828 return NULL;
2829 }
2830 copyMatrix = search->sbp->posMatrix;
2831 search->sbp->posMatrix = scaledMatrix;
2832
2833 if (search->sbp->karlinK != 0.0)
2834 search->sbp->kbp_gap[0]->K =
2835 PRO_K_MULTIPLIER*search->sbp->karlinK;
2836 search->sbp->kbp_gap[0]->logK = log(search->sbp->kbp_gap[0]->K);
2837 search->sbp->kbp_gap[0]->Lambda /= search->pbp->scalingFactor;
2838 }
2839 if (search->pbp->mb_params && !search->pbp->mb_params->no_traceback
2840 && !search->pbp->mb_params->use_dyn_prog) {
2841 seqalign = MegaBlastGapInfoToSeqAlign(search, 0, 0);
2842 } else if (StringCmp(search->prog_name, "blastn") == 0 &&
2843 search->pbp->gapped_calculation == TRUE) {
2844 result_struct = search->result_struct;
2845 hitlist_count = result_struct->hitlist_count;
2846 if (hitlist_count > 0)
2847 {
2848 spp = SeqPortNewByLoc(slp, Seq_code_ncbi4na);
2849 if (subject_bsp->repr == Seq_repr_delta)
2850 SeqPortSet_do_virtual(spp, TRUE);
2851
2852 /* make one longer to "protect" ALIGN. */
2853 sequence_start = MemNew((2+SeqLocLen(slp))*sizeof(Uint1));
2854 sequence_start[0] = ncbi4na_to_blastna[0];
2855 sequence = sequence_start+1;
2856 index=0;
2857 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
2858 {
2859 if (IS_residue(residue))
2860 {
2861 sequence[index] = ncbi4na_to_blastna[residue];
2862 index++;
2863 }
2864 }
2865 /* Gap character in last space. */
2866 sequence[index] = ncbi4na_to_blastna[0];
2867
2868 if (!search->pbp->mb_params) {
2869 /* Traditional Blastn */
2870 seqalign = SumBlastGetGappedAlignmentTraceback(
2871 search, 0, reverse, FALSE, sequence,
2872 SeqLocLen(slp));
2873 } else if (!search->pbp->mb_params->no_traceback) {
2874 /* Mega BLAST with non-greedy extension */
2875 SumBlastGetGappedAlignmentEx(search, 0, FALSE, FALSE,
2876 sequence, SeqLocLen(slp), TRUE, &seqalign, NULL, 0);
2877 }
2878
2879 sequence_start = MemFree(sequence_start);
2880 spp = SeqPortFree(spp);
2881 }
2882 }
2883 else if (search->pbp->gapped_calculation == TRUE)
2884 {
2885 result_struct = search->result_struct;
2886 hitlist_count = result_struct->hitlist_count;
2887 if (hitlist_count > 0) {
2888
2889 if (!StringCmp(search->prog_name, "tblastn")
2890 || !StringCmp(search->prog_name, "psitblastn")) {
2891 Uint1Ptr subject = NULL;
2892 SeqPortPtr rev_spp;
2893 if (slp->choice == SEQLOC_WHOLE) {
2894 spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_plus,
2895 Seq_code_ncbi4na);
2896 rev_spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_minus,
2897 Seq_code_ncbi4na);
2898 } else {
2899 spp = SeqPortNew(subject_bsp, SeqLocStart(slp),
2900 SeqLocStop(slp), Seq_strand_plus,
2901 Seq_code_ncbi4na);
2902 rev_spp = SeqPortNew(subject_bsp, SeqLocStart(slp),
2903 SeqLocStop(slp), Seq_strand_minus,
2904 Seq_code_ncbi4na);
2905 }
2906 /* make one longer to "protect" ALIGN. */
2907 subject = (Uint1Ptr) MemNew((1+subject_length)*sizeof(Uint1));
2908 rev_subject = (Uint1Ptr) MemNew((1+subject_length)*sizeof(Uint1));
2909 for (index=0; index<subject_length; index++) {
2910 subject[index] = SeqPortGetResidue(spp);
2911 rev_subject[index] = SeqPortGetResidue(rev_spp);
2912 }
2913 /* Gap character in last space. */
2914 subject[subject_length] = NULLB;
2915 rev_subject[subject_length] = NULLB;
2916 rev_subject_length = subject_length;
2917 spp = SeqPortFree(spp);
2918 rev_spp = SeqPortFree(rev_spp);
2919
2920
2921 seqalign = BlastGetGapAlgnTbck(search, 0, reverse,
2922 FALSE, subject, subject_length,
2923 rev_subject, rev_subject_length);
2924
2925 if (search->pbp->longest_intron <= 0)
2926 MemFree(subject);
2927 MemFree(rev_subject);
2928 } else {
2929 seqalign = BlastGetGapAlgnTbck(search, 0, reverse,
2930 FALSE, subject_seq, subject_length,
2931 rev_subject, rev_subject_length);
2932 result_struct->results[0]->seqalign = seqalign;
2933 }
2934 }
2935 }
2936 else /* Ungapped case, any program */
2937 {
2938 if (search->prog_number == blast_type_blastn ||
2939 search->prog_number == blast_type_blastp)
2940 seqalign = GetSeqAlignForResultHitList(search, TRUE, FALSE,
2941 search->pbp->discontinuous, reverse, FALSE);
2942 else
2943 seqalign = GetSeqAlignForResultHitList(search, FALSE, FALSE,
2944 search->pbp->discontinuous, reverse, FALSE);
2945 }
2946 /*CC: Revert changes done for psi-blast2sequences */
2947 if (search->positionBased && search->pbp->scalingFactor != 0.0) {
2948 if (scaledMatrix) {
2949 for (index = 0; index < search->sbp->query_length + 1; index++)
2950 MemFree(scaledMatrix[index]);
2951 MemFree(scaledMatrix);
2952 search->sbp->posMatrix = copyMatrix;
2953 }
2954 if (search->sbp->karlinK != 0.0)
2955 search->sbp->kbp_gap[0]->K = search->sbp->karlinK;
2956 search->sbp->kbp_gap[0]->logK = log(search->sbp->kbp_gap[0]->K);
2957 }
2958 }
2959 BioseqUnlock(subject_bsp);
2960
2961 return seqalign;
2962 }
2963
2964 BlastSearchBlkPtr LIBCALL
BlastQuerySequenceSetUp(BioseqPtr bsp,CharPtr progname,BLAST_OptionsBlkPtr options)2965 BlastQuerySequenceSetUp(BioseqPtr bsp, CharPtr progname,
2966 BLAST_OptionsBlkPtr options)
2967 {
2968 BlastSearchBlkPtr search;
2969 SeqLocPtr slp=NULL;
2970
2971 if (bsp == NULL)
2972 return NULL;
2973
2974 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
2975 if (progname == NULL && options == NULL)
2976 return NULL;
2977
2978 if (progname == NULL)
2979 progname = options->program_name;
2980
2981 if (!StringCmp(progname, "blastp") ||
2982 !StringCmp(progname, "blastx")) {
2983 if (options->gapped_calculation == TRUE) {
2984 options->two_pass_method = FALSE;
2985 options->multiple_hits_only = TRUE;
2986 }
2987 }
2988
2989 search = BLASTSetUpSearchByLoc(slp, progname, bsp->length, 0, NULL, options, NULL);
2990
2991 search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
2992
2993 if (search == NULL)
2994 return NULL;
2995
2996 return search;
2997 }
2998
2999 /*
3000 Runs blast between two sequences
3001 */
3002 SeqAlignPtr LIBCALL
BlastTwoSequencesByLocEx(SeqLocPtr slp1,SeqLocPtr slp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns)3003 BlastTwoSequencesByLocEx(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns)
3004 {
3005 return BlastTwoSequencesByLocWithCallback(slp1, slp2, progname, options,
3006 other_returns, error_returns, NULL, NULL);
3007 }
3008
3009 /************************************************************************/
3010 /* PSIBLAST2Sequences API */
3011 /************************************************************************/
3012
B2SAllocateScoreMatrix(Int4 rows,Int4 cols)3013 static BLAST_ScorePtr *B2SAllocateScoreMatrix(Int4 rows, Int4 cols)
3014 {
3015 BLAST_ScorePtr *matrix = NULL;
3016 Int4 i;
3017
3018 if (!(matrix = (BLAST_ScorePtr *) MemNew(rows*sizeof(BLAST_ScorePtr)))) {
3019 return NULL;
3020 }
3021
3022 for (i = 0; i < rows; i++) {
3023 matrix[i] = (BLAST_ScorePtr) MemNew(cols*sizeof(BLAST_Score));
3024 if (matrix[i] == NULL) {
3025 while (--i >= 0)
3026 MemFree(matrix[i]);
3027 MemFree(matrix);
3028 return NULL;
3029 }
3030 }
3031 return matrix;
3032 }
3033
3034 /* Convert a set of residue frequencies into a scaled PSSM (using
3035 * scalingFactor). */
B2SCalculateScaledPSSM(BlastSearchBlkPtr search,Nlm_FloatHiPtr * posFreqs,compactSearchItems * compactSearch,Nlm_FloatHiPtr karlinK)3036 static BLAST_ScorePtr *B2SCalculateScaledPSSM(BlastSearchBlkPtr search,
3037 Nlm_FloatHiPtr *posFreqs, compactSearchItems *compactSearch,
3038 Nlm_FloatHiPtr karlinK)
3039 {
3040 BLAST_ScorePtr *retval = NULL;
3041 posSearchItems *posSearch = NULL;
3042 Int4 qlen, alphabet_sz, rv;
3043 Nlm_FloatHi scalingFactor = search->pbp->scalingFactor;
3044 BLAST_ScoreBlkPtr sbp = NULL;
3045 ValNodePtr error_return;
3046 Int4 i, gap_open, gap_extend;
3047
3048 if (!search || !compactSearch || !posFreqs)
3049 return NULL;
3050
3051 if (!(posSearch = (posSearchItems *)MemNew(sizeof(posSearchItems)))) {
3052 ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3053 return NULL;
3054 }
3055
3056 qlen = compactSearch->qlength;
3057 alphabet_sz = compactSearch->alphabetSize;
3058 gap_open = search->pbp->gap_open / scalingFactor;
3059 gap_extend = search->pbp->gap_extend / scalingFactor;
3060
3061 if (!(sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa, 1))) {
3062 ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3063 MemFree(posSearch);
3064 return NULL;
3065 }
3066 sbp->read_in_matrix = TRUE;
3067 sbp->protein_alphabet = TRUE;
3068 sbp->posMatrix = NULL;
3069 sbp->number_of_contexts = 1;
3070 BlastScoreBlkMatFill(sbp, search->sbp->name);
3071 compactSearch->matrix = sbp->matrix;
3072 compactSearch->gapped_calculation = TRUE;
3073 compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
3074 compactSearch->ethresh = 0.001;
3075 BlastScoreBlkFill(sbp, (CharPtr) compactSearch->query, qlen, 0);
3076
3077 sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
3078 rv = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0], gap_open, gap_extend,
3079 sbp->name, &error_return);
3080 if (rv == 1) {
3081 BlastErrorPrint(error_return);
3082 BLAST_ScoreBlkDestruct(sbp);
3083 MemFree(posSearch);
3084 return NULL;
3085 }
3086 sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
3087 rv = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0], gap_open, gap_extend,
3088 sbp->name, &error_return);
3089 if (rv == 1) {
3090 BlastErrorPrint(error_return);
3091 BLAST_ScoreBlkDestruct(sbp);
3092 MemFree(posSearch);
3093 return NULL;
3094 }
3095
3096 if (sbp->kbp_ideal == NULL)
3097 sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
3098 compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
3099 compactSearch->kbp_std = sbp->kbp_std;
3100 compactSearch->kbp_psi = sbp->kbp_psi;
3101 compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
3102 compactSearch->kbp_gap_std = sbp->kbp_gap_std;
3103 compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
3104 compactSearch->K_ideal = sbp->kbp_ideal->K;
3105
3106 /* Initialize the posSearch structure */
3107 posSearch->posFreqs = posFreqs;
3108 posSearch->posMatrix = B2SAllocateScoreMatrix(qlen+1, alphabet_sz);
3109 posSearch->posPrivateMatrix = B2SAllocateScoreMatrix(qlen+1, alphabet_sz);
3110 if (!posSearch->posMatrix || !posSearch->posPrivateMatrix) {
3111 ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3112 BLAST_ScoreBlkDestruct(sbp);
3113 MemFree(posSearch->posMatrix); MemFree(posSearch->posPrivateMatrix);
3114 MemFree(posSearch);
3115 return NULL;
3116 }
3117
3118 posFreqsToMatrix(posSearch, compactSearch);
3119 impalaScaling(posSearch, compactSearch, scalingFactor, TRUE);
3120 if (karlinK)
3121 *karlinK = compactSearch->kbp_gap_psi[0]->K;
3122
3123 for (i = 0; i <= qlen; i++)
3124 MemFree(posSearch->posMatrix[i]);
3125 MemFree(posSearch->posMatrix);
3126 BLAST_ScoreBlkDestruct(sbp);
3127 retval = posSearch->posPrivateMatrix;
3128 MemFree(posSearch);
3129
3130 return retval;
3131 }
3132
3133 /* Calculates the PSSM for a given SeqLocPtr */
B2SCalculatePSSM(SeqLocPtr slp,BlastSearchBlkPtr search,BLAST_MatrixPtr matrix,Nlm_FloatHiPtr karlinK)3134 static BLAST_ScorePtr *B2SCalculatePSSM(SeqLocPtr slp, BlastSearchBlkPtr search,
3135 BLAST_MatrixPtr matrix, Nlm_FloatHiPtr karlinK)
3136 {
3137 BLAST_ScorePtr *posMatrix = NULL;
3138 compactSearchItems *compactSearch = NULL;
3139 Boolean replaced_sequence = FALSE;
3140 Int4 query_length, full_query_length;
3141 SeqLocPtr filter_slp = NULL, full_slp = NULL;
3142 Uint1Ptr sequence = NULL;
3143 BlastSequenceBlk bseq;
3144 Nlm_FloatHi scalingFactor = search->pbp->scalingFactor;
3145
3146 query_length = SeqLocLen(slp);
3147
3148 /* if the slp is not the whole sequence, retrieve the whole sequence and
3149 * use it to compute the pssm */
3150 if (matrix->rows != (query_length+1)) {
3151 SeqPortPtr spp = NULL;
3152 SeqIdPtr sip = NULL;
3153 Uint1 residue;
3154 BioseqPtr bsp = NULL;
3155 Char tmp[256];
3156 Int4 index = 0;
3157
3158 sip = SeqLocId(slp);
3159 if ((bsp = BioseqLockById(SeqIdFindBest(sip, SEQID_GI))) == NULL) {
3160 SeqIdWrite(SeqLocId(slp),tmp,PRINTID_FASTA_LONG,
3161 sizeof(tmp));
3162
3163 ErrPostEx(SEV_ERROR,0,0,"Could not retrieve full bioseq "
3164 "for %s",tmp);
3165 BioseqUnlock(bsp);
3166 return NULL;
3167 }
3168
3169 /* get full sequence to be used in WposComputation */
3170 spp = SeqPortNew(bsp, FIRST_RESIDUE, LAST_RESIDUE, Seq_strand_unknown,
3171 Seq_code_ncbistdaa);
3172
3173 full_query_length = bsp->length;
3174 sequence = (Uint1Ptr) MemNew(2*((bsp->length)+2)*sizeof(Char));
3175 BioseqUnlock(bsp);
3176
3177 sequence[index++] = NULLB;
3178 while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3179 if (IS_residue(residue)) {
3180 if (residue == 24) { /* change selenocysteine to X */
3181 residue = 21;
3182 ErrPostEx(SEV_WARNING,0,0, "Selenocysteine (U) at "
3183 "position %ld replaced by X", (long) index+1);
3184 }
3185 sequence[index++] = residue;
3186 }
3187 }
3188 sequence[index] = NULLB;
3189 spp = SeqPortFree(spp);
3190
3191 /* Filter the sequence if necessary */
3192 ValNodeAddPointer(&full_slp, SEQLOC_WHOLE, SeqIdDup(SeqLocId(slp)));
3193 filter_slp = BlastSeqLocFilter(full_slp, search->pbp->filter_string);
3194 if(search->pbp->query_lcase_mask != NULL)
3195 filter_slp = blastMergeFilterLocs(filter_slp,
3196 search->pbp->query_lcase_mask, FALSE, 0, 0);
3197
3198 BlastMaskTheResidues(sequence+1, full_query_length, 21, filter_slp,
3199 FALSE, SeqLocStart(full_slp));
3200
3201 /* Save the current query sequence */
3202 MemCpy(&bseq, search->context[0].query, sizeof(BlastSequenceBlk));
3203
3204 BlastSequenceAddSequence(search->context[0].query, NULL, sequence,
3205 full_query_length, full_query_length, 0);
3206
3207 SeqLocSetFree(full_slp);
3208 SeqLocSetFree(filter_slp);
3209 replaced_sequence = TRUE;
3210 }
3211
3212 compactSearch = compactSearchNew(compactSearch);
3213 copySearchItems(compactSearch, search, search->sbp->name);
3214 compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
3215 if (scalingFactor != 0.0 && scalingFactor != 1.0) {
3216 /* build pssm {make,copy}mat/rpsblast style */
3217 posMatrix = B2SCalculateScaledPSSM(search, search->sbp->posFreqs,
3218 compactSearch, karlinK);
3219 } else {
3220 /* build pssm psiblast style */
3221 posMatrix = WposComputation(compactSearch, NULL, search->sbp->posFreqs);
3222 }
3223 compactSearchDestruct(compactSearch);
3224
3225 if (replaced_sequence) {
3226 MemCpy(search->context[0].query, &bseq, sizeof(BlastSequenceBlk));
3227 MemFree(sequence);
3228 }
3229
3230 return posMatrix;
3231 }
3232
3233 /* Checks if the dimensions of the pssm attached to the search->sbp are
3234 * consistent with the length of the master query (slp), and trims the matrix
3235 * if necessary */
B2SVerifyPSSM(SeqLocPtr slp,BlastSearchBlkPtr search,BLAST_MatrixPtr matrix)3236 static Boolean B2SVerifyPSSM(SeqLocPtr slp, BlastSearchBlkPtr search,
3237 BLAST_MatrixPtr matrix)
3238 {
3239 Int4 i, query_length = SeqLocLen(slp);
3240
3241 if ((query_length+1) > matrix->rows) {
3242 ErrPostEx(SEV_WARNING,0,0,"Ignoring PSSM because it seems not to "
3243 "correspond to query sequence (query length = %ld, PSSM's "
3244 "number of rows = %ld)", query_length+1, matrix->rows);
3245 search->positionBased = FALSE;
3246
3247 if (matrix->matrix == NULL) {
3248 BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3249
3250 for (i = 0; i < matrix->rows; i++)
3251 posMatrix[i] = MemFree(posMatrix[i]);
3252 posMatrix = MemFree(posMatrix);
3253 }
3254 search->sbp->posMatrix = NULL;
3255 search->sbp->posFreqs = NULL;
3256 return FALSE;
3257 } else if ((query_length+1) < matrix->rows) {
3258 /* Assume BLAST_Matrix corresponds to the entire sequence, so trim
3259 * it */
3260 Int4 from, to, i, j, alphabet_sz;
3261 BLAST_ScorePtr *pssm = NULL;
3262
3263 if (slp->choice != SEQLOC_INT) {
3264 ErrPostEx(SEV_ERROR,0,0,"B2SVerifyPSSM: SeqLocPtr is not a "
3265 "SEQLOC_INT, cannot trim matrix");
3266 return FALSE;
3267 }
3268
3269 from = SeqLocStart(slp);
3270 to = SeqLocStop(slp);
3271 alphabet_sz = matrix->columns;
3272
3273 /* Adjust the pssm */
3274 pssm = (BLAST_ScorePtr *)MemNew(sizeof(BLAST_ScorePtr) *
3275 (query_length+1));
3276 for (i = 0; i <= query_length; i++) {
3277 pssm[i] = (BLAST_ScorePtr)MemNew(sizeof(BLAST_Score) *
3278 alphabet_sz);
3279 }
3280
3281 for (i = from; i <= to; i++) {
3282 for (j = 0; j < alphabet_sz; j++)
3283 pssm[(i-from)][j] = search->sbp->posMatrix[i][j];
3284 }
3285 for (j = 0; j < alphabet_sz; j++)
3286 pssm[query_length][j] = BLAST_SCORE_MIN;
3287
3288 if (matrix->matrix == NULL) {
3289 /* Free the matrix we calculated originally */
3290 BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3291
3292 for (i = 0; i < matrix->rows; i++)
3293 posMatrix[i] = MemFree(posMatrix[i]);
3294 posMatrix = MemFree(posMatrix);
3295 }
3296 search->sbp->posMatrix = pssm;
3297
3298 }
3299 return TRUE;
3300 }
3301
3302 /* psi-blast2sequences setup: matrix must contain at least the residue
3303 * frequencies to calculate the PSSM. Otherwise, if the PSSM is given, that
3304 * will be used. */
B2SPssmSetupSearch(BlastSearchBlkPtr search,SeqLocPtr pssm_slp,BLAST_MatrixPtr matrix)3305 Boolean LIBCALL B2SPssmSetupSearch(BlastSearchBlkPtr search,
3306 SeqLocPtr pssm_slp, BLAST_MatrixPtr matrix)
3307 {
3308 Nlm_FloatHi karlinK = 0.0;
3309 Int4 npos, alphabet_size;
3310
3311 if (!search || !matrix)
3312 return FALSE;
3313
3314 if (search->prog_number != blast_type_blastp) {
3315 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: only blastp is "
3316 "supported");
3317 return FALSE;
3318 }
3319
3320 search->positionBased = TRUE;
3321 npos = SeqLocLen(pssm_slp);
3322 alphabet_size = search->sbp->alphabet_size;
3323
3324 if (npos <= 0) {
3325 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: length of pssm_slp "
3326 "must be positive");
3327 return FALSE;
3328 }
3329
3330 /* save the residue frequencies, we might need them later */
3331 if (matrix->posFreqs) {
3332 search->sbp->posFreqs = allocatePosFreqs(npos, alphabet_size);
3333 copyPosFreqs(matrix->posFreqs, search->sbp->posFreqs, npos,
3334 alphabet_size);
3335 }
3336
3337 if (matrix->posFreqs && !matrix->matrix) {
3338 search->sbp->posMatrix = B2SCalculatePSSM(pssm_slp, search, matrix,
3339 &karlinK);
3340 /* if we calculated the pssm, and use did not provide one, save it*/
3341 if (matrix->karlinK == 0.0 && karlinK != 0.0)
3342 matrix->karlinK = karlinK;
3343 } else {
3344 search->sbp->posMatrix = matrix->matrix;
3345 }
3346
3347 search->sbp->mat_dim1 = search->sbp->query_length + 1;
3348 search->sbp->mat_dim2 = search->sbp->alphabet_size;
3349
3350 /* Sanity check */
3351 if (!search->sbp->posMatrix) {
3352 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: "
3353 "Could not create or obtain PSSM! Please verify "
3354 "BLAST_Matrix parameter");
3355 search->positionBased = FALSE;
3356 return FALSE;
3357 }
3358
3359 /* Make sure the BLAST_Matrix number of rows is consistent with
3360 * pssm_slp */
3361 B2SVerifyPSSM(pssm_slp, search, matrix);
3362
3363 if (matrix->karlinK != 0.0) {
3364 search->sbp->karlinK = matrix->karlinK;
3365 search->sbp->kbp_gap_psi[0]->K = matrix->karlinK;
3366 search->sbp->kbp_gap_psi[0]->logK = log(matrix->karlinK);
3367 }
3368
3369 return TRUE;
3370 }
3371
3372 /* clean up psi-blast2sequences */
B2SPssmCleanUpSearch(BlastSearchBlkPtr search,BLAST_MatrixPtr matrix)3373 Boolean LIBCALL B2SPssmCleanUpSearch(BlastSearchBlkPtr search,
3374 BLAST_MatrixPtr matrix)
3375 {
3376 Int4 i, rows = search->sbp->query_length + 1;
3377 BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3378 Nlm_FloatHiPtr *posFreqs = search->sbp->posFreqs;
3379
3380 if (!matrix)
3381 return FALSE;
3382
3383 if ((matrix->matrix == NULL) || /* B2SPssmSetupSearch created PSSM */
3384 (posMatrix != matrix->matrix)) { /* B2SVerifyPSSM trimmed PSSM */
3385 for (i = 0; i < rows; i++)
3386 posMatrix[i] = MemFree(posMatrix[i]);
3387 posMatrix = MemFree(posMatrix);
3388 }
3389 if (matrix->posFreqs) {
3390 for (i = 0; i < rows; i++)
3391 posFreqs[i] = MemFree(posFreqs[i]);
3392 posFreqs = MemFree(posFreqs);
3393 }
3394 search->sbp->posMatrix = NULL;
3395 search->sbp->posFreqs = NULL;
3396 search->positionBased = FALSE;
3397 return TRUE;
3398 }
3399
B2SPssmOnTheFlyByLoc(BlastSearchBlkPtr search,SeqLocPtr subj_slp)3400 SeqAlignPtr LIBCALL B2SPssmOnTheFlyByLoc(BlastSearchBlkPtr search,
3401 SeqLocPtr subj_slp)
3402 {
3403 Int4 index, subject_length;
3404 SeqAlignPtr seqalign = NULL;
3405 Uint1Ptr subject_seq = NULL, subject_seq_start = NULL;
3406 SeqPortPtr spp;
3407 Uint1 residue;
3408
3409 if (!search || search->query_invalid || !subj_slp)
3410 return NULL;
3411
3412 if (search->result_struct)
3413 search->result_struct = BLASTResultsStructDelete(search->result_struct);
3414 search->result_struct = BLASTResultsStructNew(search->result_size,
3415 search->pbp->max_pieces, search->pbp->hsp_range_max);
3416 BlastHitListPurge(search->current_hitlist);
3417
3418 subject_length = SeqLocLen(subj_slp);
3419
3420 if (search->prog_number == blast_type_blastp) {
3421 subject_seq_start = (Uint1Ptr) MemNew(
3422 ((subject_length)+2)*sizeof(Uint1));
3423 /* The first residue is the sentinel. */
3424 subject_seq_start[0] = NULLB;
3425 subject_seq = subject_seq_start+1;
3426 index = 0;
3427 spp = SeqPortNewByLoc(subj_slp, Seq_code_ncbistdaa);
3428 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3429 if (IS_residue(residue))
3430 subject_seq[index++] = residue;
3431 }
3432 subject_seq[index] = NULLB;
3433 spp = SeqPortFree(spp);
3434 } else {
3435 return NULL;
3436 }
3437
3438 seqalign = BlastTwoSequencesCore(search, subj_slp, subject_seq,
3439 subject_length, FALSE);
3440
3441 MemFree(subject_seq_start);
3442 AdjustOffSetsInSeqAlign(seqalign, search->query_slp, subj_slp);
3443
3444 return seqalign;
3445 }
3446
B2SPssmOnTheFly(BlastSearchBlkPtr search,BioseqPtr subj_bsp)3447 SeqAlignPtr LIBCALL B2SPssmOnTheFly(BlastSearchBlkPtr search,
3448 BioseqPtr subj_bsp)
3449 {
3450 SeqAlignPtr salp = NULL;
3451 SeqLocPtr slp = NULL;
3452
3453 if (!search || search->query_invalid || !subj_bsp)
3454 return NULL;
3455
3456 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(subj_bsp->id,
3457 SEQID_GI)));
3458 salp = B2SPssmOnTheFlyByLoc(search, slp);
3459 SeqLocFree(slp);
3460 return salp;
3461 }
3462
B2SPssmMultipleQueries(SeqLocPtr pssm_slp,BLAST_MatrixPtr matrix,SeqLocPtr * target_seqs,Int4 ntargets,BLAST_OptionsBlkPtr options)3463 SeqAlignPtr * LIBCALL B2SPssmMultipleQueries(SeqLocPtr pssm_slp,
3464 BLAST_MatrixPtr matrix, SeqLocPtr *target_seqs, Int4 ntargets,
3465 BLAST_OptionsBlkPtr options)
3466 {
3467 SeqAlignPtr *sa_array = NULL;
3468 BlastSearchBlkPtr search = NULL;
3469 Int4 i;
3470
3471 if (!matrix || !pssm_slp || !target_seqs || ntargets <= 0 || !options)
3472 return NULL;
3473
3474 /* Set up search structure */
3475 search = BLASTSetUpSearchByLoc(pssm_slp, options->program_name,
3476 SeqLocLen(pssm_slp), 0, NULL, options, NULL);
3477 B2SPssmSetupSearch(search, pssm_slp, matrix);
3478
3479 /* Allocate memory for return value */
3480 if (!(sa_array = (SeqAlignPtr*)MemNew(sizeof(SeqAlignPtr)*ntargets))) {
3481 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmMultipleQueries: Out of memory");
3482 BlastSearchBlkDestruct(search);
3483 return NULL;
3484 }
3485
3486
3487 /* Iterate over seqlocs in target_seqs, using effective search space in
3488 * rpsblast style */
3489 for (i = 0; i < ntargets; i++) {
3490 Int8 dblen = (options->db_length != 0) ?
3491 options->db_length : SeqLocLen(pssm_slp);
3492 Int4 nseqs = (options->dbseq_num != 0) ? options->dbseq_num : 1;
3493
3494 /* If search space has been specified in the options structure, the it
3495 * must have been set in BLASTSetUpSearchEx, so don't overwrite it */
3496 if ( ! (options->searchsp_eff > 0) ) {
3497 search->searchsp_eff = BLASTCalculateSearchSpace(options, nseqs,
3498 dblen, SeqLocLen(target_seqs[i]));
3499 }
3500 sa_array[i] = B2SPssmOnTheFlyByLoc(search, target_seqs[i]);
3501 }
3502
3503 /* Clean up */
3504 B2SPssmCleanUpSearch(search, matrix);
3505 BlastSearchBlkDestruct(search);
3506
3507 return sa_array;
3508 }
3509
3510 /************************************************************************/
3511 /* END PSIBLAST2Sequences API */
3512 /************************************************************************/
3513
3514 /* Note that the matrix parameter should correspond to the full master
3515 * sequence */
3516 SeqAlignPtr LIBCALL
BlastTwoSequencesByLocWithCallback(SeqLocPtr slp1,SeqLocPtr slp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)),BLAST_MatrixPtr matrix)3517 BlastTwoSequencesByLocWithCallback(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr
3518 progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns,
3519 ValNodePtr *error_returns, int (LIBCALLBACK
3520 *handle_results)PROTO((VoidPtr srch)), BLAST_MatrixPtr matrix)
3521 {
3522 BlastAllWordPtr all_words;
3523 BlastSearchBlkPtr search;
3524 BioseqPtr subject_bsp;
3525 Boolean complement=FALSE, reverse, reverse_forbidden, options_alloc;
3526 Int2 status;
3527 Int4 index, subject_length, num_of_cols;
3528 SeqAlignPtr seqalign=NULL;
3529 SeqLocPtr query_slp, subject_slp;
3530 SeqPortPtr spp;
3531 SPCompressPtr spc=NULL;
3532 Uint1 residue;
3533 Uint1Ptr subject_seq, subject_seq_start;
3534 Uint1Ptr *array;
3535 Boolean db_length_changed = FALSE;
3536
3537 if (slp1 == NULL || slp2 == NULL)
3538 return NULL;
3539
3540 if (error_returns)
3541 {
3542 *error_returns = NULL;
3543 }
3544
3545 if (other_returns)
3546 {
3547 *other_returns = NULL;
3548 }
3549
3550 if (progname == NULL && options == NULL)
3551 return NULL;
3552
3553 /* If filtering is performed, do not reverse the sequence.
3554 In this case the wrong sequence would be filtered. */
3555 reverse_forbidden = FALSE;
3556 if ((options && ((options->filter_string &&
3557 StringCmp(options->filter_string, "F")) ||
3558 options->is_megablast_search)) ||
3559 matrix != NULL)
3560 {
3561 reverse_forbidden = TRUE;
3562 }
3563
3564 /* Select the shorter sequence as the query, provided they are
3565 of the same type. */
3566 if ((StringCmp(progname, "blastn") && StringCmp(progname, "blastp")) ||
3567 (reverse_forbidden || SeqLocLen(slp1) < SeqLocLen(slp2)))
3568 {
3569 query_slp = slp1;
3570 subject_slp = slp2;
3571 reverse = FALSE;
3572 }
3573 else
3574 {
3575 query_slp = slp2;
3576 subject_slp = slp1;
3577 reverse = TRUE;
3578 }
3579
3580 /* Make sure strands are handled correctly */
3581 if (!StringCmp(progname, "blastn") &&
3582 SeqLocStrand(query_slp) != Seq_strand_both &&
3583 SeqLocStrand(subject_slp) == Seq_strand_both) {
3584 Change_Loc_Strand(subject_slp, SeqLocStrand(query_slp));
3585 Change_Loc_Strand(query_slp, Seq_strand_both);
3586 }
3587
3588 if (progname == NULL)
3589 {
3590 progname = options->program_name;
3591 }
3592
3593 /* If the subject strand is minus, turn it into plus for blastn. */
3594 /* Complement the other strand to keep things straight. */
3595 if (StringCmp(progname, "blastn") == 0 && SeqLocStrand(subject_slp) == Seq_strand_minus)
3596 {
3597 complement = TRUE;
3598 if(SeqLocStrand(query_slp) == Seq_strand_plus ||
3599 SeqLocStrand(query_slp) == Seq_strand_minus)
3600 SeqLocRevCmp(query_slp);
3601 SeqLocRevCmp(subject_slp);
3602 }
3603
3604 subject_seq_start = subject_seq = NULL;
3605
3606 /* Allocate default options if none are allocated yet. */
3607 options_alloc = FALSE;
3608 if (options == NULL)
3609 {
3610 options = BLASTOptionNew(progname, FALSE);
3611 options_alloc = TRUE;
3612 }
3613
3614 status = BLASTOptionValidateEx(options, progname, error_returns);
3615 if (status != 0)
3616 { /* error messages in other_returns? */
3617 return NULL;
3618 }
3619
3620 all_words = NULL;
3621
3622 subject_length = SeqLocLen(subject_slp);
3623
3624 if (!StringCmp(progname, "blastp") ||
3625 !StringCmp(progname, "blastx"))
3626 {
3627 subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3628 /* The first residue is the sentinel. */
3629 subject_seq_start[0] = NULLB;
3630 subject_seq = subject_seq_start+1;
3631 index = 0;
3632 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbistdaa);
3633 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3634 {
3635 if (IS_residue(residue))
3636 {
3637 subject_seq[index] = residue;
3638 index++;
3639 }
3640 }
3641 subject_seq[index] = NULLB;
3642
3643 num_of_cols = subject_length+1-options->wordsize;
3644 all_words = BlastAllWordNew(num_of_cols, options->wordsize, FALSE, TRUE);
3645 array = (Uint1Ptr *) MemNew(num_of_cols*sizeof(Uint1Ptr));
3646 for (index=0; index<num_of_cols; index++)
3647 {
3648 array[index] = subject_seq+index;
3649 }
3650 all_words->array = array;
3651 spp = SeqPortFree(spp);
3652 if (options->gapped_calculation == TRUE)
3653 {
3654 options->two_pass_method = FALSE;
3655 options->multiple_hits_only = TRUE;
3656 }
3657 }
3658 else if (!StringCmp(progname, "blastn") ||
3659 !StringCmp(progname, "tblastn") ||
3660 !StringCmp(progname, "psitblastn") ||
3661 !StringCmp(progname, "tblastx"))
3662 {
3663 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
3664 subject_bsp = BioseqFindCore(SeqLocId(subject_slp));
3665 if (subject_bsp != NULL && subject_bsp->repr == Seq_repr_delta)
3666 SeqPortSet_do_virtual(spp, TRUE);
3667 spc = SPCompressDNA(spp);
3668 if (spc == NULL)
3669 return NULL;
3670 subject_seq_start = subject_seq = spc->buffer;
3671 spp = SeqPortFree(spp);
3672 }
3673 else /* Impossible! */
3674 {
3675 return NULL;
3676 }
3677
3678 if (options->is_megablast_search)
3679 /* This has a different meaning in Mega BLAST and must be 0 */
3680 options->block_width = 0;
3681
3682 if (options->db_length == 0)
3683 {
3684 options->db_length = subject_length;
3685 db_length_changed = TRUE;
3686 }
3687
3688 options->dbseq_num = 1;
3689
3690 search = BLASTSetUpSearchByLoc(query_slp, progname, SeqLocLen(query_slp), subject_length, all_words, options, NULL);
3691
3692 /* Change length back, change only happens if zero. */
3693 if(db_length_changed)
3694 options->db_length = 0;
3695
3696
3697 if (search == NULL)
3698 return NULL;
3699
3700 if (search->query_invalid) {
3701 search = BlastSearchBlkDestruct(search);
3702 return NULL;
3703 }
3704
3705 if (!StringCmp(progname, "tblastn") ||
3706 !StringCmp(progname, "tblastx") ||
3707 !StringCmp(progname, "psitblastn")) {
3708 MemFree(search->translation_buffer);
3709 search->translation_buffer = MemNew((3+(subject_length/3))*sizeof(Uint1));
3710 search->translation_buffer_size = 1+(subject_length/3);
3711 }
3712
3713 B2SPssmSetupSearch(search, slp1, matrix);
3714
3715 search->handle_results = handle_results;
3716 search->output = options->output;
3717
3718 seqalign = BlastTwoSequencesCore(search, subject_slp, subject_seq, subject_length, reverse);
3719
3720 if (complement)
3721 {
3722 seqalign = SeqAlignListReverseStrand(seqalign);
3723 SeqLocRevCmp(query_slp);
3724 SeqLocRevCmp(subject_slp);
3725 }
3726
3727 if (spc)
3728 {
3729 SPCompressFree(spc);
3730 spc = NULL;
3731 }
3732 else
3733 {
3734 subject_seq_start = MemFree(subject_seq_start);
3735 }
3736
3737 if (search->error_return)
3738 {
3739 ValNodeLink(error_returns, search->error_return);
3740 search->error_return = NULL;
3741 }
3742
3743 if (other_returns)
3744 { /* format dbinfo etc. */
3745 *other_returns = BlastOtherReturnsPrepare(search);
3746 }
3747
3748 if (options_alloc)
3749 options = BLASTOptionDelete(options);
3750
3751 AdjustOffSetsInSeqAlign(seqalign, slp1, slp2);
3752
3753 B2SPssmCleanUpSearch(search, matrix);
3754
3755 search = BlastSearchBlkDestruct(search);
3756
3757 return seqalign;
3758 }
3759
3760 SeqAlignPtr LIBCALL
BlastTwoSequencesByLoc(SeqLocPtr slp1,SeqLocPtr slp2,CharPtr progname,BLAST_OptionsBlkPtr options)3761 BlastTwoSequencesByLoc(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr progname, BLAST_OptionsBlkPtr options)
3762 {
3763 return BlastTwoSequencesByLocEx(slp1, slp2, progname, options, NULL, NULL);
3764 }
3765
3766 SeqAlignPtr LIBCALL
BlastTwoSequencesEx(BioseqPtr bsp1,BioseqPtr bsp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns)3767 BlastTwoSequencesEx(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns)
3768 {
3769 return BlastTwoSequencesWithCallback(bsp1, bsp2, progname, options,
3770 other_returns, error_returns, NULL);
3771 }
3772
3773 SeqAlignPtr LIBCALL
BlastTwoSequencesWithCallback(BioseqPtr bsp1,BioseqPtr bsp2,CharPtr progname,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr search)))3774 BlastTwoSequencesWithCallback(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *handle_results)PROTO((VoidPtr search)))
3775 {
3776 SeqAlignPtr seqalign;
3777 SeqLocPtr slp1=NULL, slp2=NULL;
3778
3779 if (bsp1 == NULL || bsp2 == NULL)
3780 return NULL;
3781
3782 slp1 = NULL;
3783 slp2 = NULL;
3784 if (!handle_results) {
3785 ValNodeAddPointer(&slp1, SEQLOC_WHOLE,
3786 SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI)));
3787 ValNodeAddPointer(&slp2, SEQLOC_WHOLE,
3788 SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI)));
3789 } else {
3790 ValNodeAddPointer(&slp1, SEQLOC_WHOLE,
3791 SeqIdDup(SeqIdFindBestAccession(bsp1->id)));
3792 ValNodeAddPointer(&slp2, SEQLOC_WHOLE,
3793 SeqIdDup(SeqIdFindBestAccession(bsp2->id)));
3794 }
3795 seqalign = BlastTwoSequencesByLocWithCallback(slp1, slp2, progname,
3796 options, other_returns, error_returns, handle_results, NULL);
3797
3798 slp1 = SeqLocFree(slp1);
3799 slp2 = SeqLocFree(slp2);
3800
3801 return seqalign;
3802 }
3803
3804 SeqAlignPtr LIBCALL
BlastTwoSequences(BioseqPtr bsp1,BioseqPtr bsp2,CharPtr progname,BLAST_OptionsBlkPtr options)3805 BlastTwoSequences(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options)
3806 {
3807 return BlastTwoSequencesEx(bsp1, bsp2, progname, options, NULL, NULL);
3808 }
3809
3810 /*
3811 Runs blast on the fly between the query BioseqPtr (specified with a
3812 call to BLASTSetUpSearch) and the subject BioseqPtr.
3813 */
3814
3815
3816 BlastSearchBlkPtr LIBCALL
BlastSequencesOnTheFlyEx(BlastSearchBlkPtr search,BioseqPtr subject_bsp)3817 BlastSequencesOnTheFlyEx(BlastSearchBlkPtr search, BioseqPtr subject_bsp)
3818 {
3819 Int4 index, subject_length;
3820 SeqPortPtr spp;
3821 SPCompressPtr spc=NULL;
3822 Uint1Ptr subject_seq, subject_seq_start;
3823 Uint1 residue;
3824
3825 if (subject_bsp == NULL)
3826 return NULL;
3827
3828 if (search == NULL || search->query_invalid)
3829 return NULL;
3830
3831 if (!search->pbp->mb_params) {
3832 if (search->result_struct)
3833 search->result_struct =
3834 BLASTResultsStructDelete(search->result_struct);
3835 search->result_struct =
3836 BLASTResultsStructNew(search->result_size,
3837 search->pbp->max_pieces, search->pbp->hsp_range_max);
3838 } else {
3839 if (search->mb_result_struct && search->mb_result_struct[0])
3840 search->mb_result_struct[0] =
3841 BLASTResultsStructDelete(search->mb_result_struct[0]);
3842 if (!search->mb_result_struct)
3843 search->mb_result_struct = (BLASTResultsStructPtr PNTR)
3844 MemNew(sizeof(BLASTResultsStructPtr));
3845 }
3846
3847 BlastHitListPurge(search->current_hitlist);
3848
3849 subject_seq_start = subject_seq = NULL;
3850
3851 subject_length = subject_bsp->length;
3852
3853 if (StringCmp(search->prog_name, "blastp") == 0)
3854 {
3855 subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3856 /* The first residue is the sentinel. */
3857 subject_seq_start[0] = NULLB;
3858 subject_seq = subject_seq_start+1;
3859 index = 0;
3860 spp = SeqPortNew(subject_bsp, FIRST_RESIDUE, LAST_RESIDUE,
3861 0, Seq_code_ncbistdaa);
3862 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3863 {
3864 if (IS_residue(residue))
3865 {
3866 subject_seq[index] = residue;
3867 index++;
3868 }
3869 }
3870 subject_seq[index] = NULLB;
3871 spp = SeqPortFree(spp);
3872 }
3873 else if (StringCmp(search->prog_name, "blastn") == 0)
3874 {
3875 spp = SeqPortNew(subject_bsp, FIRST_RESIDUE, LAST_RESIDUE,
3876 0, Seq_code_ncbi4na);
3877 spc = SPCompressDNA(spp);
3878 subject_seq = spc->buffer;
3879 spp = SeqPortFree(spp);
3880 }
3881 else
3882 {
3883 return NULL;
3884 }
3885
3886 BlastTwoSequencesCoreEx(search, subject_bsp, subject_seq,
3887 subject_length);
3888
3889 if (spc)
3890 {
3891 SPCompressFree(spc);
3892 spc = NULL;
3893 }
3894 else
3895 {
3896 subject_seq_start = MemFree(subject_seq_start);
3897 }
3898
3899 return search;
3900 }
3901
3902 SeqAlignPtr LIBCALL
BlastSequencesOnTheFlyByLoc(BlastSearchBlkPtr search,SeqLocPtr subject_slp)3903 BlastSequencesOnTheFlyByLoc(BlastSearchBlkPtr search, SeqLocPtr subject_slp)
3904 {
3905 Int4 index, subject_length;
3906 SeqAlignPtr seqalign=NULL;
3907 SeqPortPtr spp;
3908 SPCompressPtr spc=NULL;
3909 Uint1Ptr subject_seq, subject_seq_start;
3910 Uint1 residue;
3911
3912 if (subject_slp == NULL)
3913 return NULL;
3914
3915 if (search == NULL || search->query_invalid)
3916 return NULL;
3917
3918
3919 if (!search->pbp->mb_params) {
3920 if (search->result_struct)
3921 search->result_struct = BLASTResultsStructDelete(search->result_struct);
3922 search->result_struct =
3923 BLASTResultsStructNew(search->result_size,
3924 search->pbp->max_pieces, search->pbp->hsp_range_max);
3925 } else {
3926 if (search->mb_result_struct && search->mb_result_struct[0])
3927 search->mb_result_struct[0] =
3928 BLASTResultsStructDelete(search->mb_result_struct[0]);
3929 if (!search->mb_result_struct)
3930 search->mb_result_struct = (BLASTResultsStructPtr PNTR)
3931 MemNew(sizeof(BLASTResultsStructPtr));
3932 }
3933 BlastHitListPurge(search->current_hitlist);
3934
3935 subject_seq_start = subject_seq = NULL;
3936
3937 subject_length = SeqLocLen(subject_slp);
3938
3939 if (StringCmp(search->prog_name, "blastp") == 0)
3940 {
3941 subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3942 /* The first residue is the sentinel. */
3943 subject_seq_start[0] = NULLB;
3944 subject_seq = subject_seq_start+1;
3945 index = 0;
3946 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbistdaa);
3947 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3948 {
3949 if (IS_residue(residue))
3950 {
3951 subject_seq[index] = residue;
3952 index++;
3953 }
3954 }
3955 subject_seq[index] = NULLB;
3956 spp = SeqPortFree(spp);
3957 }
3958 else if (StringCmp(search->prog_name, "blastn") == 0)
3959 {
3960 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
3961 spc = SPCompressDNA(spp);
3962 subject_seq = spc->buffer;
3963 spp = SeqPortFree(spp);
3964 }
3965 else
3966 {
3967 return NULL;
3968 }
3969
3970 seqalign = BlastTwoSequencesCore(search, subject_slp, subject_seq, subject_length, FALSE);
3971
3972 if (spc)
3973 {
3974 SPCompressFree(spc);
3975 spc = NULL;
3976 }
3977 else
3978 {
3979 subject_seq_start = MemFree(subject_seq_start);
3980 }
3981
3982 AdjustOffSetsInSeqAlign(seqalign, search->query_slp, subject_slp);
3983
3984 return seqalign;
3985 }
3986
3987 SeqAlignPtr LIBCALL
BlastSequencesOnTheFly(BlastSearchBlkPtr search,BioseqPtr subject_bsp)3988 BlastSequencesOnTheFly(BlastSearchBlkPtr search, BioseqPtr subject_bsp)
3989 {
3990 SeqAlignPtr seqalign;
3991 SeqLocPtr slp;
3992
3993 slp = NULL;
3994 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(subject_bsp->id, SEQID_GI)));
3995 seqalign = BlastSequencesOnTheFlyByLoc(search, slp);
3996 SeqLocFree(slp);
3997 return seqalign;
3998 }
3999 /*
4000 Translate a nucleotide sequence without ambiguity codes.
4001 This is used for the first-pass translation of the database.
4002
4003 BlastSearchBlkPtr search: overall BLAST structure.
4004 Int4 length: length of the nucl. sequence
4005 Uint1Ptr prot_seq: the (translated) protein sequence, with NULLB
4006 sentinels on either end. This array should be allocated
4007 with sufficient memory before the function is called.
4008 Uint1Ptr nt_seq: the original nucl. sequence.
4009
4010 The genetic code to be used is determined by the translation_table
4011 on the BlastSearchBlkPtr.
4012
4013 This function translates a packed (ncbi2na) nucl. alphabet. It
4014 views a basepair as being in one of four sets of 2-bits:
4015
4016 |0|1|2|3||0|1|2|3||0|1|2|3||...
4017
4018 1st byte | 2 byte | 3rd byte...
4019
4020 A codon that starts at the beginning of the above sequence starts in
4021 state "0" and includes basepairs 0, 1, and 2. The next codon, in the
4022 same frame, after that starts in state "3" and includes 3, 0, and 1.
4023
4024 ** Optimization:
4025 changed the single main loop to
4026 - advance to state 0,
4027 - optimized inner loop does two (3 byte->4 codon) translation per iteration
4028 (loads are moved earlier so they can be done in advance.)
4029 - do remainder
4030 */
4031
4032 Int4 LIBCALL
BlastTranslateUnambiguousSequence(BlastSearchBlkPtr search,Int4 length,Uint1Ptr prot_seq,Uint1Ptr nt_seq,Int2 frame)4033 BlastTranslateUnambiguousSequence(BlastSearchBlkPtr search, Int4 length, Uint1Ptr prot_seq, Uint1Ptr nt_seq, Int2 frame)
4034
4035 {
4036 register int state;
4037 Int2 total_remainder;
4038 Int4 prot_length;
4039 register int byte_value, codon=0;
4040 Uint1 last_remainder, last_byte, remainder;
4041 register Uint1Ptr translation, nt_seq_end, nt_seq_start;
4042 Uint1Ptr prot_seq_start;
4043 int byte_value1,byte_value2,byte_value3,byte_value4,byte_value5;
4044
4045 prot_length=0;
4046 if (nt_seq == NULL || prot_seq == NULL || (length-ABS(frame)+1) < CODON_LENGTH)
4047 return prot_length;
4048
4049 *prot_seq = NULLB;
4050 prot_seq++;
4051
4052 /* record to determine protein length. */
4053 prot_seq_start = prot_seq;
4054
4055 if (frame > 0)
4056 translation = search->translation_table;
4057 else
4058 translation = search->translation_table_rc;
4059
4060 remainder = length%4;
4061
4062 if (frame > 0)
4063 {
4064 nt_seq_end = nt_seq + (length)/4 - 1;
4065 last_remainder = (4*(length/4) - frame + 1)%CODON_LENGTH;
4066 total_remainder = last_remainder+remainder;
4067
4068 state = frame-1;
4069 byte_value = *nt_seq;
4070
4071 /* If there's lots to do, advance to state 0, then enter fast loop */
4072 while (nt_seq < nt_seq_end)
4073 {
4074 switch (state)
4075 {
4076 case 0:
4077 codon = (byte_value >> 2);
4078 *prot_seq = translation[codon];
4079 prot_seq++;
4080 /* do state = 3 now, break is NOT missing. */
4081 case 3:
4082 codon = ((byte_value & 3) << 4);
4083 nt_seq++;
4084 byte_value = *nt_seq;
4085 codon += (byte_value >> 4);
4086 *prot_seq = translation[codon];
4087 prot_seq++;
4088 if (nt_seq >= nt_seq_end)
4089 {
4090 state = 2;
4091 break;
4092 }
4093 /* Go on to state = 2 if not at end. */
4094 case 2:
4095 codon = ((byte_value & 15) << 2);
4096 nt_seq++;
4097 byte_value = *nt_seq;
4098 codon += (byte_value >> 6);
4099 *prot_seq = translation[codon];
4100 prot_seq++;
4101 if (nt_seq >= nt_seq_end)
4102 {
4103 state = 1;
4104 break;
4105 }
4106 /* Go on to state = 1 if not at end. */
4107 case 1:
4108 codon = byte_value & 63;
4109 *prot_seq = translation[codon];
4110 prot_seq++;
4111 nt_seq++;
4112 byte_value = *nt_seq;
4113 state = 0;
4114 break;
4115 } /* end switch */
4116 /* switch ends at state 0, except when at end */
4117
4118
4119 /********************************************/
4120 /* optimized loop: start in state 0. continue til near end */
4121 while (nt_seq < (nt_seq_end-10))
4122 {
4123 byte_value1 = *(++nt_seq);
4124 byte_value2 = *(++nt_seq);
4125 byte_value3 = *(++nt_seq);
4126 /* case 0: */
4127 codon = (byte_value >> 2);
4128 *prot_seq = translation[codon];
4129 prot_seq++;
4130
4131 /* case 3: */
4132 codon = ((byte_value & 3) << 4);
4133 codon += (byte_value1 >> 4);
4134 *prot_seq = translation[codon];
4135 prot_seq++;
4136
4137 byte_value4 = *(++nt_seq);
4138 /* case 2: */
4139 codon = ((byte_value1 & 15) << 2);
4140
4141 codon += (byte_value2 >> 6);
4142 *prot_seq = translation[codon];
4143 prot_seq++;
4144 /* case 1: */
4145 codon = byte_value2 & 63;
4146 byte_value5 = *(++nt_seq);
4147 *prot_seq = translation[codon];
4148 prot_seq++;
4149
4150 /* case 0: */
4151 codon = (byte_value3 >> 2);
4152 *prot_seq = translation[codon];
4153 prot_seq++;
4154 /* case 3: */
4155 byte_value = *(++nt_seq);
4156 codon = ((byte_value3 & 3) << 4);
4157 codon += (byte_value4 >> 4);
4158 *prot_seq = translation[codon];
4159 prot_seq++;
4160 /* case 2: */
4161 codon = ((byte_value4 & 15) << 2);
4162 codon += (byte_value5 >> 6);
4163 *prot_seq = translation[codon];
4164 prot_seq++;
4165 /* case 1: */
4166 codon = byte_value5 & 63;
4167 *prot_seq = translation[codon];
4168 prot_seq++;
4169 state=0;
4170 } /* end optimized while */
4171 /********************************************/
4172 } /* end while */
4173
4174
4175 if (state == 1)
4176 {
4177 /* This doesn't get done above, DON't do the state = 0
4178 below if this is done. */
4179 byte_value = *nt_seq;
4180 codon = byte_value & 63;
4181 state = 0;
4182 *prot_seq = translation[codon];
4183 prot_seq++;
4184 }
4185 else if (state == 0)
4186 { /* This one doesn't get done above. */
4187 byte_value = *nt_seq;
4188 codon = ((byte_value) >> 2);
4189 state = 3;
4190 *prot_seq = translation[codon];
4191 prot_seq++;
4192 }
4193
4194 if (total_remainder >= CODON_LENGTH)
4195 {
4196 byte_value = *(nt_seq_end);
4197 last_byte = *(nt_seq_end+1);
4198 if (state == 0)
4199 {
4200 codon = (last_byte >> 2);
4201 }
4202 else if (state == 2)
4203 {
4204 codon = ((byte_value & 15) << 2);
4205 codon += (last_byte >> 6);
4206 }
4207 else if (state == 3)
4208 {
4209 codon = ((byte_value & 3) << 4);
4210 codon += (last_byte >> 4);
4211 }
4212 *prot_seq = translation[codon];
4213 prot_seq++;
4214 }
4215 *prot_seq = NULLB;
4216 }
4217 else
4218 {
4219 nt_seq_start = nt_seq;
4220 nt_seq += length/4;
4221 state = remainder+frame;
4222 /* Do we start in the last byte? This one has the lowest order
4223 bits set to represent the remainder, hence the odd coding here. */
4224 if (state >= 0)
4225 {
4226 last_byte = *nt_seq;
4227 nt_seq--;
4228 if (state == 0)
4229 {
4230 codon = (last_byte >> 6);
4231 byte_value = *nt_seq;
4232 codon += ((byte_value & 15) << 2);
4233 state = 1;
4234 }
4235 else if (state == 1)
4236 {
4237 codon = (last_byte >> 4);
4238 byte_value = *nt_seq;
4239 codon += ((byte_value & 3) << 4);
4240 state = 2;
4241 }
4242 else if (state == 2)
4243 {
4244 codon = (last_byte >> 2);
4245 state = 3;
4246 }
4247 *prot_seq = translation[codon];
4248 prot_seq++;
4249
4250 }
4251 else
4252 {
4253 state = 3 + (remainder + frame + 1);
4254 nt_seq--;
4255 }
4256
4257 byte_value = *nt_seq;
4258
4259 /* If there's lots to do, advance to state 3, then enter fast loop */
4260 while (nt_seq > nt_seq_start)
4261 {
4262 switch (state)
4263 {
4264 case 3:
4265 codon = (byte_value & 63);
4266 *prot_seq = translation[codon];
4267 prot_seq++;
4268 /* do state = 0 now, break is NOT missing. */
4269 case 0:
4270 codon = (byte_value >> 6);
4271 nt_seq--;
4272 byte_value = *nt_seq;
4273 codon += ((byte_value & 15) << 2);
4274 *prot_seq = translation[codon];
4275 prot_seq++;
4276 if (nt_seq <= nt_seq_start)
4277 {
4278 state = 1;
4279 break;
4280 }
4281 /* Go on to state = 2 if not at end. */
4282 case 1:
4283 codon = (byte_value >> 4);
4284 nt_seq--;
4285 byte_value = *nt_seq;
4286 codon += ((byte_value & 3) << 4);
4287 *prot_seq = translation[codon];
4288 prot_seq++;
4289 if (nt_seq <= nt_seq_start)
4290 {
4291 state = 2;
4292 break;
4293 }
4294 /* Go on to state = 2 if not at end. */
4295 case 2:
4296 codon = (byte_value >> 2);
4297 *prot_seq = translation[codon];
4298 prot_seq++;
4299 nt_seq--;
4300 byte_value = *nt_seq;
4301 state = 3;
4302 break;
4303 } /* end switch */
4304 /* switch ends at state 3, except when at end */
4305
4306
4307 /********************************************/
4308 /* optimized area: start in state 0. continue til near end */
4309 while (nt_seq > (nt_seq_start+10))
4310 {
4311 byte_value1 = *(--nt_seq);
4312 byte_value2 = *(--nt_seq);
4313 byte_value3 = *(--nt_seq);
4314
4315 codon = (byte_value & 63);
4316 *prot_seq = translation[codon];
4317 prot_seq++;
4318 codon = (byte_value >> 6);
4319 codon += ((byte_value1 & 15) << 2);
4320 *prot_seq = translation[codon];
4321 prot_seq++;
4322 byte_value4 = *(--nt_seq);
4323 codon = (byte_value1 >> 4);
4324 codon += ((byte_value2 & 3) << 4);
4325 *prot_seq = translation[codon];
4326 prot_seq++;
4327 codon = (byte_value2 >> 2);
4328 *prot_seq = translation[codon];
4329 prot_seq++;
4330 byte_value5 = *(--nt_seq);
4331
4332 codon = (byte_value3 & 63);
4333 *prot_seq = translation[codon];
4334 prot_seq++;
4335 byte_value = *(--nt_seq);
4336 codon = (byte_value3 >> 6);
4337 codon += ((byte_value4 & 15) << 2);
4338 *prot_seq = translation[codon];
4339 prot_seq++;
4340 codon = (byte_value4 >> 4);
4341 codon += ((byte_value5 & 3) << 4);
4342 *prot_seq = translation[codon];
4343 prot_seq++;
4344 codon = (byte_value5 >> 2);
4345 *prot_seq = translation[codon];
4346 prot_seq++;
4347 } /* end optimized while */
4348 /********************************************/
4349
4350 } /* end while */
4351
4352 byte_value = *nt_seq;
4353 if (state == 3)
4354 {
4355 codon = (byte_value & 63);
4356 *prot_seq = translation[codon];
4357 prot_seq++;
4358 }
4359 else if (state == 2)
4360 {
4361 codon = (byte_value >> 2);
4362 *prot_seq = translation[codon];
4363 prot_seq++;
4364 }
4365 }
4366
4367 *prot_seq = NULLB;
4368
4369 return (prot_seq - prot_seq_start);
4370 } /* BlastTranslateUnambiguousSequence */
4371
4372
4373
4374 /*
4375 Gets an appropriate ID for the database (subject) sequence.
4376 Int4 hit_number is the index into the BLASTResultHitlistPtr,
4377 Boolean ordinal_number specifies whether an ordinal number (the
4378 db sequence number) or a real ID should be used.
4379 */
4380 SeqIdPtr LIBCALL
BlastGetSubjectIdEx(BlastSearchBlkPtr search,Int4 hit_number,Boolean ordinal_number,ValNodePtr * vnpp,Int2 query_number)4381 BlastGetSubjectIdEx(BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number, ValNodePtr *vnpp, Int2 query_number)
4382 {
4383 BLASTResultHitlistPtr results;
4384 DbtagPtr dbtagptr;
4385 ObjectIdPtr obidp;
4386 SeqIdPtr subject_id=NULL, sip;
4387 Uint4 header;
4388 BLASTResultsStructPtr result_struct;
4389
4390 if (search->pbp->mb_params)
4391 result_struct = search->mb_result_struct[query_number];
4392 else
4393 result_struct = search->result_struct;
4394
4395 results = result_struct->results[hit_number];
4396 if (ordinal_number) {
4397
4398 obidp = ObjectIdNew();
4399 obidp->str = NULL;
4400 obidp->id = results->subject_id;
4401 dbtagptr = DbtagNew();
4402 if (search->rdfp) {
4403 dbtagptr->db = StringSave(search->rdfp->filename);
4404 }
4405 dbtagptr->tag = obidp;
4406 ValNodeAddPointer(&subject_id, SEQID_GENERAL, dbtagptr);
4407 } else if (search->rdfp) {
4408 if (vnpp == NULL) {
4409 readdb_get_descriptor(search->rdfp, results->subject_id, &subject_id, NULL);
4410 } else {
4411 header = 0;
4412 sip = NULL;
4413
4414 if(search->rdfp->formatdb_ver == FORMATDB_VER_TEXT) {
4415 while (readdb_get_header(search->rdfp, results->subject_id, &header, &sip, NULL) == TRUE)
4416 ValNodeAddPointer(vnpp, 0, sip);
4417 } else {
4418 BlastDefLinePtr bdfp, bdfp_head;
4419
4420 bdfp_head = FDReadDeflineAsn(search->rdfp, results->subject_id);
4421
4422 if(bdfp_head == NULL) {
4423 ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d", results->subject_id);
4424 return NULL;
4425 }
4426
4427 for(bdfp = bdfp_head; bdfp != NULL; bdfp = bdfp->next) {
4428 sip = SeqIdSetDup(bdfp->seqid);
4429 ValNodeAddPointer(vnpp, 0, sip);
4430 }
4431
4432 BlastDefLineSetFree(bdfp_head);
4433 }
4434 }
4435 } else {
4436 if (results->subject_info)
4437 subject_id = SeqIdDup(results->subject_info->sip);
4438 }
4439
4440 return subject_id;
4441 }
4442
4443 SeqIdPtr LIBCALL
BlastGetSubjectId(BlastSearchBlkPtr search,Int4 hit_number,Boolean ordinal_number,ValNodePtr * vnpp)4444 BlastGetSubjectId(BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number, ValNodePtr *vnpp)
4445 {
4446 return BlastGetSubjectIdEx(search, hit_number, ordinal_number, vnpp, 0);
4447 }
4448
4449 /*
4450 Use by HeapSort (in BioseqBlastEngine) to rank Hitlist's.
4451 */
4452
4453 int LIBCALLBACK
evalue_compare_hits(VoidPtr v1,VoidPtr v2)4454 evalue_compare_hits(VoidPtr v1, VoidPtr v2)
4455
4456 {
4457 BLASTResultHitlistPtr h1, h2;
4458 BLASTResultHitlistPtr *hp1, *hp2;
4459
4460 hp1 = (BLASTResultHitlistPtr *) v1;
4461 hp2 = (BLASTResultHitlistPtr *) v2;
4462 h1 = *hp1;
4463 h2 = *hp2;
4464
4465 /* Sort first by evalue, then by score in case all evalues are zero. */
4466
4467 if (h1->best_evalue < h2->best_evalue)
4468 return -1;
4469 if (h1->best_evalue > h2->best_evalue)
4470 return 1;
4471 if (h1->high_score > h2->high_score)
4472 return -1;
4473 if (h1->high_score < h2->high_score)
4474 return 1;
4475
4476 /* In case of equal scores and E-values order will be determined by
4477 subject id */
4478
4479 if (h1->subject_id > h2->subject_id)
4480 return -1;
4481 if (h1->subject_id < h2->subject_id)
4482 return 1;
4483
4484 return 0;
4485 }
4486
4487 /* Code in BLAST_CLUSTER_HITS is not currently in use */
4488
4489 #ifdef BLAST_CLUSTER_HITS
4490 typedef struct _blast_result_with_subject_id {
4491 BLASTResultHspPtr hsp;
4492 Int4 hitlist_index, hsp_index;
4493 } BlastResultHspWithId, PNTR BlastResultHspWithIdPtr;
4494
BLASTResultHspScoreCmp(VoidPtr v1,VoidPtr v2)4495 static int LIBCALLBACK BLASTResultHspScoreCmp(VoidPtr v1, VoidPtr v2)
4496 {
4497 BLASTResultHspPtr h1, h2;
4498
4499 h1 = (*(BlastResultHspWithIdPtr PNTR) v1)->hsp;
4500 h2 = (*(BlastResultHspWithIdPtr PNTR) v2)->hsp;
4501
4502 if (h1->score < h2->score)
4503 return 1;
4504 else if (h1->score > h2->score)
4505 return -1;
4506 else return 0;
4507 }
4508
ResultHspWithIdIndexCmp(VoidPtr v1,VoidPtr v2)4509 static int LIBCALLBACK ResultHspWithIdIndexCmp(VoidPtr v1, VoidPtr v2)
4510 {
4511 BlastResultHspWithIdPtr h1, h2;
4512
4513 h1 = *(BlastResultHspWithIdPtr PNTR) v1;
4514 h2 = *(BlastResultHspWithIdPtr PNTR) v2;
4515
4516 if (h1->hitlist_index < h2->hitlist_index)
4517 return -1;
4518 else if (h1->hitlist_index > h2->hitlist_index)
4519 return 1;
4520 else if (h1->hsp_index < h2->hsp_index)
4521 return -1;
4522 else if (h1->hsp_index > h2->hsp_index)
4523 return 1;
4524 else /* Should never happen */
4525 return 0;
4526 }
4527 #endif
4528
4529 #define CLUSTER_LENGTH_THRESH 0.1
4530 #define CLUSTER_OVERLAP_THRESH 0.9
4531 #define CLUSTER_SCORE_THRESH 1.6
4532
4533 static Nlm_FloatHi
s_ComputeAverageLength(const BlastSearchBlk * search)4534 s_ComputeAverageLength(const BlastSearchBlk* search)
4535 {
4536 Nlm_FloatHi retval = 0.0;
4537
4538 if (StringCmp(search->prog_name, "blastn") != 0) {
4539 retval = BLAST_AA_AVGLEN;
4540 } else {
4541 retval = BLAST_NT_AVGLEN;
4542 }
4543
4544 if (search->rdfp) {
4545 Int4 total_number = 0;
4546 Int8 total_length = 0;
4547
4548 readdb_get_totals(search->rdfp, &total_length, &total_number);
4549 if (total_number > 0)
4550 retval = ((Nlm_FloatHi) total_length)/total_number;
4551 } else if (search->dblen > 0 && search->dbseq_num == 1) {
4552 retval = search->dblen;
4553 }
4554
4555 return retval;
4556 }
4557
4558 SeqAlignPtr LIBCALL
BioseqBlastEngineCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options,Int4Ptr * pos_matrix)4559 BioseqBlastEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options,
4560 Int4Ptr *pos_matrix)
4561 {
4562 Int4 hitlist_max;
4563 SeqAlignPtr head, seqalign;
4564 #ifdef BLAST_CLUSTER_HITS
4565 BLASTResultHspPtr hsp, hsp1;
4566 BlastResultHspWithIdPtr PNTR hspp;
4567 BLASTResultsStructPtr result_struct;
4568 BLASTResultHitlistPtr result_hitlist;
4569 Int4 hspcnt, index, index1, index2;
4570 Int4 q_overlap;
4571 BioseqPtr bsp1, bsp2, PNTR bspp;
4572 BlastSearchBlkPtr search1;
4573 BLAST_KarlinBlkPtr kbp;
4574 FloatHi bit_score;
4575 #endif
4576
4577 head = seqalign = NULL;
4578
4579 if (search == NULL || search->query_invalid)
4580 return NULL;
4581
4582 /* If pos_matrix is not NULL, then psi-blast iterations are being
4583 performed. The first psi-blast iteration should be with normal
4584 blast. */
4585 if (pos_matrix)
4586 {
4587 search->sbp->posMatrix = pos_matrix;
4588 search->positionBased = TRUE;
4589 search->sbp->kbp = search->sbp->kbp_psi;
4590 search->sbp->kbp_gap = search->sbp->kbp_gap_psi;
4591 hitlist_max = search->result_struct->hitlist_max;
4592 search->result_struct = BLASTResultsStructDelete(search->result_struct);
4593 search->result_struct = BLASTResultsStructNew(hitlist_max, search->pbp->max_pieces, search->pbp->hsp_range_max);
4594 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST)
4595 {
4596 search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
4597 search->wfp_first = BLAST_WordFinderNew(search->sbp->alphabet_size,options->wordsize,1, FALSE);
4598 }
4599
4600 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND)
4601 {
4602 search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
4603 search->wfp_second = BLAST_WordFinderNew(search->sbp->alphabet_size,options->wordsize,1, FALSE);
4604 }
4605
4606
4607 /* Only find words once if thresholds are the same. */
4608 search->wfp = search->wfp_first;
4609 if (search->whole_query == TRUE) {
4610 BlastNewFindWords(search, 0, search->context[search->first_context].query->length, search->pbp->threshold_second, (Uint1) 0);
4611 } else {
4612 BlastNewFindWords(search, search->required_start, search->required_end, search->pbp->threshold_second, (Uint1) 0);
4613 }
4614 lookup_position_aux_destruct(search->wfp->lookup);
4615 search->wfp_second = search->wfp_first;
4616
4617 /* Unless search->pbp->cutoff_s[2]_set is set, we wish to calculate
4618 cutoff_s[2] from cutoff_e[2], rather than the other way around.
4619 Setting cutoff_s[2] to zero, as was the case in the first call to
4620 blast_set_parameters, accomplishes this.
4621 */
4622 if (!search->pbp->cutoff_s_set) {
4623 search->pbp->cutoff_s = 0;
4624 }
4625 if (!search->pbp->cutoff_s2_set) {
4626 search->pbp->cutoff_s2 = 0;
4627 }
4628 /* recalculate the cutoff scores with the newly calculated
4629 Karlin-Altschul parameters. */
4630 blast_set_parameters(search,
4631 options->dropoff_1st_pass,
4632 options->dropoff_2nd_pass,
4633 s_ComputeAverageLength(search),
4634 search->searchsp_eff,
4635 options->window_size);
4636 }
4637
4638 /* Starting awake thread if multithreaded. */
4639 if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
4640 BlastStartAwakeThread(search->thr_info);
4641
4642 /* THE BLAST SEARCH IS HERE */
4643 do_the_blast_run(search);
4644
4645 #ifdef BLAST_CLUSTER_HITS
4646 if (!search->pbp->mb_params) {
4647 /* Cluster hits by region within the query */
4648 /* Assume that hits are already sorted in each hitlist by score */
4649 ValNodePtr mask;
4650 result_struct = search->result_struct;
4651 hspcnt = 0;
4652 /* Collect all HSPs in one array */
4653
4654 bspp = (BioseqPtr PNTR) Malloc(result_struct->hitlist_count*
4655 sizeof(BioseqPtr));
4656 for (index=0; index<result_struct->hitlist_count; index++) {
4657 hspcnt += result_struct->results[index]->hspcnt;
4658 bspp[index] = readdb_get_bioseq(search->rdfp,
4659 result_struct->results[index]->subject_id);
4660 }
4661
4662 hspp = (BlastResultHspWithIdPtr PNTR)
4663 Malloc(hspcnt*sizeof(BlastResultHspWithIdPtr));
4664 index2 = 0;
4665 for (index=0; index<result_struct->hitlist_count; index++) {
4666 result_hitlist = result_struct->results[index];
4667 for (index1=0; index1<result_hitlist->hspcnt; index1++) {
4668 hspp[index2] = (BlastResultHspWithIdPtr)
4669 Malloc(sizeof(BlastResultHspWithId));
4670 hspp[index2]->hitlist_index = index;
4671 hspp[index2]->hsp_index = index1;
4672 hspp[index2++]->hsp = &(result_hitlist->hsp_array[index1]);
4673 }
4674 }
4675 /* Sort by score */
4676 HeapSort((VoidPtr)hspp, hspcnt, sizeof(BLASTResultHspPtr),
4677 BLASTResultHspScoreCmp);
4678 index = 0;
4679 while (index<hspcnt) {
4680 hsp = hspp[index]->hsp;
4681 index2 = 0;
4682
4683 result_hitlist =
4684 search->result_struct->results[hspp[index]->hitlist_index];
4685 bsp1 = bspp[hspp[index]->hitlist_index];
4686
4687 search1 =
4688 BlastQuerySequenceSetUp(bsp1, search->prog_name,
4689 options);
4690 for (index1=index+1; index1<hspcnt; index1++) {
4691 /* Check if the next hit passes a simple test to be a
4692 candidate to belong to this cluster */
4693 if (hspp[index1]->hsp==NULL)
4694 continue;
4695 hsp1 = hspp[index1]->hsp;
4696 result_hitlist =
4697 search->result_struct->results[hspp[index1]->hitlist_index];
4698 bsp2 = bspp[hspp[index1]->hitlist_index];
4699 if (((FloatHi)ABS(bsp1->length - bsp2->length)) /
4700 MIN(bsp1->length, bsp2->length) > CLUSTER_LENGTH_THRESH)
4701 continue;
4702 q_overlap =
4703 MIN(hsp->query_offset+hsp->query_length,
4704 hsp1->query_offset+hsp1->query_length) -
4705 MAX(hsp->query_offset, hsp1->query_offset);
4706 if (((FloatHi)q_overlap) /
4707 MAX(hsp->query_length, hsp1->query_length) <
4708 CLUSTER_OVERLAP_THRESH)
4709 continue;
4710
4711 /* We have a candidate for attaching to the cluster */
4712 if (hspp[index]->hitlist_index == hspp[index1]->hitlist_index) {
4713 /* Almost identical hit from same subject in the same
4714 area of the query - remove! */
4715 result_hitlist =
4716 search->result_struct->results[hspp[index1]->hitlist_index];
4717 hspp[index1]->hsp = NULL;
4718 }
4719
4720 /* Do the two sequences search to determine whether this
4721 candidate in fact belongs to this cluster */
4722 search1 = BlastSequencesOnTheFlyEx(search1, bsp2);
4723
4724 if (search1 && search1->result_struct->results[0]) {
4725 if (search1->pbp->gapped_calculation)
4726 kbp = search1->sbp->kbp_gap[search1->first_context];
4727 else
4728 kbp = search1->sbp->kbp[search1->first_context];
4729 bit_score = ((search1->result_struct->results[0]->high_score *
4730 kbp->Lambda) - kbp->logK)/NCBIMATH_LN2;
4731 if (bit_score > CLUSTER_SCORE_THRESH *
4732 MAX(bsp1->length, bsp2->length)) {
4733 /* remove the respective hit */
4734 hspp[index1]->hsp = NULL;
4735 }
4736 }
4737 }
4738 mask = search1->mask;
4739 while (mask) {
4740 SeqLocSetFree(mask->data.ptrvalue);
4741 mask = mask->next;
4742 }
4743 ValNodeFree(search1->mask);
4744 search1 = BlastSearchBlkDestruct(search1);
4745 for (++index; index<hspcnt && hspp[index]->hsp==NULL; index++);
4746 }
4747
4748 for (index=0; index<result_struct->hitlist_count; index++)
4749 BioseqFree(bspp[index]);
4750 MemFree(bspp);
4751 /* Remove all NULLs from hspp array */
4752 for (index=0, index1=0; index<hspcnt; index++) {
4753 if (hspp[index]->hsp != NULL) {
4754 if (index != index1)
4755 hspp[index1] = hspp[index];
4756 index1++;
4757 } else
4758 hspp[index] = MemFree(hspp[index]);
4759 }
4760 hspcnt = index1;
4761 /* Sort according to original hitlist and hsp indices */
4762 HeapSort((VoidPtr)hspp, hspcnt, sizeof(BLASTResultHspPtr),
4763 ResultHspWithIdIndexCmp);
4764
4765 /* Rearrange the hsp_arrays for all hitlists */
4766 index = 0;
4767 for (index2=0; index2<result_struct->hitlist_count; index2++) {
4768 index1 = 0;
4769 while (index<hspcnt && hspp[index]->hitlist_index == index2) {
4770 result_struct->results[index2]->hsp_array[index1] =
4771 *(hspp[index]->hsp);
4772 index++;
4773 index1++;
4774 }
4775 result_struct->results[index2]->hspcnt = index1;
4776 }
4777
4778 for (index=0; index<hspcnt; index++)
4779 hspp[index] = MemFree(hspp[index]);
4780 hspp = MemFree(hspp);
4781 }
4782 #endif /* Clustering hits */
4783
4784 if (options->no_traceback) {
4785 BlastStopAwakeThread(search->thr_info);
4786 return NULL;
4787 }
4788
4789 BLASTPostSearchLogic(search, options, &head, TRUE);
4790
4791 /* Stop the awake thread. */
4792 BlastStopAwakeThread(search->thr_info);
4793
4794 return head;
4795 }
4796
4797 /*
4798 Deallocates all memory involved with the BlastHitRangePtr.
4799 */
4800
4801 BlastHitRangePtr LIBCALL
BlastHitRangeDestruct(BlastHitRangePtr old)4802 BlastHitRangeDestruct(BlastHitRangePtr old)
4803
4804 {
4805 if (old == NULL)
4806 return NULL;
4807
4808 MemFree(old->range_list);
4809 MemFree(old->range_list_pointer);
4810
4811 return MemFree(old);
4812 }
4813
4814 /*
4815 Allocates a a BlastHitRangePtr, with two 'total'
4816 BlastDoubleInt4Ptr's.
4817 */
4818
4819 BlastHitRangePtr LIBCALL
BlastHitRangeNew(Int4 total)4820 BlastHitRangeNew(Int4 total)
4821
4822 {
4823 BlastHitRangePtr bhrp;
4824 Int4 index;
4825
4826 bhrp = MemNew(sizeof(BlastHitRange));
4827
4828 bhrp->range_list = (BlastDoubleInt4Ptr) MemNew(total*sizeof(BlastDoubleInt4));
4829 bhrp->range_list_pointer = (BlastDoubleInt4Ptr PNTR) MemNew(total*sizeof(BlastDoubleInt4Ptr));
4830 for (index=0; index<total; index++)
4831 {
4832 bhrp->range_list_pointer[index] = &(bhrp->range_list[index]);
4833 }
4834
4835 bhrp->current = 0;
4836 bhrp->total = total;
4837
4838 return bhrp;
4839 }
4840
4841 static int LIBCALLBACK
bhrp_compare(VoidPtr v1,VoidPtr v2)4842 bhrp_compare(VoidPtr v1, VoidPtr v2)
4843
4844 {
4845 BlastDoubleInt4Ptr h1, h2;
4846 BlastDoubleInt4Ptr *hp1, *hp2;
4847
4848 hp1 = (BlastDoubleInt4Ptr PNTR) v1;
4849 hp2 = (BlastDoubleInt4Ptr PNTR) v2;
4850 h1 = *hp1;
4851 h2 = *hp2;
4852
4853 if (h1->gi < h2->gi)
4854 return -1;
4855 if (h1->gi > h2->gi)
4856 return 1;
4857
4858 return 0;
4859 }
4860
4861 BlastHitRangePtr LIBCALL
BioseqHitRangeEngineCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options)4862 BioseqHitRangeEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options)
4863
4864 {
4865 BlastHitRangePtr bhrp=NULL;
4866 BLASTResultsStructPtr result_struct;
4867 Int4 hitlist_count, index, total_hsps;
4868 Int4 sequence_length, length;
4869 Uint1Ptr sequence;
4870
4871 if (search == NULL || search->query_invalid)
4872 return NULL;
4873
4874 /* Starting awake thread if multithreaded. */
4875 if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
4876 BlastStartAwakeThread(search->thr_info);
4877
4878 do_the_blast_run(search);
4879
4880 if (search->prog_number==blast_type_blastn) {
4881 /* Unconcatenate the strands by adjusting the query offsets in
4882 all hsps */
4883 search->context[search->first_context].query->length =
4884 search->query_context_offsets[search->first_context+1] - 1;
4885 /*BlastAdjustHitOffsets(search);*/
4886 }
4887
4888 if (StringCmp(search->prog_name, "blastn") == 0 &&
4889 search->pbp->gapped_calculation)
4890 {
4891 search->pbp->gap_open = options->gap_open;
4892 search->pbp->gap_extend = options->gap_extend;
4893 /*
4894 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
4895 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
4896 */
4897
4898
4899 result_struct = search->result_struct;
4900 hitlist_count = result_struct->hitlist_count;
4901 total_hsps = 0;
4902 for (index=0; index<hitlist_count; index++)
4903 {
4904 total_hsps += result_struct->results[index]->hspcnt;
4905 }
4906 bhrp = BlastHitRangeNew(total_hsps);
4907 bhrp->query_id = search->query_id;
4908
4909 result_struct = search->result_struct;
4910 hitlist_count = result_struct->hitlist_count;
4911
4912 sequence=NULL;
4913 sequence_length=0;
4914
4915 for (index=0; index<hitlist_count; index++)
4916 {
4917 length = readdb_get_sequence_ex(search->rdfp, result_struct->results[index]->subject_id, &sequence, &sequence_length, TRUE);
4918 SumBlastGetGappedAlignmentEx(search, index, FALSE, FALSE, sequence+1, length, FALSE, NULL, bhrp, 0);
4919 }
4920 sequence = MemFree(sequence);
4921 }
4922 else
4923 {
4924 return NULL;
4925 }
4926
4927 HeapSort(bhrp->range_list_pointer, bhrp->current, sizeof(BlastHitRangePtr PNTR), bhrp_compare);
4928
4929 /* Stop the awake thread. */
4930 BlastStopAwakeThread(search->thr_info);
4931
4932 return bhrp;
4933 }
4934
4935 SeqAlignPtr LIBCALL
BioseqBlastEngineEx(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)4936 BioseqBlastEngineEx(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
4937
4938 {
4939 SeqLocPtr slp;
4940 SeqAlignPtr seqalign;
4941
4942 slp = NULL;
4943 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
4944 seqalign = BioseqBlastEngineByLocEx(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total);
4945 SeqLocFree(slp);
4946
4947 return seqalign;
4948 }
4949
4950 SeqAlignPtr LIBCALL
BioseqBlastEngine(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))4951 BioseqBlastEngine(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
4952 {
4953 /* --KM added NULL mult_queries param to call */
4954 return BioseqBlastEngineWithCallbackMult(bsp, progname, database, options, other_returns, error_returns, callback, NULL, NULL);
4955 }
4956
4957 SeqAlignPtr LIBCALL
BioseqBlastEngineWithCallback(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)))4958 BioseqBlastEngineWithCallback(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)))
4959 {
4960 return BioseqBlastEngineWithCallbackMult(bsp, progname, database, options, other_returns, error_returns, callback, NULL, NULL);
4961 }
4962
4963 /* --KM added mult_queries parameter */
4964 SeqAlignPtr LIBCALL
BioseqBlastEngineWithCallbackMult(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)),QueriesPtr mult_queries)4965 BioseqBlastEngineWithCallbackMult(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)), QueriesPtr mult_queries)
4966 {
4967 SeqLocPtr slp;
4968 SeqAlignPtr seqalign;
4969
4970 slp = NULL;
4971 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
4972 seqalign = BioseqBlastEngineByLocWithCallbackMult(slp, progname, database, options, other_returns, error_returns, callback, NULL, NULL, 0, handle_results, mult_queries);/* --KM pass mult_queries */
4973 SeqLocFree(slp);
4974
4975 return seqalign;
4976 }
4977
4978
4979
4980 SeqAlignPtr LIBCALL
BioseqBlastEngineByLoc(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))4981 BioseqBlastEngineByLoc(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
4982
4983 {
4984 return BioseqBlastEngineByLocEx(slp, progname, database, options, other_returns, error_returns, callback, NULL, NULL, 0);
4985
4986 }
4987
4988 SeqAlignPtr LIBCALL
BioseqBlastEngineByLocEx(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)4989 BioseqBlastEngineByLocEx(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
4990
4991 {
4992 return BioseqBlastEngineByLocWithCallback(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total, NULL); /* --KM pass NULL mult_queries */
4993 }
4994
4995 SeqAlignPtr LIBCALL
BioseqBlastEngineByLocWithCallback(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)))4996 BioseqBlastEngineByLocWithCallback(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)))
4997 {
4998 return BioseqBlastEngineByLocWithCallbackMult(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total, handle_results, NULL);
4999 }
5000
5001 /* --KM added mult_queries param */
5002 SeqAlignPtr LIBCALL
BioseqBlastEngineByLocWithCallbackMult(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,int (LIBCALLBACK * handle_results)PROTO ((VoidPtr srch)),QueriesPtr mult_queries)5003 BioseqBlastEngineByLocWithCallbackMult(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)), QueriesPtr mult_queries)
5004 {
5005 Boolean options_allocated=FALSE;
5006 BlastSearchBlkPtr search;
5007 Int2 status;
5008 SeqAlignPtr head;
5009 SeqLocPtr whole_slp=NULL;
5010 /* Futamura */
5011 posSearchItems *posSearch;
5012 compactSearchItems *compactSearch = NULL;
5013 Boolean checkReturn = FALSE;
5014
5015 head = NULL;
5016
5017 if (error_returns)
5018 {
5019 *error_returns = NULL;
5020 }
5021
5022 if (other_returns)
5023 {
5024 *other_returns = NULL;
5025 }
5026
5027 if (progname == NULL)
5028 return NULL;
5029
5030 /* If no options, use default. */
5031 if (options == NULL)
5032 {
5033 options = BLASTOptionNew(progname, FALSE);
5034 options_allocated = TRUE;
5035 }
5036
5037 status = BLASTOptionValidateEx(options, progname, error_returns);
5038 if (status != 0)
5039 { /* error messages in other_returns? */
5040 return NULL;
5041 }
5042
5043 if (slp == NULL || database == NULL)
5044 return NULL;
5045
5046 if(options->is_rps_blast) {
5047 RPSInfoPtr rpsinfo;
5048 BioseqPtr bsp, fake_bsp;
5049 Boolean query_is_na;
5050
5051 if((bsp = BioseqLockById(SeqLocId(slp))) == NULL)
5052 return NULL;
5053
5054 /* RPS Blast discard program name and use specific RPS Blast
5055 logic for this */
5056
5057 if(bsp->mol == Seq_mol_aa) {
5058 query_is_na = FALSE;
5059 progname = "blastp";
5060 } else {
5061 query_is_na = TRUE;
5062 progname = "tblastn";
5063 }
5064 if((rpsinfo = RPSInitEx(database, !query_is_na, options)) == NULL) {
5065
5066 ErrPostEx(SEV_ERROR, 0, 0, "Failure to initialize RPS: %s %s",
5067 progname, database);
5068 return NULL;
5069 }
5070 /* Update size of the database in accordance with RPS Database size */
5071 RPSUpdateDbSize(options, rpsinfo, bsp->length);
5072
5073 if(!query_is_na)
5074 fake_bsp = bsp;
5075 else {
5076 options->db_genetic_code = options->genetic_code;
5077 fake_bsp = createFakeProtein();
5078 }
5079 search = BLASTSetUpSearch (fake_bsp, progname, fake_bsp->length, 0,
5080 NULL, options, NULL);
5081
5082 if (search == NULL)
5083 return NULL;
5084
5085 search->thr_info->tick_callback = NULL;
5086 search->thr_info->star_callback = NULL;
5087
5088 head = RPSBlastSearch(search, bsp, rpsinfo);
5089
5090 if(query_is_na)
5091 BioseqFree(fake_bsp);
5092 BioseqUnlock(bsp);
5093 RPSClose(rpsinfo);
5094 } else {
5095
5096 search = BLASTSetUpSearchByLocWithReadDbEx(slp, progname, SeqLocLen(slp), database, options, NULL, seqid_list, gi_list, gi_list_total, mult_queries);
5097 /* --KM pass mult_queries */
5098
5099 if (search == NULL) {
5100 /* We need to veryfy if database name is wrong and to set error
5101 returns correctly */
5102 Boolean is_prot;
5103 BlastErrorMsgPtr error_msg;
5104 CharPtr chptr;
5105 ReadDBFILEPtr rdfp=NULL;
5106
5107 if(!StringICmp(progname, "blastp") ||
5108 !StringICmp(progname, "blastx")) {
5109 is_prot = TRUE;
5110 } else {
5111 is_prot = FALSE;
5112 }
5113
5114 rdfp = readdb_new(database, is_prot);
5115 if(rdfp == NULL) {
5116 error_msg = MemNew(sizeof(BlastErrorMsg));
5117 chptr = MemNew(StringLen(database) + 256);
5118 sprintf(chptr, "Database %s was not found or does not exist",
5119 database);
5120 error_msg->msg = chptr;
5121 error_msg->level = 3; /* FATAL */
5122 ValNodeAddPointer(error_returns, 0, error_msg);
5123 }
5124
5125 readdb_destruct(rdfp);
5126 return NULL;
5127 }
5128
5129 search->thr_info->tick_callback = callback;
5130 search->thr_info->star_callback = callback;
5131 search->handle_results = handle_results;
5132 search->output = options->output;
5133
5134 /* Futamura psitblastn */
5135 if (options->recoverCheckpoint)
5136 search->positionBased = TRUE;
5137 else
5138 search->positionBased = FALSE;
5139
5140 if (options->recoverCheckpoint) {
5141 posSearch = (posSearchItems *) MemNew(1 * sizeof(posSearchItems));
5142 compactSearch = compactSearchNew(compactSearch);
5143 copySearchItems(compactSearch, search, options->matrix);
5144 posInitializeInformation(posSearch,search);
5145 /*AAS*/
5146
5147 checkReturn = posReadCheckpoint(posSearch, compactSearch,
5148 options->CheckpointFileName,
5149 NO_SCOREMAT_IO,
5150 &(search->error_return));
5151 /* Reading the checkpoint changes the statistical parameters
5152 kbp_psi and kbp_gap_psi. Recalculate the cutoffs by calling
5153 blast_set_parameters. */
5154
5155 /* Unless search->pbp->cutoff_s[2]_set is set, we wish to calculate
5156 cutoff_s[2] from cutoff_e[2], rather than the other way around.
5157 Setting cutoff_s[2] to zero, as was the case in the first call to
5158 blast_set_parameters, accomplishes this.
5159 */
5160 if (!search->pbp->cutoff_s_set) {
5161 search->pbp->cutoff_s = 0;
5162 }
5163 if (!search->pbp->cutoff_s2_set) {
5164 search->pbp->cutoff_s2 = 0;
5165 }
5166 search->sbp->kbp = search->sbp->kbp_psi;
5167 search->sbp->kbp_gap = search->sbp->kbp_gap_psi;
5168 blast_set_parameters(search,
5169 options->dropoff_1st_pass,
5170 options->dropoff_2nd_pass,
5171 s_ComputeAverageLength(search),
5172 search->searchsp_eff,
5173 options->window_size);
5174
5175 search->sbp->posMatrix = posSearch->posMatrix;
5176 if (NULL == search->sbp->posFreqs)
5177 search->sbp->posFreqs = allocatePosFreqs(compactSearch->qlength,
5178 compactSearch->alphabetSize);
5179 copyPosFreqs(posSearch->posFreqs,search->sbp->posFreqs,
5180 compactSearch->qlength, compactSearch->alphabetSize);
5181
5182 if (!checkReturn) {
5183 BlastConstructErrorMessage("BioseqBlastEngineByLocEx",
5184 "Error recovering from checkpoint", 3, error_returns);
5185 return NULL;
5186 }
5187 }
5188
5189 /* ----- Here is real BLAST search done ------- */
5190 if (search->positionBased)
5191 head = BioseqBlastEngineCore(search, options, search->sbp->posMatrix);
5192 else if (options->is_megablast_search) {
5193 SeqAlignPtr PNTR seqalignp;
5194 seqalignp = BioseqMegaBlastEngineCore(search, options);
5195 head = *seqalignp;
5196 } else
5197 head = BioseqBlastEngineCore(search, options, NULL);
5198 /* end Futamura */
5199
5200 }
5201
5202 if (search->error_return) {
5203 ValNodeLink(error_returns, search->error_return);
5204 search->error_return = NULL;
5205 }
5206
5207 if (other_returns) { /* format dbinfo etc. */
5208 *other_returns = BlastOtherReturnsPrepare(search);
5209 }
5210
5211 if (options_allocated) {
5212 options = BLASTOptionDelete(options);
5213 }
5214
5215 search = BlastSearchBlkDestruct(search);
5216
5217 if(!options->is_rps_blast) {
5218
5219 /* Adjsut the offset if the query does not cover the entire sequence. */
5220 if (slp->choice != SEQLOC_WHOLE) {
5221 ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE, SeqIdFindBest(SeqLocId(slp), SEQID_GI));
5222 if (SeqLocAinB(whole_slp, slp) != 0) {
5223 AdjustOffSetsInSeqAlign(head, slp, NULL);
5224 }
5225 ValNodeFree(whole_slp);
5226 }
5227 }
5228
5229 return head;
5230 }
5231
5232 SeqLocPtr LIBCALL
BioseqHitRangeEngine(BioseqPtr bsp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)5233 BioseqHitRangeEngine(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
5234
5235 {
5236 SeqLocPtr slp;
5237
5238 slp = NULL;
5239 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
5240 return BioseqHitRangeEngineByLoc(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total);
5241 }
5242
5243 SeqLocPtr
HitRangeToSeqLoc(BlastHitRangePtr bhrp,Int4 link_value,Boolean combine)5244 HitRangeToSeqLoc(BlastHitRangePtr bhrp, Int4 link_value, Boolean combine)
5245
5246 {
5247 Boolean make_seqloc, start=TRUE;
5248 Int4 index, total, start_pos=0, stop_pos, largest_stop_pos=0;
5249 SeqIntPtr sint;
5250 SeqLocPtr retval=NULL;
5251
5252 if (bhrp == NULL)
5253 return NULL;
5254
5255 total = bhrp->current;
5256 index=0;
5257 while (index < total)
5258 {
5259 if (combine)
5260 {
5261 if (start == TRUE)
5262 {
5263 start_pos = bhrp->range_list_pointer[index]->gi + bhrp->base_offset;
5264 start = FALSE;
5265 largest_stop_pos = 0;
5266 }
5267 else
5268 {
5269 /* Keep track of largest stop position. */
5270 largest_stop_pos = MAX(largest_stop_pos, bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset);
5271 make_seqloc = FALSE;
5272 if (index == total-1) /* Last one. */
5273 {
5274 stop_pos = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5275 start = TRUE;
5276 make_seqloc = TRUE;
5277 }
5278 else if (largest_stop_pos+link_value < bhrp->range_list_pointer[index+1]->gi + bhrp->base_offset)
5279 { /* Check overlap with next one. */
5280 stop_pos = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5281 start = TRUE;
5282 make_seqloc = TRUE;
5283 }
5284
5285 if (make_seqloc)
5286 {
5287 sint = SeqIntNew();
5288 sint->from = start_pos;
5289 sint->to = MAX(largest_stop_pos, stop_pos);
5290 sint->strand = Seq_strand_plus;
5291 sint->id = SeqIdDup(SeqIdFindBest(bhrp->query_id, SEQID_GI));
5292 ValNodeAddPointer(&retval, SEQLOC_INT, sint);
5293 }
5294 index++;
5295 }
5296 }
5297 else
5298 {
5299 sint = SeqIntNew();
5300 sint->from = bhrp->range_list_pointer[index]->gi + bhrp->base_offset;
5301 sint->to = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5302 sint->strand = Seq_strand_plus;
5303 sint->id = SeqIdDup(SeqIdFindBest(bhrp->query_id, SEQID_GI));
5304 ValNodeAddPointer(&retval, SEQLOC_INT, sint);
5305 index++;
5306 }
5307 }
5308
5309 return retval;
5310 }
5311
5312 #define HITRANGE_LINKVALUE 5
5313
5314 SeqLocPtr LIBCALL
BioseqHitRangeEngineByLoc(SeqLocPtr slp,CharPtr progname,CharPtr database,BLAST_OptionsBlkPtr options,ValNodePtr * other_returns,ValNodePtr * error_returns,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)5315 BioseqHitRangeEngineByLoc(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
5316
5317 {
5318 Boolean options_allocated=FALSE;
5319 BlastHitRangePtr bhrp;
5320 BlastSearchBlkPtr search;
5321 Int2 status;
5322 SeqLocPtr seqloc, whole_slp=NULL;
5323
5324 if (error_returns)
5325 {
5326 *error_returns = NULL;
5327 }
5328
5329 if (other_returns)
5330 {
5331 *other_returns = NULL;
5332 }
5333
5334 if (progname == NULL)
5335 return NULL;
5336
5337 /* If no options, use default. */
5338 if (options == NULL)
5339 {
5340 options = BLASTOptionNew(progname, FALSE);
5341 options_allocated = TRUE;
5342 }
5343
5344 status = BLASTOptionValidateEx(options, progname, error_returns);
5345 if (status != 0)
5346 { /* error messages in other_returns? */
5347 return NULL;
5348 }
5349
5350 if (slp == NULL || database == NULL)
5351 return NULL;
5352
5353 search = BLASTSetUpSearchByLocWithReadDbEx(slp, progname, SeqLocLen(slp), database, options, NULL, seqid_list, gi_list, gi_list_total, NULL); /* --KM pass NULL mult_queries */
5354
5355 if (search == NULL)
5356 {
5357 return NULL;
5358 }
5359
5360 search->thr_info->tick_callback = callback;
5361 search->thr_info->star_callback = callback;
5362
5363 bhrp = BioseqHitRangeEngineCore(search, options);
5364 if (bhrp == NULL) /* can happen for invalid queries. */
5365 return NULL;
5366
5367 if (slp->choice != SEQLOC_WHOLE) {
5368 ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE, SeqIdFindBest(SeqLocId(slp), SEQID_GI));
5369 bhrp->base_offset = GetOffsetInLoc(slp, whole_slp, SEQLOC_START);
5370 ValNodeFree(whole_slp);
5371 }
5372
5373 seqloc = HitRangeToSeqLoc(bhrp, HITRANGE_LINKVALUE, TRUE);
5374 bhrp = BlastHitRangeDestruct(bhrp);
5375 if (search->error_return)
5376 {
5377 ValNodeLink(error_returns, search->error_return);
5378 search->error_return = NULL;
5379 }
5380
5381 if (other_returns)
5382 { /* format dbinfo etc. */
5383 *other_returns = BlastOtherReturnsPrepare(search);
5384 }
5385
5386 if (options_allocated)
5387 {
5388 options = BLASTOptionDelete(options);
5389 }
5390 search = BlastSearchBlkDestruct(search);
5391
5392 return seqloc;
5393 }
5394
BlastOtherReturnsFree(ValNodePtr other_returns)5395 void LIBCALL BlastOtherReturnsFree(ValNodePtr other_returns)
5396 {
5397 BLAST_KarlinBlkPtr ka_params;
5398 BLAST_MatrixPtr matrix;
5399 CharPtr params_buffer;
5400 TxDfDbInfoPtr dbinfo;
5401 ValNodePtr mask_loc, mask_loc_start, vnp;
5402
5403 mask_loc = NULL;
5404
5405 for (vnp=other_returns; vnp; vnp = vnp->next) {
5406 switch (vnp->choice) {
5407 case TXDBINFO:
5408 dbinfo = vnp->data.ptrvalue;
5409 dbinfo = TxDfDbInfoDestruct(dbinfo);
5410 break;
5411 case TXKABLK_NOGAP:
5412 ka_params = vnp->data.ptrvalue;
5413 MemFree(ka_params);
5414 break;
5415 case TXKABLK_GAP:
5416 ka_params = vnp->data.ptrvalue;
5417 MemFree(ka_params);
5418 break;
5419 case TXPARAMETERS:
5420 params_buffer = vnp->data.ptrvalue;
5421 MemFree(params_buffer);
5422 break;
5423 case TXMATRIX:
5424 matrix = vnp->data.ptrvalue;
5425 matrix = BLAST_MatrixDestruct(matrix);
5426
5427 break;
5428 case SEQLOC_MASKING_NOTSET:
5429 case SEQLOC_MASKING_PLUS1:
5430 case SEQLOC_MASKING_PLUS2:
5431 case SEQLOC_MASKING_PLUS3:
5432 case SEQLOC_MASKING_MINUS1:
5433 case SEQLOC_MASKING_MINUS2:
5434 case SEQLOC_MASKING_MINUS3:
5435 ValNodeAddPointer(&mask_loc, vnp->choice, vnp->data.ptrvalue);
5436 break;
5437 default:
5438 break;
5439 }
5440 }
5441
5442 mask_loc_start = mask_loc;
5443 while (mask_loc) {
5444 SeqLocSetFree(mask_loc->data.ptrvalue);
5445 mask_loc = mask_loc->next;
5446 }
5447 ValNodeFree(mask_loc_start);
5448
5449 other_returns = ValNodeFree(other_returns);
5450
5451 return;
5452 }
5453
5454 ValNodePtr LIBCALL
BlastOtherReturnsPrepare(BlastSearchBlkPtr search)5455 BlastOtherReturnsPrepare(BlastSearchBlkPtr search)
5456
5457 {
5458 BLAST_KarlinBlkPtr ka_params;
5459 BLAST_MatrixPtr blast_matrix;
5460 CharPtr parameters, chptr;
5461 ReadDBFILEPtr rdfp_var;
5462 TxDfDbInfoPtr dbinfo, head, dbinfo_var=NULL;
5463 ValNodePtr other_returns=NULL;
5464
5465 head = NULL;
5466 if (search->thr_info->blast_gi_list) {
5467 dbinfo = MemNew(sizeof(TxDfDbInfo));
5468 dbinfo->total_length = search->dblen;
5469 dbinfo->number_seqs = search->dbseq_num;
5470 dbinfo->subset = TRUE;
5471 head = dbinfo;
5472 dbinfo_var = dbinfo;
5473 }
5474
5475 rdfp_var = search->rdfp;
5476 while (rdfp_var) {
5477 dbinfo = MemNew(sizeof(TxDfDbInfo));
5478 dbinfo->name = StringSave(readdb_get_filename(rdfp_var));
5479
5480 if((chptr = readdb_get_title(rdfp_var)) == NULL)
5481 chptr = readdb_get_filename(rdfp_var);
5482 dbinfo->definition = StringSave(chptr);
5483
5484 dbinfo->date = StringSave(readdb_get_date(rdfp_var));
5485
5486 dbinfo->is_protein = readdb_is_prot(rdfp_var);
5487
5488 if (rdfp_var->aliaslen)
5489 dbinfo->total_length = rdfp_var->aliaslen;
5490 else
5491 dbinfo->total_length = readdb_get_dblen(rdfp_var);
5492 if (rdfp_var->aliasnseq)
5493 dbinfo->number_seqs = rdfp_var->aliasnseq;
5494 else
5495 dbinfo->number_seqs = readdb_get_num_entries(rdfp_var);
5496 if (head == NULL) {
5497 head = dbinfo;
5498 dbinfo_var = dbinfo;
5499 } else {
5500 dbinfo_var->next = dbinfo;
5501 dbinfo_var = dbinfo_var->next;
5502 }
5503 rdfp_var = rdfp_var->next;
5504 }
5505 if (head)
5506 ValNodeAddPointer (&other_returns, TXDBINFO, head);
5507
5508 if (search->sbp->kbp && search->sbp->kbp[search->first_context]) {
5509 ka_params = BlastKarlinBlkCreate();
5510 ka_params->Lambda = search->sbp->kbp[search->first_context]->Lambda;
5511 ka_params->K = search->sbp->kbp[search->first_context]->K;
5512 ka_params->H = search->sbp->kbp[search->first_context]->H;
5513 ValNodeAddPointer (&other_returns, TXKABLK_NOGAP, ka_params);
5514 }
5515
5516 if (search->pbp->gapped_calculation == TRUE) {
5517 if (search->sbp->kbp_gap && search->sbp->kbp_gap[search->first_context]) {
5518 ka_params = BlastKarlinBlkCreate();
5519 ka_params->Lambda = search->sbp->kbp_gap[search->first_context]->Lambda;
5520 ka_params->K = search->sbp->kbp_gap[search->first_context]->K;
5521 ka_params->H = search->sbp->kbp_gap[search->first_context]->H;
5522 ValNodeAddPointer (&other_returns, TXKABLK_GAP, ka_params);
5523 }
5524 }
5525
5526 if (search->query_invalid == FALSE) {
5527 parameters = FormatBlastParameters(search);
5528 ValNodeAddPointer (&other_returns, TXPARAMETERS, parameters);
5529 }
5530
5531 blast_matrix = BLAST_MatrixFill(search->sbp, search->positionBased);
5532 ValNodeAddPointer (&other_returns, TXMATRIX, blast_matrix);
5533
5534 if (search->mask)
5535 ValNodeLink(&other_returns, search->mask);
5536
5537 if (search->pbp->is_rps_blast) {
5538 ValNodeAddFloat(&other_returns, EFF_SEARCH_SPACE,
5539 ((Nlm_FloatHi) search->dblen_eff)*
5540 ((Nlm_FloatHi) (search->rps_qlen - search->length_adjustment)));
5541 } else {
5542 ValNodeAddFloat(&other_returns, EFF_SEARCH_SPACE,
5543 ((Nlm_FloatHi) search->dblen_eff)*
5544 ((Nlm_FloatHi) search->context[search->first_context].query->effective_length));
5545 }
5546 ValNodeAddInt(&other_returns, EFF_HSP_LENGTH, search->length_adjustment);
5547
5548 /* If Mega BLAST endpoint results, save them here */
5549 if (search->mb_endpoint_results && search->pbp->mb_params &&
5550 search->pbp->mb_params->no_traceback)
5551 /* Here 21 = BlastResponse_mbalign (see file objblst3.h) */
5552 ValNodeAddPointer(&other_returns, 21,
5553 search->mb_endpoint_results->data.ptrvalue);
5554
5555 return other_returns;
5556 }
5557
5558
5559 /*
5560 Deallocates memory for BLAST_ExtendWordParamsPtr
5561
5562 */
5563
5564 static BLAST_ExtendWordParamsPtr
BLAST_ExtendWordParamsDestruct(BLAST_ExtendWordParamsPtr ewp_params)5565 BLAST_ExtendWordParamsDestruct (BLAST_ExtendWordParamsPtr ewp_params)
5566
5567 {
5568 ewp_params = MemFree(ewp_params);
5569
5570 return ewp_params;
5571 }
5572
5573
5574 /*
5575 Allocates memory for the BLAST_ExtendWordParamsPtr.
5576
5577 This function also sets many of the parametes such as min_diag_length etc.
5578
5579 Int4 qlen: length of the query.
5580 Boolean multiple_hits: specifies whether multiple hits method is used.
5581 Int4 window_size: the max. distance between two hits that are extended.
5582 */
5583
5584 BLAST_ExtendWordParamsPtr
BLAST_ExtendWordParamsNew(Int4 qlen,Boolean multiple_hits,Int4 window_size)5585 BLAST_ExtendWordParamsNew (Int4 qlen, Boolean multiple_hits, Int4 window_size)
5586
5587 {
5588 BLAST_ExtendWordParamsPtr ewp_params;
5589 Int4 min_diag_length, bits_to_shift;
5590
5591 ewp_params= MemNew(sizeof(BLAST_ExtendWordParams));
5592
5593 if (ewp_params)
5594 {
5595 min_diag_length = 1;
5596 bits_to_shift = 0;
5597 /* What power of 2 is just longer than the query? */
5598 while (min_diag_length < (qlen+window_size))
5599 {
5600 min_diag_length = min_diag_length << 1;
5601 bits_to_shift++;
5602 }
5603 /* These are used in the word finders to shift and mask
5604 rather than dividing and taking the remainder. */
5605 ewp_params->bits_to_shift = bits_to_shift;
5606 ewp_params->min_diag_length = min_diag_length;
5607 ewp_params->min_diag_mask = min_diag_length-1;
5608 ewp_params->multiple_hits = multiple_hits;
5609 ewp_params->offset = window_size;
5610 ewp_params->window = window_size;
5611 }
5612 return ewp_params;
5613 }
5614
5615 /*
5616 Deallocates memory for the BLAST_ExtendWordPtr.
5617
5618 */
5619 BLAST_ExtendWordPtr LIBCALL
BLAST_ExtendWordDestruct(BLAST_ExtendWordPtr ewp)5620 BLAST_ExtendWordDestruct (BLAST_ExtendWordPtr ewp)
5621
5622 {
5623 if (ewp)
5624 {
5625 if (ewp->_buffer)
5626 ewp->_buffer = MemFree(ewp->_buffer);
5627
5628 ewp = MemFree(ewp);
5629 }
5630
5631 return ewp;
5632
5633 }
5634
5635 /*
5636 Allocates memory for the BLAST_ExtendWordPtr.
5637
5638 All of the memory for the arrays is allocated in one chunk
5639 called "_buffer". If multiple_hits is specified them room
5640 for "diag_level", "last_hit", and "version" is allocated and
5641 pointers into the array for these are set. If multiple_hits
5642 is not set, then only room for diag_level and version is allocated;
5643 last_hit is not needed.
5644
5645 Int4 qlen, dblen: length of the query and the LONGEST subject sequence.
5646 Boolean multiple_hits: specifies whether multiple hits method is used.
5647
5648 ** CFJ
5649 ** - previously buffer contained diag_level array, last_hit array, and version array
5650 ** change to contain array of struct {dl,lh,v}.
5651 **
5652 ** - Now that version is no longer used, combining the remaining 2 is probably not a big win.
5653
5654 */
5655 BLAST_ExtendWordPtr
BLAST_ExtendWordNew(BLAST_ExtendWordParamsPtr ewp_params)5656 BLAST_ExtendWordNew (BLAST_ExtendWordParamsPtr ewp_params)
5657
5658 {
5659 BLAST_ExtendWordPtr ewp;
5660 int i;
5661
5662 ewp = MemNew(sizeof(BLAST_ExtendWord));
5663
5664 if (ewp)
5665 {
5666 /* Allocate the buffer to be used for Combo array. */
5667 ewp->_buffer = (Int4Ptr) MemNew(ewp_params->min_diag_length*sizeof(CfjModStruct));
5668
5669 if (ewp->_buffer == NULL)
5670 {
5671 ewp = BLAST_ExtendWordDestruct(ewp);
5672 return NULL;
5673 }
5674
5675 ewp->combo_array= (CfjModStruct *) ewp->_buffer;
5676 ewp_params->offset=0;
5677 for(i=0;i<ewp_params->min_diag_length;i++){
5678 ewp->combo_array[i].diag_level=0;
5679 ewp->combo_array[i].last_hit = -ewp_params->window;
5680 }
5681 }
5682
5683 return ewp;
5684 }
5685
5686 /*****************************************************************************
5687 *
5688 * Zeroe's out the memory in the array _buffer, if offset is greater than
5689 * INT4_MAX/2. The first "min_diag_length" spaces in the array are used
5690 * by the array "diag_level", the second "min_diag_length" spaces are used
5691 * by "last_hit". All of these are zeroed out. The last "min_diag_length"
5692 * spaces are used by "version"; these are not zeroed out.
5693 *
5694 * If offset is not greater than INT4_MAX/2, then the memory is not
5695 * zeroed out. Rather "offset" is used as a "zero-point" that is
5696 * always greater than the next possible value when the word finder
5697 * starts working on a new subject sequence.
5698 *
5699 ******************************************************************************/
5700 void LIBCALL
BlastExtendWordExit(BlastSearchBlkPtr search)5701 BlastExtendWordExit(BlastSearchBlkPtr search)
5702
5703 {
5704 BLAST_ExtendWordPtr ewp;
5705 BLAST_ExtendWordParamsPtr ewp_params;
5706 Int2 index;
5707 Int4 i, min_diag_length;
5708
5709 ewp_params = search->ewp_params;
5710
5711 for (index=search->first_context; index<=search->last_context; index++)
5712 {
5713
5714 if (ewp_params->offset >= INT4_MAX/2)
5715 {
5716 ewp = search->context[index].ewp;
5717 if (ewp) {
5718 min_diag_length = ewp_params->min_diag_length;
5719 for(i=0;i<min_diag_length;i++)
5720 {
5721 ewp->combo_array[i].diag_level=0;
5722 ewp->combo_array[i].last_hit = -ewp_params->window;
5723 }
5724 }
5725 }
5726 }
5727
5728 if (ewp_params->offset < INT4_MAX/2)
5729 {
5730 ewp_params->offset += search->subject->length + ewp_params->window ;
5731 }
5732 else
5733 {
5734 ewp_params->offset = 0;
5735 }
5736 }
5737
5738
5739 BlastSequenceBlkPtr LIBCALL
BlastSequenceBlkDestruct(BlastSequenceBlkPtr seq_blk)5740 BlastSequenceBlkDestruct(BlastSequenceBlkPtr seq_blk)
5741
5742 {
5743
5744 if (seq_blk == NULL)
5745 return NULL;
5746
5747 /* Free from the start of sequence if it's filled in. */
5748 if (seq_blk->sequence_start != NULL)
5749 {
5750 seq_blk->sequence_start = MemFree(seq_blk->sequence_start);
5751 }
5752 else
5753 {
5754 seq_blk->sequence = MemFree(seq_blk->sequence);
5755 }
5756
5757 seq_blk = MemFree(seq_blk);
5758
5759 return seq_blk;
5760 }
5761
5762
5763
5764 static BLASTContextStructPtr
BLASTContextFree(BLASTContextStructPtr context,Int2 number)5765 BLASTContextFree(BLASTContextStructPtr context, Int2 number)
5766
5767 {
5768 Int2 index;
5769
5770 if (context == NULL)
5771 return NULL;
5772
5773 for (index=0; index<number; index++)
5774 {
5775 context[index].ewp = BLAST_ExtendWordDestruct(context[index].ewp);
5776 if (context[index].query_allocated == TRUE)
5777 {
5778 context[index].query = BlastSequenceBlkDestruct(context[index].query);
5779 }
5780 }
5781 context = MemFree(context);
5782
5783 return context;
5784 }
5785
BlastThrInfoFree(BlastThrInfoPtr thr_info)5786 void BlastThrInfoFree(BlastThrInfoPtr thr_info)
5787 {
5788 VoidPtr status=NULL;
5789
5790 if (thr_info == NULL)
5791 return;
5792
5793 if (thr_info->index_thr)
5794 {
5795 NlmThreadJoin(thr_info->index_thr, &status);
5796 thr_info->index_thr = NULL;
5797 }
5798
5799 if (thr_info->awake_thr)
5800 {
5801 NlmThreadJoin(thr_info->awake_thr, &status);
5802 thr_info->awake_thr = NULL;
5803 if (thr_info->callback_mutex)
5804 {
5805 NlmMutexDestroy(thr_info->callback_mutex);
5806 thr_info->callback_mutex = NULL;
5807 }
5808 }
5809 BlastGiListDestruct(thr_info->blast_gi_list, TRUE);
5810
5811 NlmMutexDestroy(thr_info->db_mutex);
5812 NlmMutexDestroy(thr_info->results_mutex);
5813 NlmMutexDestroy(thr_info->callback_mutex);
5814
5815 MemFree(thr_info);
5816
5817 return;
5818 }
5819
BlastThrInfoNew(void)5820 BlastThrInfoPtr BlastThrInfoNew(void)
5821 {
5822 BlastThrInfoPtr thr_info;
5823
5824 thr_info = MemNew(sizeof(BlastThrInfo));
5825
5826 return thr_info;
5827 }
5828
5829
5830 /*
5831 Allocates space for a copy of the BlastSearchBlk for use in
5832 multi-processing BLAST.
5833 */
5834
5835 BlastSearchBlkPtr LIBCALL
BlastSearchBlkDuplicate(BlastSearchBlkPtr search)5836 BlastSearchBlkDuplicate (BlastSearchBlkPtr search)
5837
5838 {
5839
5840 BlastSearchBlkPtr new_search;
5841 Int2 index;
5842
5843 if (search == NULL)
5844 return NULL;
5845
5846 new_search = (BlastSearchBlkPtr) MemNew(sizeof(BlastSearchBlk));
5847 if (new_search == NULL)
5848 return NULL;
5849
5850 /* What's allocated here? */
5851 new_search->allocated = 0;
5852 new_search->allocated += BLAST_SEARCH_ALLOC_SUBJECT;
5853 new_search->allocated += BLAST_SEARCH_ALLOC_PBP;
5854 new_search->allocated += BLAST_SEARCH_ALLOC_CONTEXT;
5855 new_search->allocated += BLAST_SEARCH_ALLOC_READDB;
5856 new_search->allocated += BLAST_SEARCH_ALLOC_EWPPARAMS;
5857
5858 /* AM: Support for query multiplexing. */
5859 if( search->mult_queries )
5860 new_search->mult_queries = BlastDuplicateMultQueries( search->mult_queries );
5861
5862 /* Duplicate the rfdp struct, but not the contents. */
5863 new_search->rdfp = readdb_attach(search->rdfp);
5864 if (new_search->rdfp == NULL)
5865 {
5866 new_search = BlastSearchBlkDestruct(new_search);
5867 return NULL;
5868 }
5869
5870 new_search->positionBased = search->positionBased;
5871
5872 /* Changes, need to allocate. */
5873 new_search->pbp = MemDup(search->pbp, sizeof(BLAST_ParameterBlk));
5874 if (search->pbp->mb_params)
5875 new_search->pbp->mb_params =
5876 MemDup(search->pbp->mb_params, sizeof(MegaBlastParameterBlk));
5877 new_search->pbp->filter_string = StringSave(search->pbp->filter_string);
5878 new_search->sbp = search->sbp;
5879 new_search->wfp_first = search->wfp_first;
5880 if (search->prog_number==blast_type_blastn &&
5881 search->pbp->mb_params) {
5882 new_search->wfp_second =
5883 MemDup(search->wfp_second, sizeof(BLAST_WordFinder));
5884 new_search->wfp_second->lookup =
5885 MegaBlastLookupTableDup(search->wfp_second->lookup);
5886 new_search->wfp = new_search->wfp_second;
5887 } else
5888 new_search->wfp_second = search->wfp_second;
5889 new_search->prog_name = StringSave(search->prog_name);
5890 new_search->prog_number = search->prog_number;
5891 new_search->first_context = search->first_context;
5892 new_search->last_context = search->last_context;
5893 new_search->query_slp = search->query_slp;
5894 if (search->prog_number==blast_type_blastn) {
5895 new_search->query_context_offsets =
5896 MemDup(search->query_context_offsets,
5897 (search->last_context-search->first_context+2)*sizeof(Int4));
5898 }
5899 if (search->ewp_params)
5900 new_search->ewp_params = MemDup(search->ewp_params, sizeof(BLAST_ExtendWordParams));
5901 new_search->dblen = search->dblen;
5902 new_search->dblen_eff = search->dblen_eff;
5903 new_search->dblen_eff_real = search->dblen_eff_real;
5904 new_search->dbseq_num = search->dbseq_num;
5905 new_search->length_adjustment = search->length_adjustment;
5906 new_search->searchsp_eff = search->searchsp_eff;
5907
5908 /* Allocate last_context+1 elements, even if there are only last_context-first_context
5909 being used. */
5910 new_search->context = (BLASTContextStructPtr) MemNew((search->last_context+1)*sizeof(BLASTContextStruct));
5911 for (index=new_search->first_context; index<=new_search->last_context; index++)
5912 {
5913 if (new_search->ewp_params)
5914 new_search->context[index].ewp = BLAST_ExtendWordNew(new_search->ewp_params);
5915 new_search->context[index].query = search->context[index].query;
5916 new_search->context[index].query->frame = ContextToFrame(new_search, index);
5917 new_search->context[index].query_allocated = FALSE;
5918 }
5919
5920 new_search->context_factor = search->context_factor;
5921
5922 new_search->subject = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
5923 /* 100 is the size limit in the present BLAST for hsp's. */
5924 new_search->hsp_array_size = search->hsp_array_size;
5925 /* The results are held here. */
5926 new_search->result_struct = search->result_struct;
5927 new_search->mb_result_struct = search->mb_result_struct;
5928 new_search->result_size = search->result_size;
5929 new_search->worst_evalue = DBL_MAX;
5930
5931 new_search->translation_table = search->translation_table;
5932 new_search->translation_table_rc = search->translation_table_rc;
5933 new_search->genetic_code = search->genetic_code;
5934 new_search->db_genetic_code = search->db_genetic_code;
5935
5936 if (search->translation_buffer_size > 0)
5937 { /* two extra for the NULLB's on end. */
5938 new_search->translation_buffer = MemNew((2+search->translation_buffer_size)*sizeof(Uint1));
5939 new_search->translation_buffer_size = search->translation_buffer_size;
5940 }
5941
5942 new_search->gap_align = NULL; /* Allocated automatically. */
5943
5944 new_search->whole_query = search->whole_query;
5945 new_search->required_start = search->required_start;
5946 new_search->required_end = search->required_end;
5947
5948 new_search->handle_results = search->handle_results;
5949 if (!search->pbp->mb_params)
5950 new_search->query_id = SeqIdSetDup(search->query_id);
5951 else {
5952 new_search->qid_array = (SeqIdPtr PNTR)
5953 Malloc((search->last_context/2 + 1)*sizeof(SeqIdPtr));
5954
5955 for (index=0; index<=search->last_context/2; index++)
5956 new_search->qid_array[index] = SeqIdSetDup(search->qid_array[index]);
5957 }
5958
5959 /* Duplicating DNAP sequence used in OOF search */
5960 if(search->pbp->is_ooframe)
5961 new_search->query_dnap = BlastMakeCopyQueryDNAP(search->query_dnap);
5962
5963 new_search->thr_info = search->thr_info;
5964 new_search->semid = search->semid;
5965
5966 #ifdef BLAST_COLLECT_STATS
5967 new_search->first_pass_hits = 0;
5968 new_search->second_pass_hits = 0;
5969 new_search->second_pass_trys = 0;
5970 new_search->first_pass_extends = 0;
5971 new_search->second_pass_extends = 0;
5972 new_search->first_pass_good_extends = 0;
5973 new_search->second_pass_good_extends = 0;
5974 new_search->number_of_seqs_better_E = 0;
5975 new_search->prelim_gap_no_contest = 0;
5976 new_search->prelim_gap_passed = 0;
5977 new_search->prelim_gap_attempts = 0;
5978 new_search->real_gap_number_of_hsps = 0;
5979 #endif
5980 new_search->output = search->output;
5981
5982 if (search->abmp) {
5983 new_search = GreedyAlignMemAlloc(new_search);
5984 if (new_search->abmp == NULL) {
5985 new_search = BlastSearchBlkDestruct(new_search);
5986 return NULL;
5987 }
5988 }
5989 if (search->mb_endpoint_results) {
5990 new_search->mb_endpoint_results = ValNodeNew(NULL);
5991 new_search->mb_endpoint_results->data.ptrvalue =
5992 search->mb_endpoint_results->data.ptrvalue;
5993 }
5994 new_search->mask1 = search->mask1;
5995
5996 return new_search;
5997 }
5998 /*
5999 Allocates space for the new BlastSearchBlk and some sturctures
6000 attached to it.
6001 */
6002
6003 BlastSearchBlkPtr LIBCALL
BlastSearchBlkNew(Int2 wordsize,Int4 qlen,CharPtr dbname,Boolean multiple_hits,BLAST_Score threshold_first,BLAST_Score threshold_second,Int4 result_size,CharPtr prog_name,BlastAllWordPtr all_words,Int2 first_context,Int2 last_context,Int4 window_size)6004 BlastSearchBlkNew (Int2 wordsize, Int4 qlen, CharPtr dbname, Boolean multiple_hits, BLAST_Score threshold_first, BLAST_Score threshold_second, Int4 result_size, CharPtr prog_name, BlastAllWordPtr all_words, Int2 first_context, Int2 last_context, Int4 window_size)
6005
6006 {
6007 return BlastSearchBlkNewExtra(wordsize, qlen, dbname, multiple_hits, threshold_first, threshold_second, result_size, prog_name, all_words, first_context, last_context, NULL, window_size);
6008
6009 }
6010
6011 /*
6012 Allocates space for the new BlastSearchBlk and some sturctures
6013 attached to it.
6014 */
6015
6016 BlastSearchBlkPtr LIBCALL
BlastSearchBlkNewExtra(Int2 wordsize,Int4 qlen,CharPtr dbname,Boolean multiple_hits,BLAST_Score threshold_first,BLAST_Score threshold_second,Int4 result_size,CharPtr prog_name,BlastAllWordPtr all_words,Int2 first_context,Int2 last_context,ReadDBFILEPtr rdfp,Int4 window_size)6017 BlastSearchBlkNewExtra (Int2 wordsize, Int4 qlen, CharPtr dbname, Boolean multiple_hits, BLAST_Score threshold_first, BLAST_Score threshold_second, Int4 result_size, CharPtr prog_name, BlastAllWordPtr all_words, Int2 first_context, Int2 last_context, ReadDBFILEPtr rdfp, Int4 window_size)
6018
6019 {
6020
6021 BlastSearchBlkPtr search;
6022 BLASTContextStructPtr context;
6023 Uint1 is_prot;
6024 Int2 index;
6025 Uint1 alphabet;
6026 Int4 longest_db_seq=INT4_MAX;
6027 ReadDBFILEPtr rdfp_var;
6028 Int4 last_ewp_index;
6029
6030 search = (BlastSearchBlkPtr) MemNew(sizeof(BlastSearchBlk));
6031
6032 if (search != NULL)
6033 {
6034 search->allocated = 0; /* everything's allocated here. */
6035 search->allocated += BLAST_SEARCH_ALLOC_QUERY;
6036 search->allocated += BLAST_SEARCH_ALLOC_SUBJECT;
6037 search->allocated += BLAST_SEARCH_ALLOC_PBP;
6038 search->allocated += BLAST_SEARCH_ALLOC_SBP;
6039 search->allocated += BLAST_SEARCH_ALLOC_EWPPARAMS;
6040 search->allocated += BLAST_SEARCH_ALLOC_CONTEXT;
6041 search->allocated += BLAST_SEARCH_ALLOC_RESULTS;
6042 search->allocated += BLAST_SEARCH_ALLOC_READDB;
6043 search->allocated += BLAST_SEARCH_ALLOC_ALL_WORDS;
6044 search->allocated += BLAST_SEARCH_ALLOC_THRINFO;
6045 search->allocated += BLAST_SEARCH_ALLOC_MASK1;
6046
6047 search->positionBased = FALSE;
6048
6049 if (StringCmp(prog_name, "blastn") == 0)
6050 {
6051 alphabet = BLASTNA_SEQ_CODE;
6052 }
6053 else
6054 {
6055 alphabet = Seq_code_ncbistdaa;
6056 }
6057
6058 if (dbname != NULL)
6059 {
6060
6061 if (rdfp == NULL)
6062 {
6063 if (StringCmp(prog_name, "blastp") == 0 || StringCmp(prog_name, "blastx") == 0)
6064 { /* Protein DB for blastp and blastx. */
6065 is_prot = READDB_DB_IS_PROT;
6066 }
6067 else
6068 {
6069 is_prot = READDB_DB_IS_NUC;
6070 }
6071
6072 if ((search->rdfp=readdb_new(dbname, is_prot)) == NULL)
6073 {
6074 return NULL;
6075 }
6076 }
6077 else
6078 { /* Attaches to the rdfp, rather than reallocating it. */
6079 search->rdfp = readdb_attach(rdfp);
6080 }
6081
6082 rdfp_var = search->rdfp;
6083 longest_db_seq = 0;
6084 while (rdfp_var)
6085 {
6086 longest_db_seq = MAX(longest_db_seq, readdb_get_maxlen(rdfp_var));
6087 rdfp_var = rdfp_var->next;
6088 }
6089 }
6090
6091 search->first_context = first_context;
6092 search->last_context = last_context;
6093
6094 search->pbp =
6095 (BLAST_ParameterBlkPtr) MemNew(sizeof(BLAST_ParameterBlk));
6096
6097 search->sbp = BLAST_ScoreBlkNew(alphabet, last_context+1);
6098
6099 /* Only allocate these if thresholds are above zero, i.e. they will be used. */
6100 if (StringCmp(prog_name, "blastn") != 0)
6101 {
6102 if (threshold_second > 0)
6103 {
6104 search->wfp_first = BLAST_WordFinderNew(search->sbp->alphabet_size, wordsize, 1, FALSE);
6105 search->allocated += BLAST_SEARCH_ALLOC_WFP_FIRST;
6106 /* Only allocate a new WFP if 2nd th differs from 1st. */
6107 search->wfp_second = search->wfp_first;
6108 }
6109 }
6110 else
6111 {
6112 if (multiple_hits)
6113 search->wfp_second = BLAST_WordFinderNew(256, wordsize, READDB_COMPRESSION_RATIO, FALSE);
6114 else
6115 search->wfp_second = BLAST_WordFinderNew(256, wordsize, READDB_COMPRESSION_RATIO, TRUE);
6116 search->allocated += BLAST_SEARCH_ALLOC_WFP_SECOND;
6117 }
6118
6119 search->prog_name = StringSave(prog_name);
6120 search->prog_number = BlastGetProgramNumber(prog_name);
6121 if (qlen > 0)
6122 search->ewp_params = BLAST_ExtendWordParamsNew(qlen, multiple_hits, window_size);
6123 else
6124 search->ewp_params = NULL;
6125 context = search->context = (BLASTContextStructPtr)
6126 MemNew((1+search->last_context)*sizeof(BLASTContextStruct));
6127 if (search->prog_number != blast_type_blastn)
6128 last_ewp_index = search->last_context;
6129 else /* All queries (Mega BLAST) and strands are concatenated
6130 in a single sequence */
6131 last_ewp_index = search->first_context;
6132
6133 for (index=search->first_context; index<=search->last_context; index++)
6134 {
6135 if (search->ewp_params && index <= last_ewp_index)
6136 context[index].ewp = BLAST_ExtendWordNew(search->ewp_params);
6137 context[index].query = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
6138 context[index].query->frame = ContextToFrame(search, index);
6139 context[index].query_allocated = TRUE;
6140 }
6141
6142 search->subject = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
6143 /* 100 is the size limit in the present BLAST for hsp's. */
6144 search->hsp_array_size = 100;
6145 /* The results are held here. */
6146 search->result_size = result_size;
6147 /*
6148 search->result_struct = BLASTResultsStructNew(result_size, search->pbp->max_pieces, search->pbp->hsp_range_max);
6149 */
6150
6151 search->worst_evalue = DBL_MAX;
6152
6153 search->whole_query = TRUE;
6154 search->required_start = 0;
6155 search->required_end = -1;
6156
6157 search->all_words = all_words;
6158
6159 search->thr_info = BlastThrInfoNew();
6160 #ifdef BLAST_COLLECT_STATS
6161 search->first_pass_hits = 0;
6162 search->second_pass_hits = 0;
6163 search->second_pass_trys = 0;
6164 search->first_pass_extends = 0;
6165 search->second_pass_extends = 0;
6166 search->first_pass_good_extends = 0;
6167 search->second_pass_good_extends = 0;
6168 search->number_of_seqs_better_E = 0;
6169 search->prelim_gap_no_contest = 0;
6170 search->prelim_gap_passed = 0;
6171 search->prelim_gap_attempts = 0;
6172 search->real_gap_number_of_hsps = 0;
6173 #endif
6174 }
6175
6176 return search;
6177 }
6178
6179 /*
6180 Deallocates memory associated with the BlastSearchBlkPtr.
6181 */
6182
6183 BlastSearchBlkPtr LIBCALL
BlastSearchBlkDestruct(BlastSearchBlkPtr search)6184 BlastSearchBlkDestruct (BlastSearchBlkPtr search)
6185
6186 {
6187
6188 if (search != NULL) {
6189 if (search->allocated & BLAST_SEARCH_ALLOC_QUERY)
6190 search->original_seq = MemFree(search->original_seq);
6191
6192 if (search->allocated & BLAST_SEARCH_ALLOC_SUBJECT)
6193 search->subject = BlastSequenceBlkDestruct(search->subject);
6194
6195 if (search->allocated & BLAST_SEARCH_ALLOC_SBP)
6196 search->sbp = BLAST_ScoreBlkDestruct(search->sbp);
6197
6198 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST)
6199 search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
6200
6201 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND) {
6202 search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
6203 } else if (search->prog_number==blast_type_blastn &&
6204 search->pbp->mb_params) {
6205 search->wfp_second =
6206 MegaBlastWordFinderDeallocate(search->wfp_second);
6207 }
6208
6209 /* Freeing DNAP sequence used in OOF */
6210
6211 if(search->pbp != NULL && search->pbp->is_ooframe) {
6212 BlastFreeQueryDNAP(search->query_dnap);
6213 search->query_dnap = NULL;
6214 }
6215
6216 if (search->allocated & BLAST_SEARCH_ALLOC_EWPPARAMS) {
6217 search->ewp_params = BLAST_ExtendWordParamsDestruct(search->ewp_params);
6218 }
6219
6220 if (search->allocated & BLAST_SEARCH_ALLOC_CONTEXT) {
6221 search->context = BLASTContextFree(search->context, 1+search->last_context);
6222 }
6223
6224 if (search->allocated & BLAST_SEARCH_ALLOC_RESULTS) {
6225 if (!search->pbp->mb_params)
6226 search->result_struct =
6227 BLASTResultsStructDelete(search->result_struct);
6228 else {
6229 Int2 index;
6230 for (index=0; index<=search->last_context/2; index++)
6231 search->mb_result_struct[index] =
6232 BLASTResultsStructDelete(search->mb_result_struct[index]);
6233 search->mb_result_struct = MemFree(search->mb_result_struct);
6234 }
6235 }
6236
6237 if (search->allocated & BLAST_SEARCH_ALLOC_PBP) {
6238 search->pbp->mb_params = MemFree(search->pbp->mb_params);
6239 MemFree(search->pbp->filter_string);
6240 search->pbp = MemFree(search->pbp);
6241 }
6242
6243 if (search->allocated & BLAST_SEARCH_ALLOC_READDB) {
6244 search->rdfp = readdb_destruct(search->rdfp);
6245 }
6246
6247 if (search->current_hitlist) {
6248 search->current_hitlist = BlastHitListDestruct(search->current_hitlist);
6249 }
6250 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
6251
6252
6253 if (search->prog_name) {
6254 search->prog_name = MemFree(search->prog_name);
6255 }
6256
6257 if (search->query_id) {
6258 search->query_id = SeqIdSetFree(search->query_id);
6259 }
6260 if (search->qid_array) {
6261 Int4 index;
6262 for (index=0; index<=search->last_context/2; index++)
6263 SeqIdSetFree(search->qid_array[index]);
6264 search->qid_array = MemFree(search->qid_array);
6265 }
6266 if (search->translation_buffer_size > 0) {
6267 search->translation_buffer = MemFree(search->translation_buffer);
6268 }
6269
6270 if (search->allocated & BLAST_SEARCH_ALLOC_TRANS_INFO) {
6271
6272 if (search->translation_table) {
6273 search->translation_table = MemFree(search->translation_table);
6274 }
6275
6276 if (search->translation_table_rc) {
6277 search->translation_table_rc = MemFree(search->translation_table_rc);
6278 }
6279 }
6280
6281 if (search->allocated & BLAST_SEARCH_ALLOC_ALL_WORDS) {
6282 search->all_words = BlastAllWordDestruct(search->all_words);
6283 }
6284
6285 search->gap_align = GapAlignBlkDelete(search->gap_align);
6286
6287 if (search->allocated & BLAST_SEARCH_ALLOC_QUERY_SLP) {
6288 if (search->query_slp)
6289 search->query_slp = SeqLocFree(search->query_slp);
6290 }
6291
6292
6293 if(search->allocated & BLAST_SEARCH_ALLOC_THRINFO)
6294 BlastThrInfoFree(search->thr_info);
6295
6296 if (search->abmp)
6297 search->abmp = GreedyAlignMemFree(search->abmp);
6298
6299 search->query_context_offsets = MemFree(search->query_context_offsets);
6300
6301 MemFree(search->mb_endpoint_results);
6302
6303 if (search->allocated & BLAST_SEARCH_ALLOC_MASK1)
6304 {
6305 if (search->mask1)
6306 {
6307 SeqLocSetFree(search->mask1->data.ptrvalue);
6308 search->mask1 = ValNodeFree(search->mask1);
6309 }
6310 }
6311
6312 search = MemFree(search);
6313 }
6314
6315 return search;
6316 }
6317
6318
6319 /*
6320 Deallocates all the memory associated with the BlastAllWordPtr.
6321 */
6322
6323 BlastAllWordPtr LIBCALL
BlastAllWordDestruct(BlastAllWordPtr all_words)6324 BlastAllWordDestruct(BlastAllWordPtr all_words)
6325
6326 {
6327 if (all_words == NULL)
6328 return NULL;
6329
6330 if (all_words->array)
6331 {
6332 all_words->array = MemFree(all_words->array);
6333 }
6334
6335 if (all_words->rows_allocated && all_words->array_storage)
6336 {
6337 all_words->array_storage = MemFree(all_words->array_storage);
6338 }
6339
6340 MemFree(all_words);
6341
6342 return NULL;
6343 }
6344
6345 /*
6346 Allocates the BlastAllWordPtr and sets some flags.
6347 */
6348 BlastAllWordPtr LIBCALL
BlastAllWordNew(Int4 num_of_cols,Int4 wordsize,Boolean rows_allocated,Boolean specific)6349 BlastAllWordNew(Int4 num_of_cols, Int4 wordsize, Boolean rows_allocated, Boolean specific)
6350
6351 {
6352 BlastAllWordPtr all_words;
6353
6354 all_words = MemNew(sizeof(BlastAllWord));
6355 if (all_words)
6356 {
6357 all_words->rows_allocated = rows_allocated;
6358 all_words->specific = specific;
6359 all_words->num_of_cols = num_of_cols;
6360 all_words->wordsize = wordsize;
6361 }
6362
6363 return all_words;
6364 }
6365
6366 BLAST_HitListPtr LIBCALL
BlastHitListDestruct(BLAST_HitListPtr hitlist)6367 BlastHitListDestruct(BLAST_HitListPtr hitlist)
6368 {
6369 BLAST_HSPPtr PNTR hsp_array;
6370 Int4 hspcnt_max, index;
6371
6372 if (hitlist == NULL)
6373 return NULL;
6374
6375 hspcnt_max = hitlist->hspcnt_max;
6376 hsp_array = hitlist->hsp_array;
6377
6378 for (index=0; index<hspcnt_max; index++)
6379 {
6380 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
6381 }
6382
6383 hitlist->hsp_array = MemFree(hsp_array);
6384 hitlist->lh_helper = MemFree(hitlist->lh_helper);
6385
6386 MemFree(hitlist->exact_match_array);
6387
6388 hitlist = MemFree(hitlist);
6389
6390 return hitlist;
6391 }
6392
6393 /****************************************************************
6394
6395 Functions to allocate and destroy the BLAST_HitList.
6396
6397 ***************************************************************/
6398 BLAST_HitListPtr LIBCALL
BlastHitListNew(BlastSearchBlkPtr search)6399 BlastHitListNew(BlastSearchBlkPtr search)
6400 {
6401 BLAST_HitListPtr hitlist;
6402
6403 hitlist = (BLAST_HitListPtr) MemNew(sizeof(BLAST_HitList));
6404
6405 if (hitlist == NULL)
6406 return hitlist;
6407
6408 hitlist->hspmax = search->hsp_array_size;
6409 hitlist->hsp_array = (BLAST_HSPPtr PNTR) MemNew(hitlist->hspmax*sizeof
6410 (BLAST_HSPPtr));
6411
6412 if (hitlist->hsp_array == NULL)
6413 {
6414 hitlist = BlastHitListDestruct(hitlist);
6415 return NULL;
6416 }
6417
6418 if (search->pbp->mb_params) {
6419 hitlist->exact_match_array = (MegaBlastExactMatchPtr)
6420 MemNew(hitlist->hspmax*sizeof(MegaBlastExactMatch));
6421 hitlist->exact_match_max = hitlist->hspmax;
6422 }
6423
6424 return hitlist;
6425 }
6426
6427
6428 /*
6429 This function translates the context number of a context into
6430 the frame of the sequence.
6431
6432 Arguments:
6433
6434 BlastSearchBlkPtr search: search structure,
6435 Int2 context_number: context number used by BLASTContextStruct array
6436 Boolean is_query: if TRUE, refers to query, otherwise the subject.
6437 */
6438
6439 Int2
ContextToFrame(BlastSearchBlkPtr search,Int2 context_number)6440 ContextToFrame(BlastSearchBlkPtr search, Int2 context_number)
6441
6442 {
6443 Int2 frame=255;
6444 Uint1 prog_number = search->prog_number;
6445
6446 if (prog_number == blast_type_blastn)
6447 {
6448 if (context_number % 2 == 0)
6449 frame = 1;
6450 else
6451 frame = -1;
6452 }
6453 else if (prog_number == blast_type_blastp ||
6454 prog_number == blast_type_tblastn ||
6455 prog_number == blast_type_psitblastn)
6456 { /* Query and subject are protein, no frame. */
6457 frame = 0;
6458 }
6459 else if (prog_number == blast_type_blastx || prog_number == blast_type_tblastx)
6460 {
6461 frame = context_number < 3 ? context_number+1 : -context_number+2;
6462 }
6463
6464 return frame;
6465 }
6466
6467 /*
6468 Allocates and fills in the BLASTSubjectInfo structure.
6469 */
6470
6471 BLASTSubjectInfoPtr LIBCALL
BLASTSubjectInfoNew(SeqIdPtr sip,CharPtr defline,Int4 length)6472 BLASTSubjectInfoNew(SeqIdPtr sip, CharPtr defline, Int4 length)
6473
6474 {
6475 BLASTSubjectInfoPtr subject_info;
6476
6477 subject_info = (BLASTSubjectInfoPtr) MemNew(sizeof(BLASTSubjectInfo));
6478
6479 if (subject_info == NULL)
6480 return NULL;
6481
6482 subject_info->sip = sip;
6483 subject_info->defline = defline;
6484 subject_info->length = length;
6485
6486 return subject_info;
6487 }
6488
6489 /*
6490 Deallocates the BLASTSubjectInfo structure and the
6491 SeqIdPtr, as well as the defline.
6492 */
6493
6494 BLASTSubjectInfoPtr LIBCALL
BLASTSubjectInfoDestruct(BLASTSubjectInfoPtr subject_info)6495 BLASTSubjectInfoDestruct(BLASTSubjectInfoPtr subject_info)
6496
6497 {
6498
6499 if (subject_info == NULL)
6500 return NULL;
6501
6502 SeqIdFree(subject_info->sip);
6503 MemFree(subject_info->defline);
6504 subject_info = MemFree(subject_info);
6505
6506 return subject_info;
6507 }
6508
6509
6510
6511 /*
6512 Destroys BLASTResultsStructure and associated memory.
6513 */
6514
6515 BLASTResultsStructPtr LIBCALL
BLASTResultsStructDelete(BLASTResultsStructPtr result_struct)6516 BLASTResultsStructDelete(BLASTResultsStructPtr result_struct)
6517
6518 {
6519 Int4 index;
6520 BLASTResultHitlistPtr PNTR results;
6521 BLASTHeapPtr hp, hpt;
6522
6523 if (result_struct == NULL)
6524 return NULL;
6525
6526 results = result_struct->results;
6527 for (index=0; index<result_struct->hitlist_max; index++)
6528 {
6529 if (results[index])
6530 {
6531 results[index] = BLASTResultHitlistFree(results[index]);
6532 }
6533 }
6534
6535
6536 for (hp = result_struct->heap_ptr; hp; )
6537 {
6538 hpt = hp->next;
6539 hp->heap = MemFree(hp->heap);
6540 hp = MemFree(hp);
6541 hp = hpt;
6542 }
6543 result_struct->results = MemFree(result_struct->results);
6544 result_struct = MemFree(result_struct);
6545
6546 return result_struct;
6547 }
6548
6549 /*
6550 returns BLASTResultsStruct.
6551 */
6552
6553 BLASTResultsStructPtr
BLASTResultsStructNew(Int4 results_size,Int4 max_pieces,Int4 range_max)6554 BLASTResultsStructNew(Int4 results_size, Int4 max_pieces, Int4 range_max)
6555
6556 {
6557 BLASTResultsStructPtr new;
6558 Int4 index;
6559
6560 new = MemNew(sizeof(BLASTResultsStruct));
6561 new->results = (BLASTResultHitlistPtr PNTR) MemNew(results_size*sizeof(BLASTResultHitlistPtr));
6562
6563 for (index=0; index<results_size; index++)
6564 new->results[index] = NULL;
6565
6566 new->hitlist_max = results_size;
6567 new->hitlist_count = 0;
6568 new->max_pieces = max_pieces;
6569 if (range_max > 0) {
6570 new->heap_ptr = (BLASTHeapPtr) MemNew(sizeof(BLASTHeapStruct));
6571 new->heap_ptr->cutvalue = INT4_MAX;
6572 new->heap_ptr->num_in_heap = new->heap_ptr->num_of_ref = 0;
6573 new->heap_ptr->prev = new->heap_ptr->next = NULL;
6574 new->heap_ptr->heap = (BLASTResultHspPtr PNTR) MemNew(sizeof(BLASTResultHspPtr)*range_max);
6575 }
6576 return new;
6577 }
6578
6579
6580 Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes);
6581
6582 /*
6583 GetTranslation to get the translation of the nucl. sequence in the
6584 appropriate frame and with the appropriate GeneticCode.
6585
6586 The function return an allocated CharPtr, the caller must delete this.
6587 The first and last spaces of this CharPtr contain NULLB's.
6588 */
6589
6590 Uint1Ptr LIBCALL
GetTranslation(Uint1Ptr query_seq,Int4 nt_length,Int2 frame,Int4Ptr length,CharPtr genetic_code)6591 GetTranslation(Uint1Ptr query_seq, Int4 nt_length, Int2 frame, Int4Ptr length, CharPtr genetic_code)
6592 {
6593 Uint1 codon[CODON_LENGTH];
6594 Int4 index, index_prot;
6595 SeqMapTablePtr smtp;
6596 Uint1 residue, new_residue;
6597 Uint1Ptr prot_seq;
6598
6599 smtp = SeqMapTableFind(Seq_code_ncbistdaa, Seq_code_ncbieaa);
6600
6601 /* Allocate two extra spaces for NULLB's at beginning and end of seq. */
6602 prot_seq = (Uint1Ptr) MemNew((2+(nt_length+2)/CODON_LENGTH)*sizeof(Uint1));
6603
6604 /* The first character in the protein is the NULLB sentinel. */
6605 prot_seq[0] = NULLB;
6606 index_prot = 1;
6607 for (index=ABS(frame)-1; index<nt_length-2; index += CODON_LENGTH)
6608 {
6609 codon[0] = query_seq[index];
6610 codon[1] = query_seq[index+1];
6611 codon[2] = query_seq[index+2];
6612 residue = AAForCodon(codon, genetic_code);
6613 new_residue = SeqMapTableConvert(smtp, residue);
6614 if (IS_residue(new_residue))
6615 {
6616 prot_seq[index_prot] = new_residue;
6617 }
6618 index_prot++;
6619 }
6620 prot_seq[index_prot] = NULLB;
6621 *length = index_prot-1;
6622
6623 return prot_seq;
6624 }
6625
6626
6627 /*************************************************************************
6628 *
6629 * MaskTheResidues masks up to max_length residues in buffer.
6630 * The residue to be used for masking (generally 'N' for nucleotides
6631 * and 'X' for proteins) is mask_residue. offset tells how far
6632 * along the sequence the first residue in buffer is. mask_slp
6633 * specifies which parts of the sequence to mask. 'max_length is
6634 * the total length of the sequence.
6635 *
6636 *************************************************************************/
6637
6638 void
BlastMaskTheResidues(Uint1Ptr buffer,Int4 max_length,Uint1 mask_residue,SeqLocPtr mask_slp,Boolean reverse,Int4 offset)6639 BlastMaskTheResidues(Uint1Ptr buffer, Int4 max_length, Uint1 mask_residue, SeqLocPtr mask_slp, Boolean reverse, Int4 offset)
6640
6641 {
6642 SeqLocPtr slp=NULL;
6643 Int4 index, start, stop;
6644
6645 while (mask_slp)
6646 {
6647 slp=NULL;
6648 while((slp = SeqLocFindNext(mask_slp, slp))!=NULL)
6649 {
6650 if (reverse)
6651 {
6652 start = max_length - 1 - SeqLocStop(slp);
6653 stop = max_length - 1 - SeqLocStart(slp);
6654 }
6655 else
6656 {
6657 start = SeqLocStart(slp);
6658 stop = SeqLocStop(slp);
6659 }
6660
6661 start -= offset;
6662 stop -= offset;
6663
6664 for (index=start; index<=stop; index++)
6665 {
6666 buffer[index] = mask_residue;
6667 }
6668 }
6669 mask_slp = mask_slp->next;
6670 }
6671
6672 }
6673
6674 /*
6675 COnverts a protein (translated) SeqLocPtr from the protein
6676 coordinates to the nucl. coordinates.
6677
6678 Only works on a SeqLocPtr of type SeqIntPtr right now.
6679 */
6680
6681 Boolean
BlastConvertProteinSeqLoc(SeqLocPtr slp,Int2 frame,Int4 full_length)6682 BlastConvertProteinSeqLoc(SeqLocPtr slp, Int2 frame, Int4 full_length)
6683
6684 {
6685 SeqIntPtr seq_int;
6686 Int4 from, to;
6687
6688 if (slp == NULL)
6689 return TRUE;
6690
6691 if (slp->choice == SEQLOC_PACKED_INT)
6692 slp = slp->data.ptrvalue;
6693
6694 while (slp)
6695 {
6696 if (slp->choice != SEQLOC_INT)
6697 return FALSE;
6698
6699 seq_int = slp->data.ptrvalue;
6700 from = seq_int->from;
6701 to = seq_int->to;
6702
6703 if (frame < 0)
6704 {
6705 seq_int->to = full_length - CODON_LENGTH*from + frame;
6706 seq_int->from = full_length - CODON_LENGTH*to + frame + 1;
6707 seq_int->strand = Seq_strand_minus;
6708 }
6709 else
6710 {
6711 seq_int->from = CODON_LENGTH*from + frame - 1;
6712 seq_int->to = CODON_LENGTH*to + frame - 1;
6713 seq_int->strand = Seq_strand_plus;
6714 }
6715 slp = slp->next;
6716 }
6717
6718 return TRUE;
6719 }
6720
6721 /*
6722 COnverts a DNA SeqLocPtr from the nucl. coordinates to
6723 the protein (translated) coordinates.
6724 Only works on a SeqLocPtr of type SEQLOC_INT or SEQLOC_PACKED_INT right now.
6725 */
6726
6727 Boolean
BlastConvertDNASeqLoc(SeqLocPtr slp,Int2 frame,Int4 full_length)6728 BlastConvertDNASeqLoc(SeqLocPtr slp, Int2 frame, Int4 full_length)
6729 {
6730 SeqIntPtr seq_int;
6731 Int4 from, to;
6732
6733 if (slp == NULL)
6734 return TRUE;
6735
6736 if (slp->choice == SEQLOC_PACKED_INT)
6737 slp = slp->data.ptrvalue;
6738
6739 while (slp) {
6740 if (slp->choice != SEQLOC_INT)
6741 return FALSE;
6742
6743 seq_int = slp->data.ptrvalue;
6744 from = seq_int->from;
6745 to = seq_int->to;
6746
6747 if (frame < 0) {
6748 seq_int->from = (full_length + frame - to)/CODON_LENGTH;
6749 seq_int->to = (full_length + frame - from)/CODON_LENGTH;
6750 seq_int->strand = Seq_strand_minus;
6751 } else {
6752 seq_int->from = (from - frame + 1)/CODON_LENGTH;
6753 seq_int->to = (to-frame + 1)/CODON_LENGTH;
6754 seq_int->strand = Seq_strand_plus;
6755 }
6756 slp = slp->next;
6757 }
6758
6759 return TRUE;
6760 }
6761
6762 SeqLocPtr
BioseqSegEx(BioseqPtr bsp_unfilter,CharPtr options)6763 BioseqSegEx(BioseqPtr bsp_unfilter, CharPtr options)
6764
6765 {
6766 BioseqPtr bsp_filter;
6767 Boolean mask_state;
6768 Char cmd_buf[2*PATH_MAX], temp_file[PATH_MAX];
6769 CharPtr filter_dir;
6770 Int4 index, mask_begin=0;
6771 SeqEntryPtr sep;
6772 SeqLocPtr slp_mask;
6773 SeqPortPtr spp_filter, spp_unfilter;
6774 Uint1 res_filter, res_unfilter;
6775 FILE *fp;
6776
6777
6778 if (bsp_unfilter == NULL)
6779 return NULL;
6780
6781 #ifdef OS_UNIX
6782
6783 TmpNam(temp_file);
6784 fp = FileOpen(temp_file, "w");
6785 if (BioseqToFasta(bsp_unfilter, fp, FALSE) == FALSE)
6786 {
6787 BioseqUnlock(bsp_unfilter);
6788 FileClose(fp);
6789 return NULL;
6790 }
6791 FileClose(fp);
6792
6793 filter_dir = getenv("BLASTFILTER");
6794 if (filter_dir == NULL)
6795 filter_dir = BLASTFILTER_DIR;
6796
6797 if (options != NULL)
6798 sprintf(cmd_buf, "%s%s%s%s %s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, options, " -x");
6799 else
6800 sprintf(cmd_buf, "%s%s%s%s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, " -x");
6801
6802 fp = popen(cmd_buf, "r");
6803 if (fp == NULL)
6804 {
6805 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
6806 return NULL;
6807 }
6808
6809 sep = FastaToSeqEntry(fp, FALSE);
6810 FileClose(fp);
6811 if (sep == NULL)
6812 {
6813 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
6814 return NULL;
6815 }
6816 bsp_filter = sep->data.ptrvalue;
6817
6818 spp_filter = SeqPortNew(bsp_filter, 0, -1, Seq_strand_plus, Seq_code_ncbistdaa);
6819 spp_unfilter = SeqPortNew(bsp_unfilter, 0, -1, Seq_strand_plus, Seq_code_ncbistdaa);
6820
6821 mask_state = FALSE;
6822 index = 0;
6823 slp_mask = NULL;
6824 while ((res_filter=SeqPortGetResidue(spp_filter)) != SEQPORT_EOF)
6825 {
6826 res_unfilter=SeqPortGetResidue(spp_unfilter);
6827 if (res_filter != res_unfilter)
6828 {
6829 if (mask_state == FALSE)
6830 {
6831 mask_begin = index;
6832 mask_state = TRUE;
6833 }
6834 }
6835 else if (mask_state == TRUE)
6836 {
6837 ValNodeLink(&slp_mask, SeqLocIntNew(mask_begin, index-1, Seq_strand_plus, bsp_filter->id));
6838 mask_state = FALSE;
6839 }
6840 index++;
6841 }
6842
6843 /* If the last portion of the sequence was masked. */
6844 if (mask_state == TRUE)
6845 {
6846 ValNodeLink(&slp_mask, SeqLocIntNew(mask_begin, index-1, Seq_strand_plus, bsp_filter->id));
6847 }
6848
6849 sep = SeqEntryFree(sep);
6850 SeqPortFree(spp_filter);
6851 SeqPortFree(spp_unfilter);
6852
6853 pclose(fp);
6854 FileRemove(temp_file);
6855
6856 return slp_mask;
6857 #else
6858 return NULL;
6859 #endif
6860 }
6861
6862 /*
6863 Runs seg and obtains a SeqLocPtr from it.
6864 */
6865 static SeqLocPtr
SeqLocSegEx(SeqLocPtr slp,CharPtr instructions)6866 SeqLocSegEx(SeqLocPtr slp, CharPtr instructions)
6867
6868 {
6869 BioseqPtr bsp_unfilter;
6870 SeqLocPtr slp_mask;
6871 SeqIdPtr sip;
6872
6873
6874 if (slp == NULL)
6875 return NULL;
6876
6877 sip = SeqIdFindBest(SeqLocId(slp), SEQID_GI);
6878 bsp_unfilter = BioseqLockById(sip);
6879 slp_mask = BioseqSegEx(bsp_unfilter, instructions);
6880
6881 BioseqUnlock(bsp_unfilter);
6882
6883 return slp_mask;
6884 }
6885
6886 SeqLocPtr
SeqLocSeg(SeqLocPtr slp)6887 SeqLocSeg(SeqLocPtr slp)
6888
6889 {
6890 return SeqLocSegEx(slp, NULL);
6891 }
6892
6893 SeqLocPtr
MyBioseqSeg(BioseqPtr bsp_unfilter)6894 MyBioseqSeg(BioseqPtr bsp_unfilter)
6895
6896 {
6897 return BioseqSegEx(bsp_unfilter, NULL);
6898 }
6899
6900 #define BLASTSEQLOC_BUFFER_SIZE 128
6901
6902 Boolean
parse_blast_options(BLAST_OptionsBlkPtr options,CharPtr string_options,CharPtr PNTR error_message,CharPtr PNTR database,Int4Ptr descriptions,Int4Ptr alignments)6903 parse_blast_options(BLAST_OptionsBlkPtr options, CharPtr string_options,
6904 CharPtr PNTR error_message, CharPtr PNTR database,
6905 Int4Ptr descriptions, Int4Ptr alignments)
6906 {
6907 CharPtr opt_str = "GErqeWdyXZPAIvbYzcFsSpfwtgn", *values;
6908 Int4 index;
6909
6910 if (options == NULL)
6911 return FALSE;
6912
6913 if(!BlastParseInputString(string_options, opt_str, &values, error_message))
6914 {
6915 return FALSE;
6916 }
6917
6918 /* -G gap open cost */
6919
6920 index = BlastGetLetterIndex(opt_str, 'G');
6921 if(values[index] != NULL) {
6922 options->gap_open = atoi(values[index]);
6923 }
6924
6925 /* -E gap extend cost */
6926
6927 index = BlastGetLetterIndex(opt_str, 'E');
6928 if(values[index] != NULL) {
6929 options->gap_extend = atoi(values[index]);
6930 }
6931
6932 /* -q penalty for nucleotide mismatch. */
6933
6934 index = BlastGetLetterIndex(opt_str, 'q');
6935 if(values[index] != NULL) {
6936 options->penalty = atoi(values[index]);
6937 }
6938
6939 /* -r reward for nucleotide match. */
6940
6941 index = BlastGetLetterIndex(opt_str, 'r');
6942 if(values[index] != NULL) {
6943 options->reward = atoi(values[index]);
6944 }
6945
6946 /* -e expect value. */
6947
6948 index = BlastGetLetterIndex(opt_str, 'e');
6949 if(values[index] != NULL) {
6950 options->expect_value = atof(values[index]);
6951 }
6952
6953 /* -W wordsize. */
6954
6955 index = BlastGetLetterIndex(opt_str, 'W');
6956 if(values[index] != NULL) {
6957 options->wordsize = atoi(values[index]);
6958 }
6959
6960 /* -d database. */
6961 if (database) {
6962 index = BlastGetLetterIndex(opt_str, 'd');
6963 if(values[index] != NULL) {
6964 *database = values[index];
6965 values[index] = NULL;
6966 }
6967 }
6968
6969 /* -y Dropoff (X) for blast extensions in bits (default if zero) */
6970
6971 index = BlastGetLetterIndex(opt_str, 'y');
6972 if(values[index] != NULL) {
6973 options->dropoff_2nd_pass = atof(values[index]);
6974 }
6975
6976 /* -X X dropoff value for gapped alignment (in bits) */
6977
6978 index = BlastGetLetterIndex(opt_str, 'X');
6979 if(values[index] != NULL) {
6980 options->gap_x_dropoff = atof(values[index]);
6981 }
6982
6983 /* -Z final X dropoff value for gapped alignment (in bits) */
6984
6985 index = BlastGetLetterIndex(opt_str, 'Z');
6986 if(values[index] != NULL) {
6987 options->gap_x_dropoff_final = atof(values[index]);
6988 }
6989
6990 /* -P multiple hits/two-pass. */
6991
6992 index = BlastGetLetterIndex(opt_str, 'P');
6993 if(values[index] != NULL) {
6994 if (atoi(values[index]) == 0)
6995 {
6996 options->two_pass_method = FALSE;
6997 options->multiple_hits_only = TRUE;
6998 }
6999 else if (atoi(values[index]) == 1)
7000 {
7001 options->two_pass_method = FALSE;
7002 options->multiple_hits_only = FALSE;
7003 }
7004 else
7005 {
7006 options->two_pass_method = TRUE;
7007 options->multiple_hits_only = FALSE;
7008 }
7009 }
7010
7011 /* -A window size. */
7012
7013 index = BlastGetLetterIndex(opt_str, 'A');
7014 if(values[index] != NULL) {
7015 options->window_size = atoi(values[index]);
7016 }
7017
7018 /* -I Hitlist size */
7019 index = BlastGetLetterIndex(opt_str, 'I');
7020 if (values[index] != NULL)
7021 options->hitlist_size = atoi(values[index]);
7022
7023 /* -v Number of descriptions */
7024 if (descriptions) {
7025 *descriptions = -1;
7026 index = BlastGetLetterIndex(opt_str, 'v');
7027 if (values[index] != NULL) {
7028 *descriptions = atoi(values[index]);
7029 options->hitlist_size =
7030 MAX(options->hitlist_size, *descriptions);
7031 }
7032 }
7033
7034 /* -b Number of alignments */
7035 if (alignments) {
7036 *alignments = -1;
7037 index = BlastGetLetterIndex(opt_str, 'b');
7038 if (values[index] != NULL) {
7039 *alignments = atoi(values[index]);
7040 options->hitlist_size =
7041 MAX(options->hitlist_size, *alignments);
7042 }
7043 }
7044
7045 /* -Y Effective search space */
7046 index = BlastGetLetterIndex(opt_str, 'Y');
7047 if (values[index] != NULL)
7048 options->searchsp_eff = atof(values[index]);
7049
7050 /* -z Effective database length */
7051 index = BlastGetLetterIndex(opt_str, 'z');
7052 if (values[index] != NULL) {
7053 const char *dummy=NULL;
7054 options->db_length = StringToInt8(values[index], &dummy);
7055 }
7056
7057 /* -c Constant in pseudocounts for multipass version */
7058 index = BlastGetLetterIndex(opt_str, 'c');
7059 if (values[index] != NULL)
7060 options->pseudoCountConst = atoi(values[index]);
7061
7062 /* -F Filter string */
7063 index = BlastGetLetterIndex(opt_str, 'F');
7064 if (values[index] != NULL)
7065 options->filter_string = values[index];
7066
7067 /* -s Score cut off for megablast */
7068 index = BlastGetLetterIndex(opt_str, 's');
7069 if (values[index] != NULL)
7070 options->cutoff_s2 = atoi(values[index]);
7071
7072 /* -S Strand option */
7073 index = BlastGetLetterIndex(opt_str, 'S');
7074 if (values[index] != NULL)
7075 options->strand_option = (Uint1) atoi(values[index]);
7076
7077 /* -p Percentage of identity cut-off */
7078 index = BlastGetLetterIndex(opt_str, 'p');
7079 if (values[index] != NULL)
7080 options->perc_identity = (FloatLo) atof(values[index]);
7081
7082 /* -f threshold for hits */
7083
7084 index = BlastGetLetterIndex(opt_str, 'f');
7085 if(values[index] != NULL) {
7086 options->threshold_second = atoi(values[index]);
7087 }
7088
7089 /* -w Frame shift penalty (OOF algorithm for blastx) */
7090
7091 index = BlastGetLetterIndex(opt_str, 'w');
7092 if(values[index] != NULL) {
7093 options->shift_pen = atoi(values[index]);
7094 options->is_ooframe = TRUE;
7095 }
7096
7097 /* -t Discontiguous word template length for megablast;
7098 Longest intron length for sum statistics in tblastn */
7099
7100 index = BlastGetLetterIndex(opt_str, 't');
7101 if(values[index] != NULL) {
7102 if (options->is_megablast_search)
7103 options->mb_template_length = atoi(values[index]);
7104 else
7105 options->longest_intron = atoi(values[index]);
7106 }
7107
7108 /* -g Scan every base of the database for megablast */
7109
7110 index = BlastGetLetterIndex(opt_str, 'g');
7111 if(values[index] != NULL) {
7112 options->mb_one_base_step = (TO_UPPER(*values[index]) == 'T');
7113 }
7114
7115 /* -n Use dynamic programming algorithm in megablast for gapped
7116 extensions instead of greedy algorithm */
7117
7118 index = BlastGetLetterIndex(opt_str, 'n');
7119 if(values[index] != NULL) {
7120 options->mb_use_dyn_prog = (TO_UPPER(*values[index]) == 'T');
7121 }
7122
7123 values = MemFree(values);
7124
7125 return TRUE;
7126 }
7127
7128 static Boolean
parse_dust_options(CharPtr ptr,Int4Ptr level,Int4Ptr window,Int4Ptr cutoff,Int4Ptr linker)7129 parse_dust_options(CharPtr ptr, Int4Ptr level, Int4Ptr window, Int4Ptr cutoff, Int4Ptr linker)
7130
7131 {
7132 Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7133 Int4 arg, index, index1, window_pri=-1, linker_pri=-1, level_pri=-1, cutoff_pri=-1;
7134 long tmplong;
7135
7136 arg = 0;
7137 index1 = 0;
7138 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7139 {
7140 if (*ptr == ' ' || *ptr == NULLB)
7141 {
7142 buffer[index1] = NULLB;
7143 index1 = 0;
7144 switch(arg) {
7145 case 0:
7146 sscanf(buffer, "%ld", &tmplong);
7147 level_pri = tmplong;
7148 break;
7149 case 1:
7150 sscanf(buffer, "%ld", &tmplong);
7151 window_pri = tmplong;
7152 break;
7153 case 2:
7154 sscanf(buffer, "%ld", &tmplong);
7155 cutoff_pri = tmplong;
7156 break;
7157 case 3:
7158 sscanf(buffer, "%ld", &tmplong);
7159 linker_pri = tmplong;
7160 break;
7161 default:
7162 break;
7163 }
7164
7165 arg++;
7166 while (*ptr == ' ')
7167 ptr++;
7168
7169 /* end of the buffer. */
7170 if (*ptr == NULLB)
7171 break;
7172 }
7173 else
7174 {
7175 buffer[index1] = *ptr; ptr++;
7176 index1++;
7177 }
7178 }
7179
7180 *level = level_pri;
7181 *window = window_pri;
7182 *cutoff = cutoff_pri;
7183 *linker = linker_pri;
7184
7185 return TRUE;
7186 }
7187
7188
7189 static Boolean
parse_seg_options(CharPtr ptr,Int4Ptr window,FloatHiPtr locut,FloatHiPtr hicut)7190 parse_seg_options(CharPtr ptr, Int4Ptr window, FloatHiPtr locut, FloatHiPtr hicut)
7191
7192 {
7193 Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7194 Int4 arg, index, index1;
7195 long tmplong;
7196 FloatHi tmpdouble;
7197
7198 arg = 0;
7199 index1 = 0;
7200 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7201 {
7202 if (*ptr == ' ' || *ptr == NULLB)
7203 {
7204 buffer[index1] = NULLB;
7205 index1 = 0;
7206 switch(arg) {
7207 case 0:
7208 sscanf(buffer, "%ld", &tmplong);
7209 *window = tmplong;
7210 break;
7211 case 1:
7212 sscanf(buffer, "%le", &tmpdouble);
7213 *locut = tmpdouble;
7214 break;
7215 case 2:
7216 sscanf(buffer, "%le", &tmpdouble);
7217 *hicut = tmpdouble;
7218 break;
7219 default:
7220 break;
7221 }
7222
7223 arg++;
7224 while (*ptr == ' ')
7225 ptr++;
7226
7227 /* end of the buffer. */
7228 if (*ptr == NULLB)
7229 break;
7230 }
7231 else
7232 {
7233 buffer[index1] = *ptr; ptr++;
7234 index1++;
7235 }
7236 }
7237
7238 return TRUE;
7239 }
7240
7241 static Boolean
parse_cc_options(CharPtr ptr,Int4Ptr window,FloatHiPtr cutoff,Int4Ptr linker)7242 parse_cc_options(CharPtr ptr, Int4Ptr window, FloatHiPtr cutoff, Int4Ptr linker)
7243
7244 {
7245 Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7246 Int4 arg, index, index1;
7247 long tmplong;
7248 FloatHi tmpdouble;
7249
7250 arg = 0;
7251 index1 = 0;
7252 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7253 {
7254 if (*ptr == ' ' || *ptr == NULLB)
7255 {
7256 buffer[index1] = NULLB;
7257 index1 = 0;
7258 switch(arg) {
7259 case 0:
7260 sscanf(buffer, "%ld", &tmplong);
7261 *window = tmplong;
7262 break;
7263 case 1:
7264 sscanf(buffer, "%le", &tmpdouble);
7265 *cutoff = tmpdouble;
7266 break;
7267 case 2:
7268 sscanf(buffer, "%ld", &tmplong);
7269 *linker = tmplong;
7270 break;
7271 default:
7272 break;
7273 }
7274
7275 arg++;
7276 while (*ptr == ' ')
7277 ptr++;
7278
7279 /* end of the buffer. */
7280 if (*ptr == NULLB)
7281 break;
7282 }
7283 else
7284 {
7285 buffer[index1] = *ptr; ptr++;
7286 index1++;
7287 }
7288 }
7289
7290 return TRUE;
7291 }
7292
7293 CharPtr
load_options_to_buffer(CharPtr instructions,CharPtr buffer)7294 load_options_to_buffer(CharPtr instructions, CharPtr buffer)
7295 {
7296 Boolean not_started=TRUE;
7297 CharPtr buffer_ptr, ptr;
7298 Int4 index;
7299
7300 ptr = instructions;
7301 buffer_ptr = buffer;
7302 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE && *ptr != NULLB; index++)
7303 {
7304 if (*ptr == ';')
7305 {
7306 ptr++;
7307 break;
7308 }
7309 /* Remove blanks at the beginning. */
7310 if (not_started && *ptr == ' ')
7311 {
7312 ptr++;
7313 }
7314 else
7315 {
7316 not_started = FALSE;
7317 *buffer_ptr = *ptr;
7318 buffer_ptr++; ptr++;
7319 }
7320 }
7321
7322 *buffer_ptr = NULLB;
7323
7324 if (not_started == FALSE)
7325 { /* Remove trailing blanks. */
7326 buffer_ptr--;
7327 while (*buffer_ptr == ' ' && buffer_ptr > buffer)
7328 {
7329 *buffer_ptr = NULLB;
7330 buffer_ptr--;
7331 }
7332 }
7333
7334 return ptr;
7335 }
7336
7337 #define CC_WINDOW 22
7338 #define CC_CUTOFF 40.0
7339 #define CC_LINKER 32
7340
7341 /*
7342 This function parses the 'instructions' string and then calls the appopriate
7343 filtering functions.
7344 */
7345 SeqLocPtr
BlastBioseqFilter(BioseqPtr bsp,CharPtr instructions)7346 BlastBioseqFilter(BioseqPtr bsp, CharPtr instructions)
7347
7348 {
7349 return BlastBioseqFilterEx(bsp, instructions, NULL);
7350 }
7351
7352 SeqLocPtr
BlastBioseqFilterEx(BioseqPtr bsp,CharPtr instructions,BoolPtr mask_at_hash)7353 BlastBioseqFilterEx(BioseqPtr bsp, CharPtr instructions, BoolPtr mask_at_hash)
7354
7355 {
7356 SeqLocPtr slp = NULL;
7357 SeqLocPtr slp_mask;
7358
7359 ValNodeAddPointer(&slp, SEQLOC_WHOLE,
7360 SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
7361 slp_mask = BlastSeqLocFilterEx(slp, instructions, mask_at_hash);
7362 slp = SeqLocFree(slp);
7363 return slp_mask;
7364 }
7365
7366 SeqLocPtr
BlastSeqLocFilter(SeqLocPtr slp,CharPtr instructions)7367 BlastSeqLocFilter(SeqLocPtr slp, CharPtr instructions)
7368
7369 {
7370 return BlastSeqLocFilterEx(slp, instructions, NULL);
7371 }
7372
7373 SeqLocPtr
BlastSeqLocFilterEx(SeqLocPtr slp,CharPtr instructions,BoolPtr mask_at_hash)7374 BlastSeqLocFilterEx(SeqLocPtr slp, CharPtr instructions, BoolPtr mask_at_hash)
7375
7376 {
7377 BioseqPtr bsp;
7378 BLAST_OptionsBlkPtr repeat_options, vs_options;
7379 Boolean do_all=FALSE, do_seg=FALSE, do_coil_coil=FALSE, do_dust=FALSE, do_repeats=FALSE, do_vecscreen=FALSE;
7380 Boolean myslp_allocated;
7381 CharPtr buffer=NULL;
7382 CharPtr ptr, repeat_database=NULL, vs_database=NULL, error_msg;
7383 Int2 seqloc_num;
7384 Int4 window_cc, linker_cc, window_dust, level_dust, minwin_dust, linker_dust;
7385 SeqLocPtr cc_slp=NULL, dust_slp=NULL, seg_slp=NULL, seqloc_head=NULL, repeat_slp=NULL, vs_slp=NULL;
7386 PccDatPtr pccp;
7387 Nlm_FloatHiPtr scores;
7388 Nlm_FloatHi cutoff_cc;
7389 SegParamsPtr sparamsp=NULL;
7390 SeqAlignPtr seqalign;
7391 SeqIdPtr sip;
7392 SeqLocPtr myslp, seqloc_var, seqloc_tmp;
7393 ValNodePtr vnp=NULL, vnp_var;
7394
7395 cutoff_cc = CC_CUTOFF;
7396
7397 if (instructions == NULL || StringICmp(instructions, "F") == 0)
7398 return NULL;
7399
7400 /* FALSE is the default right now. */
7401 if (mask_at_hash)
7402 *mask_at_hash = FALSE;
7403
7404 /* parameters for dust. */
7405 /* -1 indicates defaults. */
7406 level_dust = -1;
7407 window_dust = -1;
7408 minwin_dust = -1;
7409 linker_dust = -1;
7410 if (StringICmp(instructions, "T") == 0)
7411 { /* do_all actually means seg for proteins and dust for nt. */
7412 do_all = TRUE;
7413 }
7414 else
7415 {
7416 buffer = MemNew(StringLen(instructions)*sizeof(Char));
7417 ptr = instructions;
7418 /* allow old-style filters when m cannot be followed by the ';' */
7419 if (*ptr == 'm' && ptr[1] == ' ')
7420 {
7421 if (mask_at_hash)
7422 *mask_at_hash = TRUE;
7423 ptr += 2;
7424 }
7425 while (*ptr != NULLB)
7426 {
7427 if (*ptr == 'S')
7428 {
7429 sparamsp = SegParamsNewAa();
7430 sparamsp->overlaps = TRUE; /* merge overlapping segments. */
7431 ptr = load_options_to_buffer(ptr+1, buffer);
7432 if (buffer[0] != NULLB)
7433 {
7434 parse_seg_options(buffer, &sparamsp->window, &sparamsp->locut, &sparamsp->hicut);
7435 }
7436 do_seg = TRUE;
7437 }
7438 else if (*ptr == 'C')
7439 {
7440 ptr = load_options_to_buffer(ptr+1, buffer);
7441 window_cc = CC_WINDOW;
7442 cutoff_cc = CC_CUTOFF;
7443 linker_cc = CC_LINKER;
7444 if (buffer[0] != NULLB)
7445 parse_cc_options(buffer, &window_cc, &cutoff_cc, &linker_cc);
7446 do_coil_coil = TRUE;
7447 }
7448 else if (*ptr == 'D')
7449 {
7450 ptr = load_options_to_buffer(ptr+1, buffer);
7451 if (buffer[0] != NULLB)
7452 parse_dust_options(buffer, &level_dust, &window_dust, &minwin_dust, &linker_dust);
7453 do_dust = TRUE;
7454 }
7455 else if (*ptr == 'R')
7456 {
7457 repeat_options = BLASTOptionNew("blastn", TRUE);
7458 repeat_options->expect_value = 0.1;
7459 repeat_options->penalty = -1;
7460 repeat_options->wordsize = 11;
7461 repeat_options->gap_x_dropoff_final = 90;
7462 repeat_options->dropoff_2nd_pass = 40;
7463 repeat_options->gap_open = 2;
7464 repeat_options->gap_extend = 1;
7465 ptr = load_options_to_buffer(ptr+1, buffer);
7466 if (buffer[0] != NULLB)
7467 parse_blast_options(repeat_options,
7468 buffer, &error_msg, &repeat_database,
7469 NULL, NULL);
7470 if (repeat_database == NULL)
7471 repeat_database = StringSave("humlines.lib humsines.lib retrovir.lib");
7472 do_repeats = TRUE;
7473 }
7474 else if (*ptr == 'V')
7475 {
7476 vs_options = VSBlastOptionNew();
7477 ptr = load_options_to_buffer(ptr+1, buffer);
7478 if (buffer[0] != NULLB)
7479 parse_blast_options(vs_options, buffer,
7480 &error_msg, &vs_database, NULL, NULL);
7481 vs_options = BLASTOptionDelete(vs_options);
7482 if (vs_database == NULL)
7483 vs_database = StringSave("UniVec_Core");
7484 do_vecscreen = TRUE;
7485 }
7486 else if (*ptr == 'L')
7487 { /* do low-complexity filtering; dust for blastn, otherwise seg.*/
7488 do_all = TRUE;
7489 ptr++;
7490 }
7491 else if (*ptr == 'm')
7492 {
7493 if (mask_at_hash)
7494 *mask_at_hash = TRUE;
7495 ptr++;
7496 }
7497 else
7498 { /* Nothing applied. */
7499 ptr++;
7500 }
7501 }
7502 buffer = MemFree(buffer);
7503 }
7504
7505 seqloc_num = 0;
7506 seqloc_head = NULL;
7507 sip = SeqLocId(slp);
7508 bsp = BioseqLockById(SeqIdFindBest(sip, SEQID_GI));
7509 if (ISA_aa(bsp->mol))
7510 {
7511 if (do_all || do_seg)
7512 {
7513 seg_slp = SeqlocSegAa(slp, sparamsp);
7514 SegParamsFree(sparamsp);
7515 sparamsp = NULL;
7516 seqloc_num++;
7517 }
7518 if (do_coil_coil)
7519 {
7520 pccp = PccDatNew ();
7521 pccp->window = window_cc;
7522 ReadPccData (pccp);
7523 /*scores = PredictCCBioseq(bsp, 0, bsp->length-1, pccp);*/
7524 scores = PredictCCSeqLoc(slp, pccp);
7525 cc_slp = FilterCC(scores, cutoff_cc, SeqLocLen(slp), linker_cc, SeqIdDup(sip), FALSE);
7526 MemFree(scores);
7527 PccDatFree (pccp);
7528 seqloc_num++;
7529 }
7530 }
7531 else
7532 {
7533 if (do_all || do_dust)
7534 {
7535 dust_slp = SeqLocDustEx(slp, level_dust, window_dust, linker_dust);
7536 seqloc_num++;
7537 }
7538 if (do_repeats)
7539 {
7540 /* Either the SeqLocPtr is SEQLOC_WHOLE (both strands) or SEQLOC_INT (probably
7541 one strand). In that case we make up a double-stranded one as we wish to look at both strands. */
7542 myslp_allocated = FALSE;
7543 if (slp->choice == SEQLOC_INT)
7544 {
7545 myslp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp), Seq_strand_both, SeqLocId(slp));
7546 myslp_allocated = TRUE;
7547 }
7548 else
7549 {
7550 myslp = slp;
7551 }
7552 start_timer;
7553 repeat_slp = BioseqHitRangeEngineByLoc(myslp, "blastn", repeat_database, repeat_options, NULL, NULL, NULL, NULL, NULL, 0);
7554 stop_timer("after repeat filtering");
7555 repeat_options = BLASTOptionDelete(repeat_options);
7556 repeat_database = MemFree(repeat_database);
7557 if (myslp_allocated)
7558 SeqLocFree(myslp);
7559 seqloc_num++;
7560 }
7561 if (do_vecscreen)
7562 {
7563 /* Either the SeqLocPtr is SEQLOC_WHOLE (both strands) or SEQLOC_INT (probably
7564 one strand). In that case we make up a double-stranded one as we wish to look at both strands. */
7565 myslp_allocated = FALSE;
7566 if (slp->choice == SEQLOC_INT)
7567 {
7568 myslp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp), Seq_strand_both, SeqLocId(slp));
7569 myslp_allocated = TRUE;
7570 }
7571 else
7572 {
7573 myslp = slp;
7574 }
7575 VSScreenSequenceByLoc(myslp, NULL, vs_database, &seqalign, &vnp, NULL, NULL);
7576 vnp_var = vnp;
7577 while (vnp_var)
7578 {
7579 seqloc_tmp = vnp_var->data.ptrvalue;
7580 if (vs_slp == NULL)
7581 {
7582 vs_slp = seqloc_tmp;
7583 }
7584 else
7585 {
7586 seqloc_var = vs_slp;
7587 while (seqloc_var->next)
7588 seqloc_var = seqloc_var->next;
7589 seqloc_var->next = seqloc_tmp;
7590 }
7591 vnp_var->data.ptrvalue = NULL;
7592 vnp_var = vnp_var->next;
7593 }
7594 vnp = ValNodeFree(vnp);
7595 seqalign = SeqAlignSetFree(seqalign);
7596 vs_database = MemFree(vs_database);
7597 if (myslp_allocated)
7598 SeqLocFree(myslp);
7599 seqloc_num++;
7600 }
7601 }
7602
7603 if (seqloc_num == 0)
7604 { /* nothing. */
7605 ;
7606 }
7607 else if (seqloc_num == 1)
7608 {
7609 if (seg_slp)
7610 seqloc_head = seg_slp;
7611 if (cc_slp)
7612 seqloc_head = cc_slp;
7613 if (dust_slp)
7614 seqloc_head = dust_slp;
7615 if (repeat_slp)
7616 seqloc_head = repeat_slp;
7617 if (vs_slp)
7618 seqloc_head = vs_slp;
7619 }
7620 else
7621 {
7622 if (seg_slp)
7623 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, seg_slp);
7624 if (cc_slp)
7625 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, cc_slp);
7626 if (dust_slp)
7627 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, dust_slp);
7628 if (repeat_slp)
7629 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, repeat_slp);
7630 if (vs_slp)
7631 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, vs_slp);
7632 }
7633
7634 BioseqUnlock(bsp);
7635 return seqloc_head;
7636 }
7637
7638 /*
7639 Program to run seg on a sequence. Note that this program only
7640 really works in UNIX systems.
7641 */
7642 Boolean LIBCALL
FilterWithSeg(Uint1Ptr sequence,Int4 length,Uint1 alphabet)7643 FilterWithSeg (Uint1Ptr sequence, Int4 length, Uint1 alphabet)
7644
7645 {
7646
7647 #ifdef OS_UNIX
7648
7649 BioseqPtr bsp;
7650 Char cmd_buf[2*PATH_MAX], temp_file[PATH_MAX];
7651 CharPtr filter_dir;
7652 FILE PNTR fp;
7653 Int4 byte_store_length;
7654 Nlm_ByteStorePtr byte_store;
7655 SeqEntryPtr sep;
7656
7657 if (sequence == NULL || length == 0)
7658 return FALSE;
7659
7660 byte_store = Nlm_BSNew(length);
7661
7662 byte_store_length = Nlm_BSWrite(byte_store, (VoidPtr) sequence, length);
7663 if (length != byte_store_length)
7664 {
7665 Nlm_BSDelete(byte_store, length);
7666 return FALSE;
7667 }
7668
7669 bsp = BioseqNew();
7670 bsp->seq_data = (SeqDataPtr) byte_store;
7671 bsp->length = length;
7672 bsp->seq_data_type = alphabet;
7673 bsp->mol = Seq_mol_aa;
7674 bsp->repr = Seq_repr_raw;
7675
7676 TmpNam(temp_file);
7677 fp = FileOpen(temp_file, "w");
7678 if (BioseqToFasta(bsp, fp, FALSE) == FALSE)
7679 {
7680 bsp = BioseqFree(bsp);
7681 return FALSE;
7682 }
7683 FileClose(fp);
7684
7685 bsp = BioseqFree(bsp);
7686
7687 filter_dir = getenv("BLASTFILTER");
7688 if (filter_dir != NULL)
7689 sprintf(cmd_buf, "%s%s%s%s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, " -x");
7690 else
7691 sprintf(cmd_buf, "%s%s%s%s%s", BLASTFILTER_DIR, DIRDELIMSTR, "seg ", temp_file, " -x");
7692
7693 fp = popen(cmd_buf, "r");
7694 if (fp == NULL)
7695 {
7696 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
7697 return FALSE;
7698 }
7699
7700 sep = FastaToSeqEntry(fp, FALSE);
7701 if (sep == NULL)
7702 {
7703 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
7704 return FALSE;
7705 }
7706
7707 pclose(fp);
7708
7709 bsp = sep->data.ptrvalue;
7710 BioseqRawConvert(bsp, Seq_code_ncbistdaa);
7711
7712 BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET);
7713 Nlm_BSRead((ByteStorePtr) bsp->seq_data, (VoidPtr) sequence, length);
7714
7715 SeqEntryFree(sep);
7716
7717 FileRemove(temp_file);
7718
7719 return TRUE;
7720 #else
7721 return FALSE;
7722 #endif
7723 }
7724
7725
BLAST_HSPFree(BLAST_HSPPtr hsp)7726 BLAST_HSPPtr BLAST_HSPFree(BLAST_HSPPtr hsp)
7727 {
7728 if (hsp)
7729 hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
7730
7731 return (BLAST_HSPPtr) MemFree(hsp);
7732 }
7733
7734 /*
7735 Frees memory used for HSP's on the ResultHitlist.
7736 Should be called as the SeqAlignPtr for a hitlist
7737 is produced to save memory.
7738 */
7739
7740 void
BLASTResultFreeHsp(BLASTResultHitlistPtr result)7741 BLASTResultFreeHsp(BLASTResultHitlistPtr result)
7742
7743 {
7744 BLASTResultHspPtr hsp;
7745 Int4 index;
7746
7747 if (result == NULL || result->hsp_array == NULL)
7748 return;
7749
7750 for(index=0; index < result->hspcnt; index++) {
7751 hsp = &result->hsp_array[index];
7752 if (hsp)
7753 hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
7754 }
7755
7756 if (result->hspcnt != 0)
7757 result->hsp_array = MemFree(result->hsp_array);
7758
7759 result->hspcnt = 0;
7760
7761 return;
7762 }
7763
7764 /*
7765 Free's the hitlist without performing a check
7766 on the integrity of the heap (used for culling).
7767 */
7768 BLASTResultHitlistPtr LIBCALL
BLASTResultHitlistFree(BLASTResultHitlistPtr result)7769 BLASTResultHitlistFree(BLASTResultHitlistPtr result)
7770
7771 {
7772 return BLASTResultHitlistFreeEx(NULL, result);
7773
7774 }
7775
7776
7777 BLASTResultHitlistPtr LIBCALL
BLASTResultHitlistFreeEx(BlastSearchBlkPtr search,BLASTResultHitlistPtr result)7778 BLASTResultHitlistFreeEx(BlastSearchBlkPtr search, BLASTResultHitlistPtr result)
7779
7780 {
7781 BLASTHeapPtr hp;
7782 Int4 index;
7783 register Int4 subject_id;
7784
7785 if (result == NULL)
7786 return NULL;
7787
7788
7789 /*
7790 Check the integrity of the heap used for culling. Occassionally
7791 HSP's that have been saved (in the heap before the start of
7792 the HSP) are missed.
7793 Only do this if the BlastSearchBlkPtr was provided.
7794 */
7795 if (search && search->pbp->perform_culling == TRUE && result->num_ref > 0)
7796 {
7797 subject_id = result->subject_id;
7798
7799 /* result->num_ref can change in the loop. */
7800 for (hp = search->result_struct->heap_ptr; hp && result->num_ref>0; hp = hp->next)
7801 {
7802 index=0; /* Note that hp->num_in_heap can change in the loop */
7803 while (index < hp->num_in_heap)
7804 {
7805 if (hp->heap[index]->point_back->subject_id == subject_id)
7806 {
7807 BlastDeleteHeap(hp, index);
7808 }
7809 else
7810 index++;
7811 }
7812 }
7813 }
7814
7815 /* In case it was not freed before. */
7816 BLASTResultFreeHsp(result);
7817
7818 BLASTSubjectInfoDestruct(result->subject_info);
7819
7820 result = MemFree(result);
7821
7822 return result;
7823 }
7824
7825 /*
7826 Creates a new BLASTResultHitlist, with the an hsp-array of length hspcnt. If the
7827 allocation fails, then NULL is returned.
7828 */
7829
7830 BLASTResultHitlistPtr LIBCALL
BLASTResultHitlistNew(Int4 hspcnt)7831 BLASTResultHitlistNew(Int4 hspcnt)
7832
7833 {
7834
7835 BLASTResultHitlistPtr new;
7836
7837 new = (BLASTResultHitlistPtr) MemNew(sizeof(BLASTResultHitlist));
7838 if (new == NULL)
7839 return NULL;
7840
7841 new->hsp_array = (BLASTResultHspPtr) MemNew(hspcnt*sizeof(BLASTResultHsp));
7842 if (new->hsp_array == NULL)
7843 {
7844 new = BLASTResultHitlistFree(new);
7845 return NULL;
7846 }
7847 new->hspcnt = hspcnt;
7848
7849 return new;
7850 }
7851
7852
7853 static Boolean
CopyHSPToResultHsp(BLAST_KarlinBlkPtr kbp,BLAST_HSPPtr hsp,BLASTResultHspPtr result_hsp)7854 CopyHSPToResultHsp(BLAST_KarlinBlkPtr kbp, BLAST_HSPPtr hsp, BLASTResultHspPtr result_hsp)
7855 {
7856 if (result_hsp == NULL || hsp == NULL)
7857 return FALSE;
7858
7859 result_hsp->ordering_method = hsp->ordering_method;
7860 result_hsp->number = hsp->num;
7861 result_hsp->score = hsp->score;
7862 result_hsp->bit_score = ((hsp->score*kbp->Lambda) - kbp->logK)/NCBIMATH_LN2;
7863 result_hsp->e_value = hsp->evalue;
7864 result_hsp->num_ident = hsp->num_ident;
7865 result_hsp->query_offset = hsp->query.offset;
7866 result_hsp->query_length = hsp->query.length;
7867 result_hsp->query_frame = hsp->query.frame;
7868 result_hsp->query_gapped_start = hsp->query.gapped_start;
7869 result_hsp->subject_offset = hsp->subject.offset;
7870 result_hsp->subject_length = hsp->subject.length;
7871 result_hsp->subject_frame = hsp->subject.frame;
7872 result_hsp->subject_gapped_start = hsp->subject.gapped_start;
7873 result_hsp->context = hsp->context;
7874 result_hsp->gap_info = hsp->gap_info;
7875 /* Not set in the other type of HSP? */
7876 result_hsp->hspset_cnt = 0;
7877
7878 return TRUE;
7879 }
7880
7881 Boolean LIBCALL
CopyResultHspToHSP(BLASTResultHspPtr result_hsp,BLAST_HSPPtr hsp)7882 CopyResultHspToHSP(BLASTResultHspPtr result_hsp, BLAST_HSPPtr hsp)
7883 {
7884 if (result_hsp == NULL || hsp == NULL)
7885 return FALSE;
7886
7887 hsp->ordering_method = result_hsp->ordering_method;
7888 hsp->num = result_hsp->number;
7889 hsp->score = result_hsp->score;
7890 hsp->evalue = result_hsp->e_value;
7891 hsp->num_ident = result_hsp->num_ident;
7892 hsp->query.offset = result_hsp->query_offset;
7893 hsp->query.length = result_hsp->query_length;
7894 hsp->query.end = result_hsp->query_offset + result_hsp->query_length;
7895 hsp->query.frame = result_hsp->query_frame;
7896 hsp->query.gapped_start = result_hsp->query_gapped_start;
7897 hsp->subject.offset = result_hsp->subject_offset;
7898 hsp->subject.length = result_hsp->subject_length;
7899 hsp->subject.end = result_hsp->subject_offset + result_hsp->subject_length;
7900 hsp->subject.frame = result_hsp->subject_frame;
7901 hsp->subject.gapped_start = result_hsp->subject_gapped_start;
7902 hsp->context = result_hsp->context;
7903
7904 return TRUE;
7905 }
7906
7907 /* Same as FillInStdSegInfo, only taking BLAST_HSPPtr argument instead of
7908 BlastResultHspPtr */
7909 StdSegPtr
BLASTHspToStdSeg(BlastSearchBlkPtr search,Int4 subject_length,BLAST_HSPPtr hsp,SeqIdPtr sip,Boolean reverse,SeqIdPtr gi_list)7910 BLASTHspToStdSeg(BlastSearchBlkPtr search, Int4 subject_length, BLAST_HSPPtr hsp, SeqIdPtr sip, Boolean reverse, SeqIdPtr gi_list)
7911 {
7912 StdSegPtr ssp = NULL;
7913 BLASTResultHspPtr result_hsp =
7914 (BLASTResultHspPtr) Malloc(sizeof(BLASTResultHsp));
7915
7916 CopyHSPToResultHsp(search->sbp->kbp[search->first_context],
7917 hsp, result_hsp);
7918 ssp = FillInStdSegInfo(search, search->subject_id, subject_length, &ssp,
7919 result_hsp, sip, reverse, gi_list);
7920 MemFree(result_hsp);
7921 return ssp;
7922 }
7923
7924 /*
7925 Sort the HSP's by score.
7926 */
7927
7928 int LIBCALLBACK
score_compare_hsps(VoidPtr v1,VoidPtr v2)7929 score_compare_hsps(VoidPtr v1, VoidPtr v2)
7930
7931 {
7932 BLAST_HSPPtr hsp1, hsp2; /* the HSPs to be compared */
7933 int result = 0; /* the result of the comparison */
7934
7935 hsp1 = *((BLAST_HSPPtr PNTR) v1);
7936 hsp2 = *((BLAST_HSPPtr PNTR) v2);
7937
7938 /* Null HSPs are "greater" than any non-null ones, so they go to the end
7939 of a sorted list. */
7940 if (!hsp1 && !hsp2)
7941 return 0;
7942 else if (!hsp1)
7943 return 1;
7944 else if (!hsp2)
7945 return -1;
7946
7947 if (0 == (result = BLAST_CMP(hsp2->score, hsp1->score)) &&
7948 0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
7949 0 == (result = BLAST_CMP(hsp2->subject.end, hsp1->subject.end)) &&
7950 0 == (result = BLAST_CMP(hsp1->query .offset, hsp2->query .offset))) {
7951 /* if all other test can't distinguish the HSPs, then the final
7952 test is the result */
7953 result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
7954 }
7955 return result;
7956 }
7957
7958 /*
7959 Function to look for the highest scoring window (of size HSP_MAX_WINDOW)
7960 in an HSP and return the middle of this. Used by the gapped-alignment
7961 functions to start the gapped alignments.
7962 */
7963
GetStartForGappedAlignment(BlastSearchBlkPtr search,BLAST_HSPPtr hsp,Uint1Ptr query,Uint1Ptr subject,Int4Ptr PNTR matrix)7964 Int4 GetStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp, Uint1Ptr query, Uint1Ptr subject, Int4Ptr PNTR matrix)
7965 {
7966 Int4 index1, max_offset, score, max_score, hsp_end;
7967 Uint1Ptr query_var, subject_var;
7968 Boolean positionBased = (search->positionBased && search->sbp->posMatrix);
7969
7970 if (hsp->query.length <= HSP_MAX_WINDOW) {
7971 max_offset = hsp->query.offset + hsp->query.length/2;
7972 return max_offset;
7973 }
7974
7975 hsp_end = hsp->query.offset + HSP_MAX_WINDOW;
7976 query_var = query + hsp->query.offset;
7977 subject_var = subject + hsp->subject.offset;
7978 score=0;
7979 if (!positionBased) {
7980 for (index1=hsp->query.offset; index1<hsp_end; index1++) {
7981 score += matrix[*query_var][*subject_var];
7982 query_var++; subject_var++;
7983 }
7984 } else {
7985 for (index1=hsp->query.offset; index1<hsp_end; index1++) {
7986 score += search->sbp->posMatrix[index1][*subject_var];
7987 query_var++; subject_var++;
7988 }
7989 }
7990 max_score = score;
7991 max_offset = hsp_end - 1;
7992 hsp_end = hsp->query.end -
7993 MAX(0, hsp->query.length - hsp->subject.length);
7994 for (index1=hsp->query.offset + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
7995 if (!positionBased) {
7996 score -= matrix[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
7997 score += matrix[*query_var][*subject_var];
7998 } else {
7999 score -= search->sbp->posMatrix[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
8000 score += search->sbp->posMatrix[index1][*subject_var];
8001 }
8002 if (score > max_score) {
8003 max_score = score;
8004 max_offset = index1;
8005 }
8006 query_var++; subject_var++;
8007 }
8008 if (max_score > 0)
8009 max_offset -= HSP_MAX_WINDOW/2;
8010 else
8011 max_offset = hsp->query.offset;
8012
8013 return max_offset;
8014 }
8015
8016 /*
8017 Check whether the starting point for gapped alignment lies in
8018 region that has positive score. This routine is called after a
8019 preliminary gapped alignment has been computed, but before the
8020 traceback is computed. The score of the region containing the
8021 starting point may have changed due to the introduction of
8022 ambiguity characters, further filtering of the sequences or the
8023 application of composition based statistics.
8024
8025 Usually, we check an ungapped alignment of length 11 about the
8026 starting point: 5 characters to the left and 5 to the right.
8027 However, the actual region checked is occassionally shorter because
8028 we don't check characters before the start, or after the end, of
8029 the preliminarily aligned regions in the query or subject.
8030 */
8031 Boolean
CheckStartForGappedAlignment(BlastSearchBlkPtr search,BLAST_HSPPtr hsp,Uint1Ptr query,Uint1Ptr subject,Int4Ptr PNTR matrix)8032 CheckStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp,
8033 Uint1Ptr query, Uint1Ptr subject,
8034 Int4Ptr PNTR matrix)
8035 {
8036 Int4 left, right; /* Number of aligned characters to the
8037 left and right of the starting point */
8038 Int4 score; /* Score of the word alignment */
8039 Uint1Ptr subject_var; /* Current character in the subject sequence */
8040 Uint1Ptr subject_right; /* last character to be considered in the subject
8041 sequence */
8042 Boolean positionBased =
8043 (search->positionBased && search->sbp->posMatrix);
8044
8045 /* Compute the number of characters to the left of the start
8046 to include in the word */
8047 left = -HSP_MAX_WINDOW/2;
8048 if (left < hsp->query.offset - hsp->query.gapped_start) {
8049 left = hsp->query.offset - hsp->query.gapped_start;
8050 }
8051 if (left < hsp->subject.offset - hsp->subject.gapped_start) {
8052 left = hsp->subject.offset - hsp->subject.gapped_start;
8053 }
8054
8055 /* Compute the number of characters to right to include in the word,
8056 including the starting point itself. */
8057 right = HSP_MAX_WINDOW/2 + 1;
8058 if (right > hsp->query.end - hsp->query.gapped_start) {
8059 right = hsp->query.end - hsp->query.gapped_start;
8060 }
8061 if (right > hsp->subject.end - hsp->subject.gapped_start) {
8062 right = hsp->subject.end - hsp->subject.gapped_start;
8063 }
8064
8065 /* Calculate the score of the word */
8066 score = 0;
8067 subject_var = subject + hsp->subject.gapped_start + left;
8068 subject_right = subject + hsp->subject.gapped_start + right;
8069 if ( !positionBased ) {
8070 Uint1Ptr query_var; /* Current character in the query */
8071 query_var = query + hsp->query.gapped_start + left;
8072 for ( ; subject_var < subject_right; subject_var++, query_var++) {
8073 score += matrix[*query_var][*subject_var];
8074 }
8075 } else {
8076 Int4 query_index; /* Current position in the query */
8077 query_index = hsp->query.gapped_start + left;
8078 for ( ; subject_var < subject_right; subject_var++, query_index++) {
8079 score += search->sbp->posMatrix[query_index][*subject_var];
8080 }
8081 }
8082 if (score <= 0) {
8083 return FALSE;
8084 } else {
8085 return TRUE;
8086 }
8087 }
8088
8089
8090 /*
8091 Gets the ratio used to change an evalue calculated with the subject
8092 sequence length to one with a db length.
8093 */
8094
8095 Nlm_FloatHi LIBCALL
GetDbSubjRatio(BlastSearchBlkPtr search,Int4 subject_length)8096 GetDbSubjRatio(BlastSearchBlkPtr search, Int4 subject_length)
8097 {
8098 Nlm_FloatHi db_subj_ratio;
8099
8100 db_subj_ratio =
8101 ((Nlm_FloatHi) search->context_factor * search->dblen) /
8102 ((Nlm_FloatHi) subject_length);
8103 if (StringCmp(search->prog_name, "tblastn") == 0 ||
8104 StringCmp(search->prog_name, "tblastx") == 0 ||
8105 StringCmp(search->prog_name, "psitblastn") == 0)
8106 {
8107 db_subj_ratio *= 3;
8108 }
8109
8110 return db_subj_ratio;
8111 }
8112
8113 /* The following value should be divisible by 3, to make sure that frames stay
8114 the same when translations are restricted to partial sequence. */
8115 #define SUBJECT_ADJUSTMENT 2100
8116 SeqAlignPtr LIBCALL
BlastGetGapAlgnTbckWithReaddb(BlastSearchBlkPtr search,Int4 hit_number,Boolean ordinal_number)8117 BlastGetGapAlgnTbckWithReaddb (BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number)
8118
8119 {
8120 BLASTResultHitlistPtr result_hitlist;
8121 BioseqPtr subject_bsp;
8122 Boolean subject_allocated = FALSE;
8123 Int4 index1, subject_length, rev_subject_length;
8124 Int4 subject_start, subject_end;
8125 Int4 hsp_count;
8126 BLASTResultHspPtr hsp_array;
8127 SeqAlignPtr seqalign;
8128 SeqPortPtr spp;
8129 Uint1Ptr subject, rev_subject;
8130
8131 result_hitlist = search->result_struct->results[hit_number];
8132
8133 if (StringCmp(search->prog_name, "tblastn") == 0 ||
8134 StringCmp(search->prog_name, "psitblastn") == 0)
8135 {
8136 subject_bsp = readdb_get_bioseq(search->rdfp, result_hitlist->subject_id);
8137 spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_plus, Seq_code_ncbi4na);
8138 /* make one longer to "protect" ALIGN. */
8139 subject = MemNew((1+subject_bsp->length)*sizeof(Uint1));
8140 hsp_array = result_hitlist->hsp_array;
8141 hsp_count = result_hitlist->hspcnt;
8142 for (index1=0; index1<hsp_count; index1++)
8143 {
8144 if (hsp_array[index1].subject_frame > 0)
8145 { /* Get subsequence corresponding to this hsp. */
8146 Int4 offset;
8147
8148 subject_start = 3*hsp_array[index1].subject_offset;
8149 subject_end = subject_start + 3*hsp_array[index1].subject_length;
8150
8151 /* add SUBJECT_ADJUSTMENT bases to either end. */
8152 subject_start = MAX(subject_start - SUBJECT_ADJUSTMENT, 0);
8153 subject_end = MIN(subject_end + SUBJECT_ADJUSTMENT, subject_bsp->length);
8154
8155 SeqPortSeek(spp, subject_start, SEEK_SET);
8156
8157 for (offset=subject_start; offset<subject_end; offset++)
8158 subject[offset] = SeqPortGetResidue(spp);
8159
8160 if (subject_start == 0 && subject_end == subject_bsp->length)
8161 break; /* entire sequence has been fetched. */
8162 }
8163 }
8164 /* Gap character in last space. */
8165 subject[subject_bsp->length] = NULLB;
8166 subject_length = subject_bsp->length;
8167 spp = SeqPortFree(spp);
8168
8169 spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_minus, Seq_code_ncbi4na);
8170 /* make one longer to "protect" ALIGN. */
8171 rev_subject = MemNew((1+subject_bsp->length)*sizeof(Uint1));
8172 hsp_array = result_hitlist->hsp_array;
8173 hsp_count = result_hitlist->hspcnt;
8174 for (index1=0; index1<hsp_count; index1++)
8175 {
8176 if (hsp_array[index1].subject_frame < 0)
8177 { /* Get subsequence corresponding to this hsp. */
8178 Int4 offset;
8179
8180 subject_start = 3*hsp_array[index1].subject_offset;
8181 subject_end = subject_start + 3*hsp_array[index1].subject_length;
8182
8183 /* add SUBJECT_ADJUSTMENT bases to either end. */
8184 subject_start = MAX(subject_start - SUBJECT_ADJUSTMENT, 0);
8185 subject_end = MIN(subject_end + SUBJECT_ADJUSTMENT, subject_bsp->length);
8186
8187 SeqPortSeek(spp, subject_start, SEEK_SET);
8188
8189 for (offset=subject_start; offset<subject_end; offset++)
8190 rev_subject[offset] = SeqPortGetResidue(spp);
8191
8192 if (subject_start == 0 && subject_end == subject_bsp->length)
8193 break; /* entire sequence has been fetched. */
8194 }
8195 }
8196 /* Gap character in last space. */
8197 rev_subject[subject_bsp->length] = NULLB;
8198 rev_subject_length = subject_bsp->length;
8199 spp = SeqPortFree(spp);
8200 subject_bsp = BioseqFree(subject_bsp);
8201 subject_allocated = TRUE;
8202 }
8203 else
8204 {
8205 subject_length = readdb_get_sequence(search->rdfp, result_hitlist->subject_id, (Uint1Ptr PNTR) &subject);
8206 rev_subject = NULL;
8207 rev_subject_length = 0;
8208 }
8209
8210 seqalign = BlastGetGapAlgnTbck (search, hit_number, FALSE, ordinal_number, subject, subject_length, rev_subject, rev_subject_length);
8211
8212 if (subject_allocated)
8213 {
8214 subject = MemFree(subject);
8215 rev_subject = MemFree(rev_subject);
8216 }
8217
8218 return seqalign;
8219 }
8220
8221 int LIBCALLBACK
query_offset_compare_hsp(VoidPtr v1,VoidPtr v2)8222 query_offset_compare_hsp(VoidPtr v1, VoidPtr v2)
8223
8224 {
8225 BLAST_HSPPtr h1, h2;
8226 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8227
8228 hp1 = (BLAST_HSPPtr PNTR) v1;
8229 hp2 = (BLAST_HSPPtr PNTR) v2;
8230 h1 = *hp1;
8231 h2 = *hp2;
8232
8233 if (h1 == NULL) {
8234 return (h2 == NULL) ? 0 : 1;
8235 } else if (h2 == NULL) {
8236 return -1;
8237 }
8238
8239 if (h1->query.offset < h2->query.offset)
8240 return -1;
8241 if (h1->query.offset > h2->query.offset)
8242 return 1;
8243
8244 if (h1->subject.offset < h2->subject.offset)
8245 return -1;
8246 if (h1->subject.offset > h2->subject.offset)
8247 return 1;
8248
8249 return 0;
8250 }
8251
8252 int LIBCALLBACK
query_end_compare_hsp(VoidPtr v1,VoidPtr v2)8253 query_end_compare_hsp(VoidPtr v1, VoidPtr v2)
8254
8255 {
8256 BLAST_HSPPtr h1, h2;
8257 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8258
8259 hp1 = (BLAST_HSPPtr PNTR) v1;
8260 hp2 = (BLAST_HSPPtr PNTR) v2;
8261 h1 = *hp1;
8262 h2 = *hp2;
8263
8264 if (h1 == NULL) {
8265 return (h2 == NULL) ? 0 : 1;
8266 } else if (h2 == NULL) {
8267 return -1;
8268 }
8269
8270 if (h1->query.end < h2->query.end)
8271 return -1;
8272 if (h1->query.end > h2->query.end)
8273 return 1;
8274
8275 if (h1->subject.end < h2->subject.end)
8276 return -1;
8277 if (h1->subject.end > h2->subject.end)
8278 return 1;
8279
8280 return 0;
8281 }
8282 /*
8283 Check the gapped alignments for an overlap of two different alignments.
8284 A sufficient overlap is when two alignments have the same start values
8285 of have the same final values.
8286
8287 The number of valid alignments remaining is returned.
8288 */
8289
8290 static Int4
CheckGappedAlignmentsForOverlap(BlastSearchBlkPtr search,BLAST_HSPPtr * hsp_array,Int4 hsp_count,Int2 frame)8291 CheckGappedAlignmentsForOverlap(BlastSearchBlkPtr search, BLAST_HSPPtr *hsp_array, Int4 hsp_count, Int2 frame)
8292
8293 {
8294 Int4 index1, index, increment;
8295
8296 if (search == NULL || hsp_array == NULL || hsp_count == 0)
8297 return 0;
8298
8299 HeapSort(hsp_array, hsp_count, sizeof(BLAST_HSPPtr), query_offset_compare_hsp);
8300 index=0;
8301 increment=1;
8302 while (index < hsp_count-increment)
8303 { /* Check if both HSP's start on or end on the same digonal. */
8304 if (hsp_array[index+increment] == NULL)
8305 {
8306 increment++;
8307 continue;
8308 }
8309
8310 if (frame != 0 && hsp_array[index+increment]->subject.frame != frame)
8311 break;
8312
8313 if (hsp_array[index] && hsp_array[index]->query.offset == hsp_array[index+increment]->query.offset &&
8314 hsp_array[index]->subject.offset == hsp_array[index+increment]->subject.offset &&
8315 SIGN(hsp_array[index]->query.frame) == SIGN(hsp_array[index+increment]->query.frame))
8316 {
8317 if (hsp_array[index]->score > hsp_array[index+increment]->score)
8318 {
8319 hsp_array[index+increment] =
8320 BLAST_HSPFree(hsp_array[index+increment]);
8321 increment++;
8322 }
8323 else
8324 {
8325 hsp_array[index] =
8326 BLAST_HSPFree(hsp_array[index]);
8327 index++;
8328 increment = 1;
8329 }
8330 }
8331 else
8332 {
8333 index++;
8334 increment = 1;
8335 }
8336 }
8337
8338 HeapSort(hsp_array, hsp_count, sizeof(BLAST_HSPPtr), query_end_compare_hsp);
8339 index=0;
8340 increment=1;
8341 while (index < hsp_count-increment)
8342 { /* Check if both HSP's start on or end on the same digonal. */
8343 if (hsp_array[index+increment] == NULL)
8344 {
8345 increment++;
8346 continue;
8347 }
8348
8349 if (frame != 0 && hsp_array[index+increment]->subject.frame != frame)
8350 break;
8351
8352 if (hsp_array[index] &&
8353 hsp_array[index]->query.end == hsp_array[index+increment]->query.end &&
8354 hsp_array[index]->subject.end == hsp_array[index+increment]->subject.end &&
8355 SIGN(hsp_array[index]->query.frame) == SIGN(hsp_array[index+increment]->query.frame))
8356 {
8357 if (hsp_array[index]->score > hsp_array[index+increment]->score)
8358 {
8359 hsp_array[index+increment] =
8360 BLAST_HSPFree(hsp_array[index+increment]);
8361 increment++;
8362 }
8363 else
8364 {
8365 hsp_array[index] =
8366 BLAST_HSPFree(hsp_array[index]);
8367 index++;
8368 increment = 1;
8369 }
8370 }
8371 else
8372 {
8373 index++;
8374 increment = 1;
8375 }
8376 }
8377
8378 HeapSort(hsp_array,hsp_count,sizeof(BLAST_HSPPtr), score_compare_hsps);
8379
8380 index1 = 0;
8381 for (index=0; index<hsp_count; index++)
8382 {
8383 if (hsp_array[index] != NULL)
8384 index1++;
8385 }
8386
8387
8388 return index1;
8389
8390 }
8391
8392 /*
8393 Sort the HSP's by frame.
8394 */
8395
8396 int LIBCALLBACK
frame_compare_hsp_m3(VoidPtr v1,VoidPtr v2)8397 frame_compare_hsp_m3(VoidPtr v1, VoidPtr v2)
8398
8399 {
8400 BLAST_HSPPtr h1, h2;
8401 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8402
8403 hp1 = (BLAST_HSPPtr PNTR) v1;
8404 hp2 = (BLAST_HSPPtr PNTR) v2;
8405 h1 = *hp1;
8406 h2 = *hp2;
8407
8408 if (h1->subject.frame == -3 && h2->subject.frame != -3)
8409 return -1;
8410 if (h2->subject.frame == -3 && h1->subject.frame != -3)
8411 return 1;
8412
8413 return 0;
8414 }
8415 int LIBCALLBACK
frame_compare_hsp_m2(VoidPtr v1,VoidPtr v2)8416 frame_compare_hsp_m2(VoidPtr v1, VoidPtr v2)
8417
8418 {
8419 BLAST_HSPPtr h1, h2;
8420 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8421
8422 hp1 = (BLAST_HSPPtr PNTR) v1;
8423 hp2 = (BLAST_HSPPtr PNTR) v2;
8424 h1 = *hp1;
8425 h2 = *hp2;
8426
8427 if (h1->subject.frame == -2 && h2->subject.frame != -2)
8428 return -1;
8429 if (h2->subject.frame == -2 && h1->subject.frame != -2)
8430 return 1;
8431
8432 return 0;
8433 }
8434
8435 int LIBCALLBACK
frame_compare_hsp_m1(VoidPtr v1,VoidPtr v2)8436 frame_compare_hsp_m1(VoidPtr v1, VoidPtr v2)
8437
8438 {
8439 BLAST_HSPPtr h1, h2;
8440 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8441
8442 hp1 = (BLAST_HSPPtr PNTR) v1;
8443 hp2 = (BLAST_HSPPtr PNTR) v2;
8444 h1 = *hp1;
8445 h2 = *hp2;
8446
8447 if (h1->subject.frame == -1 && h2->subject.frame != -1)
8448 return -1;
8449 if (h2->subject.frame == -1 && h1->subject.frame != -1)
8450 return 1;
8451
8452 return 0;
8453 }
8454 int LIBCALLBACK
frame_compare_hsp_p1(VoidPtr v1,VoidPtr v2)8455 frame_compare_hsp_p1(VoidPtr v1, VoidPtr v2)
8456
8457 {
8458 BLAST_HSPPtr h1, h2;
8459 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8460
8461 hp1 = (BLAST_HSPPtr PNTR) v1;
8462 hp2 = (BLAST_HSPPtr PNTR) v2;
8463 h1 = *hp1;
8464 h2 = *hp2;
8465
8466 if (h1->subject.frame == 1 && h2->subject.frame != 1)
8467 return -1;
8468 if (h2->subject.frame == 1 && h1->subject.frame != 1)
8469 return 1;
8470
8471 return 0;
8472 }
8473 int LIBCALLBACK
frame_compare_hsp_p2(VoidPtr v1,VoidPtr v2)8474 frame_compare_hsp_p2(VoidPtr v1, VoidPtr v2)
8475
8476 {
8477 BLAST_HSPPtr h1, h2;
8478 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8479
8480 hp1 = (BLAST_HSPPtr PNTR) v1;
8481 hp2 = (BLAST_HSPPtr PNTR) v2;
8482 h1 = *hp1;
8483 h2 = *hp2;
8484
8485 if (h1->subject.frame == 2 && h2->subject.frame != 2)
8486 return -1;
8487 if (h2->subject.frame == 2 && h1->subject.frame != 2)
8488 return 1;
8489
8490 return 0;
8491 }
8492 int LIBCALLBACK
frame_compare_hsp_p3(VoidPtr v1,VoidPtr v2)8493 frame_compare_hsp_p3(VoidPtr v1, VoidPtr v2)
8494
8495 {
8496 BLAST_HSPPtr h1, h2;
8497 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8498
8499 hp1 = (BLAST_HSPPtr PNTR) v1;
8500 hp2 = (BLAST_HSPPtr PNTR) v2;
8501 h1 = *hp1;
8502 h2 = *hp2;
8503
8504 if (h1->subject.frame == 3 && h2->subject.frame != 3)
8505 return -1;
8506 if (h2->subject.frame == 3 && h1->subject.frame != 3)
8507 return 1;
8508
8509 return 0;
8510 }
8511 /*
8512 Engine to get the gapped scores from an array of HSP's.
8513 */
8514 static BLAST_HSPPtr PNTR
BlastGappedScoreInternal(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,GapAlignBlkPtr gap_align,BLAST_HSPPtr * hsp_array,Int4Ptr hspcnt,Int4Ptr hspcnt_max,Int4 hspmax,Int2 frame)8515 BlastGappedScoreInternal(BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, GapAlignBlkPtr gap_align, BLAST_HSPPtr *hsp_array, Int4Ptr hspcnt, Int4Ptr hspcnt_max, Int4 hspmax, Int2 frame)
8516
8517 {
8518 BLAST_HSPPtr hsp, hsp1=NULL;
8519 BLAST_HSPPtr PNTR hsp_array_new;
8520 BLAST_HSP_helperPtr helper;
8521 Boolean hsp_start_is_contained, hsp_end_is_contained;
8522 Int4 hsp_cnt=0, index, index1;
8523 Int4 max_offset = 0, next_offset;
8524 Int4 query_num; /* AM: Added to support query concatenation */
8525
8526 /* helper contains most frequently used information to speed up access. */
8527 helper = Malloc((*hspcnt)*sizeof(BLAST_HSP_helper));
8528 for (index=0; index<(*hspcnt); index++)
8529 {
8530 hsp_start_is_contained = FALSE;
8531 hsp_end_is_contained = FALSE;
8532 hsp = hsp_array[index];
8533 /* This prefetches this value for the test below. */
8534 next_offset = hsp->query.offset;
8535
8536 if (frame != 0 && hsp->subject.frame != frame)
8537 break;
8538
8539 for (index1=0; index1<index; index1++)
8540 {
8541 hsp_start_is_contained = FALSE;
8542 hsp_end_is_contained = FALSE;
8543
8544 hsp1 = hsp_array[index1];
8545 if (hsp1 == NULL)
8546 continue;
8547
8548 /* Check with the helper array whether further
8549 tests are warranted. Having only two ints
8550 in the helper array speeds up access. */
8551 if (helper[index1].qoffset <= next_offset &&
8552 helper[index1].qend >= next_offset)
8553 {
8554 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE)
8555
8556 { /* Check that it's on diff. strands. */
8557 if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8558 SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
8559 hsp_start_is_contained = TRUE;
8560 }
8561 if (hsp_start_is_contained && CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE)
8562
8563 { /* Check that it's on diff. strands. */
8564 if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8565 SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
8566 hsp_end_is_contained = TRUE;
8567 if (hsp_start_is_contained && hsp_end_is_contained && hsp->score <= hsp1->score)
8568 {
8569 break;
8570 }
8571 }
8572 }
8573 }
8574
8575 if (hsp_start_is_contained == FALSE ||
8576 hsp_end_is_contained == FALSE ||
8577 (hsp1 == NULL) || (hsp->score > hsp1->score))
8578 {
8579 gap_align->include_query = 0;
8580
8581 if(!search->pbp->is_ooframe) {
8582 max_offset = GetStartForGappedAlignment(search, hsp, search->context[hsp->context].query->sequence, subject, search->sbp->matrix);
8583 }
8584
8585 #ifdef BLAST_COLLECT_STATS
8586 search->real_gap_number_of_hsps++;
8587 #endif
8588 Nlm_MemSet((VoidPtr) &(hsp_array[index]->hsp_link), 0, sizeof(BLAST_HSP_LINK));
8589 hsp_array[index]->linked_set = FALSE;
8590 hsp_array[index]->start_of_chain = FALSE;
8591 hsp_array[index]->num = 0;
8592 hsp_array[index]->xsum = 0.0;
8593
8594 if(search->pbp->is_ooframe) {
8595 gap_align->is_ooframe = TRUE;
8596 gap_align->query = subject;
8597 if(hsp->query.frame > 0) {
8598 gap_align->subject = search->query_dnap[0]->sequence;
8599 gap_align->subject_length = search->query_dnap[0]->length;
8600 } else {
8601 gap_align->subject = search->query_dnap[1]->sequence;
8602 gap_align->subject_length = search->query_dnap[1]->length;
8603 }
8604
8605 gap_align->query_length = subject_length;
8606
8607 gap_align->q_start = hsp->subject.offset;
8608 gap_align->s_start = hsp->query.offset;
8609
8610 hsp->query.gapped_start = gap_align->s_start;
8611 hsp->subject.gapped_start = gap_align->q_start;
8612
8613 } else {
8614 gap_align->query = search->context[hsp->context].query->sequence;
8615 gap_align->query_length = search->context[hsp->context].query->length;
8616 gap_align->q_start = max_offset;
8617 gap_align->s_start =
8618 (hsp->subject.offset - hsp->query.offset) + max_offset;
8619 hsp->query.gapped_start = gap_align->q_start;
8620 hsp->subject.gapped_start = gap_align->s_start;
8621
8622 gap_align->subject = subject;
8623 gap_align->subject_length = subject_length;
8624 }
8625
8626 /* For out-of frame gapping - query is protein
8627 and subject is DNA translated into 3 frames */
8628
8629 PerformGappedAlignment(gap_align);
8630
8631 if(search->pbp->is_ooframe) {
8632 hsp->query.offset = gap_align->subject_start;
8633 hsp->subject.offset = gap_align->query_start;
8634 /* The end is one further for BLAST than for the gapped align. */
8635 hsp->query.end = gap_align->subject_stop + 1;
8636 hsp->subject.end = gap_align->query_stop + 1;
8637 } else {
8638 hsp->query.offset = gap_align->query_start;
8639 hsp->query.end = gap_align->query_stop + 1;
8640 hsp->subject.offset = gap_align->subject_start;
8641 hsp->subject.end = gap_align->subject_stop + 1;
8642 /* The end is one further for BLAST than for the gapped align. */
8643 }
8644
8645 hsp->query.length = hsp->query.end - hsp->query.offset;
8646 hsp->subject.length = hsp->subject.end - hsp->subject.offset;
8647 hsp->score = gap_align->score;
8648 if( hsp->score >= search->pbp->cutoff_s1 ) {
8649 /* AM: Changed to support query concatenation */
8650 if( !search->mult_queries )
8651 hsp->evalue =
8652 BlastKarlinStoE_simple(hsp->score,
8653 search->sbp->
8654 kbp_gap[search->first_context],
8655 search->searchsp_eff);
8656 else {
8657 query_num = GetQueryNum( search->mult_queries,
8658 hsp->query.offset,
8659 hsp->query.end,
8660 hsp->query.frame );
8661 hsp->evalue =
8662 BlastKarlinStoE_simple(hsp->score,
8663 search->sbp->
8664 kbp_gap[search->first_context],
8665 search->mult_queries->
8666 SearchSpEff[query_num]);
8667 }
8668
8669 hsp_cnt++;
8670 /* Fill in the helper structure. */
8671 helper[index].qoffset = hsp->query.offset;
8672 helper[index].qend = hsp->query.end;
8673 } else {
8674 /* Score of the gapped extension is below the required
8675 cutoff, delete this hsp */
8676 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8677 }
8678 }
8679 else
8680 { /* Contained within another HSP, delete. */
8681 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8682 }
8683 }
8684 helper = MemFree(helper);
8685
8686 hsp_cnt = CheckGappedAlignmentsForOverlap(search, hsp_array, *hspcnt, frame);
8687
8688 if (hsp_cnt < (*hspcnt))
8689 {
8690 /* Save HSP's again, discarding those that have been NULLed out. */
8691 hsp_array_new = MemNew(hspmax*sizeof(BLAST_HSPPtr));
8692 index1 = 0;
8693 for (index=0; index<(*hspcnt_max); index++)
8694 {
8695 if (hsp_array[index] != NULL)
8696 {
8697 hsp_array_new[index1] = hsp_array[index];
8698 index1++;
8699 }
8700 }
8701
8702 hsp_array = MemFree(hsp_array);
8703
8704 *hspcnt = index1;
8705 *hspcnt_max = index1;
8706 hsp_array = hsp_array_new;
8707 }
8708 *hspcnt = hsp_cnt;
8709
8710 return hsp_array;
8711 }
8712
8713 /*
8714 Engine to get the gapped scores from an array of HSP's.
8715 */
8716 static Boolean
BlastNtGappedScoreInternal(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,GapAlignBlkPtr gap_align,BLAST_HSPPtr * hsp_array,Int4Ptr hspcnt,Int4Ptr hspcnt_max,Int4 hspmax)8717 BlastNtGappedScoreInternal(BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, GapAlignBlkPtr gap_align, BLAST_HSPPtr *hsp_array, Int4Ptr hspcnt, Int4Ptr hspcnt_max, Int4 hspmax)
8718
8719 {
8720 BLAST_HSPPtr hsp, hsp1=NULL;
8721 BLAST_HSP_helperPtr helper;
8722 Boolean hsp_start_is_contained, hsp_end_is_contained;
8723 Int4 hsp_cnt=0, index, index1, next_offset, query_length;
8724 /* AM: Added to support query concatenation. */
8725 Int4 query_num;
8726
8727 /* helper contains most frequently used information to speed up access. */
8728 helper = Malloc((*hspcnt)*sizeof(BLAST_HSP_helper));
8729
8730 for (index=0; index<(*hspcnt); index++)
8731 {
8732 hsp_start_is_contained = FALSE;
8733 hsp_end_is_contained = FALSE;
8734 hsp = hsp_array[index];
8735 /* This prefetches this value for the test below. */
8736 next_offset = hsp->query.offset;
8737
8738 for (index1=0; index1<index; index1++)
8739 {
8740 hsp_start_is_contained = FALSE;
8741 hsp_end_is_contained = FALSE;
8742
8743 hsp1 = hsp_array[index1];
8744 if (hsp1 == NULL)
8745 continue;
8746
8747
8748 /* Check with the helper array whether further
8749 tests are warranted. Having only two ints
8750 in the helper array speeds up access. */
8751 if (helper[index1].qoffset <= next_offset &&
8752 helper[index1].qend >= next_offset)
8753 {
8754
8755 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE)
8756 { /* Check that it's on diff. strands. */
8757 hsp_start_is_contained = TRUE;
8758 }
8759 if (hsp_start_is_contained && CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE)
8760 { /* Check that it's on diff. strands. */
8761 hsp_end_is_contained = TRUE;
8762 }
8763 if (hsp_start_is_contained && hsp_end_is_contained && hsp->score <= hsp1->score)
8764 {
8765 break;
8766 }
8767 }
8768 }
8769
8770 if (hsp_start_is_contained == FALSE ||
8771 hsp_end_is_contained == FALSE ||
8772 hsp->score > hsp1->score)
8773 {
8774 gap_align->include_query = 0;
8775 #ifdef BLAST_COLLECT_STATS
8776 search->real_gap_number_of_hsps++;
8777 #endif
8778 /*
8779 Nlm_MemSet((VoidPtr) &(hsp_array[index]->hsp_link), 0, sizeof(BLAST_HSP_LINK));
8780 hsp_array[index]->linked_set = FALSE;
8781 hsp_array[index]->start_of_chain = FALSE;
8782 hsp_array[index]->num = 0;
8783 hsp_array[index]->sumscore = 0;
8784 */
8785
8786 gap_align->query = search->context[hsp->context].query->sequence;
8787 gap_align->query_length = search->context[hsp->context].query->length;
8788 gap_align->q_start = hsp->query.gapped_start;
8789 gap_align->s_start = hsp->subject.gapped_start;
8790
8791 gap_align->subject = subject;
8792 gap_align->subject_length = subject_length;
8793
8794 if (hsp->subject.gapped_start >= 0) {
8795 if (!PerformNtGappedAlignment(gap_align))
8796 return FALSE;
8797 }
8798
8799 query_length =
8800 search->query_context_offsets[search->first_context+1] - 1;
8801 if (gap_align->query_start / query_length !=
8802 (gap_align->query_stop - 1) / query_length) {
8803 if (gap_align->q_start < query_length) {
8804 gap_align->subject_stop -=
8805 (gap_align->query_stop - query_length + 1);
8806 gap_align->query_stop = query_length - 1;
8807 } else {
8808 gap_align->subject_start +=
8809 (query_length + 1 - gap_align->query_start);
8810 gap_align->query_start = query_length + 1;
8811 }
8812 }
8813 hsp->query.offset = gap_align->query_start;
8814 hsp->subject.offset = gap_align->subject_start;
8815 /* The end is one further for BLAST than for the gapped align. */
8816 hsp->query.end = gap_align->query_stop + 1;
8817 hsp->subject.end = gap_align->subject_stop + 1;
8818 hsp->query.length = hsp->query.end - hsp->query.offset;
8819 hsp->subject.length = hsp->subject.end - hsp->subject.offset;
8820 hsp->score = gap_align->score;
8821 /* TLM */
8822 /* AM: Changed to support query concatenation. */
8823 if( !search->mult_queries )
8824 hsp->evalue = BlastKarlinStoE_simple(hsp->score, search->sbp->kbp[search->first_context], search->searchsp_eff);
8825 else
8826 {
8827 query_num = GetQueryNum( search->mult_queries,
8828 hsp->query.offset,
8829 hsp->query.end,
8830 hsp->query.frame );
8831 hsp->evalue = BlastKarlinStoE_simple( hsp->score,
8832 search->sbp->kbp[search->first_context],
8833 search->mult_queries->SearchSpEff[query_num] );
8834 }
8835
8836 hsp_cnt++;
8837 /* Fill in the helper structure. */
8838 helper[index].qoffset = hsp->query.offset;
8839 helper[index].qend = hsp->query.end;
8840 }
8841 else
8842 { /* Contained within another HSP, delete. */
8843 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8844 }
8845 }
8846 helper = MemFree(helper);
8847
8848 /*
8849 hsp_cnt = CheckGappedAlignmentsForOverlap(search, hsp_array, *hspcnt, 0);
8850 */
8851
8852 if (hsp_cnt < (*hspcnt))
8853 {
8854 /* Save HSP's again, discarding those that have been NULLed out. */
8855 index1 = 0;
8856 for (index=0; index<(*hspcnt); index++)
8857 {
8858 if (hsp_array[index] != NULL)
8859 {
8860 hsp_array[index1] = hsp_array[index];
8861 index1++;
8862 }
8863 }
8864
8865 }
8866 *hspcnt = hsp_cnt;
8867
8868 return TRUE;
8869 }
8870
8871 /*
8872 Loads the HSP's into the BlastHitRangePtr.
8873 */
8874 static Boolean
BlastHitRangeLoad(BlastSearchBlkPtr search,BLAST_HSPPtr * hsp_array,Int4 hspcnt,BlastHitRangePtr bhrp)8875 BlastHitRangeLoad (BlastSearchBlkPtr search, BLAST_HSPPtr *hsp_array, Int4 hspcnt, BlastHitRangePtr bhrp)
8876
8877 {
8878 BlastDoubleInt4Ptr tmp;
8879 BLAST_HSPPtr hsp;
8880 Int4 index, query_length;
8881
8882 if (bhrp->current+hspcnt > bhrp->total)
8883 return FALSE;
8884
8885 if (hspcnt <= 0)
8886 return TRUE;
8887
8888 tmp = bhrp->range_list;
8889
8890 tmp += bhrp->current;
8891
8892 for (index=0; index<hspcnt; index++)
8893 {
8894 hsp = hsp_array[index];
8895 query_length = search->context[hsp->context].query->length;
8896 if (hsp->query.frame >= 0)
8897 {
8898 tmp->gi = hsp->query.offset;
8899 tmp->ordinal_id = hsp->query.end - 1;
8900 }
8901 else
8902 {
8903 tmp->gi = query_length - hsp->query.end;
8904 tmp->ordinal_id = query_length - hsp->query.offset - 1;
8905 }
8906 tmp++;
8907 }
8908
8909 bhrp->current += hspcnt;
8910
8911 return TRUE;
8912 }
8913
rpsFilterSequenceByMask(ValNodePtr mask,Uint1Ptr sequence,Int4 length,Int4 frame,Int4 dna_length)8914 static void rpsFilterSequenceByMask(ValNodePtr mask, Uint1Ptr sequence, Int4 length, Int4 frame, Int4 dna_length)
8915 {
8916 SeqLocPtr filter_slp = NULL;
8917 ValNodePtr vnp;
8918
8919 if(mask == NULL)
8920 return;
8921
8922 for(vnp = mask; vnp != NULL; vnp = vnp->next) {
8923
8924 if(vnp->choice == FrameToDefine(frame)) {
8925 filter_slp = (SeqLocPtr) vnp->data.ptrvalue;
8926 break;
8927 }
8928 }
8929
8930 if(filter_slp != NULL)
8931 BlastMaskTheResidues(sequence+1, length, 21, filter_slp, FALSE, 0);
8932
8933 /* BlastConvertProteinSeqLoc(filter_slp, frame, dna_length); */
8934
8935 return;
8936 }
8937
BLASTCheckHSPInclusion(BLAST_HSPPtr * hsp_array,Int4 hspcnt,Boolean is_ooframe)8938 void BLASTCheckHSPInclusion(BLAST_HSPPtr *hsp_array, Int4 hspcnt,
8939 Boolean is_ooframe)
8940 {
8941 Int4 index, index1;
8942 BLAST_HSPPtr hsp, hsp1;
8943
8944 for (index = 0; index < hspcnt; index++) {
8945
8946 hsp = hsp_array[index];
8947
8948 if (hsp == NULL)
8949 continue;
8950
8951 for (index1 = 0; index1 < index; index1++) {
8952
8953 hsp1 = hsp_array[index1];
8954
8955 if (hsp1 == NULL)
8956 continue;
8957
8958 if(is_ooframe) {
8959 if (SIGN(hsp1->query.frame) != SIGN(hsp->query.frame))
8960 continue;
8961 } else {
8962 if (hsp->context != hsp1->context)
8963 continue;
8964 }
8965
8966 /* Check of the start point of this HSP */
8967 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE) {
8968 /* Check of the end point of this HSP */
8969 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE) {
8970 /* Now checking correct strand */
8971 if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8972 SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame)){
8973
8974 /* If we come here through all these if-s - this
8975 mean, that current HSP should be removed. */
8976
8977 if(hsp_array[index] != NULL) {
8978 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8979 break;
8980 }
8981 }
8982 }
8983 }
8984 }
8985 }
8986 return;
8987 }
8988
8989
8990 /*
8991 Take a BLAST_HSPPtr (array of HSP's) and get a traceback for them.
8992 */
8993
8994 Int4
RealBlastGetGappedAlignmentTraceback(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,Uint1Ptr rev_subject,Int4 rev_subject_length,SeqIdPtr subject_id,BLAST_HSPPtr * hsp_array,Int4 hspcnt,SeqAlignPtr * head,BlastHitRangePtr bhrp,Int4 min_score_to_keep,Boolean reverse,Int4 ordinal_id,Boolean do_traceback)8995 RealBlastGetGappedAlignmentTraceback(BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, Uint1Ptr rev_subject, Int4 rev_subject_length, SeqIdPtr subject_id, BLAST_HSPPtr *hsp_array, Int4 hspcnt, SeqAlignPtr *head, BlastHitRangePtr bhrp, Int4 min_score_to_keep, Boolean reverse, Int4 ordinal_id, Boolean do_traceback)
8996
8997 {
8998 BLAST_HSPPtr hsp, hsp1, hsp2;
8999 BLAST_ParameterBlkPtr pbp;
9000 BLASTResultHsp result_hsp;
9001 Boolean hsp_start_is_contained, hsp_end_is_contained, keep;
9002 Boolean do_not_do;
9003 GapAlignBlkPtr gap_align;
9004 Int4 new_hspcnt=0;
9005 Int4 index, index1, index2, query_length, max_offset;
9006 Int4Ptr translated_subject_length=NULL;
9007 Int4Ptr translated_subject_length_orig=NULL;
9008 SeqAlignPtr seqalign, seqalign_var, *seqalign_array;
9009 Uint1Ptr query, PNTR translated_subject=NULL, PNTR translated_subject_orig=NULL;
9010 ValNodePtr gi_list=NULL;
9011 BLAST_HitListPtr tmp_hitlist;
9012 BLAST_HitListPtr real_hitlist;
9013 SeqIdPtr query_id, new_subject_seqid = NULL, seqid_tmp;
9014 Int4 max_start = MAX_DBSEQ_LEN / 2, start_shift;
9015 Int4 align_length;
9016 Int4 query_num; /* AM: Added to support query concatenation. */
9017 Boolean partial_translation;
9018 Int4 translation_length;
9019
9020 pbp = search->pbp;
9021 MemSet(&result_hsp, 0, sizeof(BLASTResultHsp));
9022
9023 seqalign=NULL;
9024 if (do_traceback)
9025 seqalign_array = MemNew(hspcnt*sizeof(SeqAlignPtr));
9026
9027 if (search->gap_align == NULL) {
9028 search->gap_align = GapAlignBlkNew(1, 1);
9029 }
9030
9031 gap_align = search->gap_align;
9032
9033 gi_list = BlastGetAllowedGis(search, ordinal_id, &new_subject_seqid);
9034
9035 #if 1
9036 if (gi_list) {
9037 /* change subject's gi with this 'use_this_gi' gi */
9038 subject_id->data.intvalue = gi_list->data.intvalue;
9039 }
9040 #endif
9041
9042 gap_align->is_ooframe = pbp->is_ooframe; /* For OOF: blastx and tblastn */
9043 gap_align->shift_pen = pbp->shift_pen;
9044
9045 gap_align->discontinuous = pbp->discontinuous;
9046 gap_align->positionBased =
9047 (search->positionBased && search->sbp->posMatrix);
9048 gap_align->gap_open = pbp->gap_open;
9049 gap_align->gap_extend = pbp->gap_extend;
9050 gap_align->decline_align = pbp->decline_align;
9051 gap_align->x_parameter = pbp->gap_x_dropoff_final;
9052 gap_align->matrix = search->sbp->matrix;
9053 gap_align->posMatrix = search->sbp->posMatrix;
9054 partial_translation = (subject_length > SUBJECT_ADJUSTMENT);
9055
9056 for (index=0; index<hspcnt; index++) {
9057 hsp_start_is_contained = FALSE;
9058 hsp_end_is_contained = FALSE;
9059 hsp = hsp_array[index];
9060
9061 for (index1=0; index1<index; index1++) {
9062 hsp_start_is_contained = FALSE;
9063 hsp_end_is_contained = FALSE;
9064
9065 hsp1 = hsp_array[index1];
9066 if (hsp1 == NULL)
9067 continue;
9068
9069 if(pbp->is_ooframe) {
9070 if (SIGN(hsp1->query.frame) != SIGN(hsp->query.frame))
9071 continue;
9072 } else {
9073 if (hsp->context != hsp1->context)
9074 continue;
9075 }
9076
9077 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE) {
9078 if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
9079 SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
9080 hsp_start_is_contained = TRUE;
9081 }
9082 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE) {
9083 if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
9084 SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
9085 hsp_end_is_contained = TRUE;
9086 }
9087 if (hsp_start_is_contained && hsp_end_is_contained && hsp->score <= hsp1->score) {
9088 break;
9089 }
9090 }
9091
9092 do_not_do = FALSE;
9093 /* Check whether this part of query has already been covered. */
9094 /* Commented out by TLM as this seems buggy.
9095 if (bhrp) {
9096 total = bhrp->current;
9097 for (index1=0; index1<total; index1++) {
9098 if (hsp->query.offset >= bhrp->range_list_pointer[index1]->gi &&
9099 hsp->query.end <= bhrp->range_list_pointer[index1]->ordinal_id) {
9100 do_not_do = TRUE;
9101 break;
9102 }
9103 }
9104 }
9105 */
9106 if (do_not_do == FALSE && (hsp_start_is_contained == FALSE || hsp_end_is_contained == FALSE ||
9107 hsp->score > hsp1->score)) {
9108 query = (Uint1Ptr) search->context[hsp->context].query->sequence;
9109 query_length = search->context[hsp->context].query->length;
9110
9111 gap_align->include_query = 0;
9112
9113
9114 if(search->pbp->is_ooframe) {
9115 gap_align->is_ooframe = TRUE;
9116 gap_align->query = subject;
9117
9118 if(hsp->query.frame > 0) {
9119 gap_align->subject = search->query_dnap[0]->sequence;
9120 gap_align->subject_length = search->query_dnap[0]->length;
9121 } else {
9122 gap_align->subject = search->query_dnap[1]->sequence;
9123 gap_align->subject_length = search->query_dnap[1]->length;
9124 }
9125
9126 gap_align->query_frame = hsp->subject.frame;
9127 gap_align->subject_frame = ContextToFrame(search, hsp->context);
9128 gap_align->query_length = subject_length;
9129 } else {
9130 gap_align->query_frame = ContextToFrame(search, hsp->context);
9131 gap_align->query = query;
9132
9133 gap_align->subject_frame = hsp->subject.frame;
9134 gap_align->subject = subject;
9135
9136 gap_align->query_length = query_length;
9137 gap_align->subject_length = subject_length;
9138 }
9139
9140 gap_align->translate1 = FALSE;
9141 gap_align->translate2 = FALSE;
9142 if (StringCmp(search->prog_name, "blastx") == 0) {
9143 gap_align->translate1 = TRUE;
9144 gap_align->translate2 = FALSE;
9145 }
9146
9147 start_shift = 0;
9148
9149 if (StringCmp(search->prog_name, "tblastn") == 0 ||
9150 StringCmp(search->prog_name, "psitblastn") == 0) {
9151 gap_align->translate1 = FALSE;
9152 gap_align->translate2 = TRUE;
9153 if (translated_subject == NULL) {
9154 translated_subject_orig = MemNew(8*sizeof(Uint1Ptr));
9155 translated_subject = translated_subject_orig + 3;
9156 translated_subject_length_orig = MemNew(8*sizeof(Int4));
9157 translated_subject_length = translated_subject_length_orig + 3;
9158 }
9159 if (partial_translation) {
9160 translated_subject[hsp->subject.frame] =
9161 MemFree(translated_subject[hsp->subject.frame]);
9162 /* NB: since SUBJECT_ADJUSTMENT is divisible by 3, the frame
9163 will remain the same.
9164 */
9165 start_shift =
9166 MAX(0, 3*hsp->subject.offset - SUBJECT_ADJUSTMENT);
9167 translation_length =
9168 MIN(3*hsp->subject.end + SUBJECT_ADJUSTMENT, subject_length)
9169 - start_shift;
9170 if (hsp->subject.frame > 0) {
9171 translated_subject[hsp->subject.frame] =
9172 GetTranslation(subject+start_shift, translation_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9173 } else {
9174 translated_subject[hsp->subject.frame] =
9175 GetTranslation(rev_subject+start_shift, translation_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9176 }
9177 /* Below, the start_shift will be used for the protein
9178 coordinates, so need to divide it by 3 */
9179 start_shift /= CODON_LENGTH;
9180 hsp->subject.offset -= start_shift;
9181 hsp->subject.gapped_start -= start_shift;
9182
9183 } else if (translated_subject[hsp->subject.frame] == NULL) {
9184 if (hsp->subject.frame > 0) {
9185 translated_subject[hsp->subject.frame] =
9186 GetTranslation(subject, subject_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9187 } else {
9188 translated_subject[hsp->subject.frame] =
9189 GetTranslation(rev_subject, rev_subject_length, hsp->subject.frame, &translated_subject_length[hsp->subject.frame], search->db_genetic_code);
9190 }
9191 /* For RPS Blast filtering if needed */
9192 if(search->pbp->is_rps_blast && search->pbp->filter_string != NULL && StringICmp(search->pbp->filter_string, "F")) {
9193 rpsFilterSequenceByMask(search->mask, translated_subject[hsp->subject.frame], translated_subject_length[hsp->subject.frame], hsp->subject.frame, (hsp->subject.frame > 0) ? subject_length : rev_subject_length);
9194 }
9195 }
9196
9197 gap_align->subject = translated_subject[hsp->subject.frame] + 1;
9198 gap_align->subject_length = translated_subject_length[hsp->subject.frame];
9199 }
9200
9201 /* these should both only be zero for blastn. */
9202 if (!search->pbp->is_ooframe &&
9203 (((hsp->query.gapped_start == 0 && hsp->subject.gapped_start == 0) ||
9204 CheckStartForGappedAlignment(search, hsp, gap_align->query, gap_align->subject, search->sbp->matrix) == FALSE))) {
9205 max_offset = GetStartForGappedAlignment(search, hsp, gap_align->query, gap_align->subject, search->sbp->matrix);
9206 gap_align->q_start = max_offset;
9207 gap_align->s_start = (hsp->subject.offset - hsp->query.offset) + max_offset;
9208 hsp->query.gapped_start = gap_align->q_start;
9209 hsp->subject.gapped_start = gap_align->s_start;
9210 } else {
9211 if(search->pbp->is_ooframe) {
9212 /* Code above should be investigated for possible
9213 optimization for OOF */
9214 gap_align->q_start = hsp->subject.gapped_start;
9215 gap_align->s_start = hsp->query.gapped_start;
9216 gap_align->subject_start = 0;
9217 gap_align->query_start = 0;
9218 } else {
9219 gap_align->q_start = hsp->query.gapped_start;
9220 gap_align->s_start = hsp->subject.gapped_start;
9221 }
9222 }
9223
9224 if (search->prog_number == blast_type_blastn) {
9225 /* For blastn, use only part of a long subject sequence,
9226 because the placeholders for the gapped alignment
9227 information have only been allocated for at most a
9228 certain length */
9229 if (gap_align->s_start > max_start) {
9230 start_shift = (gap_align->s_start / max_start) * max_start;
9231 gap_align->subject = gap_align->subject + start_shift;
9232
9233 gap_align->s_start %= max_start;
9234 } else
9235 start_shift = 0;
9236
9237 gap_align->subject_length =
9238 MIN(gap_align->subject_length - start_shift,
9239 gap_align->s_start + hsp->subject.length + max_start);
9240 }
9241
9242 if (do_traceback) {
9243 if (!search->pbp->mb_params ||
9244 search->pbp->mb_params->use_dyn_prog) {
9245 PerformGappedAlignmentWithTraceback(gap_align);
9246 } else {
9247 PerformGreedyAlignmentWithTraceback(gap_align, search->abmp,
9248 search->sbp);
9249 }
9250 } else {
9251 PerformGappedAlignment(gap_align);
9252 }
9253
9254 if (gap_align->score >= min_score_to_keep) {
9255
9256 if(search->pbp->is_ooframe) {
9257 hsp->query.offset = gap_align->subject_start + start_shift;
9258 hsp->subject.offset = gap_align->query_start;
9259 /* The end is one further for BLAST than for the gapped align. */
9260 hsp->query.end = gap_align->subject_stop + 1 + start_shift;
9261 hsp->subject.end = gap_align->query_stop + 1;
9262 } else {
9263 hsp->query.offset = gap_align->query_start;
9264 hsp->subject.offset = gap_align->subject_start + start_shift;
9265 /* The end is one further for BLAST than for the gapped align. */
9266 hsp->query.end = gap_align->query_stop + 1;
9267 hsp->subject.end = gap_align->subject_stop + 1 + start_shift;
9268 }
9269
9270 if (gap_align->edit_block && start_shift > 0) {
9271 gap_align->edit_block->start2 += start_shift;
9272 gap_align->edit_block->length2 += start_shift;
9273 }
9274 hsp->query.length = hsp->query.end - hsp->query.offset;
9275 hsp->subject.length = hsp->subject.end - hsp->subject.offset;
9276 hsp->score = gap_align->score;
9277
9278 if (do_traceback) {
9279 hsp->gap_info = gap_align->edit_block;
9280 }
9281
9282 keep = TRUE;
9283 /* If greedy alignment was used for traceback, we still need
9284 to reevaluate the score with ambiguity information */
9285 if (search->pbp->mb_params &&
9286 !search->pbp->mb_params->use_dyn_prog &&
9287 ReevaluateScoreWithAmbiguities(search, subject, hsp)) {
9288 /* HSP became below the cutoff after reevaluation */
9289 keep = FALSE;
9290 }
9291
9292 if (keep && (search->prog_number == blast_type_blastp ||
9293 search->prog_number == blast_type_blastn)) {
9294 if (search->pbp->mb_params) {
9295 FloatHi searchsp_eff = (FloatHi) search->dblen_eff *
9296 (FloatHi) search->context[hsp->context].query->effective_length;
9297
9298 hsp->evalue = BlastKarlinStoE_simple(hsp->score,
9299 search->sbp->kbp_gap[hsp->context],
9300 searchsp_eff);
9301 } else {
9302 /* AM: Changed to support query concatenation. */
9303 if( !search->mult_queries )
9304 hsp->evalue = BlastKarlinStoE_simple(hsp->score,
9305 search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9306 else
9307 {
9308 /* AM: First determine which query to use, then use the
9309 corresponding SearchSpEff element in the call to
9310 BlastKarlinStoE_simple() */
9311 query_num = GetQueryNum( search->mult_queries,
9312 hsp->query.offset,
9313 hsp->query.end,
9314 hsp->query.frame );
9315 hsp->evalue = BlastKarlinStoE_simple( hsp->score,
9316 search->sbp->kbp_gap[search->first_context],
9317 search->mult_queries->SearchSpEff[query_num] );
9318 }
9319 }
9320 /*hsp->pvalue = BlastKarlinEtoP(hsp->evalue);*/
9321 if (hsp->evalue > search->pbp->cutoff_e) /* put in for comp. based stats. */
9322 keep = FALSE;
9323 }
9324
9325 if (keep) {
9326 if (search->pbp->is_ooframe) {
9327 OOFBlastHSPGetNumIdentical(gap_align->query,
9328 gap_align->subject-start_shift, hsp, NULL,
9329 &hsp->num_ident, &align_length);
9330 } else {
9331 search->subject->sequence_start =
9332 gap_align->subject - start_shift - 1;
9333 BlastHSPGetNumIdentical(search, hsp, NULL, &hsp->num_ident,
9334 &align_length);
9335 }
9336 if (search->pbp->mb_params &&
9337 search->pbp->mb_params->use_dyn_prog) {
9338 if (hsp->num_ident * 100 <
9339 align_length * search->pbp->mb_params->perc_identity) {
9340 keep = FALSE;
9341 }
9342 }
9343 search->subject->sequence_start = NULL;
9344
9345 if (search->pbp->scalingFactor != 0.0 && search->pbp->scalingFactor != 1.0)
9346 /* Scale down score for blastp and tblastn. */
9347 hsp->score = (hsp->score+(0.5*search->pbp->scalingFactor))/search->pbp->scalingFactor;
9348
9349 /* only one alignment considered for blast[np]. */
9350 /* This may be changed by LinkHsps for blastx or tblastn. */
9351 hsp->num = 1;
9352 if ((search->prog_number == blast_type_tblastn ||
9353 search->prog_number == blast_type_psitblastn) &&
9354 search->pbp->longest_intron > 0)
9355 hsp->evalue = BlastKarlinStoE_simple(hsp->score,
9356 search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9357 }
9358
9359 for (index2=0; index2<index && keep == TRUE; index2++) {
9360 hsp2 = hsp_array[index2];
9361 if (hsp2 == NULL)
9362 continue;
9363
9364 /* Check if both HSP's start or end on the same diagonal (and are on same strands). */
9365 if (((hsp->query.offset == hsp2->query.offset &&
9366 hsp->subject.offset == hsp2->subject.offset) ||
9367 (hsp->query.end == hsp2->query.end &&
9368 hsp->subject.end == hsp2->subject.end)) &&
9369 hsp->context == hsp2->context &&
9370 hsp->subject.frame == hsp2->subject.frame) {
9371 if (hsp2->score > hsp->score) {
9372 keep = FALSE;
9373 break;
9374 } else {
9375 new_hspcnt--;
9376 if (do_traceback) {
9377 seqalign_array[index2] =
9378 SeqAlignFree(seqalign_array[index2]);
9379 }
9380 hsp_array[index2] =
9381 BLAST_HSPFree(hsp_array[index2]);
9382 }
9383 }
9384 }
9385
9386 if (keep) {
9387 new_hspcnt++;
9388 } else {
9389 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9390 }
9391 } else { /* Should be kept? */
9392 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9393 }
9394 } else { /* Contained within another HSP, delete. */
9395 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9396 }
9397 }
9398 if (search->pbp->scalingFactor != 0.0 && search->pbp->scalingFactor != 1.0)
9399 { /* Rescale Lambda. */
9400 search->sbp->kbp_gap[0]->Lambda *= search->pbp->scalingFactor;
9401 }
9402
9403 /* Now for OOF alignment we try to detect simular alignments */
9404
9405 HeapSort(hsp_array,hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
9406 BLASTCheckHSPInclusion(hsp_array, hspcnt, pbp->is_ooframe);
9407
9408 /* Make up fake hitlist, relink and rereap. */
9409
9410 if (StringCmp(search->prog_name, "blastx") == 0 ||
9411 StringCmp(search->prog_name, "tblastn") == 0 ||
9412 StringCmp(search->prog_name, "psitblastn") == 0) {
9413 hspcnt = HspArrayPurge(hsp_array, hspcnt, FALSE);
9414 tmp_hitlist = (BLAST_HitListPtr) MemNew(sizeof(BLAST_HitList));
9415 real_hitlist = search->current_hitlist;
9416
9417 search->current_hitlist = tmp_hitlist;
9418 tmp_hitlist->hsp_array = hsp_array;
9419 tmp_hitlist->hspcnt = hspcnt;
9420 tmp_hitlist->hspmax = hspcnt;
9421
9422 /* Use real subject length for all programs - it will be adjusted inside
9423 the functions that need it */
9424 search->subject->length = subject_length;
9425
9426 if (search->prog_number == blast_type_tblastn &&
9427 search->pbp->longest_intron > 0) {
9428 BlastSequenceAddSequence(search->subject, NULL, subject-1,
9429 subject_length, subject_length, 0);
9430 search->subject_id = ordinal_id;
9431 }
9432
9433 if (!search->pbp->do_sum_stats || search->pbp->longest_intron > 0)
9434 BlastGetNonSumStatsEvalue(search);
9435
9436 /* AM: Changed to support query concatenation. */
9437 if (search->pbp->do_sum_stats == TRUE)
9438 {
9439 if( search->mult_queries ) search->mult_queries->use_mq = FALSE;
9440
9441 BlastLinkHsps(search);
9442 }
9443
9444 if (search->prog_number == blast_type_tblastn &&
9445 search->pbp->longest_intron > 0)
9446 search->subject->sequence_start = search->subject->sequence = NULL;
9447
9448 BlastReapHitlistByEvalue(search);
9449
9450 hspcnt = search->current_hitlist->hspcnt;
9451 search->current_hitlist = real_hitlist;
9452 tmp_hitlist->lh_helper = MemFree(tmp_hitlist->lh_helper);
9453 MemFree(tmp_hitlist);
9454 }
9455
9456 new_hspcnt = HspArrayPurge(hsp_array, hspcnt, FALSE);
9457
9458 HeapSort(hsp_array,new_hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
9459
9460 /* Remove extra HSPs if there is a user proveded limit on the number
9461 of HSPs per database sequence */
9462 if (search->pbp->hsp_num_max > new_hspcnt) {
9463 for (index=new_hspcnt; index<search->pbp->hsp_num_max; ++index) {
9464 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
9465 }
9466 new_hspcnt = MIN(new_hspcnt, search->pbp->hsp_num_max);
9467 }
9468
9469 if (do_traceback) {
9470 for (index=0; index<new_hspcnt; index++) {
9471 hsp = hsp_array[index];
9472 hsp->gap_info->reverse = reverse;
9473 hsp->gap_info->original_length1 = search->context[hsp->context].query->original_length;
9474 hsp->gap_info->original_length2 = subject_length;
9475 if (search->pbp->mb_params) {
9476 query_id = search->qid_array[hsp->context/2];
9477 } else {
9478 query_id = search->query_id;
9479 }
9480 CopyHSPToResultHsp(search->sbp->kbp_gap[search->first_context],
9481 hsp, &result_hsp);
9482
9483 if (new_subject_seqid) {
9484 if (search->pbp->explode_seqids)
9485 seqid_tmp = gi_list;
9486 else
9487 seqid_tmp = new_subject_seqid;
9488 } else {
9489 seqid_tmp = subject_id;
9490 }
9491
9492 while (seqid_tmp) {
9493 if(search->pbp->is_ooframe) {
9494 seqalign = OOFGapXEditBlockToSeqAlign(hsp->gap_info, seqid_tmp, query_id, hsp->query.frame > 0 ? search->query_dnap[0]->length : search->query_dnap[1]->length);
9495 } else {
9496 seqalign = GapXEditBlockToSeqAlign(hsp->gap_info, seqid_tmp, query_id);
9497 }
9498
9499 seqalign->score = GetScoreSetFromBlastResultHsp(&result_hsp, gi_list);
9500
9501 if (seqalign_array[index] == NULL)
9502 seqalign_array[index] = seqalign;
9503 else {
9504 seqalign_var = seqalign_array[index];
9505 while (seqalign_var->next)
9506 seqalign_var = seqalign_var->next;
9507 seqalign_var->next = seqalign;
9508 }
9509 seqid_tmp = seqid_tmp->next;
9510 }
9511 }
9512
9513 *head = NULL;
9514 for (index=0; index<new_hspcnt; index++) {
9515 if (seqalign_array[index] != NULL) {
9516 if (*head == NULL) {
9517 *head = seqalign_array[index];
9518 } else {
9519 for (seqalign_var=*head; seqalign_var->next != NULL;) {
9520 seqalign_var = seqalign_var->next;
9521 }
9522 seqalign_var->next = seqalign_array[index];
9523 }
9524 }
9525 }
9526
9527 seqalign_array = MemFree(seqalign_array);
9528 } else {
9529 if (bhrp)
9530 BlastHitRangeLoad(search, hsp_array, new_hspcnt, bhrp);
9531 }
9532
9533 gi_list = SeqIdSetFree(gi_list);
9534 if (new_subject_seqid)
9535 new_subject_seqid = SeqIdSetFree(new_subject_seqid);
9536
9537 if ((StringCmp(search->prog_name, "tblastn") == 0 ||
9538 StringCmp(search->prog_name, "psitblastn") == 0)&&
9539 translated_subject_orig) {
9540 for (index=0; index<8; index++) {
9541 MemFree(translated_subject_orig[index]);
9542 }
9543 MemFree(translated_subject_orig);
9544 MemFree(translated_subject_length_orig);
9545 }
9546
9547 return new_hspcnt;
9548 }
9549
9550
9551 /*
9552 find the traceback for a gapped alignment. Do this by
9553 organizing the list of HSP's by sum group, then order
9554 these groups by score. Then attempt to perform the alignment
9555 by using the highest scoring HSP of every sum group, then the
9556 2nd highest scoring HSP, etc. until all the HSP's of a sum
9557 group have been examined. Then move onto the next sum group.
9558 */
9559 SeqAlignPtr LIBCALL
SumBlastGetGappedAlignmentTraceback(BlastSearchBlkPtr search,Int4 hit_number,Boolean reverse,Boolean ordinal_number,Uint1Ptr subject,Int4 subject_length)9560 SumBlastGetGappedAlignmentTraceback (BlastSearchBlkPtr search, Int4 hit_number, Boolean reverse, Boolean ordinal_number, Uint1Ptr subject, Int4 subject_length)
9561
9562 {
9563 SeqAlignPtr seqalign;
9564
9565 SumBlastGetGappedAlignmentEx(search, hit_number, reverse, ordinal_number, subject, subject_length, TRUE, &seqalign, NULL, 0);
9566
9567 return seqalign;
9568 }
9569
9570 Boolean LIBCALL
SumBlastGetGappedAlignmentEx(BlastSearchBlkPtr search,Int4 hit_number,Boolean reverse,Boolean ordinal_number,Uint1Ptr subject,Int4 subject_length,Boolean do_traceback,SeqAlignPtr PNTR seqalignP,BlastHitRangePtr bhrp,Int2 query_number)9571 SumBlastGetGappedAlignmentEx (BlastSearchBlkPtr search, Int4 hit_number, Boolean reverse, Boolean ordinal_number, Uint1Ptr subject, Int4 subject_length, Boolean do_traceback, SeqAlignPtr PNTR seqalignP, BlastHitRangePtr bhrp, Int2 query_number)
9572
9573 {
9574 BLAST_HSPPtr PNTR hsp_array;
9575 BLASTResultHitlistPtr result_hitlist;
9576 BLASTResultHspPtr result_hsp_array=NULL, hsp;
9577 Boolean not_done;
9578 Int4 hspcnt=0, new_hspcnt=0, hspset_cnt_old;
9579 Int4 index, index1, high_score=0, ordinal_id, next_start, start, stop;
9580 SeqAlignPtr seqalign=NULL;
9581 SeqIdPtr subject_id=NULL, sip, subject_id_var;
9582 Nlm_FloatHi current_evalue=DBL_MAX;
9583 ValNodePtr vnp, vnp_start;
9584 BLASTResultsStructPtr result_struct;
9585 Boolean is_megablast = (search->pbp->mb_params != NULL);
9586
9587 if (search == NULL)
9588 return FALSE;
9589
9590 if (is_megablast) {
9591 result_struct = search->mb_result_struct[query_number];
9592 } else {
9593 result_struct = search->result_struct;
9594 }
9595 result_hitlist = result_struct->results[hit_number];
9596 hspcnt = result_hitlist->hspcnt;
9597
9598 if (search->pbp->explode_seqids)
9599 { /* Obtain and connect all SeqId's if explode demanded. */
9600 vnp = NULL;
9601 if (is_megablast)
9602 BlastGetSubjectIdEx(search, hit_number,
9603 ordinal_number, &vnp, query_number);
9604 else
9605 BlastGetSubjectId(search, hit_number, ordinal_number, &vnp);
9606 vnp_start = vnp;
9607 while (vnp)
9608 {
9609 sip = GetTheSeqAlignID(vnp->data.ptrvalue);
9610 SeqIdFree(vnp->data.ptrvalue);
9611 if (subject_id == NULL)
9612 {
9613 subject_id = sip;
9614 }
9615 else
9616 {
9617 subject_id_var = subject_id;
9618 while (subject_id_var->next)
9619 subject_id_var = subject_id_var->next;
9620 subject_id_var->next = sip;
9621 }
9622 vnp = vnp->next;
9623 }
9624 vnp_start = vnp = ValNodeFree(vnp_start);
9625 }
9626 else
9627 {
9628 sip = BlastGetSubjectIdEx(search, hit_number, ordinal_number,
9629 NULL, query_number);
9630 subject_id = GetTheSeqAlignID(sip);
9631 sip = SeqIdSetFree(sip);
9632 }
9633 ordinal_id = result_hitlist->subject_id;
9634
9635 hsp_array = MemNew(hspcnt*sizeof(BLAST_HSPPtr));
9636 not_done = TRUE;
9637 start=0;
9638 next_start=0;
9639 while (not_done)
9640 {
9641 hsp = &(result_hitlist->hsp_array[start]);
9642 hspset_cnt_old = hsp->hspset_cnt;
9643 for (index=start; index<hspcnt; index++)
9644 {
9645 hsp = &(result_hitlist->hsp_array[index]);
9646 if(hspset_cnt_old != hsp->hspset_cnt)
9647 {
9648 hspset_cnt_old = hsp->hspset_cnt;
9649 stop = index;
9650 next_start = stop;
9651 break;
9652 }
9653 }
9654
9655 if (index == hspcnt)
9656 {
9657 stop = hspcnt;
9658 not_done = FALSE;
9659 }
9660
9661 index1=0;
9662 for (index=start; index<stop; index++)
9663 {
9664 hsp_array[index] = MemNew(sizeof(BLAST_HSP));
9665 CopyResultHspToHSP(&(result_hitlist->hsp_array[index]), hsp_array[index]);
9666 index1++;
9667 }
9668
9669 /* heap sort the last sum group */
9670 HeapSort(hsp_array+start,(stop-start),sizeof(BLAST_HSPPtr), score_compare_hsps);
9671 start = next_start;
9672 }
9673
9674 new_hspcnt = RealBlastGetGappedAlignmentTraceback(search, subject, subject_length, NULL, 0, subject_id, hsp_array, hspcnt, &seqalign, bhrp, search->pbp->cutoff_s, reverse, ordinal_id, do_traceback);
9675
9676 /* Save HSP's again, discarding those that have been NULLed out. */
9677 /* If no HSP's were valid, best_evalue is set to DBL_MAX. */
9678 index1 = 0;
9679 if (new_hspcnt > 0)
9680 {
9681 result_hsp_array = MemNew((new_hspcnt)*sizeof(BLASTResultHsp));
9682 index1 = 0;
9683 for (index=0; index<hspcnt; index++)
9684 {
9685 if (hsp_array[index] != NULL)
9686 {
9687 if (current_evalue > hsp_array[index]->evalue)
9688 current_evalue = hsp_array[index]->evalue;
9689 if (high_score < hsp_array[index]->score)
9690 high_score = hsp_array[index]->score;
9691 CopyHSPToResultHsp(search->sbp->kbp_gap[search->first_context], hsp_array[index], &(result_hsp_array[index1]));
9692 index1++;
9693 /* Do not free edit block, just the
9694 BLAST_HSP structure. */
9695 hsp_array[index] = MemFree(hsp_array[index]);
9696 }
9697 }
9698 }
9699 hsp_array = MemFree(hsp_array);
9700
9701 result_hitlist->hspcnt = index1;
9702 if (result_hitlist->hsp_array)
9703 MemFree(result_hitlist->hsp_array);
9704 result_hitlist->hsp_array = result_hsp_array;
9705 result_hitlist->best_evalue = current_evalue;
9706 result_hitlist->high_score = high_score;
9707
9708 subject_id = SeqIdSetFree(subject_id);
9709
9710 if (seqalignP)
9711 * seqalignP = seqalign;
9712
9713 return TRUE;
9714 }
9715
9716 /*
9717 Performs a gapped alignment on the HSP's in a hitlist.
9718 Discards those that do not meet the standard.
9719 */
9720
9721 SeqAlignPtr LIBCALL
BlastGetGapAlgnTbck(BlastSearchBlkPtr search,Int4 hit_number,Boolean reverse,Boolean ordinal_number,Uint1Ptr subject,Int4 subject_length,Uint1Ptr rev_subject,Int4 rev_subject_length)9722 BlastGetGapAlgnTbck (BlastSearchBlkPtr search, Int4 hit_number, Boolean reverse, Boolean ordinal_number, Uint1Ptr subject, Int4 subject_length, Uint1Ptr rev_subject, Int4 rev_subject_length)
9723
9724 {
9725 BLAST_HSPPtr PNTR hsp_array;
9726 BLASTResultHitlistPtr result_hitlist;
9727 BLASTResultHspPtr result_hsp_array=NULL;
9728 Int4 hspcnt=0, new_hspcnt=0;
9729 Int4 index, index1, high_score=0, ordinal_id;
9730 SeqAlignPtr seqalign, head, seqalign_var;
9731 SeqIdPtr subject_id=NULL, sip, subject_id_var;
9732 Nlm_FloatHi current_evalue=DBL_MAX;
9733 ValNodePtr vnp, vnp_start;
9734
9735 if (search == NULL)
9736 return NULL;
9737
9738 result_hitlist = search->result_struct->results[hit_number];
9739 hspcnt = result_hitlist->hspcnt;
9740 ordinal_id = result_hitlist->subject_id;
9741
9742 if (search->pbp->explode_seqids)
9743 { /* Obtain and connect all SeqId's if explode demanded. */
9744 vnp = NULL;
9745 BlastGetSubjectId(search, hit_number, ordinal_number, &vnp);
9746 vnp_start = vnp;
9747 while (vnp)
9748 {
9749 sip = GetTheSeqAlignID(vnp->data.ptrvalue);
9750 SeqIdFree(vnp->data.ptrvalue);
9751 if (subject_id == NULL)
9752 {
9753 subject_id = sip;
9754 }
9755 else
9756 {
9757 subject_id_var = subject_id;
9758 while (subject_id_var->next)
9759 subject_id_var = subject_id_var->next;
9760 subject_id_var->next = sip;
9761 }
9762 vnp = vnp->next;
9763 }
9764 vnp_start = vnp = ValNodeFree(vnp_start);
9765 }
9766 else
9767 {
9768 sip = BlastGetSubjectId(search, hit_number, ordinal_number, NULL);
9769 subject_id = GetTheSeqAlignID(sip);
9770 sip = SeqIdSetFree(sip);
9771 }
9772
9773 head = NULL;
9774
9775 hsp_array = MemNew(hspcnt*sizeof(BLAST_HSPPtr));
9776 for (index=0; index<hspcnt; index++)
9777 {
9778 hsp_array[index] = MemNew(sizeof(BLAST_HSP));
9779 CopyResultHspToHSP(&(result_hitlist->hsp_array[index]),
9780 hsp_array[index]);
9781 }
9782 HeapSort(hsp_array,hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
9783
9784 new_hspcnt = RealBlastGetGappedAlignmentTraceback(search, subject, subject_length, rev_subject, rev_subject_length, subject_id, hsp_array, hspcnt, &seqalign, NULL, 0, reverse, ordinal_id, TRUE);
9785 if (seqalign != NULL)
9786 {
9787 if (head == NULL)
9788 {
9789 head = seqalign;
9790 }
9791 else
9792 {
9793 for (seqalign_var=head; seqalign_var->next != NULL;)
9794 {
9795 seqalign_var = seqalign_var->next;
9796 }
9797 seqalign_var->next = seqalign;
9798 }
9799 }
9800
9801 /* Save HSP's again, discarding those that have been NULLed out. */
9802 result_hsp_array = MemNew((new_hspcnt)*sizeof(BLASTResultHsp));
9803 index1 = 0;
9804 for (index=0; index<hspcnt; index++)
9805 {
9806 if (hsp_array[index] != NULL)
9807 {
9808 if (current_evalue > hsp_array[index]->evalue)
9809 current_evalue = hsp_array[index]->evalue;
9810 if (high_score < hsp_array[index]->score)
9811 high_score = hsp_array[index]->score;
9812
9813 CopyHSPToResultHsp(search->sbp->kbp_gap[search->first_context], hsp_array[index], &(result_hsp_array[index1]));
9814 index1++;
9815 /* Do not free edit block, just the BLAST_HSP
9816 structure */
9817 hsp_array[index] = MemFree(hsp_array[index]);
9818 }
9819 }
9820 hsp_array = MemFree(hsp_array);
9821
9822 if (result_hitlist->hsp_array) {
9823 /* Delete any edit blocks from a previous traceback. */
9824 for (index=0; index< result_hitlist->hspcnt; ++index)
9825 GapXEditBlockDelete(result_hitlist->hsp_array[index].gap_info);
9826
9827 MemFree(result_hitlist->hsp_array);
9828 }
9829 result_hitlist->hspcnt = index1;
9830 result_hitlist->hsp_array = result_hsp_array;
9831 result_hitlist->best_evalue = current_evalue;
9832 result_hitlist->high_score = high_score;
9833
9834 subject_id = SeqIdSetFree(subject_id);
9835
9836 return head;
9837 }
9838
9839 /*
9840 Performs a gapped alignment on the HSP's in a hitlist.
9841 Discards those that do not meet the standard.
9842 */
9843
9844 Int2 LIBCALL
BlastPreliminaryGappedScore(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length,Int2 frame)9845 BlastPreliminaryGappedScore (BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, Int2 frame)
9846
9847 {
9848 BLAST_HitListPtr hitlist;
9849 BLAST_HSPPtr hsp;
9850 BLAST_HSPPtr PNTR hsp_array;
9851 GapAlignBlkPtr gap_align;
9852 Int2 status;
9853 Int4 index, max_offset = 0, query_length, min_score;
9854 BLAST_ParameterBlkPtr pbp;
9855
9856 if (search == NULL)
9857 return 1;
9858
9859 pbp = search->pbp;
9860
9861 if (search->gap_align == NULL)
9862 {
9863 search->gap_align = GapAlignBlkNew(1, 1);
9864 }
9865 gap_align = search->gap_align;
9866
9867 min_score = search->pbp->cutoff_s1;
9868
9869 status = 0;
9870 hitlist = search->current_hitlist;
9871 if (hitlist && hitlist->hspcnt > 0)
9872 {
9873 query_length = search->context[search->first_context].query->length;
9874
9875 hitlist->hspcnt_max = hitlist->hspcnt;
9876 hsp_array = hitlist->hsp_array;
9877 if (frame != 0)
9878 {
9879 for (index=0; index<hitlist->hspcnt; index++)
9880 {
9881 hsp = hsp_array[index];
9882 if (frame == hsp->subject.frame)
9883 break;
9884 }
9885 if (frame != hsp->subject.frame)
9886 return 0;
9887 }
9888 else
9889 { /* The first HSP has the highest score. */
9890 hsp = hsp_array[0];
9891 }
9892
9893 /* The first HSP has the highest score. */
9894 /*
9895 e_value = BlastKarlinStoE_simple(hsp->score, search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9896 */
9897 if (hsp->score >= min_score)
9898 {
9899 #ifdef BLAST_COLLECT_STATS
9900 search->prelim_gap_no_contest++;
9901 #endif
9902 hitlist->further_process = TRUE;
9903 return 1;
9904 }
9905 gap_align->is_ooframe = pbp->is_ooframe;
9906 gap_align->shift_pen = pbp->shift_pen;
9907 gap_align->discontinuous = pbp->discontinuous;
9908 gap_align->positionBased =
9909 (search->positionBased && search->sbp->posMatrix);
9910 gap_align->include_query = 0;
9911 gap_align->gap_open = pbp->gap_open;
9912 gap_align->gap_extend = pbp->gap_extend;
9913 gap_align->decline_align = pbp->decline_align;
9914 gap_align->x_parameter = pbp->gap_x_dropoff;
9915 gap_align->matrix = search->sbp->matrix;
9916 gap_align->posMatrix = search->sbp->posMatrix;
9917 for (index=0; index<hitlist->hspcnt; index++)
9918 {
9919 hsp = hsp_array[index];
9920 if (frame != 0)
9921 {
9922 if (frame != hsp->subject.frame)
9923 continue;
9924 }
9925
9926 if (hsp->score < search->pbp->gap_trigger)
9927 { /* Stop looking, we're below the cutoff. */
9928 status = 0;
9929 break;
9930 }
9931
9932 #ifdef BLAST_COLLECT_STATS
9933 search->prelim_gap_attempts++;
9934 #endif
9935 gap_align->score = 0;
9936
9937 if(!search->pbp->is_ooframe) {
9938 max_offset = GetStartForGappedAlignment(search, hsp, search->context[hsp->context].query->sequence, subject, search->sbp->matrix);
9939 }
9940
9941 if(search->pbp->is_ooframe) {
9942 gap_align->is_ooframe = TRUE;
9943 gap_align->query = subject;
9944
9945 if(hsp->query.frame > 0) {
9946 gap_align->subject = search->query_dnap[0]->sequence;
9947 gap_align->subject_length = search->query_dnap[0]->length;
9948 } else {
9949 gap_align->subject = search->query_dnap[1]->sequence;
9950 gap_align->subject_length = search->query_dnap[1]->length;
9951 }
9952
9953 gap_align->query_frame = hsp->subject.frame;
9954 gap_align->subject_frame = ContextToFrame(search, hsp->context);
9955
9956 gap_align->query_length = subject_length;
9957 gap_align->q_start = hsp->subject.offset;
9958 gap_align->s_start = hsp->query.offset;
9959 } else {
9960 gap_align->query = search->context[hsp->context].query->sequence;
9961 gap_align->subject = subject;
9962 gap_align->query_length = search->context[hsp->context].query->length;
9963 gap_align->subject_length = subject_length;
9964 gap_align->q_start = max_offset;
9965 gap_align->s_start = (hsp->subject.offset - hsp->query.offset) + max_offset;
9966 }
9967
9968 gap_align->include_query = 0;
9969
9970 /* Perform only if the query's required start corresponds to a point after the start of the subject. */
9971 if (gap_align->s_start >= 0)
9972 PerformGappedAlignment(gap_align);
9973 /*
9974 e_value = BlastKarlinStoE_simple(gap_align->score, search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
9975 */
9976 if (gap_align->score >= min_score)
9977 { /* Found one, stop looking. */
9978 hitlist->further_process = TRUE;
9979 status = 1;
9980 #ifdef BLAST_COLLECT_STATS
9981 search->prelim_gap_passed++;
9982 #endif
9983 break;
9984 }
9985 }
9986 }
9987
9988 return status;
9989 }
9990
9991 /*
9992 Performs a gapped alignment on the HSP's in a hitlist.
9993 This is to be used with blastn assuming the database sequence
9994 will be unpacked on the fly.
9995 Discards those that do not meet the standard.
9996 */
9997
9998 Int2 LIBCALL
BlastNTPreliminaryGappedScore(BlastSearchBlkPtr search,Uint1Ptr subject,Int4 subject_length)9999 BlastNTPreliminaryGappedScore (BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length)
10000
10001 {
10002 BLAST_HitListPtr hitlist;
10003 BLAST_HSPPtr hsp;
10004 BLAST_HSPPtr PNTR hsp_array;
10005 GapAlignBlkPtr gap_align;
10006 Int2 status;
10007 Int4 index;
10008 Nlm_FloatHi e_value;
10009 BLAST_ParameterBlkPtr pbp;
10010 /* AM: To support query concatenation. */
10011 Int4 query_num;
10012
10013 if (search == NULL)
10014 return -1;
10015
10016 pbp = search->pbp;
10017
10018 if (search->gap_align == NULL)
10019 {
10020 search->gap_align = GapAlignBlkNew(1, 1);
10021 }
10022 gap_align = search->gap_align;
10023
10024 status = 0;
10025 hitlist = search->current_hitlist;
10026 if (hitlist && hitlist->hspcnt > 0)
10027 {
10028
10029 hitlist->hspcnt_max = hitlist->hspcnt;
10030 hsp_array = hitlist->hsp_array;
10031
10032 /* The first HSP has the highest score. */
10033 hsp = hsp_array[0];
10034
10035 /* AM: Changed to support query concatenation. */
10036 /* The first HSP has the highest score. */
10037 if( !search->mult_queries )
10038 e_value = BlastKarlinStoE_simple(hsp->score, search->sbp->kbp[search->first_context], search->searchsp_eff);
10039 else
10040 {
10041 /* AM: First determine which query to use, then use the
10042 corresponding SearchSpEff element in the call to
10043 BlastKarlinStoE_simple() */
10044 query_num = GetQueryNum( search->mult_queries,
10045 hsp->query.offset,
10046 hsp->query.end,
10047 hsp->query.frame );
10048 e_value = BlastKarlinStoE_simple( hsp->score,
10049 search->sbp->kbp[search->first_context],
10050 search->mult_queries->SearchSpEff[query_num] );
10051 }
10052
10053 if (e_value <= pbp->cutoff_e)
10054 {
10055 #ifdef BLAST_COLLECT_STATS
10056 search->prelim_gap_no_contest++;
10057 #endif
10058 hitlist->further_process = TRUE;
10059 return 1;
10060 }
10061
10062 gap_align->is_ooframe = pbp->is_ooframe;
10063 gap_align->shift_pen = pbp->shift_pen;
10064 gap_align->positionBased = search->positionBased;
10065 gap_align->discontinuous = pbp->discontinuous;
10066 gap_align->include_query = 0;
10067 gap_align->gap_open = pbp->gap_open;
10068 gap_align->gap_extend = pbp->gap_extend;
10069 gap_align->decline_align = pbp->decline_align;
10070 gap_align->x_parameter = pbp->gap_x_dropoff;
10071 gap_align->matrix = search->sbp->matrix;
10072 gap_align->posMatrix = search->sbp->posMatrix;
10073 for (index=0; index<hitlist->hspcnt; index++)
10074 {
10075 hsp = hsp_array[index];
10076
10077 if (hsp->score < search->pbp->gap_trigger)
10078 { /* Stop looking, we're below the cutoff. */
10079 status = 0;
10080 break;
10081 }
10082
10083 #ifdef BLAST_COLLECT_STATS
10084 search->prelim_gap_attempts++;
10085 #endif
10086 gap_align->score = 0;
10087 gap_align->query = search->context[hsp->context].query->sequence;
10088 gap_align->subject = subject;
10089 gap_align->query_length = search->context[hsp->context].query->length;
10090 gap_align->subject_length = subject_length;
10091 gap_align->include_query = 0;
10092 gap_align->q_start = hsp->query.gapped_start;
10093 gap_align->s_start = hsp->subject.gapped_start;
10094 /* Perform only if the query's required start corresponds to a point after the start of the subject. */
10095 if (gap_align->s_start >= 0) {
10096 if (!PerformNtGappedAlignment(gap_align))
10097 return -1;
10098 }
10099
10100 /* AM: Change to support query concatenation */
10101 if( !search->mult_queries )
10102 e_value = BlastKarlinStoE_simple(gap_align->score, search->sbp->kbp[search->first_context], search->searchsp_eff);
10103 else
10104 {
10105 query_num = GetQueryNum( search->mult_queries,
10106 hsp->query.offset,
10107 hsp->query.end,
10108 hsp->query.frame );
10109 e_value = BlastKarlinStoE_simple(gap_align->score,
10110 search->sbp->kbp[search->first_context],
10111 search->mult_queries->SearchSpEff[query_num]);
10112 }
10113
10114 if (e_value <= pbp->cutoff_e)
10115 { /* Found one, stop looking. */
10116 hitlist->further_process = TRUE;
10117 status = 1;
10118 #ifdef BLAST_COLLECT_STATS
10119 search->prelim_gap_passed++;
10120 #endif
10121 break;
10122 }
10123 }
10124 }
10125
10126 return status;
10127 }
10128
10129 /*
10130 Performs a gapped alignment on the HSP's in a hitlist.
10131 Discards those that do not meet the standard.
10132 Do this by obtaining the sequence from readdb and calling
10133 BlastGetGappedScore.
10134 */
10135
10136 Int2 LIBCALL
BlastGetGappedScoreWithReaddb(BlastSearchBlkPtr search,Int4 sequence_number)10137 BlastGetGappedScoreWithReaddb (BlastSearchBlkPtr search, Int4 sequence_number)
10138
10139 {
10140 BLAST_HitListPtr hitlist;
10141 Int2 retval;
10142 Int4 subject_length;
10143 Uint1Ptr subject;
10144
10145 if (search == NULL)
10146 return 1;
10147
10148 retval=0;
10149 hitlist = search->current_hitlist;
10150 if (hitlist && hitlist->hspcnt > 0)
10151 {
10152 if (hitlist->further_process == FALSE)
10153 {
10154 BlastHitListPurge(hitlist);
10155 return 0;
10156 }
10157 subject_length = readdb_get_sequence(search->rdfp, sequence_number, &subject);
10158 retval = BlastGetGappedScore(search, subject_length, subject, 0);
10159 }
10160
10161 return retval;
10162 }
10163
10164
10165 /*
10166 Performs a gapped alignment on the HSP's in a hitlist.
10167 Discards those that do not meet the standard.
10168 */
10169
10170 Int2 LIBCALL
BlastGetGappedScore(BlastSearchBlkPtr search,Int4 subject_length,Uint1Ptr subject,Int2 frame)10171 BlastGetGappedScore (BlastSearchBlkPtr search, Int4 subject_length, Uint1Ptr subject, Int2 frame)
10172
10173 {
10174 BLAST_HitListPtr hitlist;
10175 BLAST_HSPPtr PNTR hsp_array, PNTR hsp_array_new;
10176 BLAST_ParameterBlkPtr pbp;
10177 GapAlignBlkPtr gap_align;
10178 Int2 status=0;
10179 Int4 hsp_cnt=0, hspcnt_max;
10180 Int4 index, index1;
10181
10182 if (search == NULL)
10183 return 1;
10184
10185 pbp = search->pbp;
10186
10187
10188 if (search->gap_align == NULL)
10189 {
10190 search->gap_align = GapAlignBlkNew(1, 1);
10191 }
10192 gap_align = search->gap_align;
10193
10194 hitlist = search->current_hitlist;
10195 if (hitlist && hitlist->hspcnt > 0)
10196 {
10197 if (hitlist->further_process == FALSE)
10198 {
10199 BlastHitListPurge(hitlist);
10200 return 0;
10201 }
10202
10203
10204 hsp_array = hitlist->hsp_array;
10205 if (hitlist->hspcnt != hitlist->hspcnt_max)
10206 {
10207 /* Save HSP's again, discarding those that have been NULLed out. */
10208 hsp_array_new = MemNew((hitlist->hspmax)*sizeof(BLAST_HSPPtr));
10209 index1 = 0;
10210 for (index=0; index<hitlist->hspcnt_max; index++)
10211 {
10212 if (hsp_array[index] != NULL)
10213 {
10214 hsp_array_new[index1] = hsp_array[index];
10215 index1++;
10216 }
10217 }
10218 hsp_array = MemFree(hsp_array);
10219 hsp_array = hsp_array_new;
10220 hitlist->hsp_array = hsp_array_new;
10221 hitlist->hspcnt = index1;
10222 hitlist->hspcnt_max = index1;
10223 }
10224
10225 gap_align->is_ooframe = pbp->is_ooframe;
10226 gap_align->shift_pen = pbp->shift_pen;
10227 gap_align->discontinuous = pbp->discontinuous;
10228 gap_align->positionBased =
10229 (search->positionBased && search->sbp->posMatrix);
10230 gap_align->include_query = 0;
10231 gap_align->gap_open = pbp->gap_open;
10232 gap_align->gap_extend = pbp->gap_extend;
10233 gap_align->decline_align = pbp->decline_align;
10234 gap_align->x_parameter = pbp->gap_x_dropoff;
10235 gap_align->matrix = search->sbp->matrix;
10236 gap_align->posMatrix = search->sbp->posMatrix;
10237
10238 if (frame != 0)
10239 {
10240 hsp_array = hitlist->hsp_array;
10241 switch (frame) {
10242 case -3:
10243 HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_m3);
10244 break;
10245 case -2:
10246 HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_m2);
10247 break;
10248 case -1:
10249 HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_m1);
10250 break;
10251 case 1:
10252 HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_p1);
10253 break;
10254 case 2:
10255 HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_p2);
10256 break;
10257 case 3:
10258 HeapSort(hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr), frame_compare_hsp_p3);
10259 break;
10260 default:
10261 break;
10262 }
10263
10264 for (index=0; index<hitlist->hspcnt; index++)
10265 {
10266 if (hsp_array[index]->subject.frame != frame)
10267 break;
10268 }
10269 HeapSort(hsp_array,index,sizeof(BLAST_HSPPtr), score_compare_hsps);
10270 }
10271 else
10272 {
10273 HeapSort(hsp_array,hitlist->hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
10274 }
10275 hitlist->hspcnt_max = hitlist->hspcnt;
10276 hsp_array = hitlist->hsp_array;
10277 hspcnt_max = hitlist->hspcnt;
10278 hsp_cnt = hitlist->hspcnt;
10279
10280 start_timer;
10281 hsp_array = BlastGappedScoreInternal(search, subject, subject_length, gap_align, hsp_array, &hsp_cnt, &hspcnt_max, hitlist->hspmax, frame);
10282 stop_timer("after BlastGappedScoreInternal");
10283 hitlist->hspcnt = hsp_cnt;
10284 hitlist->hspcnt_max = hspcnt_max;
10285 hitlist->hsp_array = hsp_array;
10286 }
10287
10288 return status;
10289 }
10290
10291 /*
10292 Performs a gapped alignment on the HSP's in a hitlist.
10293 Discards those that do not meet the standard.
10294 */
10295
10296 Int2 LIBCALL
BlastNTGetGappedScore(BlastSearchBlkPtr search,Int4 subject_length,Uint1Ptr subject)10297 BlastNTGetGappedScore (BlastSearchBlkPtr search, Int4 subject_length, Uint1Ptr subject)
10298
10299 {
10300 BLAST_HitListPtr hitlist;
10301 BLAST_HSPPtr PNTR hsp_array, PNTR hsp_array_new;
10302 BLAST_ParameterBlkPtr pbp;
10303 GapAlignBlkPtr gap_align;
10304 Int2 status=0;
10305 Int4 hsp_cnt=0, hspcnt_max;
10306 Int4 index, index1;
10307
10308 if (search == NULL)
10309 return -1;
10310
10311 pbp = search->pbp;
10312
10313
10314 if (search->gap_align == NULL)
10315 {
10316 search->gap_align = GapAlignBlkNew(1, 1);
10317 }
10318 gap_align = search->gap_align;
10319
10320 hitlist = search->current_hitlist;
10321 if (hitlist && hitlist->hspcnt > 0)
10322 {
10323 if (hitlist->further_process == FALSE)
10324 {
10325 BlastHitListPurge(hitlist);
10326 return 0;
10327 }
10328
10329 hsp_array = hitlist->hsp_array;
10330 if (hitlist->hspcnt != hitlist->hspcnt_max)
10331 {
10332 /* Save HSP's again, discarding those that have been NULLed out. */
10333 hsp_array_new = MemNew((hitlist->hspmax)*sizeof(BLAST_HSPPtr));
10334 index1 = 0;
10335 for (index=0; index<hitlist->hspcnt_max; index++)
10336 {
10337 if (hsp_array[index] != NULL)
10338 {
10339 hsp_array_new[index1] = hsp_array[index];
10340 index1++;
10341 }
10342 }
10343 hsp_array = MemFree(hsp_array);
10344 hsp_array = hsp_array_new;
10345 hitlist->hsp_array = hsp_array_new;
10346 hitlist->hspcnt = index1;
10347 hitlist->hspcnt_max = index1;
10348 }
10349
10350 gap_align->is_ooframe = pbp->is_ooframe;
10351 gap_align->shift_pen = pbp->shift_pen;
10352 gap_align->discontinuous = pbp->discontinuous;
10353 gap_align->positionBased = search->positionBased;
10354 gap_align->include_query = 0;
10355 gap_align->gap_open = pbp->gap_open;
10356 gap_align->gap_extend = pbp->gap_extend;
10357 gap_align->decline_align = pbp->decline_align;
10358 gap_align->x_parameter = pbp->gap_x_dropoff;
10359 gap_align->matrix = search->sbp->matrix;
10360 gap_align->posMatrix = search->sbp->posMatrix;
10361
10362 HeapSort(hsp_array,hitlist->hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
10363 hitlist->hspcnt_max = hitlist->hspcnt;
10364 hsp_array = hitlist->hsp_array;
10365 hspcnt_max = hitlist->hspcnt;
10366 hsp_cnt = hitlist->hspcnt;
10367
10368 if (!BlastNtGappedScoreInternal(search, subject, subject_length, gap_align, hsp_array, &hsp_cnt, &hspcnt_max, hitlist->hspmax))
10369 /* Gapped extension failed */
10370 return -1;
10371 hitlist->hspcnt = hsp_cnt;
10372 hitlist->hspcnt_max = hspcnt_max;
10373 hitlist->hsp_array = hsp_array;
10374 }
10375
10376 return status;
10377 }
10378
10379
10380 /******************************************************************
10381
10382 Purges (i.e., cleans) the HitList for reuse.
10383
10384 *******************************************************************/
10385
10386 Int2 LIBCALL
BlastHitListPurge(BLAST_HitListPtr hitlist)10387 BlastHitListPurge(BLAST_HitListPtr hitlist)
10388
10389 {
10390 BLAST_HSPPtr PNTR hsp_array;
10391 Int4 hspcnt_max, index;
10392
10393 if (hitlist == NULL)
10394 return 1;
10395
10396 hsp_array = hitlist->hsp_array;
10397
10398 if (hitlist->hspcnt > hitlist->hspcnt_max)
10399 hspcnt_max = hitlist->hspcnt;
10400 else
10401 hspcnt_max = hitlist->hspcnt_max;
10402
10403 for (index=0; index<hspcnt_max; index++) {
10404 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
10405 }
10406
10407 hitlist->hspcnt = 0;
10408 hitlist->hspcnt_max = 0;
10409 hitlist->further_process = FALSE;
10410
10411 return 0;
10412 }
10413
10414 /*
10415 Cleans out the NULLed out HSP's from the HSP array,
10416 moving the BLAST_HSPPtr's up to fill in the gaps.
10417
10418 returns the number of valid HSP's.
10419 */
10420
10421 Int4 LIBCALL
HspArrayPurge(BLAST_HSPPtr PNTR hsp_array,Int4 hspcnt,Boolean clear_num)10422 HspArrayPurge (BLAST_HSPPtr PNTR hsp_array, Int4 hspcnt, Boolean clear_num)
10423
10424 {
10425 Int4 index, index1;
10426
10427 if (hspcnt == 0 || hsp_array == NULL)
10428 return 0;
10429
10430 index1 = 0;
10431 for (index=0; index<hspcnt; index++)
10432 {
10433 if (hsp_array[index] != NULL)
10434 {
10435 hsp_array[index1] = hsp_array[index];
10436 if (clear_num)
10437 hsp_array[index1]->num = 0;
10438 index1++;
10439 }
10440 }
10441
10442 for (index=index1; index<hspcnt; index++)
10443 {
10444 hsp_array[index] = NULL;
10445 }
10446
10447 hspcnt = index1;
10448
10449 return index1;
10450 }
10451
10452
OOF_TranslateHspToDNAP(BLAST_HSPPtr hspp,Int4 length)10453 static void OOF_TranslateHspToDNAP(BLAST_HSPPtr hspp, Int4 length)
10454 {
10455 Int4 from, to, frame;
10456
10457 from = hspp->query.offset;
10458 to = hspp->query.end;
10459 frame = abs(hspp->query.frame);
10460
10461 hspp->query.offset = CODON_LENGTH*from + frame - 1;
10462 hspp->query.end = CODON_LENGTH*to + frame - 1;
10463 hspp->query.length = hspp->query.end - hspp->query.offset + 1;
10464 hspp->query.gapped_start = CODON_LENGTH*hspp->query.gapped_start + frame - 1;
10465
10466 return;
10467 }
10468 /**************************************************************************
10469 *
10470 * Save the current HSP in the appropriate ranking.
10471 *
10472 **************************************************************************/
10473
10474 #define BLAST_HSP_ADD 100
10475
10476 void
BlastSaveCurrentHsp(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 length,Int2 context)10477 BlastSaveCurrentHsp(BlastSearchBlkPtr search, BLAST_Score score, Int4 q_offset, Int4 s_offset, Int4 length, Int2 context)
10478
10479 {
10480 BLAST_HitListPtr current_hitlist;
10481 BLAST_HSPPtr PNTR hsp_array, new_hsp;
10482 Int4 hspcnt, hspmax, high_index, low_index;
10483
10484 current_hitlist = search->current_hitlist;
10485 hsp_array = current_hitlist->hsp_array;
10486 hspcnt = current_hitlist->hspcnt;
10487 hspmax = current_hitlist->hspmax;
10488
10489 /* Check if need to create a new list */
10490 if (hspmax == 0 && current_hitlist->do_not_reallocate == FALSE) {
10491 hsp_array = (BLAST_HSPPtr PNTR) Malloc(BLAST_HSP_ADD*sizeof(BLAST_HSPPtr));
10492 hspmax = current_hitlist->hspmax = BLAST_HSP_ADD;
10493 }
10494
10495 /* Check if list is already full, then reallocate. */
10496 if (hspcnt >= hspmax && current_hitlist->do_not_reallocate == FALSE)
10497 {
10498 hsp_array = (BLAST_HSPPtr PNTR) Realloc(hsp_array, current_hitlist->hspmax*2*sizeof(BLAST_HSPPtr));
10499 if (hsp_array == NULL)
10500 {
10501 ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in BlastSaveCurrentHsp for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, (long) hspmax);
10502 current_hitlist->do_not_reallocate = TRUE;
10503 }
10504 else
10505 {
10506 current_hitlist->hsp_array = hsp_array;
10507 current_hitlist->hspmax *= 2;
10508 hspmax = current_hitlist->hspmax;
10509 /* Prohibit future allocations. */
10510 if (search->pbp->hsp_num_max != 0 && current_hitlist->hspmax >= search->pbp->hsp_num_max)
10511 {
10512 ErrPostEx(SEV_WARNING, 0, 0, "Reached max %ld HSPs in BlastSaveCurrentHsp, continuing with this limit",
10513 (long) hspmax);
10514 current_hitlist->do_not_reallocate = TRUE;
10515 /* HSPs must be now sorted */
10516 HeapSort(hsp_array, hspcnt, sizeof(BLAST_HSPPtr), score_compare_hsps);
10517 }
10518 }
10519 }
10520
10521 new_hsp = (BLAST_HSPPtr) MemNew(sizeof(BLAST_HSP));
10522 new_hsp->score = score;
10523 new_hsp->query.offset = q_offset;
10524 new_hsp->subject.offset = s_offset;
10525 new_hsp->query.length = length;
10526 new_hsp->subject.length = length;
10527 new_hsp->query.end = q_offset + length;
10528 new_hsp->subject.end = s_offset + length;
10529 new_hsp->context = context;
10530 new_hsp->query.frame = ContextToFrame(search, context);
10531 new_hsp->subject.frame = search->subject->frame;
10532
10533 /* HACK */
10534 new_hsp->query.gapped_start = q_offset;
10535 new_hsp->subject.gapped_start = s_offset;
10536
10537 /* For out-of frame gapping - subject is protein
10538 and query is DNA translated into 3 frames
10539 so we have to adjust DNA sequence and
10540 coordinates */
10541
10542 if(search->pbp->is_ooframe) {
10543 OOF_TranslateHspToDNAP(new_hsp, new_hsp->query.frame > 0 ? search->query_dnap[0]->length : search->query_dnap[1]->length);
10544 }
10545
10546 if (!search->pbp->gapped_calculation &&
10547 search->prog_number != blast_type_blastn) {
10548 Int4 align_length;
10549 BlastHSPGetNumIdentical(search, new_hsp, NULL, &new_hsp->num_ident,
10550 &align_length);
10551 }
10552
10553 /* If we are saving ALL HSP's, simply save and sort later. */
10554 if (current_hitlist->do_not_reallocate == FALSE)
10555 {
10556 hsp_array[current_hitlist->hspcnt] = new_hsp;
10557 (current_hitlist->hspcnt)++;
10558 return;
10559 }
10560
10561 /* Use a binary search to insert the HSP. */
10562 low_index = 0;
10563 high_index = hspcnt;
10564 while(low_index < high_index) {
10565 Int4 next_index = (low_index + high_index)/2;
10566 if( score_compare_hsps(&new_hsp, &hsp_array[next_index]) > 0 ) {
10567 low_index = next_index + 1;
10568 } else {
10569 high_index = next_index;
10570 }
10571 }
10572
10573 if (hspcnt >= hspmax)
10574 {
10575 if (low_index >= hspcnt) {
10576 /* this HSP is less significant than others on a full list.*/
10577 new_hsp = BLAST_HSPFree(new_hsp);
10578 return;
10579 } else {
10580 /* Delete the last HPS on the list. */
10581 hspcnt = --current_hitlist->hspcnt;
10582 hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10583 }
10584 }
10585 /* Move existing elements out of the way */
10586 Nlm_MemMove(&hsp_array[low_index] + 1, &hsp_array[low_index],
10587 (hspcnt-low_index)*sizeof(hsp_array[0]));
10588 hspcnt = ++current_hitlist->hspcnt;
10589
10590 /* Insert the new HSP */
10591 hsp_array[low_index] = new_hsp;
10592 return;
10593 }
10594
10595 void
BlastSaveCurrentHspGapped(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 q_length,Int4 s_length,Int2 context,GapXEditScriptPtr esp)10596 BlastSaveCurrentHspGapped(BlastSearchBlkPtr search, BLAST_Score score,
10597 Int4 q_offset, Int4 s_offset, Int4 q_length,
10598 Int4 s_length, Int2 context, GapXEditScriptPtr esp)
10599 {
10600 BlastNtSaveCurrentHspGapped(search, score, q_offset, s_offset, q_length,
10601 s_length, q_offset, s_offset, context, esp);
10602
10603 }
10604
10605 void
BlastNtSaveCurrentHspGapped(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 q_length,Int4 s_length,Int4 q_gapped_start,Int4 s_gapped_start,Int2 context,GapXEditScriptPtr esp)10606 BlastNtSaveCurrentHspGapped(BlastSearchBlkPtr search, BLAST_Score score,
10607 Int4 q_offset, Int4 s_offset, Int4 q_length,
10608 Int4 s_length, Int4 q_gapped_start,
10609 Int4 s_gapped_start, Int2 context,
10610 GapXEditScriptPtr esp)
10611 {
10612 BLAST_HitListPtr current_hitlist;
10613 BLAST_HSPPtr PNTR hsp_array, new_hsp;
10614 BLAST_Score highscore, lowscore;
10615 Int4 hspcnt, hspmax, index, new_index, high_index, old_index, low_index;
10616 Int4 new_hspmax;
10617
10618 current_hitlist = search->current_hitlist;
10619 hsp_array = current_hitlist->hsp_array;
10620 hspcnt = current_hitlist->hspcnt;
10621 hspmax = current_hitlist->hspmax;
10622
10623 /* Check if list is already full, then reallocate. */
10624 if (hspcnt >= hspmax && current_hitlist->do_not_reallocate == FALSE)
10625 {
10626 new_hspmax = 2*current_hitlist->hspmax;
10627 if (search->pbp->hsp_num_max && search->last_context <= 1)
10628 /* The HSP limit can only be applied here in case of a
10629 single query sequence; even then, save twice as many HSPs
10630 so far, to accommodate for possible inclusion check
10631 failures and score changes because of ambiguities */
10632 new_hspmax = MIN(new_hspmax, 2*search->pbp->hsp_num_max);
10633 if (new_hspmax > current_hitlist->hspmax) {
10634 hsp_array = (BLAST_HSPPtr PNTR) Realloc(hsp_array, current_hitlist->hspmax*2*sizeof(BLAST_HSPPtr));
10635 if (hsp_array == NULL)
10636 {
10637 ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in BlastSaveCurrentHsp for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, (long) hspmax);
10638 current_hitlist->do_not_reallocate = TRUE;
10639 }
10640 else
10641 {
10642 current_hitlist->hsp_array = hsp_array;
10643 current_hitlist->hspmax = new_hspmax;
10644 hspmax = new_hspmax;
10645 }
10646 } else {
10647 /*ErrPostEx(SEV_WARNING, 0, 0,
10648 "Sequence %ld: reached max %ld HSPs",
10649 search->subject_id, (long) hspmax);*/
10650 current_hitlist->do_not_reallocate = TRUE;
10651 }
10652 if (current_hitlist->do_not_reallocate) {
10653 /* HSPs must be now sorted */
10654 HeapSort(hsp_array, hspcnt, sizeof(BLAST_HSPPtr),
10655 score_compare_hsps);
10656 }
10657 }
10658
10659 new_hsp = (BLAST_HSPPtr) MemNew(sizeof(BLAST_HSP));
10660 new_hsp->score = score;
10661 new_hsp->query.offset = q_offset;
10662 new_hsp->subject.offset = s_offset;
10663 new_hsp->query.length = q_length;
10664 new_hsp->subject.length = s_length;
10665 new_hsp->query.end = q_offset + q_length;
10666 new_hsp->subject.end = s_offset + s_length;
10667 new_hsp->context = context;
10668 new_hsp->query.frame = ContextToFrame(search, context);
10669 new_hsp->subject.frame = search->subject->frame;
10670
10671 new_hsp->query.gapped_start = q_gapped_start;
10672 new_hsp->subject.gapped_start = s_gapped_start;
10673
10674 if (esp)
10675 MegaBlastFillHspGapInfo(new_hsp, esp);
10676
10677 /* If we are saving ALL HSP's, simply save and sort later. */
10678 if (current_hitlist->do_not_reallocate == FALSE)
10679 {
10680 hsp_array[current_hitlist->hspcnt] = new_hsp;
10681 (current_hitlist->hspcnt)++;
10682 return;
10683 }
10684
10685 /* Use a binary search to insert the HSP. */
10686
10687 if (hspcnt != 0)
10688 {
10689 highscore = hsp_array[0]->score;
10690 lowscore = hsp_array[hspcnt-1]->score;
10691 }
10692 else
10693 {
10694 highscore = 0;
10695 lowscore = 0;
10696 }
10697
10698 if (score >= highscore)
10699 {
10700 new_index = 0;
10701 }
10702 else if (score <= lowscore)
10703 {
10704 new_index = hspcnt;
10705 }
10706 else
10707 {
10708 low_index = 0;
10709 high_index = hspcnt-1;
10710 new_index = (low_index+high_index)/2;
10711 old_index = new_index;
10712
10713 for (index=0; index<BLAST_SAVE_ITER_MAX; index++)
10714 {
10715 if (score > hsp_array[new_index]->score)
10716 {
10717 high_index = new_index;
10718 }
10719 else
10720 {
10721 low_index = new_index;
10722 }
10723 new_index = (low_index+high_index)/2;
10724 if (new_index == old_index)
10725 { /* Perform this check as new_index get rounded DOWN a
10726 bove.*/
10727 if (score < hsp_array[new_index]->score)
10728 {
10729 new_index++;
10730 }
10731 break;
10732 }
10733 old_index = new_index;
10734 }
10735 }
10736
10737 if (hspcnt >= hspmax)
10738 {
10739 if (new_index >= hspcnt)
10740 { /* this HSP is less significant than others on a full list.*/
10741 new_hsp = BLAST_HSPFree(new_hsp);
10742 return;
10743 }
10744 else
10745 { /* Delete the last HSP on the list. */
10746 hspcnt = --current_hitlist->hspcnt;
10747 hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10748 }
10749 }
10750 current_hitlist->hspcnt++;
10751 Nlm_MemMove((hsp_array+new_index+1), (hsp_array+new_index), (hspcnt-new_index)*sizeof(hsp_array[0]));
10752 hsp_array[new_index] = new_hsp;
10753
10754 return;
10755 }
10756
10757 void
BlastNtSaveCurrentHsp(BlastSearchBlkPtr search,BLAST_Score score,Int4 q_offset,Int4 s_offset,Int4 length,Int2 context,Int4 query_gap_start,Int4 subject_gap_start)10758 BlastNtSaveCurrentHsp(BlastSearchBlkPtr search, BLAST_Score score, Int4 q_offset, Int4 s_offset, Int4 length, Int2 context, Int4 query_gap_start, Int4 subject_gap_start)
10759
10760 {
10761 BLAST_HitListPtr current_hitlist;
10762 BLAST_HSPPtr PNTR hsp_array, PNTR hsp_array_new, new_hsp;
10763 BLAST_Score highscore, lowscore;
10764 Int4 hspcnt, hspmax, index, new_index, high_index, old_index, low_index;
10765
10766 current_hitlist = search->current_hitlist;
10767 hsp_array = current_hitlist->hsp_array;
10768 hspcnt = current_hitlist->hspcnt;
10769 hspmax = current_hitlist->hspmax;
10770
10771
10772 /* Check if list is already full, then reallocate. */
10773 if (hspcnt >= hspmax && current_hitlist->do_not_reallocate == FALSE)
10774 {
10775 hsp_array_new = (BLAST_HSPPtr PNTR) MemNew((current_hitlist->hspmax+BLAST_HSP_ADD)*sizeof(BLAST_HSPPtr));
10776 if (hsp_array_new == NULL)
10777 {
10778 ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in BlastSaveCurrentHsp for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, (long) hspmax);
10779 current_hitlist->do_not_reallocate = TRUE;
10780 }
10781 else
10782 {
10783 Nlm_MemCopy(hsp_array_new, hsp_array, current_hitlist->hspmax*sizeof(BLAST_HSPPtr));
10784 current_hitlist->hsp_array = MemFree(current_hitlist->hsp_array);
10785 current_hitlist->hsp_array = hsp_array_new;
10786 current_hitlist->hspmax += BLAST_HSP_ADD;
10787 hspmax = current_hitlist->hspmax;
10788 hsp_array = hsp_array_new;
10789 /* Prohibit future allocations. */
10790 if (search->pbp->hsp_num_max != 0 && current_hitlist->hspmax >= 2*search->pbp->hsp_num_max)
10791 {
10792 ErrPostEx(SEV_WARNING, 0, 0, "Reached max %ld HSPs in BlastSaveCurrentHsp, continuing with this limit",
10793 (long) hspmax);
10794 current_hitlist->do_not_reallocate = TRUE;
10795 }
10796 }
10797 if (current_hitlist->do_not_reallocate) {
10798 HeapSort(hsp_array, hspcnt, sizeof(BLAST_HSPPtr),
10799 score_compare_hsps);
10800 }
10801 }
10802
10803 new_hsp = (BLAST_HSPPtr) MemNew(sizeof(BLAST_HSP));
10804 new_hsp->score = score;
10805 new_hsp->query.offset = q_offset;
10806 new_hsp->subject.offset = s_offset;
10807 new_hsp->query.length = length;
10808 new_hsp->subject.length = length;
10809 new_hsp->query.end = q_offset + length;
10810 new_hsp->subject.end = s_offset + length;
10811 new_hsp->context = context;
10812 new_hsp->query.frame = ContextToFrame(search, context);
10813 new_hsp->subject.frame = search->subject->frame;
10814
10815 new_hsp->query.gapped_start = query_gap_start;
10816 new_hsp->subject.gapped_start = subject_gap_start;
10817
10818 /* If we are saving ALL HSP's, simply save and sort later. */
10819 if (current_hitlist->do_not_reallocate == FALSE)
10820 {
10821 hsp_array[current_hitlist->hspcnt] = new_hsp;
10822 (current_hitlist->hspcnt)++;
10823 return;
10824 }
10825
10826 /* Use a binary search to insert the HSP. */
10827
10828 if (hspcnt != 0)
10829 {
10830 highscore = hsp_array[0]->score;
10831 lowscore = hsp_array[hspcnt-1]->score;
10832 }
10833 else
10834 {
10835 highscore = 0;
10836 lowscore = 0;
10837 }
10838
10839 if (score >= highscore)
10840 {
10841 new_index = 0;
10842 }
10843 else if (score <= lowscore)
10844 {
10845 new_index = hspcnt;
10846 }
10847 else
10848 {
10849 low_index = 0;
10850 high_index = hspcnt-1;
10851 new_index = (low_index+high_index)/2;
10852 old_index = new_index;
10853
10854 for (index=0; index<BLAST_SAVE_ITER_MAX; index++)
10855 {
10856 if (score > hsp_array[new_index]->score)
10857 {
10858 high_index = new_index;
10859 }
10860 else
10861 {
10862 low_index = new_index;
10863 }
10864 new_index = (low_index+high_index)/2;
10865 if (new_index == old_index)
10866 { /* Perform this check as new_index get rounded DOWN a
10867 bove.*/
10868 if (score < hsp_array[new_index]->score)
10869 {
10870 new_index++;
10871 }
10872 break;
10873 }
10874 old_index = new_index;
10875 }
10876 }
10877
10878 if (hspcnt >= hspmax)
10879 {
10880 if (new_index >= hspcnt)
10881 { /* this HSP is less significant than others on a full list.*/
10882 new_hsp = MemFree(new_hsp);
10883 return;
10884 }
10885 else
10886 { /* Delete the last HPS on the list. */
10887 hspcnt = --current_hitlist->hspcnt;
10888 hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10889 }
10890 }
10891 current_hitlist->hspcnt++;
10892 Nlm_MemMove((hsp_array+new_index+1), (hsp_array+new_index), (hspcnt-new_index)*sizeof(hsp_array[0]));
10893 hsp_array[new_index] = new_hsp;
10894
10895 return;
10896 }
10897
10898 Uint1Ptr
GetSequenceWithDenseSeg(DenseSegPtr dsp,Boolean query,Int4Ptr start,Int4Ptr length)10899 GetSequenceWithDenseSeg(DenseSegPtr dsp, Boolean query, Int4Ptr start, Int4Ptr length)
10900
10901 {
10902 BioseqPtr bsp;
10903 Int4 index, offset;
10904 SeqIdPtr id;
10905 SeqPortPtr spp;
10906 Uint1Ptr buffer;
10907 Boolean startSet = FALSE;
10908
10909 if (dsp == NULL)
10910 return NULL;
10911
10912 if (query == TRUE)
10913 {
10914 offset = 0;
10915 id = dsp->ids;
10916 }
10917 else
10918 {
10919 offset = 1;
10920 id = dsp->ids->next;
10921 }
10922
10923 *length = 0;
10924 for (index=0; index<dsp->numseg; index++)
10925 {
10926 if (dsp->starts[offset+2*index] != -1) {
10927 *length += dsp->lens[index];
10928 if (!startSet) {
10929 *start = dsp->starts[offset + 2*index];
10930 startSet = TRUE;
10931 }
10932 }
10933 }
10934
10935 bsp = BioseqLockById(id);
10936 if (bsp == NULL) {
10937 Char buf[1024];
10938 StringCpy(buf, "Failed to retrieve sequence ");
10939 SeqIdWrite(id, &buf[StringLen(buf)], PRINTID_FASTA_LONG,
10940 sizeof(buf)-StringLen(buf)-1);
10941 ErrPostEx(SEV_WARNING, 0, 0, buf);
10942 return NULL;
10943 }
10944
10945 spp = SeqPortNew(bsp, *start, (*start)+(*length)-1, Seq_strand_unknown, Seq_code_ncbistdaa);
10946
10947 buffer = MemNew((*length)*sizeof(Uint1));
10948
10949 for (index=0; index<*length; index++)
10950 buffer[index] = SeqPortGetResidue(spp);
10951
10952 spp = SeqPortFree(spp);
10953 BioseqUnlock(bsp);
10954
10955 return buffer;
10956 }
10957
10958 /*
10959 Produces a 'fake' BioseqPtr, for use with BLAST when the
10960 ID of the original BioseqPtr cannot be trusted. Note that
10961 the ID of the original BioseqPtr is removed.
10962 */
10963
10964 BioseqPtr LIBCALL
BlastMakeFakeBioseq(BioseqPtr bsp,CharPtr name)10965 BlastMakeFakeBioseq(BioseqPtr bsp, CharPtr name)
10966
10967 {
10968 BioseqPtr fake_bsp;
10969 ObjectIdPtr obidp;
10970
10971 if (bsp == NULL)
10972 return NULL;
10973
10974 fake_bsp = BioseqNew();
10975 fake_bsp->descr = bsp->descr;
10976 fake_bsp->repr = bsp->repr;
10977 fake_bsp->mol = bsp->mol;
10978 fake_bsp->length = bsp->length;
10979 fake_bsp->strand = bsp->strand;
10980 fake_bsp->seq_data_type = bsp->seq_data_type;
10981 fake_bsp->seq_ext_type = bsp->seq_ext_type;
10982 fake_bsp->seq_data = bsp->seq_data;
10983 fake_bsp->seq_ext = bsp->seq_ext;
10984
10985 obidp = ObjectIdNew();
10986 if (name)
10987 obidp->str = StringSave(name);
10988 else
10989 obidp->str = StringSave("QUERY");
10990 ValNodeAddPointer(&(fake_bsp->id), SEQID_LOCAL, obidp);
10991
10992 SeqMgrAddToBioseqIndex (fake_bsp);
10993
10994 return fake_bsp;
10995 }
10996
10997 BioseqPtr LIBCALL
BlastDeleteFakeBioseq(BioseqPtr fake_bsp)10998 BlastDeleteFakeBioseq(BioseqPtr fake_bsp)
10999
11000 {
11001 if (fake_bsp == NULL)
11002 return NULL;
11003
11004 fake_bsp->descr = NULL;
11005 fake_bsp->length = 0;
11006 fake_bsp->seq_data = NULL;
11007 fake_bsp->seq_ext = NULL;
11008
11009 return BioseqFree(fake_bsp);
11010 }
11011
11012 /* Comparison function for sorting gi list */
Int4Compare(const void * i,const void * j)11013 static int Int4Compare(const void* i, const void* j)
11014 {
11015 if (*(Int4Ptr)i > *(Int4Ptr)j)
11016 return (1);
11017 if (*(Int4Ptr)i < *(Int4Ptr)j)
11018 return (-1);
11019 return (0);
11020 }
11021
11022 /* Remove hits from a SeqAlignPtr that are not from a gi list. The function
11023 * is optimized with an assumption that the incoming gi list is not sorted.
11024 * Since sorting of the gi list may be expensive, the hit gis are found
11025 * and sorted. Then for each gi in the (presumably large) incoming gi list,
11026 * a binary search is performed to check if it is present in the list of hit
11027 * gis. This procedure is linear in the gi list size.
11028 */
11029 SeqAlignPtr
BlastPruneSeqAlignByGiList(SeqAlignPtr seqalign,Int4Ptr gi_list,Int4 gi_list_total,Int4 hitlist_size)11030 BlastPruneSeqAlignByGiList(SeqAlignPtr seqalign, Int4Ptr gi_list,
11031 Int4 gi_list_total, Int4 hitlist_size)
11032 {
11033 SeqAlignPtr head = NULL, last_sap = NULL, next_sap, sap;
11034 SeqIdPtr sip;
11035 BioseqPtr bsp;
11036 Int4 gi = 0, index;
11037 Int4* hit_gis;
11038 Int4 num_hit_gis, gi_index;
11039 Boolean* good_gis;
11040 Boolean good_gi = FALSE;
11041
11042 if (!gi_list || gi_list_total <= 0)
11043 return NULL;
11044
11045 /* If the size of the gi list is small, sort it and use a different
11046 routine, which takes a sorted list argument. */
11047
11048 if (hitlist_size >= gi_list_total) {
11049 qsort((void*)gi_list, gi_list_total, sizeof(Int4), Int4Compare);
11050 return BlastPruneSeqAlignBySortedGiList(seqalign, gi_list,
11051 gi_list_total);
11052 }
11053
11054 hit_gis = (Int4*) MemNew(hitlist_size*sizeof(Int4));
11055
11056 gi = 0;
11057 index = 0;
11058 /* Find all subject gis in the Seq-align chain */
11059 for (sap = seqalign; sap; sap = sap->next) {
11060 sip = SeqAlignId(sap, 1);
11061 if (sip->choice != SEQID_GI) {
11062 bsp = BioseqLockById(sip);
11063 if (bsp) {
11064 sip = SeqIdFindBest(bsp->id, SEQID_GI);
11065 BioseqUnlock(bsp);
11066 }
11067 }
11068 if (sip->choice == SEQID_GI) {
11069 /* Save this gi if the previous value of gi
11070 is different from the current value. */
11071 if (gi != sip->data.intvalue) {
11072 gi = sip->data.intvalue;
11073 hit_gis[index] = gi;
11074 ++index;
11075 }
11076 }
11077 }
11078 num_hit_gis = index;
11079 qsort((void*)hit_gis, num_hit_gis, sizeof(Int4),
11080 Int4Compare);
11081 good_gis = (Boolean*) MemNew(num_hit_gis*sizeof(Boolean));
11082
11083 for (index = 0; index < gi_list_total; ++index) {
11084 gi_index = BinarySearchInt4(gi_list[index], hit_gis, num_hit_gis);
11085 if (hit_gis[gi_index] == gi_list[index])
11086 good_gis[gi_index] = TRUE;
11087 }
11088
11089 for (sap = seqalign; sap; sap = next_sap) {
11090 next_sap = sap->next;
11091 sip = SeqAlignId(sap, 1);
11092 if (sip->choice != SEQID_GI) {
11093 bsp = BioseqLockById(sip);
11094 if (bsp) {
11095 sip = SeqIdFindBest(bsp->id, SEQID_GI);
11096 BioseqUnlock(bsp);
11097 }
11098 }
11099 if (sip->choice == SEQID_GI) {
11100 /* Do the following check only if the previous value of gi
11101 is different from the current value. */
11102 if (gi != sip->data.intvalue) {
11103 gi = sip->data.intvalue;
11104 index = BinarySearchInt4(gi, hit_gis, num_hit_gis);
11105 good_gi = good_gis[index];
11106 }
11107 } else {
11108 good_gi = FALSE;
11109 }
11110 if (good_gi) {
11111 /* Advance the pointer to the last link in the pruned chain to
11112 the current Seq-align. */
11113 if (head == NULL)
11114 head = last_sap = sap;
11115 else {
11116 last_sap = sap;
11117 }
11118 } else {
11119 /* Link last Seq-align in the pruned chain to the next Seq-align
11120 in the original chain. */
11121 if (last_sap)
11122 last_sap->next = sap->next;
11123 sap->next = NULL;
11124 /* Free this Seq-align, since it's no longer needed. */
11125 sap = SeqAlignFree(sap);
11126 }
11127 }
11128
11129
11130 return head;
11131 }
11132
11133 /* Remove hits from a SeqAlignPtr that are not from a sorted gi list.
11134 * No check is made that incoming gi list is sorted. User must make
11135 * sure that it is that way. The pruning is done by a single pass over the
11136 * list of Seq-aligns, in which a binary search is performed for any new
11137 * subject gi to check if it is present in the gi list.
11138 */
11139 SeqAlignPtr
BlastPruneSeqAlignBySortedGiList(SeqAlignPtr seqalign,Int4Ptr gi_list,Int4 gi_list_total)11140 BlastPruneSeqAlignBySortedGiList(SeqAlignPtr seqalign, Int4Ptr gi_list,
11141 Int4 gi_list_total)
11142 {
11143 SeqAlignPtr head = NULL, last_sap = NULL, next_sap, sap;
11144 SeqIdPtr sip;
11145 BioseqPtr bsp;
11146 Int4 gi = 0;
11147 Boolean good_gi = FALSE;
11148
11149 if (!gi_list || gi_list_total <= 0)
11150 return NULL;
11151
11152 /* Find all subject gis in the Seq-align chain */
11153 for (sap = seqalign; sap; sap = next_sap) {
11154 next_sap = sap->next;
11155 sip = SeqAlignId(sap, 1);
11156 if (sip->choice != SEQID_GI) {
11157 bsp = BioseqLockById(sip);
11158 if (bsp) {
11159 sip = SeqIdFindBest(bsp->id, SEQID_GI);
11160 BioseqUnlock(bsp);
11161 }
11162 }
11163 if (sip->choice == SEQID_GI) {
11164 /* Do the following check only if the previous value of gi is
11165 different from the current value. Otherwise the "good_gi"
11166 variable is left with its previous value. */
11167 if (gi != sip->data.intvalue) {
11168 Int4 index;
11169 gi = sip->data.intvalue;
11170 index = BinarySearchInt4(gi, gi_list, gi_list_total);
11171 good_gi = (gi_list[index] == gi);
11172 }
11173 } else {
11174 good_gi = FALSE;
11175 }
11176
11177 if (good_gi) {
11178 /* Advance the pointer to the last link in the pruned chain to
11179 the current Seq-align. */
11180 if (head == NULL)
11181 head = last_sap = sap;
11182 else {
11183 last_sap = sap;
11184 }
11185 } else {
11186 /* Link last Seq-align in the pruned chain to the next Seq-align
11187 in the original chain. */
11188 if (last_sap)
11189 last_sap->next = sap->next;
11190 sap->next = NULL;
11191 /* Free this Seq-align, since it's no longer needed. */
11192 sap = SeqAlignFree(sap);
11193 }
11194 }
11195
11196 return head;
11197 }
11198
11199 /*
11200 Remove hits from a SeqAlignPtr that are not within an expect
11201 value range
11202 */
11203
11204 SeqAlignPtr
BlastPruneSeqAlignByEvalueRange(SeqAlignPtr seqalign,FloatHi expect_low,FloatHi expect_high)11205 BlastPruneSeqAlignByEvalueRange(SeqAlignPtr seqalign, FloatHi expect_low,
11206 FloatHi expect_high)
11207 {
11208 SeqAlignPtr head = NULL, last_sap = NULL, sap, next_sap;
11209 Int4 score, number;
11210 FloatHi evalue, bit_score;
11211 SeqIdPtr sip = NULL;
11212
11213 for (sap = seqalign; sap; sap = next_sap) {
11214 next_sap = sap->next;
11215 GetScoreAndEvalue(sap, &score, &bit_score, &evalue, &number);
11216 if (evalue >= expect_low && evalue <= expect_high) {
11217 /* Leave this Seq-align */
11218 if (head == NULL)
11219 head = last_sap = sap;
11220 else {
11221 last_sap = sap;
11222 }
11223
11224 if (sip && SeqIdComp(TxGetSubjectIdFromSeqAlign(sap), sip)
11225 == SIC_YES) {
11226 /* Add message about deleted high scoring hits */
11227 MakeBlastScore(&sap->score, "warning", expect_low, 0);
11228 }
11229 sip = NULL;
11230
11231 } else {
11232 if (evalue < expect_low && sip == NULL) {
11233 sip = TxGetSubjectIdFromSeqAlign(sap);
11234 }
11235 /* Remove this Seq-align: link last Seq-align in the pruned
11236 chain to the next Seq-align in the original chain. */
11237 if (last_sap)
11238 last_sap->next = sap->next;
11239 sap->next = NULL;
11240 /* Free this Seq-align, since it's no longer needed. */
11241 sap = SeqAlignFree(sap);
11242 }
11243
11244 }
11245 return head;
11246 }
11247
11248 /*
11249 Returns the program name for a given program number.
11250 The caller must delete the returned string.
11251 */
11252
11253 CharPtr LIBCALL
BlastGetProgramName(Uint1 number)11254 BlastGetProgramName(Uint1 number)
11255
11256 {
11257 CharPtr string=NULL;
11258
11259 switch (number) {
11260
11261 case blast_type_blastn:
11262 string = StringSave("blastn");
11263 break;
11264 case blast_type_blastp:
11265 string = StringSave("blastp");
11266 break;
11267 case blast_type_blastx:
11268 string = StringSave("blastx");
11269 break;
11270 case blast_type_tblastn:
11271 string = StringSave("tblastn");
11272 break;
11273 case blast_type_tblastx:
11274 string = StringSave("tblastx");
11275 break;
11276 case blast_type_psitblastn:
11277 string = StringSave("psitblastn");
11278 break;
11279 default:
11280 string = NULL;
11281 break;
11282 }
11283
11284 return string;
11285 }
11286
11287 /*
11288 Returns the program number for a string containing the
11289 program name.
11290 */
11291
11292 Uint1 LIBCALL
BlastGetProgramNumber(CharPtr blast_program)11293 BlastGetProgramNumber(CharPtr blast_program)
11294
11295 {
11296 if (blast_program == NULL)
11297 return blast_type_undefined;
11298
11299 if (StringICmp("blastn", blast_program) == 0)
11300 {
11301 return blast_type_blastn;
11302 }
11303 else if (StringICmp("blastp", blast_program) == 0)
11304 {
11305 return blast_type_blastp;
11306 }
11307 else if (StringICmp("blastx", blast_program) == 0)
11308 {
11309 return blast_type_blastx;
11310 }
11311 else if (StringICmp("tblastn", blast_program) == 0)
11312 {
11313 return blast_type_tblastn;
11314 }
11315 else if (StringICmp("psitblastn", blast_program) == 0)
11316 {
11317 return blast_type_psitblastn;
11318 }
11319 else if (StringICmp("tblastx", blast_program) == 0)
11320 {
11321 return blast_type_tblastx;
11322 }
11323
11324 return blast_type_undefined;
11325 }
11326
11327 /*
11328 returns information aobut the db and query types (protein or dna)
11329 as well as the 'align_type' that should be attached to the SeqAnnot
11330 for formatting.
11331
11332 If an invalid program is entered, then 0 is returned.
11333 */
11334
11335 Uint1 LIBCALL
BlastGetTypes(CharPtr blast_program,Boolean PNTR query_is_na,Boolean PNTR db_is_na)11336 BlastGetTypes(CharPtr blast_program, Boolean PNTR query_is_na, Boolean PNTR db_is_na)
11337
11338 {
11339 Uint1 align_type=0;
11340
11341 align_type = BlastGetProgramNumber(blast_program);
11342 if(align_type == blast_type_undefined) {
11343 ErrPostEx(SEV_ERROR, 0,0, "Program name undefined: \"%s\"",
11344 blast_program);
11345 return blast_type_undefined;
11346 }
11347
11348 if (align_type == blast_type_blastn)
11349 {
11350 *query_is_na = TRUE;
11351 *db_is_na = TRUE;
11352 }
11353 else if (align_type == blast_type_blastp)
11354 {
11355 *query_is_na = FALSE;
11356 *db_is_na = FALSE;
11357 }
11358 else if (align_type == blast_type_blastx)
11359 {
11360 *query_is_na = TRUE;
11361 *db_is_na = FALSE;
11362 }
11363 else if (align_type == blast_type_tblastn)
11364 {
11365 *query_is_na = FALSE;
11366 *db_is_na = TRUE;
11367 }
11368 else if (align_type == blast_type_psitblastn)
11369 {
11370 *query_is_na = FALSE;
11371 *db_is_na = TRUE;
11372 }
11373 else if (align_type == blast_type_tblastx)
11374 {
11375 *query_is_na = TRUE;
11376 *db_is_na = TRUE;
11377 }
11378
11379 return align_type;
11380 }
11381
11382
11383 /*
11384 Find the word hits for a nucl. query. No neighbors are found here.
11385 If no indices are saved, then return 1, indicating that the
11386 search should not be performed.
11387
11388 */
11389
11390 Int2
BlastNtFindWords(BlastSearchBlkPtr search,Int4 start,Int4 len,Int1 context_index)11391 BlastNtFindWords(BlastSearchBlkPtr search, Int4 start, Int4 len, Int1 context_index)
11392 {
11393 register Int4 offset, initial_wordsize, reduced_wordsize;
11394 Boolean found_ambig, saved_index=FALSE;
11395 BLAST_WordFinderPtr wfp;
11396 Int4 end, index, index_addition, lookup_index, stop;
11397 LookupTablePtr lookup;
11398 Uint1Ptr str;
11399 ValNodePtr vnp, vnp_start=NULL;
11400
11401
11402 if (search == NULL)
11403 {
11404 return -1;
11405 }
11406
11407 wfp = search->wfp;
11408 if (wfp == NULL)
11409 {
11410 return -2;
11411 }
11412
11413 lookup = wfp->lookup;
11414 if (lookup == NULL)
11415 {
11416 return -3;
11417 }
11418
11419 initial_wordsize = (lookup->wordsize)*(wfp->compression_ratio);
11420 reduced_wordsize = (lookup->reduced_wordsize);
11421
11422 vnp = search->context[context_index].location;
11423 if (vnp == NULL)
11424 {
11425 ValNodeAddInt(&vnp, 1, -1);
11426 vnp_start = vnp;
11427 ValNodeAddInt(&vnp, 0, len);
11428 }
11429
11430 while (vnp)
11431 {
11432 if (vnp->choice == 1)
11433 {
11434 start = vnp->data.intvalue + 1;
11435 vnp = vnp->next;
11436 if (vnp == NULL)
11437 end = len;
11438 }
11439 if (vnp && vnp->choice == 0)
11440 {
11441 end = vnp->data.intvalue - initial_wordsize;
11442 vnp = vnp->next;
11443 }
11444
11445 end = MIN(end, len-initial_wordsize);
11446
11447 str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11448
11449 for (offset=start; offset<end; offset++, str++)
11450 {
11451 found_ambig= FALSE;
11452 lookup_index = 0;
11453 stop = reduced_wordsize;
11454 index_addition = 0;
11455 for (index=0; index<stop; index++)
11456 {
11457 if (*(str+index_addition) > 3 || *(str+index_addition+1) > 3 || *(str+index_addition+2) > 3 || *(str+index_addition+3) > 3)
11458 {
11459 found_ambig = TRUE;
11460 break;
11461 }
11462
11463 lookup_index += (*(str+index_addition) << 6);
11464 lookup_index += (*(str+1+index_addition) << 4);
11465 lookup_index += (*(str+2+index_addition) << 2);
11466 lookup_index += *(str+3+index_addition);
11467
11468 if (index != stop-1)
11469 { /* 8 bits/byte */
11470 lookup_index <<= 8;
11471 index_addition += 4;
11472 }
11473 }
11474
11475 if (found_ambig == FALSE)
11476 {
11477 lookup_add_index(lookup, (Int4) lookup_index, offset+(reduced_wordsize*(wfp->compression_ratio)), context_index);
11478 saved_index = TRUE;
11479 }
11480 }
11481 }
11482
11483 if (vnp_start)
11484 {
11485 vnp_start = ValNodeFree(vnp_start);
11486 }
11487
11488 if (saved_index == FALSE)
11489 return 1;
11490
11491 return 0;
11492 }
11493
11494 /*
11495 This functions finds the words.
11496 return values:
11497 0: success, words saved
11498 1: no words saved, no error
11499 -1: error
11500
11501 */
11502
11503
11504 Int2 LIBCALL
BlastFindWords(BlastSearchBlkPtr search,Int4 start,Int4 len,BLAST_Score threshold,Int1 context_index)11505 BlastFindWords(BlastSearchBlkPtr search, Int4 start, Int4 len, BLAST_Score threshold, Int1 context_index)
11506
11507 {
11508 register Uint1 last_char, last_char2;
11509 Uint1Ptr words, PNTR array;
11510 Uint1Ptr s_string_start, s_string;
11511 register Uint1Ptr str;
11512 BLAST_Score best_total, delta_score, diff, diff2, first_score;
11513 BLAST_Score second_score, start_score, start_score2, score;
11514 BLAST_ScoreBlkPtr sbp;
11515 register BLAST_ScorePtr PNTR matrix;
11516 BLAST_WordFinderPtr wfp;
11517 Boolean exact_match, saved_index=FALSE;
11518 LookupTablePtr lookup;
11519 register Int4 index1, index3, offset;
11520 register Int1 index2;
11521 Int4 num_of_cols, alphabet_size, wordsize;
11522 Int4 loop_increment, loop_increment2, stop;
11523 SeqCodeTablePtr sctp;
11524 ValNodePtr vnp, vnp_start;
11525
11526 sbp = search->sbp;
11527 matrix = sbp->matrix;
11528 str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11529 wfp = search->wfp;
11530 if (wfp == NULL)
11531 return -2;
11532 lookup = wfp->lookup;
11533 if (lookup == NULL)
11534 return -3;
11535 wordsize = wfp->wordsize;
11536
11537
11538 sctp = SeqCodeTableFindObj(sbp->alphabet_code);
11539 alphabet_size=sctp->num;
11540 if (search->all_words == NULL)
11541 {
11542 search->all_words = BlastPopulateAllWordArrays(wordsize, alphabet_size);
11543 if (search->all_words == NULL)
11544 {
11545 return -1;
11546 }
11547 num_of_cols = search->all_words->num_of_cols;
11548 array = search->all_words->array;
11549 }
11550 else
11551 {
11552 num_of_cols = search->all_words->num_of_cols;
11553 array = search->all_words->array;
11554 }
11555
11556 /* Index a specific small set, such as one db sequence. */
11557 if (search->all_words->specific)
11558 {
11559 len -= (wordsize-1);
11560 for (offset=start; offset<len; offset++, str++)
11561 {
11562 for (index1=0; index1<num_of_cols; index1++)
11563 {
11564 Boolean ExactMatch = TRUE;
11565 words = array[index1];
11566 score = 0;
11567 for (index2=0; index2<wordsize; index2++)
11568 {
11569 score += matrix[*(str+index2)][*(words+index2)];
11570 if (*(str+index2) != *(words+index2))
11571 ExactMatch = FALSE;
11572 }
11573 /* If score is above threshold or an exact match gives a non-zero value. */
11574 if (score >= threshold || (ExactMatch && score > 0))
11575 {
11576 lookup_add(lookup, (CharPtr) words, offset+wordsize-1, context_index);
11577 saved_index = TRUE;
11578 }
11579 }
11580 }
11581
11582 if (saved_index)
11583 return 0;
11584 else
11585 return 1;
11586 }
11587
11588 s_string_start = s_string = MemNew((wordsize+2)*sizeof(Uint1));
11589
11590 if (s_string_start == NULL)
11591 return -1;
11592
11593 /* Amounts to advance loops if the same character is to be checked again. */
11594 loop_increment=(long) (Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2)));
11595 loop_increment2=loop_increment/alphabet_size;
11596 /* Shorten len so up to the last complete word is checked. */
11597 len -= (wordsize-1);
11598
11599 vnp_start = NULL;
11600 vnp = search->context[context_index].location;
11601 if (vnp == NULL)
11602 {
11603 ValNodeAddInt(&vnp, 1, -1);
11604 ValNodeAddInt(&vnp, 0, len+wordsize);
11605 vnp_start = vnp;
11606 }
11607
11608 while (vnp)
11609 {
11610 if (vnp->choice == 1)
11611 {
11612 start = vnp->data.intvalue + 1;
11613 vnp = vnp->next;
11614 if (vnp == NULL)
11615 stop = len;
11616 }
11617 if (vnp && vnp->choice == 0)
11618 {
11619 stop = vnp->data.intvalue - (wordsize-1);
11620 vnp = vnp->next;
11621 }
11622
11623 stop = MIN(stop, len);
11624
11625 str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11626
11627 for (offset=start; offset<stop; offset++, str++)
11628 {
11629 /* Put query into the lookup table, after checking that word would give
11630 a positive value. */
11631 /* These are the exact matches. */
11632 best_total=0;
11633 for (index1=0; index1<wordsize; index1++)
11634 {
11635 best_total += matrix[(Int4) *(str+index1)][(Int4) *(str+index1)];
11636 }
11637 if (best_total > 0)
11638 {
11639 lookup_add(lookup, (CharPtr) str, offset+wordsize-1, context_index);
11640 saved_index = TRUE;
11641 }
11642
11643 /* Check if a match with a non-identical word could give a score above T. */
11644 best_total=0;
11645 for (index1=0; index1<wordsize; index1++)
11646 {
11647 best_total += sbp->maxscore[str[index1]];
11648 }
11649
11650 if (best_total < threshold)
11651 { /* no chance of a match! */
11652 continue;
11653 }
11654
11655 delta_score = best_total-threshold;
11656
11657 /* pick a last_char that is at end of the array, could this be improved? */
11658 last_char=array[num_of_cols-1][wordsize-2];
11659 last_char2=array[num_of_cols-1][wordsize-2];
11660
11661 for (index1=0; index1<num_of_cols; index1++)
11662 {
11663 words = array[index1];
11664
11665 /*
11666 only do this check if the letter has changed from last time. See if
11667 the new letter, matched with the first letter of the word, changes the
11668 total possible score to below threshold. If so, move ahead to the next letter.
11669 This is repeated with the second letter in the word.
11670
11671 The order of the letters in the first and second columns of array is
11672 important here!
11673 */
11674 if (last_char != *words)
11675 {
11676 last_char = *words;
11677 first_score = matrix[(Int4) *str][(Int4) *words];
11678 diff = delta_score + first_score - sbp->maxscore[*str];
11679 if (diff < 0)
11680 {
11681 /* index1 should be advanced by loop_increment, decrement by one as the "for"
11682 loop above increments by one. */
11683 index1 += loop_increment;
11684 index1--;
11685 continue;
11686 }
11687 start_score = first_score;
11688 }
11689
11690 if (wordsize > 2 && last_char2 != *(words+1) && wordsize != 1)
11691 {
11692 last_char2 = *(words+1);
11693 second_score = matrix[(Int4) *(str+1)][(Int4) *(words+1)];
11694 diff2 = second_score - sbp->maxscore[*(str+1)];
11695 diff2 += diff;
11696 if (diff2 < 0)
11697 {
11698 /* index1 should be advanced by loop_increment2, decrement by one as the "for"
11699 loop above increments by one. */
11700 index1 += loop_increment2;
11701 index1--;
11702 continue;
11703 }
11704 start_score = second_score+first_score;
11705 }
11706
11707 start_score2 = start_score;
11708
11709 for (index2=2; index2<wordsize-1; index2++)
11710 {
11711 start_score2 += matrix[(Int4) *(str+index2)][*(words+index2)];
11712 }
11713
11714 for (index2=0; index2<alphabet_size; index2++)
11715 {
11716 score = start_score2;
11717 score += matrix[(Int4) *(str+wordsize-1)][index2];
11718
11719 if (score >= threshold)
11720 {
11721 exact_match=TRUE;
11722 for (index3=0; index3<wordsize-1; index3++)
11723 {
11724 if (*(str+index3) != *(words+index3))
11725 {
11726 exact_match=FALSE;
11727 break;
11728 }
11729 }
11730 if (*(str+wordsize-1) != index2)
11731 {
11732 exact_match=FALSE;
11733 }
11734
11735 /* Exact matches were done above, exclude here. Is this really needed? */
11736 /* Could exact matches just be done here? */
11737 if (exact_match == FALSE)
11738 {
11739 s_string = s_string_start;
11740 for (index3=0; index3<wordsize-1; index3++)
11741 {
11742 *s_string = *(words+index3);
11743 s_string++;
11744 }
11745 *s_string = index2;
11746 lookup_add(lookup, (CharPtr) s_string_start, offset+wordsize-1, context_index);
11747 saved_index = TRUE;
11748 }
11749 }
11750 }
11751 }
11752 }
11753 }
11754
11755 if (vnp_start)
11756 {
11757 vnp_start = ValNodeFree(vnp_start);
11758 }
11759
11760 s_string_start = MemFree(s_string_start);
11761
11762 if (saved_index == FALSE)
11763 return 1;
11764
11765 return 0;
11766 }
11767
11768 /*AAS position-based version of BlastFindWords*/
11769 Int2 LIBCALL
BlastNewFindWords_Old(BlastSearchBlkPtr search,Int4 start,Int4 len,BLAST_Score threshold,Int1 context_index)11770 BlastNewFindWords_Old(BlastSearchBlkPtr search, Int4 start, Int4 len, BLAST_Score threshold, Int1 context_index)
11771
11772 {
11773 register Uint1 last_char, last_char2;
11774 Uint1Ptr words, PNTR array;
11775 Uint1Ptr s_string_start, s_string;
11776 register Uint1Ptr str;
11777 BLAST_Score best_total, delta_score, diff, diff2, first_score;
11778 BLAST_Score second_score, start_score, start_score2, score;
11779 BLAST_ScoreBlkPtr sbp;
11780 BLAST_WordFinderPtr wfp;
11781 Boolean exact_match;
11782 LookupTablePtr lookup;
11783 register Int4 index1, index3, offset;
11784 register Int1 index2;
11785 Int4 num_of_cols, alphabet_size, wordsize;
11786 Int4 loop_increment, loop_increment2;
11787 SeqCodeTablePtr sctp;
11788
11789 sbp = search->sbp;
11790 str = (Uint1Ptr) search->context[context_index].query->sequence + start;
11791 wfp = search->wfp;
11792 lookup = wfp->lookup;
11793 wordsize = wfp->wordsize;
11794
11795 sctp = SeqCodeTableFindObj(sbp->alphabet_code);
11796 alphabet_size=sctp->num;
11797 if (search->all_words == NULL)
11798 {
11799 search->all_words = BlastPopulateAllWordArrays(wordsize, alphabet_size);
11800 if (search->all_words == NULL)
11801 {
11802 return -1;
11803 }
11804 num_of_cols = search->all_words->num_of_cols;
11805 array = search->all_words->array;
11806 }
11807 else
11808 {
11809 num_of_cols = search->all_words->num_of_cols;
11810 array = search->all_words->array;
11811 }
11812
11813 /* Index a specific small set, such as one db sequence. */
11814 if (search->all_words->specific)
11815 {
11816 len -= (wordsize-1);
11817 for (offset=start; offset<len; offset++, str++)
11818 {
11819 for (index1=0; index1<num_of_cols; index1++)
11820 {
11821 words = array[index1];
11822 score = 0;
11823 for (index2=0; index2<wordsize; index2++)
11824 {
11825 score += MtrxScorePosSearch(search->sbp,
11826 offset + index2,
11827 *(words+index2));
11828 }
11829 if (score >= threshold)
11830 {
11831 lookup_add(lookup, (CharPtr) words, offset+wordsize-1, context_index);
11832 }
11833 }
11834 }
11835 return 0;
11836 }
11837
11838 s_string_start = s_string = MemNew((wordsize+2)*sizeof(Uint1));
11839
11840 if (s_string_start == NULL)
11841 return 1;
11842
11843 /* Amounts to advance loops if the same character is to be checked again. */
11844 loop_increment=(long) (Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2)));
11845 loop_increment2=loop_increment/alphabet_size;
11846 /* Shorten len so up to the last complete word is checked. */
11847 len -= (wordsize-1);
11848 for (offset=start; offset<len; offset++, str++)
11849 {
11850 /* Put query into the lookup table, after checking that word would give
11851 a positive value. */
11852 best_total=0;
11853 for (index1=0; index1<wordsize; index1++)
11854 {
11855 best_total += MtrxScorePosSearch(search->sbp,
11856 offset+index1,(Int4) *(str+index1));
11857 }
11858 if (best_total > 0)
11859 lookup_add(lookup, (CharPtr) str, offset+wordsize-1, context_index);
11860
11861 /* Check if a match with a non-identical word could give a score above T. */
11862 best_total=0;
11863 for (index1=0; index1<wordsize; index1++)
11864 {
11865 best_total += sbp->maxscore[str[index1]];
11866 }
11867
11868 if (best_total < threshold)
11869 { /* no chance of a match! */
11870 continue;
11871 }
11872
11873 delta_score = best_total-threshold;
11874
11875 /* pick a last_char that is at end of the array, could this be improved? */
11876 last_char=array[num_of_cols-1][wordsize-2];
11877 last_char2=array[num_of_cols-1][wordsize-2];
11878
11879 for (index1=0; index1<num_of_cols; index1++)
11880 {
11881 words = array[index1];
11882
11883 /*
11884 only do this check if the letter has changed from last time. See if
11885 the new letter, matched with the first letter of the word, changes the
11886 total possible score to below threshold. If so, move ahead to the next letter.
11887 This is repeated with the second letter in the word.
11888
11889 The order of the letters in the first and second columns of array is
11890 important here!
11891 */
11892 if (last_char != *words)
11893 {
11894 last_char = *words;
11895 first_score = MtrxScorePosSearch(search->sbp,
11896 offset,(Int4) *words);
11897 diff = delta_score + first_score - sbp->maxscore[*str];
11898 if (diff < 0)
11899 {
11900 /* index1 should be advanced by loop_increment, decrement by one as the "for"
11901 loop above increments by one. */
11902 index1 += loop_increment;
11903 index1--;
11904 continue;
11905 }
11906 start_score = first_score;
11907 }
11908
11909 if (last_char2 != *(words+1) && wordsize != 1)
11910 {
11911 last_char2 = *(words+1);
11912 second_score = MtrxScorePosSearch(search->sbp,
11913 offset+1,(Int4) *(words+1));
11914 diff2 = second_score - sbp->maxscore[*(str+1)];
11915 diff2 += diff;
11916 if (diff2 < 0)
11917 {
11918 /* index1 should be advanced by loop_increment2, decrement by one as the "for"
11919 loop above increments by one. */
11920 index1 += loop_increment2;
11921 index1--;
11922 continue;
11923 }
11924 start_score = second_score+first_score;
11925 }
11926
11927 start_score2 = start_score;
11928
11929 for (index2=2; index2<wordsize-1; index2++)
11930 {
11931 start_score2 += MtrxScorePosSearch(search->sbp,
11932 offset+index2,*(words+index2));
11933 }
11934
11935 for (index2=0; index2<alphabet_size; index2++)
11936 {
11937 score = start_score2;
11938 score += MtrxScorePosSearch(search->sbp,
11939 offset+wordsize-1,index2);
11940
11941 if (score >= threshold)
11942 {
11943 exact_match=TRUE;
11944 for (index3=0; index3<wordsize-1; index3++)
11945 {
11946 if (*(str+index3) != *(words+index3))
11947 {
11948 exact_match=FALSE;
11949 break;
11950 }
11951 }
11952 if (*(str+wordsize-1) != index2)
11953 {
11954 exact_match=FALSE;
11955 }
11956
11957 if (exact_match == FALSE)
11958 {
11959 s_string = s_string_start;
11960 for (index3=0; index3<wordsize-1; index3++)
11961 {
11962 *s_string = *(words+index3);
11963 s_string++;
11964 }
11965 *s_string = index2;
11966 lookup_add(lookup, (CharPtr) s_string_start, offset+wordsize-1, context_index);
11967
11968 }
11969 }
11970 }
11971 }
11972 }
11973
11974 s_string_start = MemFree(s_string_start);
11975 return 0;
11976 }
11977
11978 /* SSH position-based version of BlastFindWords
11979 Lookup structure should be allocated at this point*/
11980
BlastNewFindWordsEx(LookupTablePtr lookup,BLAST_ScorePtr PNTR posMatrix,Int4 start,Int4 len,BlastAllWordPtr all_words,BLAST_Score threshold,Int4 wordsize,Int1 context_index)11981 Int2 BlastNewFindWordsEx(LookupTablePtr lookup, BLAST_ScorePtr PNTR posMatrix,
11982 Int4 start, Int4 len, BlastAllWordPtr all_words,
11983 BLAST_Score threshold, Int4 wordsize,
11984 Int1 context_index)
11985 {
11986 register Uint1 last_char, last_char2;
11987 Uint1Ptr words, PNTR array;
11988 Uint1Ptr s_string_start, s_string;
11989 BLAST_Score best_total, delta_score, diff, diff2, first_score;
11990 BLAST_Score second_score, start_score, start_score2, score;
11991 register Int4 index1, index3, offset;
11992 register Int1 index2;
11993 Int4 num_of_cols, alphabet_size;
11994 Int4 loop_increment, loop_increment2;
11995 Boolean all_words_allocated = FALSE;
11996 BLAST_ScorePtr maxscore;
11997
11998 if(lookup == NULL || posMatrix == NULL)
11999 return -1;
12000
12001 alphabet_size = PSI_ALPHABET_SIZE; /* 26 */
12002
12003 if (all_words == NULL) {
12004 all_words = BlastPopulateAllWordArrays(wordsize, alphabet_size);
12005 if (all_words == NULL) {
12006 return -1;
12007 }
12008
12009 all_words_allocated = TRUE;
12010 num_of_cols = all_words->num_of_cols;
12011 array = all_words->array;
12012 } else {
12013 num_of_cols = all_words->num_of_cols;
12014 array = all_words->array;
12015 }
12016
12017 /* Index a specific small set, such as one db sequence.
12018 This is used when all_words actually a subset of "all_words" found
12019 in some sequence. Used for ex. in BlastTwoSequences */
12020
12021 if (all_words->specific) {
12022 len -= (wordsize-1);
12023 for (offset = start; offset < len; offset++) {
12024 for (index1 = 0; index1 < num_of_cols; index1++) {
12025 words = array[index1];
12026 score = 0;
12027 for (index2 = 0; index2 < wordsize; index2++)
12028 score += posMatrix[offset + index2][*(words+index2)];
12029 if (score >= threshold) {
12030 lookup_add(lookup, (CharPtr) words, offset + wordsize - 1,
12031 context_index);
12032 }
12033 }
12034 }
12035
12036 if(all_words_allocated)
12037 BlastAllWordDestruct(all_words);
12038
12039 return 0;
12040 }
12041 /* ----------- End of "specific" word finding ------------ */
12042
12043 /* maxscore matrix will be used position-specific -
12044 of length = (len - start) */
12045
12046 maxscore = BlastPSIMaxScoreGet(posMatrix, start, len);
12047
12048 s_string_start = s_string = MemNew((wordsize+2)*sizeof(Uint1));
12049
12050 if (s_string_start == NULL)
12051 return 1;
12052
12053 /* Amounts to advance loops if the same character is to be checked again. */
12054 loop_increment=(long) (Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2)));
12055 loop_increment2=loop_increment/alphabet_size;
12056
12057 /* Shorten len so up to the last complete word is checked. */
12058 len -= (wordsize-1);
12059 for (offset = start; offset < len; offset++) {
12060
12061 /* Check if a match with a non-identical word could give a score above T. */
12062 best_total = 0;
12063 for (index1 = 0; index1 < wordsize; index1++) {
12064 best_total += maxscore[offset+index1];
12065 }
12066
12067 if (best_total < threshold) { /* no chance of a match! */
12068 continue;
12069 }
12070
12071 delta_score = best_total - threshold;
12072
12073 /* pick a last_char that is at end of the array, could this be improved? */
12074 last_char=array[num_of_cols-1][wordsize-2];
12075 last_char2=array[num_of_cols-1][wordsize-2];
12076
12077 for (index1 = 0; index1 < num_of_cols; index1++) {
12078 words = array[index1];
12079
12080 /*
12081 only do this check if the letter has changed from last time. See if
12082 the new letter, matched with the first letter of the word, changes the
12083 total possible score to below threshold. If so, move ahead to the next letter.
12084 This is repeated with the second letter in the word.
12085
12086 The order of the letters in the first and second columns of array is
12087 important here!
12088 */
12089 if (last_char != *words) {
12090 last_char = *words;
12091
12092 first_score = posMatrix[offset][*words];
12093 diff = delta_score + first_score - maxscore[offset];
12094
12095 if (diff < 0) {
12096 /* index1 should be advanced by loop_increment, decrement by one as the "for"
12097 loop above increments by one. */
12098 index1 += loop_increment;
12099 index1--;
12100 continue;
12101 }
12102 start_score = first_score;
12103 }
12104
12105 if (last_char2 != *(words+1) && wordsize != 1) {
12106 last_char2 = *(words+1);
12107 second_score = posMatrix[offset+1][*(words+1)];
12108
12109 diff2 = second_score - maxscore[offset+1];
12110 diff2 += diff;
12111 if (diff2 < 0) {
12112 /* index1 should be advanced by loop_increment2, decrement by one as the "for"
12113 loop above increments by one. */
12114 index1 += loop_increment2;
12115 index1--;
12116 continue;
12117 }
12118 start_score = second_score + first_score;
12119 }
12120
12121 start_score2 = start_score;
12122
12123 for (index2 = 2; index2 < wordsize - 1; index2++) {
12124 start_score2 += posMatrix[offset+index2][*(words+index2)];
12125 }
12126
12127 for (index2 = 0; index2 < alphabet_size; index2++) {
12128 score = start_score2;
12129 score += posMatrix[offset+wordsize-1][index2];
12130
12131 if (score >= threshold) {
12132 s_string = s_string_start;
12133 for (index3=0; index3 < wordsize-1; index3++) {
12134 *s_string = *(words+index3);
12135 s_string++;
12136 }
12137 *s_string = index2;
12138 lookup_add(lookup, (CharPtr) s_string_start,
12139 offset+wordsize-1, context_index);
12140 }
12141 }
12142 }
12143 }
12144
12145 s_string_start = MemFree(s_string_start);
12146
12147 if(all_words_allocated)
12148 BlastAllWordDestruct(all_words);
12149
12150 MemFree(maxscore);
12151
12152 return 0;
12153 }
12154 /* SSH position-based version of BlastFindWords*/
12155 Int2 LIBCALL
BlastNewFindWords(BlastSearchBlkPtr search,Int4 start,Int4 len,BLAST_Score threshold,Int1 context_index)12156 BlastNewFindWords(BlastSearchBlkPtr search, Int4 start, Int4 len, BLAST_Score threshold, Int1 context_index)
12157 {
12158 Int2 status;
12159
12160 status = BlastNewFindWordsEx(search->wfp->lookup, search->sbp->posMatrix,
12161 start, len, search->all_words, threshold,
12162 search->wfp->wordsize, context_index);
12163 return status;
12164 }
12165
12166 /*******************************************************************
12167
12168 This function allocates and populates an array containing every possible
12169 letter combination for an alphabet of size alphabet_size for the first
12170 wordsize-1 letters. The last letter of the word is done on the fly.
12171 The approach is best described with a table that demonstrates
12172 the strategy with a two-letter alphabet and a wordsize of three:
12173
12174 index 1 2
12175 col. 1 0
12176
12177 A A
12178 A B
12179 B A
12180 B B
12181 A A
12182 A B
12183 B A
12184 B B
12185
12186 col. 0: basic pattern repeated N**(W-1) times, where N is the size of the
12187 alphabet and W is the wordsize.
12188 col. 1: AABB repeated N**(W-2) times.
12189
12190 Each pattern is repeated N**(W-1-C) times, where "C" is the column number.
12191 The number of repeats of a given letter is N**C.
12192 The total number of rows in the array is then N**(W-1-C) * N**C * N = N**W,
12193 as we expect.
12194
12195 NOTE: The order of the columns is important, as it is used in
12196 BlastWFContextNeighborhoodAdd above. In particular it is useful to have
12197 all the letters grouped together.
12198 *********************************************************************/
12199
12200 BlastAllWordPtr
BlastPopulateAllWordArrays(Int4 wordsize,Int4 alphabet_size)12201 BlastPopulateAllWordArrays(Int4 wordsize, Int4 alphabet_size)
12202
12203 {
12204 BlastAllWordPtr all_words;
12205 Uint1Ptr *array_ptr, *array;
12206 Uint1Ptr array_storage;
12207 register Int4 index, index1, index3, num_of_cols, times, repeat_num;
12208 register Int1 index2;
12209 num_of_cols = (Int4) Nlm_Powi((Nlm_FloatHi)alphabet_size, wordsize-1);
12210 array = (Uint1Ptr *) MemNew(num_of_cols*sizeof(Uint1Ptr));
12211
12212 array_storage = (Uint1Ptr) MemNew(num_of_cols*(wordsize-1)*sizeof(Uint1));
12213 for (index=0; index<num_of_cols; index++)
12214 {
12215 array[index] = array_storage+(index*(wordsize-1));
12216 }
12217
12218 for (index=0; index<wordsize-1; index++)
12219 {
12220 array_ptr = array;
12221 repeat_num= (Int4) Nlm_Powi((Nlm_FloatHi)alphabet_size,(wordsize-2-index));
12222 times = (Int4) Nlm_Powi((Nlm_FloatHi)alphabet_size, index);
12223 for (index1=0; index1<times; index1++)
12224 {
12225 for (index2=0; index2<alphabet_size; index2++)
12226 {
12227 for (index3=0; index3<repeat_num; index3++)
12228 {
12229 (*array_ptr)[index] = index2;
12230 array_ptr++;
12231 }
12232 }
12233 }
12234 }
12235
12236 all_words = BlastAllWordNew(num_of_cols, wordsize, TRUE, FALSE);
12237 if (all_words)
12238 {
12239 all_words->array = array;
12240 all_words->array_storage = array_storage;
12241 }
12242
12243 return all_words;
12244 }
12245
12246 /**************************************************************************
12247 *
12248 * Get the "ScoreSet" ScorePtr from the BLAST data, which is provided
12249 * by "hsp". "score_set" should be NULL, a chain of scores is added.
12250 **************************************************************************/
12251
12252 ScorePtr LIBCALL
GetScoreSetFromBlastResultHsp(BLASTResultHspPtr hsp,SeqIdPtr gi_list)12253 GetScoreSetFromBlastResultHsp(BLASTResultHspPtr hsp, SeqIdPtr gi_list)
12254
12255 {
12256 ScorePtr score_set=NULL;
12257 Nlm_FloatHi prob;
12258 Int4 score;
12259 CharPtr scoretype;
12260
12261 score = hsp->score;
12262 if (score > 0)
12263 MakeBlastScore(&score_set, "score", 0.0, score);
12264
12265 score = hsp->number;
12266 scoretype = "sum_n";
12267
12268 if (score > 1)
12269 MakeBlastScore(&score_set, scoretype, 0.0, score);
12270
12271 prob = hsp->e_value;
12272 if (hsp->number <= 1)
12273 {
12274 scoretype = "e_value";
12275 }
12276 else
12277 {
12278 scoretype = "sum_e";
12279 }
12280 if (prob >= 0.)
12281 {
12282 if (prob < 1.0e-180)
12283 prob = 0.0;
12284 MakeBlastScore(&score_set, scoretype, prob, 0);
12285 }
12286
12287 prob = hsp->bit_score;
12288 if (prob >= 0.)
12289 MakeBlastScore(&score_set, "bit_score", prob, 0);
12290
12291 if (hsp->num_ident > 0)
12292 MakeBlastScore(&score_set, "num_ident", 0.0, hsp->num_ident);
12293
12294 if (hsp->number > 1 && hsp->ordering_method == BLAST_SMALL_GAPS)
12295 {
12296 MakeBlastScore(&score_set, "small_gap", 0.0, 1);
12297 } else if (hsp->ordering_method > 3) {
12298 /* In new tblastn this means splice junction was found */
12299 MakeBlastScore(&score_set, "splice_junction", 0.0, 1);
12300 }
12301
12302 while (gi_list)
12303 {
12304 MakeBlastScore(&score_set, "use_this_gi", 0.0, gi_list->data.intvalue);
12305 gi_list = gi_list->next;
12306 }
12307
12308 return score_set;
12309 }
12310
12311 /** Configure the database chunk size adaptively.
12312 * Note: Must be called from a single-threaded context
12313 * @param search the The search block to configure [inout]
12314 * @param num_seq The number of sequences in the database [in]
12315 */
ConfigureDbChunkSize(BlastSearchBlkPtr search,Int4 num_seq)12316 void ConfigureDbChunkSize(BlastSearchBlkPtr search, Int4 num_seq)
12317 {
12318 /* Emit a tick after how many sequences? */
12319 search->thr_info->db_incr = num_seq / BLAST_NTICKS;
12320
12321 /* Divide the search into chunks */
12322 search->thr_info->db_chunk_size = MAX(num_seq / 100,1);
12323
12324 /* Loadbalance more finely for multithreaded searches. */
12325 if (NlmThreadsAvailable() && search->pbp->process_num > 1)
12326 search->thr_info->db_chunk_size = MAX(num_seq/(100*(search->pbp->process_num
12327 )), 1);
12328
12329 return;
12330 }
12331