1
2 static char const rcsid[] = "$Id: blast.c,v 6.451 2008/01/25 21:15:22 bealer Exp $";
3
4 /* $Id: blast.c,v 6.451 2008/01/25 21:15:22 bealer Exp $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's offical duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28
29 /*****************************************************************************
30
31 File name: blast.c
32
33 Author: Tom Madden
34
35 Contents: BLAST functions
36
37 Detailed Contents:
38
39 - Functions that allocate and deallocate structures used by BLAST.
40
41 - Functions that find the initial word hits for BLAST (both contiguous
42 and discontiguous).
43
44 - Functions that extend these initial word hits and decide if the
45 results HSP (High-Scoring Segment Pairs) are worth keeping.
46
47 - Functions that link together HSP's to a "hitlist".
48
49 - Functions that save the hitlist to a structure appropriate for
50 further manipulation.
51
52 ******************************************************************************
53 * $Revision: 6.451 $
54 *
55 * $Log: blast.c,v $
56 * Revision 6.451 2008/01/25 21:15:22 bealer
57 * - Fix synchronization issue with blastpgp -a4 -j4 when composition based
58 * statistics is used for databases with multiple volumes.
59 *
60 * Revision 6.450 2007/05/07 13:30:54 kans
61 * added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr)
62 *
63 * Revision 6.449 2007/03/13 20:38:39 madden
64 * - In BLASTCalculateSearchSpace, use floating point multiplication to
65 * compute the floating point value searchsp.
66 *
67 * - In BLASTSetUpSearchInternalByLoc, don't cast
68 * DROPOFF_NUMBER_OF_BITS to an integer when assigning the floating
69 * point options dropoff_1st_pass and dropoff_2nd_pass.
70 *
71 * - In BLASTSetUpSearchInternalByLoc, use floating point division to
72 * compute the floating point value avglen.
73 *
74 * - In blast_set_parameters, change the type of the function arguments
75 * dropoff_number_of_bits_1st_pass and
76 * dropoff_number_of_bits_2nd_pass to Nlm_FloatHi.
77 *
78 * - In blast_set_parameters, cast a value in the computation of
79 * cutoff_s_first to type BLAST_Score only after dividing by Lambda,
80 * instead of before performing the division.
81 * [from Mike Gertz]
82 *
83 * Revision 6.448 2007/03/05 14:51:22 camacho
84 * - In BLASTPerformFinalSearch, merge the hitlists for PSITBLASTN, and is
85 * done for TBLASTN.
86 * - In xsum_compare_hsps, break ties by calling score_compare_hsps.
87 *
88 * Revision 6.447 2006/09/21 13:42:36 madden
89 * BlastProcessGiLists returns a boolean to specify that an attempt was made to process a list of GIs. If no matches were found this can be reported back to the user
90 *
91 * Revision 6.446 2006/06/01 15:48:38 papadopo
92 * in blastMergeFilterLocs, add the capability to merge mixed-type seqlocs; these appear in e.g. megablast with both low-complexity and repeat filtering
93 *
94 * Revision 6.445 2005/10/06 12:52:23 madden
95 * Changes to support correct gapped stats for blastn
96 *
97 * Revision 6.444 2005/09/29 17:40:08 coulouri
98 * from mike gertz:
99 * In the do_gapped_blast_search routine, in the case where query
100 * concatenation is used, call BlastLinkHsps only when
101 * search->pbp->do_sum_stats is true.
102 *
103 * Revision 6.443 2005/09/26 15:02:58 morgulis
104 * Fixing some memort leaks when using query concatenation in blastn and tblastn.
105 *
106 * Revision 6.442 2005/08/31 20:32:31 coulouri
107 * From Mike Gertz:
108 * - Added the function BlastSingleQueryResultSize to implement the
109 * policy for adjusting the hitlist size for preliminary alignments
110 * to a single query.
111 * - In BLASTSetUpSearchWithReadDbInternalMult replaced existing code
112 * for adjusting the hitlist size for a single query by a call to
113 * BlastSingleQueryResultSize.
114 * - In BLASTSetUpSearchEx, replaced existing code for adjusting the
115 * hitlist size by a call to BlastSingleQueryResultSize. This
116 * changes the behavior of the routine slightly, in that the hitlist
117 * size is (correctly) no longer increased for ungapped alignments.
118 *
119 * Revision 6.441 2005/07/28 14:57:09 coulouri
120 * remove dead code
121 *
122 * Revision 6.440 2005/07/27 15:51:54 coulouri
123 * remove unused queue_callback
124 *
125 * Revision 6.439 2005/05/19 11:11:59 coulouri
126 * Changes from morgulis to address rt ticket 15091715:
127 * null hsp_array in blastall tblastn query concatenation causes segfault
128 *
129 * Revision 6.438 2005/05/10 18:51:15 dondosha
130 * Removed unused functions and variables; moved sorting of HSPs by score after new_link_hsps inside this function
131 *
132 * Revision 6.437 2005/05/10 16:15:23 dondosha
133 * Back-porting changes in uneven gap HSP linking from algo/blast code: from Mike Gertz
134 *
135 * Revision 6.436 2005/05/06 11:49:22 coulouri
136 * remove unnecessary evalue check that results in instability of number_of_seqs_better_E; addresses rt ticket 15075332
137 *
138 * Revision 6.435 2005/05/02 16:03:14 coulouri
139 * refactor code to set db_chunk_size
140 *
141 * Revision 6.434 2005/04/25 14:16:36 coulouri
142 * set db_chunk_size adaptively
143 *
144 * Revision 6.433 2005/01/24 21:17:36 camacho
145 * 1. Changed implementation of RPSBlastResultHspScoreCmp to have the same
146 * tie-breakers as score_compare_hsps
147 * 2. Renamed RPSBlastResultHspScoreCmp to BLASTResultHspScoreCmp
148 *
149 * Revision 6.432 2005/01/21 19:41:04 camacho
150 * Initialize variables
151 *
152 * Revision 6.431 2005/01/10 18:52:28 coulouri
153 * fixes from morgulis to allow concatenation of >255 queries in [t]blastn
154 *
155 * Revision 6.430 2004/12/29 13:26:28 madden
156 * One hit extension fixes so that:
157 * 1.) it is no longer iterative; now a left extension is performed and then a right extension.
158 * 2.) the left extension now stops when the score has dropped by xdrop, the right when the score goes to zero.
159 * 3.) fix one hit stopping criteria so that it is like two hit criteria.
160 * .
161 *
162 * Revision 6.429 2004/12/20 15:22:16 camacho
163 * Calculate kbp_ideal values rather than loading them from pre-computed values
164 *
165 * Revision 6.428 2004/12/14 14:07:54 madden
166 * Fix typo in if statement
167 *
168 * Revision 6.427 2004/11/30 16:33:16 dondosha
169 * Do not subtract starting offset in AdjustOffsetsInMaskLoc, because this is done in other functions after lower case mask is merged with filter mask
170 *
171 * Revision 6.426 2004/11/23 21:21:15 coulouri
172 * remove dead code, eliminate compiler warnings
173 *
174 * Revision 6.425 2004/11/22 15:43:24 dondosha
175 * Call AdjustOffsetsInMaskLoc for the options query_lcase_mask field, not parameters, to avoid leaving pointer to freed memory
176 *
177 * Revision 6.424 2004/11/19 13:22:05 madden
178 * Remove no_check_score completely (from Mike Gertz)
179 *
180 * Revision 6.423 2004/11/04 17:23:11 madden
181 * Fix for tblastn searches, do not mix HSPs from separate frames
182 *
183 * Revision 6.422 2004/11/04 15:51:55 bealer
184 * - bl2seq should use dblen as average length if database is not available.
185 *
186 * Revision 6.421 2004/11/01 14:07:06 madden
187 * - In CalculateSecondCutoffScore use the number of starting points,
188 * rather than the maxiumum size of the gap, when calculating the
189 * cutoffs.
190 *
191 * Recently, the meaning of search->pbp->gap_size was changed.
192 * Previously, it represented the maximum number of permitted
193 * starting points; now it represents the maximum permitted gap.
194 * The CalculateSecondCutoffScore was not updated to reflect the new
195 * meaning. (The algo/blast/code was appropriately updated.)
196 *
197 * - Remove the BlastReapHitlistByScore routine, and a call to the
198 * routine in BLASTPerformFinalSearch.
199 *
200 * Revision 6.420 2004/10/25 18:30:21 papadopo
201 * From Michael Gertz:
202 * 1. Change BlastNtWordExtend to only terminate an ungapped alignment
203 * if the running score fails the X-drop criterion, *not* if the score
204 * becomes zero
205 * 2. Change BlastNtWordExtend to call BlastSaveCurrentHsp only for an
206 * ungapped alignment, since it would choose an incorrect start point
207 * for a gapped alignment
208 *
209 * Revision 6.419 2004/10/18 13:01:54 madden
210 * Changes from Mike Gertz:
211 * - In xsum_compare_hsps change the comparison tests so that nil
212 * HSPs are less than any non-nil HSP. Previously, this
213 * function would return 0 if either HSP was nil, which would
214 * result in sort routines terminating before the non-nil HSPs
215 * in the list were fully sorted.
216 *
217 * - In rev_compare_hsps_cfj, reversed the order of the
218 * comparsion on query.frame to make the sort order consistent
219 * with the sort used in algo/blast/core/link_hsps.c.
220 *
221 * Revision 6.418 2004/10/07 13:07:06 madden
222 * Cast int to FloatHi to prevent wrap-around
223 *
224 * Revision 6.417 2004/09/30 12:10:19 madden
225 * Add function BlastReapHitlistByScore, use on gapped tblastn and blastx HSPs that do not acheive a high enough score for continued processing
226 *
227 * Revision 6.416 2004/09/28 15:59:40 papadopo
228 * Items 1 and 2 of version 6.414 were mistakenly left out
229 *
230 * Revision 6.415 2004/09/28 15:52:16 papadopo
231 * From Michael Gertz:
232 * 1. Undo previous fix to ungapped PSSM wordfinder (not necessary)
233 * 2. Modify square-matrix ungapped wordfinder to avoid occaisional
234 * incorrect choice of start offset for right extensions
235 * 3. Call BlastLinkHsps if and only if search->pbp->do_sum_stats is
236 * true; previously used the program number to decide
237 * 4. For ungapped blastx and tblastn, if longest_intron is not set
238 * (i.e. = 0) or (longest_intron - 2)/3 is nonpositive, call
239 * link_hsps. Otherwise call new_link_hsps.
240 * 5. For gapped blastx, tblastn or psitblastn, if longest_intron is
241 * not set (i.e. = 0), set it to 122. Then call new_link_hsps if
242 * (longest_intron - 2)/3 is positive. Otherwise turn off sum statistics.
243 * 6. In BlastLinkHsps, enabled the use of new_link_hsps for psitblastn.
244 * 7. Caused all routines for calculating the significance of multiple
245 * distinct alignments (BlastSmallGapSumE, BlastLargeGapSumE and
246 * BlastUnevenGapSumE) to use
247 *
248 * sum_{i in linked_set} (\lambda_i s_i - \ln K_i)
249 *
250 * as the weighted sum score. This change affects e-values in
251 * blastx and tblastx.
252 * 8. When computing normalized sum scores, use the ungapped values of
253 * (lambda, K) for ungapped alignments.
254 * 9. In SumHSPEvalue, for blastx, the subject_length must be divided by 3.
255 * 10. Pass the effective database size into BlastSmallGapSumE,
256 * BlastLargeGapSumE and BlastUnevenGapSumE. The routines use this
257 * value in a simplified formula to compute the e-value of singleton sets.
258 * 11. Sort HSPs in new_link_hsps by normalized score, rather than score;
259 * for blastx, this places HSPs in the correct order of significance.
260 * 12. In new_link_hsps, set xsum field of every HSP to the appropriate
261 * value for a singleton set before doing any linking.
262 * 13. In both link_hsps and new_link_hsps use normalized sum score,
263 * rather than raw sum score, everywhere when choosing linked sets
264 * 14. Delete code in new_link_hsps for finding splice junctions.
265 * 15. Delete some unused variables in link_hsps.
266 *
267 * Revision 6.412 2004/09/22 16:44:48 dondosha
268 * Assign frames in ungapped blastn before any attempt to link HSPs, not only before second linking
269 *
270 * Revision 6.411 2004/09/21 16:28:23 dondosha
271 * Make sure first change in previous revision is applied only to blastn
272 *
273 * Revision 6.410 2004/09/21 13:58:46 dondosha
274 * 1. Assign HSP contexts and subject frames before linking HSPs after
275 * reevaluation with ambiguities for ungapped blastn - necessary to distinguish
276 * HSPs from different strands;
277 * 2. Use ideal Karlin-Altschul parameters for RPS tblastn instead of those for a
278 * fake protein.
279 *
280 * Revision 6.409 2004/09/15 18:33:23 papadopo
281 * From Michael Gertz: modify two-hit ungapped code to compute the correct end offset even if an extension to the right does not happen
282 *
283 * Revision 6.408 2004/08/27 16:11:18 dondosha
284 * Changes in new_link_hsps from Mike Gertz: adjust singleton sets e-values by gap decay divisor; use effective db length for sum e-value calculations
285 *
286 * Revision 6.407 2004/08/16 19:37:26 dondosha
287 * Enabled uneven gap HSP linking for blastx
288 *
289 * Revision 6.406 2004/05/21 13:53:37 dondosha
290 * Fix in BLASTMergeHitLists
291 *
292 * Revision 6.405 2004/04/28 14:37:06 madden
293 * Changes from Mike Gertz
294 * - modified the link_hsps routine to apply the gap_prob parameter to
295 * the result of BlastSmallGapSumE and BlastLargeGapSumE.
296 * - further modified link_hsps to use BlastGapDecayDivisor to weight
297 * tests based on multiple collections of HSPs.
298 * - removed all reference to gap_prob from the new_link_hsps.
299 * - further modified new_link_hsps to use BlastGapDecayDivisor to weight
300 * tests based on multiple collections of HSPs.
301 *
302 * Revision 6.404 2004/04/20 14:55:47 morgulis
303 * 1. Fixed query offsets in results when -B option is used.
304 * 2. Fixes for lower case masking handling with -B option.
305 *
306 * Revision 6.403 2004/04/13 21:03:30 madden
307 * Use ignore_gilist Boolean to determine whether gilist lookup should occur
308 *
309 * Revision 6.402 2004/03/31 17:58:51 papadopo
310 * Mike Gertz' changes for length adjustment calculations
311 *
312 * Revision 6.401 2004/03/22 15:35:39 dondosha
313 * 1. Do not allow cutoff score for saving HSPs to be smaller than gap trigger;
314 * 2. When merging hitlists with a restriction on number of HSPs, keep best
315 * scoring ones.
316 *
317 * Revision 6.400 2004/02/26 15:52:29 papadopo
318 * Mike Gertz' modifications to unify handling of gapped Karlin blocks between protein and nucleotide searches
319 *
320 * Revision 6.399 2004/02/24 14:07:00 camacho
321 * Use approximate sequence length calculation for entrez-limited
322 * nucleotide blast databases.
323 *
324 * Revision 6.398 2004/02/03 17:54:16 dondosha
325 * Correction to revision 6.391 in function BlastGetDbChunk
326 *
327 * Revision 6.397 2004/01/06 22:37:10 dondosha
328 * Use BLAST_HSPfree function
329 *
330 * Revision 6.396 2003/12/29 15:42:46 coulouri
331 * tblastn query concatenation fixes from morgulis
332 *
333 * Revision 6.395 2003/12/12 16:01:23 madden
334 * Change to signature of BlastCutoffs, remove BlastCutoffs_simple
335 *
336 * Revision 6.394 2003/12/10 17:05:27 dondosha
337 * Added function ReevaluateScoreWithAmbiguities to reevaluate score for one HSP; use it after greedy traceback
338 *
339 * Revision 6.393 2003/11/19 18:09:13 dondosha
340 * Use consistent rounding in length adjustment calculation
341 *
342 * Revision 6.392 2003/11/10 20:15:29 dondosha
343 * Bug fix in BLASTMergeHsps
344 *
345 * Revision 6.391 2003/10/23 17:46:17 dondosha
346 * Fix in BlastGetDbChunk for looking up ordinal ids within a range
347 *
348 * Revision 6.390 2003/08/08 16:36:21 dondosha
349 * 1. Treat final_db_seq as 1 beyond the final sequence; 0 is an exception, meaning end of database.
350 * 2. Added more meaningful error message when query length is less than wordsize.
351 *
352 * Revision 6.389 2003/05/30 17:20:10 coulouri
353 * add rcsid
354 *
355 * Revision 6.388 2003/05/14 20:35:58 camacho
356 * Allow searching empty databases
357 *
358 * Revision 6.387 2003/05/13 16:02:53 coulouri
359 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
360 *
361 * Revision 6.386 2003/05/12 12:23:43 camacho
362 * Sanity check for number of sequences & db length
363 *
364 * Revision 6.385 2003/04/23 15:15:36 camacho
365 * Moved reading of gi list to readdb
366 *
367 * Revision 6.384 2003/03/24 19:42:13 madden
368 * Changes to support query concatenation for blastn and tblastn
369 *
370 * Revision 6.383 2003/03/14 22:33:44 dondosha
371 * Do not increase preliminary hitlist size for ungapped search
372 *
373 * Revision 6.382 2003/03/06 19:10:42 madden
374 * Allow search->pbp->process_num to be > 1 if MT enabled
375 *
376 * Revision 6.381 2003/03/05 21:30:24 dondosha
377 * Fix in BlastMakeCopyQueryDNAP for single-strand OOF search
378 *
379 * Revision 6.380 2002/12/24 14:12:03 dondosha
380 * Removed accidental duplicate lines
381 *
382 * Revision 6.379 2002/12/10 23:13:22 bealer
383 * Fix do_the_blast_run and BlastGetDbChunk to calculate beginning and ending
384 * sequence numbers correctly.
385 * Fix BlastGetDbChunk to use precise start and end points, not nearest
386 * multiples of 32.
387 * Fix do_the_blast_run and BlastGetDbChunk to handle mixed oidlist / real db
388 * multiple database scenarios.
389 *
390 * Revision 6.378 2002/12/04 22:39:51 bealer
391 * Undo previous set of changes.
392 *
393 * Revision 6.377 2002/11/25 19:53:34 bealer
394 * Remove extraneous commented code.
395 *
396 * Revision 6.376 2002/11/25 19:50:26 bealer
397 * Prevent extra work by BlastGetDbChunk when OID lists are used.
398 *
399 * Revision 6.375 2002/11/13 18:03:10 dondosha
400 * Correction in BlastReevaluateWithAmbiguities
401 *
402 * Revision 6.374 2002/11/08 14:58:43 kans
403 * first argument to NlmReadMFILE must be cast as Uint1Ptr - Mac compiler picked up this inconsistency with the prototype
404 *
405 * Revision 6.373 2002/11/07 21:06:15 camacho
406 * Made GetGisFromFile work even without mmap
407 *
408 * Revision 6.372 2002/11/04 22:55:56 dondosha
409 * For blastn, calculate number of identities in BlastReevaluateWithAmbiguities
410 *
411 * Revision 6.371 2002/10/28 21:44:03 madden
412 * Added comments about gap-free extensions
413 *
414 * Revision 6.370 2002/09/18 20:23:19 camacho
415 * Added BLASTCalculateSearchSpace
416 *
417 * Revision 6.369 2002/09/11 20:46:25 camacho
418 * Removed deprecated BlastSeqIdListPtr code
419 *
420 * Revision 6.368 2002/08/30 18:56:02 dondosha
421 * Made BlastMakeTempProteinBioseq and HackSeqLocId public: needed for Cn3D
422 *
423 * Revision 6.367 2002/08/30 15:42:48 dondosha
424 * In blastn, use ewp structure only for the first context
425 *
426 * Revision 6.366 2002/08/22 13:39:45 camacho
427 * Close the header and sequence files only if allocated
428 *
429 * Revision 6.365 2002/08/07 21:37:47 camacho
430 * Do not remove the search block prematurely in do_gapped_blast_search
431 *
432 * Revision 6.364 2002/08/06 17:33:50 madden
433 * Fix return value problem
434 *
435 * Revision 6.363 2002/07/19 17:55:47 dondosha
436 * 1.Return 0 status from BLASTPerformFinalSearch when database sequence has 0 length;
437 * 2. Do not destroy search block too early.
438 *
439 * Revision 6.362 2002/07/15 18:53:27 camacho
440 * Small fix to previous commit
441 *
442 * Revision 6.361 2002/07/14 17:18:13 camacho
443 * Fixed small memory leak in do_blast_search/do_gapped_blast_search
444 *
445 * Revision 6.360 2002/07/12 18:02:55 dondosha
446 * Do not call AdjustOffsetsInMaskLoc if no lower case mask
447 *
448 * Revision 6.359 2002/07/12 16:06:26 dondosha
449 * Adjust offsets and remove unneeded lower case mask locations when query is a subsequence
450 *
451 * Revision 6.358 2002/06/27 13:01:26 kans
452 * BlastGetVirtualOIDList is LIBCALL
453 *
454 * Revision 6.357 2002/06/26 00:56:28 camacho
455 *
456 * 1. Fixed bug when searching a mixture of real and mask databases.
457 * 2. Clean up of code that calculates the number of sequences and database
458 * length.
459 *
460 * Revision 6.356 2002/06/25 16:43:45 dondosha
461 * Get out from all search loops if bad status returned, meaning process ran out of memory
462 *
463 * Revision 6.355 2002/06/25 13:11:22 madden
464 * Fix UMR for status in do_gapped_blast_search
465 *
466 * Revision 6.354 2002/06/21 21:49:10 camacho
467 * Removed references to thr_info->blast_seqid_list in BlastGetDbChunk
468 *
469 * Revision 6.353 2002/06/12 15:43:09 dondosha
470 * Potential uninitialized variable bug fixed
471 *
472 * Revision 6.352 2002/06/12 15:33:25 dondosha
473 * Corrected integer types of the variable holding return status in 2 functions
474 *
475 * Revision 6.351 2002/06/11 20:40:04 dondosha
476 * Correction to previous change
477 *
478 * Revision 6.350 2002/06/11 14:44:45 dondosha
479 * Return status from some functions instead of search block pointer
480 *
481 * Revision 6.349 2002/06/05 15:30:34 coulouri
482 * Move signal handling to blastsrv.c
483 *
484 * Revision 6.348 2002/05/20 22:49:10 dondosha
485 * Fix for the Mega BLAST case when database sequence is split, and an HSP is accidentally extended across the boundary to a completely masked query
486 *
487 * Revision 6.347 2002/05/15 19:51:01 dondosha
488 * Do a sanity check for the final db sequence parameter
489 *
490 * Revision 6.346 2002/04/23 16:01:27 madden
491 * Fix for ungapped search of arbitrary matrix
492 *
493 * Revision 6.345 2002/04/23 15:40:10 madden
494 * Fix for effective length change and ungapped blast
495 *
496 * Revision 6.344 2002/04/19 21:22:30 madden
497 * Added protection for matrices that are only empty strings
498 *
499 * Revision 6.343 2002/04/18 12:07:05 madden
500 * Check for Selenocysteine in Bioseq, replace with X
501 *
502 * Revision 6.342 2002/04/17 17:30:15 madden
503 * Call getAlphaBeta only for gapped alignments
504 *
505 * Revision 6.341 2002/04/16 15:42:15 madden
506 * Save mask1 for lookup table hashing only (change for neighboring)
507 *
508 * Revision 6.340 2002/04/04 21:19:15 dondosha
509 * Corrections for megablast with non-greedy extensions
510 *
511 * Revision 6.339 2002/03/26 21:20:50 dondosha
512 * 1. Make hitlist size larger for preliminary gapped alignment
513 * 2. Pass readdb structure to megablast set up if it is already initialized
514 *
515 * Revision 6.338 2002/03/26 16:46:40 madden
516 * Move calculation of effective lengths to BlastCalculateEffectiveLengths
517 *
518 * Revision 6.337 2002/03/06 18:34:31 dondosha
519 * Pass the filtered locations back from the megablast engine to use in formatting
520 *
521 * Revision 6.336 2002/02/27 22:39:00 dondosha
522 * Fixed bug in splitting long database sequences for translated searches
523 *
524 * Revision 6.335 2002/02/27 17:43:20 dondosha
525 * Made effective database length option work properly
526 *
527 * Revision 6.334 2002/02/26 22:25:20 dondosha
528 * Return error as soon as it is found that matrix name is not supported
529 *
530 * Revision 6.333 2002/02/26 17:37:40 dondosha
531 * Fixed bug in BlastNtWordFinder for word sizes > 12
532 *
533 * Revision 6.332 2002/02/26 15:03:13 dondosha
534 * Accidental newline in sprintf removed
535 *
536 * Revision 6.331 2002/02/25 23:26:57 dondosha
537 * Changed error to warning if no letters to be indexed just on one context
538 *
539 * Revision 6.330 2002/01/04 22:01:33 coulouri
540 * Fixed BlastSetLimits() to work under linux
541 *
542 * Revision 6.329 2002/01/04 20:16:12 dondosha
543 * Correction for single strand blastx with OOF gapping
544 *
545 * Revision 6.328 2001/12/28 20:38:40 dondosha
546 * Moved Mega BLAST related parameters into a separate structure
547 *
548 * Revision 6.327 2001/12/17 17:31:35 madden
549 * Fix memory leaks
550 *
551 * Revision 6.326 2001/12/14 21:04:31 madden
552 * Reinit start to zero for every frame
553 *
554 * Revision 6.325 2001/12/10 23:04:19 dondosha
555 * Corrected how number of db sequences is set when gi list exists
556 *
557 * Revision 6.324 2001/11/23 21:11:16 dondosha
558 * Correction to previous change
559 *
560 * Revision 6.323 2001/11/23 19:57:55 dondosha
561 * Correction for bl2seq related to recent changes in megablast
562 *
563 * Revision 6.322 2001/11/14 23:39:31 dondosha
564 * Switched return value for BlastNtWordUngappedExtend
565 *
566 * Revision 6.321 2001/11/13 18:17:26 dondosha
567 * Added BlastNtWordUngappedExtend for use in Mega BLAST
568 *
569 * Revision 6.320 2001/09/21 14:42:08 dondosha
570 * Correction of previous fix in BlastReapPartialHitlistByEvalue for non-megablast programs
571 *
572 * Revision 6.319 2001/09/20 14:39:15 madden
573 * Fix for non-blastn programs in BlastReapPartialHitlistByEvalue
574 *
575 * Revision 6.318 2001/09/17 16:33:39 dondosha
576 * Bug fix in BlastReapPartialHitlistByEvalue
577 *
578 * Revision 6.317 2001/09/11 14:28:30 madden
579 * Added timed_out Boolean to SearchBlk
580 *
581 * Revision 6.316 2001/09/07 14:46:43 dondosha
582 * Roll back removal of threshold_first from functions and structures
583 *
584 * Revision 6.315 2001/09/06 20:24:33 dondosha
585 * Removed threshold_first
586 *
587 * Revision 6.314 2001/08/10 14:55:55 madden
588 * Add pv_array for multiple hits blastn
589 *
590 * Revision 6.313 2001/07/24 19:50:32 dondosha
591 * Do not create a star_proc thread if there is no tick_proc
592 *
593 * Revision 6.312 2001/07/20 18:52:25 dondosha
594 * Removed unused code
595 *
596 * Revision 6.311 2001/07/18 19:24:17 madden
597 * Set options->dbseq_num if use_real_db TRUE
598 *
599 * Revision 6.310 2001/07/09 14:17:23 madden
600 * Fix PC-lint complaints from R. Williams
601 *
602 * Revision 6.309 2001/07/09 13:12:02 madden
603 * Removed unused variables
604 *
605 * Revision 6.308 2001/07/06 15:22:42 madden
606 * Correction for BLASTN
607 *
608 * Revision 6.307 2001/06/29 18:07:20 madden
609 * Fix problem with scalingFactor
610 *
611 * Revision 6.306 2001/06/28 13:42:09 madden
612 * Fixes to prevent overflow on number of hits reporting
613 *
614 * Revision 6.305 2001/06/27 17:46:33 madden
615 * Add mutex to protect number_of_pos_hits, found by H. Gabb at KAI
616 *
617 * Revision 6.304 2001/06/26 20:37:04 madden
618 * Fixes for realdb_done problem found by H. Gabb at KAI
619 *
620 * Revision 6.303 2001/06/25 16:03:12 madden
621 * Correctly set gapped_start for blastn
622 *
623 * Revision 6.302 2001/06/21 21:29:07 dondosha
624 * Fixed memory leaks: destroy all error returns, free private_slp
625 *
626 * Revision 6.301 2001/06/15 16:38:45 dondosha
627 * Correction to previous changes
628 *
629 * Revision 6.300 2001/06/14 22:09:14 dondosha
630 * Rearranged code for gi lists and oid masks processing to get rid of duplication
631 *
632 * Revision 6.299 2001/06/13 21:45:08 dondosha
633 * Search of multiple databases with gi files implemented
634 *
635 * Revision 6.298 2001/06/12 19:48:55 madden
636 * Introduce total_hsp_limit, check before making SeqAlign
637 *
638 * Revision 6.297 2001/05/25 19:34:17 vakatov
639 * Nested comment typo fixed
640 *
641 * Revision 6.296 2001/05/04 15:59:46 dondosha
642 * Function BlastFillQueryOffsets now has an extra argument for megablast use
643 *
644 * Revision 6.295 2001/05/03 21:48:28 dondosha
645 * Handle some cases when memory allocation fails
646 *
647 * Revision 6.294 2001/04/23 17:09:18 madden
648 * Use StringSave for gifile variable
649 *
650 * Revision 6.293 2001/04/16 16:37:01 madden
651 * Restore old length correction behavior for blastn
652 *
653 * Revision 6.292 2001/04/13 20:46:42 madden
654 * Changed edge effect correction in BLASTSetUpSearchInternalByLoc to use new method with alpha and beta parameters from Altschul, Bundschuh, Olsen, Hwa, Nucleic Acids Research 29(2001), 351-361.
655 *
656 * Revision 6.291 2001/04/11 20:56:06 madden
657 * Added scalingFactor for rpsblast
658 *
659 * Revision 6.290 2001/04/04 20:31:16 dondosha
660 * Bug fix for blastx with a subsequence query
661 *
662 * Revision 6.289 2001/04/02 15:55:27 dondosha
663 * Check HSP frames when merging hitlists from split subject sequence
664 *
665 * Revision 6.288 2001/03/30 23:53:45 dondosha
666 * Correction in splitting long database sequences for tblastn
667 *
668 * Revision 6.287 2001/03/19 18:53:45 madden
669 * Added call to BlastSeqLocFillDoubleIntEx, changed call to BlastSeqLocFillDoubleIntRev
670 *
671 * Revision 6.286 2001/03/14 14:54:35 madden
672 * fix problem with partial translating query
673 *
674 * Revision 6.285 2001/03/12 21:38:59 dondosha
675 * Bug fix in database sequence splitting change
676 *
677 * Revision 6.284 2001/03/08 22:05:47 dondosha
678 * Split very long database sequences in all BLAST programs
679 *
680 * Revision 6.283 2001/03/07 14:09:17 madden
681 * Set multiple_hits depending on option block
682 *
683 * Revision 6.282 2001/03/06 22:02:32 dondosha
684 * Rolled back accidental change in BlastReevaluateWithAmbiguities
685 *
686 * Revision 6.281 2001/03/01 15:41:33 dondosha
687 * Added protection from infinite loop in new_link_hsps
688 *
689 * Revision 6.280 2001/01/24 21:55:53 dondosha
690 * Correction to previous change
691 *
692 * Revision 6.279 2001/01/24 20:51:49 dondosha
693 * Enabled splitting of the second sequence for 2 sequences with megablast
694 *
695 * Revision 6.278 2001/01/19 17:23:16 madden
696 * Optimization for 2-hit blastn
697 *
698 * Revision 6.277 2001/01/16 14:03:53 madden
699 * Enable gapped check for blastn immediately after finding hits
700 *
701 * Revision 6.276 2001/01/09 20:10:37 shavirin
702 * Added sorting of all hits in result_struct for every element in
703 * results. Added function RPSResultHspScoreCmp.
704 *
705 * Revision 6.275 2001/01/08 20:21:40 dondosha
706 * Adjust subject offset in the gap edit blocks if database sequence was split in megablast search
707 *
708 * Revision 6.274 2001/01/03 21:45:29 dondosha
709 * Fixed a memory leak - some edit blocks not freed in megablast
710 *
711 * Revision 6.273 2001/01/02 22:29:45 dondosha
712 * Assign virtual oidlist to the first non-whole database rdfp in the linked list
713 *
714 * Revision 6.272 2000/12/28 18:22:29 madden
715 * Fixes to BlastNtWordFinder_mh
716 *
717 * Revision 6.271 2000/12/27 16:51:17 dondosha
718 * When splitting database sequence for megablast, keep only significant HSPs from partial hitlists
719 *
720 * Revision 6.270 2000/12/26 17:50:46 dondosha
721 * Fixed bug in BLASTMergeHsps function for merging HSPs after splitting of a database sequence
722 *
723 * Revision 6.269 2000/12/21 17:37:24 dondosha
724 * Fixed bug with minus-strand blastn search
725 *
726 * Revision 6.268 2000/12/20 15:44:01 madden
727 * Better error message if query is shorter than wordsize
728 *
729 * Revision 6.267 2000/12/18 20:38:55 shavirin
730 * Removed include <time.h> before <ncbi.h>.
731 *
732 * Revision 6.266 2000/12/07 17:45:13 dondosha
733 * Use actual subject sequence length in GreedyAlignMemAlloc for 2 Sequences engine
734 *
735 * Revision 6.265 2000/12/04 18:51:23 madden
736 * Fix memory leaks
737 *
738 * Revision 6.264 2000/11/29 16:58:16 dondosha
739 * Small fix to previous revision
740 *
741 * Revision 6.263 2000/11/29 16:29:31 dondosha
742 * For megablast, allow splitting of long subject sequences and merging hitlists
743 *
744 * Revision 6.262 2000/11/17 17:51:59 dondosha
745 * Removed is_megablast argument from BLASTSetUpSearchWithReadDbInternalEx since it is part of options
746 *
747 * Revision 6.261 2000/11/13 20:38:48 madden
748 * Fix for zero length db sequence in ungapped blast
749 *
750 * Revision 6.260 2000/11/09 14:59:38 dondosha
751 * Longest intron length in options set in nucleotide coordinates
752 *
753 * Revision 6.259 2000/11/08 22:21:32 dondosha
754 * Enabled new tblastn by adding a longest_intron option
755 *
756 * Revision 6.258 2000/11/07 16:30:24 madden
757 * Introduce intermediate score (before linking of HSPs) for blastx and tblastn
758 *
759 * Revision 6.257 2000/11/03 20:13:55 dondosha
760 * Do not call readdb_get_sequence_ex from new_link_hsps for two sequences BLAST
761 *
762 * Revision 6.256 2000/11/01 16:25:58 madden
763 * Changes from Futamura for psitblastn
764 *
765 * Revision 6.255 2000/11/01 00:05:18 vakatov
766 * Added missing "LIBCALL"
767 *
768 * Revision 6.254 2000/10/31 16:30:56 shavirin
769 * Function BLASTSetUpSearchInternalByLoc became external.
770 *
771 * Revision 6.253 2000/10/30 16:51:04 shavirin
772 * Changed function with creation temporary bioseqs for SEG filtering.
773 *
774 * Revision 6.252 2000/10/26 18:45:58 dondosha
775 * Check if gi list file is provided from the db alias
776 *
777 * Revision 6.251 2000/10/24 19:05:45 dondosha
778 * Moved function UniqueLocalId to sequtil.c
779 *
780 * Revision 6.250 2000/10/06 21:36:02 dondosha
781 * Do not multiply window size by 3 for subject in new_link_hsps
782 *
783 * Revision 6.249 2000/10/06 16:36:57 shavirin
784 * Correctly closed file with gi list in the function GetGisFromFile().
785 *
786 * Revision 6.248 2000/10/05 19:54:50 dondosha
787 * For Mega BLAST, call MegaBlastSaveCurrentHitlist instead of BlastSaveCurrentHitlist
788 *
789 * Revision 6.247 2000/09/28 15:05:59 dondosha
790 * Added splice junction search; corrected sum evalue calculation
791 *
792 * Revision 6.246 2000/09/28 14:27:52 madden
793 * Correct use of search space for linked hsps
794 *
795 * Revision 6.245 2000/09/18 16:04:38 madden
796 * No call to BlastFindWords if rpsblast
797 *
798 * Revision 6.244 2000/09/14 14:58:20 dondosha
799 * Further improvements with new tblastn (still not in the executable)
800 *
801 * Revision 6.243 2000/09/12 16:11:31 dondosha
802 * Changed window size, plus some bug fixes for new_link_hsps
803 *
804 * Revision 6.242 2000/09/01 18:25:10 dondosha
805 * Pass start and length to BlastFindWords, not start and end
806 *
807 * Revision 6.241 2000/09/01 13:47:39 shavirin
808 * Fixed error and typecast warnings from Windows NT compilation.
809 *
810 * Revision 6.240 2000/08/31 18:37:22 shavirin
811 * Added check for NULL in BlastMakeCopyQueryDNAP().
812 *
813 * Revision 6.239 2000/08/31 17:06:20 shavirin
814 * Added few OOF related functions to copy and delete query_dnap.
815 *
816 * Revision 6.238 2000/08/31 15:59:12 dondosha
817 * No need to call ReadDBFreeSharedInfo from do_the_blast_run
818 *
819 * Revision 6.237 2000/08/29 19:36:37 madden
820 * Do not lookup gis if gilist_already_calculated is set
821 *
822 * Revision 6.236 2000/08/29 18:09:34 dondosha
823 * Adjust the reverse strand offsets for non-megablast blastn in BlastSaveCurrentHitlist
824 *
825 * Revision 6.235 2000/08/25 22:41:49 dondosha
826 * Do reevaluation of score with ambiguities for megablast
827 *
828 * Revision 6.234 2000/08/23 18:48:44 madden
829 * Use BlastKarlinBlkGappedCalcEx in place of BlastKarlinBlkGappedCalc
830 *
831 * Revision 6.233 2000/08/18 20:12:28 dondosha
832 * Do not use search->query_id in megablast, use only qid_array
833 *
834 * Revision 6.232 2000/08/07 16:59:49 dondosha
835 * Correct construction of path for gi list file
836 *
837 * Revision 6.231 2000/08/03 17:50:37 dondosha
838 * Check HSPs for going beyond ends of query in megablast
839 *
840 * Revision 6.230 2000/08/02 15:26:09 dondosha
841 * For megablast compute search space depending on query when getting evalue
842 *
843 * Revision 6.229 2000/07/25 16:52:49 shavirin
844 * Corrected function BlastCreateQueryDNAP().
845 *
846 * Revision 6.228 2000/07/24 16:12:05 hurwitz
847 * made definition of BLASTSetUpSearchWithReadDbInternalEx match the one in blastpri.h
848 *
849 * Revision 6.227 2000/07/21 21:26:43 dondosha
850 * Added BLASTSetUpSearchWithReadDbInternalEx with Boolean argument is_megablast
851 *
852 * Revision 6.226 2000/07/18 22:32:38 shavirin
853 * Adjusted space allocated for DNA-P query sequence
854 *
855 * Revision 6.225 2000/07/17 14:17:10 shavirin
856 * Added new function BlastCreateQueryDNAP() and OOF_TranslateToDNAP() and
857 * support for Out of frame gap algorithm.
858 *
859 * Revision 6.224 2000/07/12 13:36:29 shavirin
860 * Removed last NULL parameter from MegaBlastSetUpSearchInternalByLoc().
861 *
862 * Revision 6.223 2000/07/11 17:16:19 shavirin
863 * Added new parameter is_ooframe for Out-Of-Frame gapping algorithm.
864 *
865 * Revision 6.222 2000/06/22 22:28:07 dondosha
866 * Only look at HSPs up to hspcnt_max in BlastSaveCurrentHitlist - this allows not to use MemNew when initializing hsp_array
867 *
868 * Revision 6.221 2000/06/22 14:08:20 madden
869 * Fix bug in BlastWordExtend_prelim if word-hit is at end of sequence
870 *
871 * Revision 6.220 2000/06/08 20:34:10 madden
872 * add explode_seqids option to show all ids in a defline
873 *
874 * Revision 6.219 2000/05/25 21:03:56 dondosha
875 * In BlastSaveCurrentHitlist assign hspcnt for result hitlist correctly
876 *
877 * Revision 6.218 2000/05/24 19:48:06 dondosha
878 * Moved initialization of qid_array in megablast to search set-up
879 *
880 * Revision 6.217 2000/05/19 19:36:18 madden
881 * Fix for longer words in BlastNtWordFinder, do not call BlastNTPreliminaryGappedScore
882 *
883 * Revision 6.216 2000/05/17 17:13:36 dondosha
884 * Removed some unused variables
885 *
886 * Revision 6.215 2000/05/16 19:59:24 madden
887 * Do no set ignore_small_gaps to TRUE
888 *
889 * Revision 6.214 2000/05/12 19:42:29 dondosha
890 * Use array instead of linked list of query ids in megablast
891 *
892 * Revision 6.213 2000/05/12 18:53:25 shavirin
893 * Fixed memory leak with OIDList.
894 *
895 * Revision 6.212 2000/05/11 18:02:23 shavirin
896 * Minor change for using gi_list together with oid-databasees.
897 *
898 * Revision 6.211 2000/05/09 19:42:49 shavirin
899 * Fixed in BlastGetDbChunk() no-mutex regular database case.
900 *
901 * Revision 6.210 2000/05/03 17:08:26 shavirin
902 * Fixed minor bug in the function BLASTSetUpSearchWithReadDbInternal().
903 *
904 * Revision 6.209 2000/05/01 21:24:54 dondosha
905 * Changed greedy_gapped_align to MegaBlastGreedyAlign
906 *
907 * Revision 6.208 2000/04/28 17:51:49 shavirin
908 * Replaced define RPS_BLAST with checking parameter is_rps_blast.
909 *
910 * Revision 6.207 2000/04/25 19:05:13 dondosha
911 * Before search assign db_chunk_last to first_db_seq
912 *
913 * Revision 6.206 2000/04/24 16:43:51 dondosha
914 * Call BlastReapHitlistByEvalue in MegaBlast if hitlists are saved
915 *
916 * Revision 6.205 2000/04/20 15:12:32 dondosha
917 * Bug fix for minus-strand only search - do not try to concatenate second strand to first
918 *
919 * Revision 6.204 2000/04/11 12:47:08 madden
920 * Proper casting to Int8
921 *
922 * Revision 6.203 2000/04/10 20:01:24 dondosha
923 * Fill both strands mask locations in a one location list for blastn
924 *
925 * Revision 6.202 2000/04/10 17:16:44 madden
926 * Make search_sp Int8 to prevent overflow
927 *
928 * Revision 6.201 2000/04/10 15:24:24 dondosha
929 * Enabled use of MegaBlast for BlastTwoSequences
930 *
931 * Revision 6.200 2000/04/07 20:19:45 dondosha
932 * Do not call BlastReapHitlistByEvalue for megablast
933 *
934 * Revision 6.199 2000/04/07 16:43:25 dondosha
935 * Assign dbseq_num to min of actual db size and gilist size
936 *
937 * Revision 6.198 2000/04/07 13:11:56 shavirin
938 * Checked for queue_callback != NULL.
939 *
940 * Revision 6.197 2000/04/06 13:13:33 shavirin
941 * Changed sequence to post semaphore info for internal queueing.
942 *
943 * Revision 6.196 2000/04/04 20:48:21 dondosha
944 * Fixed a memory leak in saving hitlists for MegaBlast
945 *
946 * Revision 6.195 2000/04/04 16:16:59 dondosha
947 * Fixed some memory leaks in MegaBlast traceback
948 *
949 * Revision 6.194 2000/04/03 21:21:44 dondosha
950 * Assign is_neighboring parameter from option
951 *
952 * Revision 6.193 2000/03/31 21:14:24 dondosha
953 * Changed some names related to MegaBlast
954 *
955 * Revision 6.192 2000/03/31 16:50:51 dondosha
956 * Sort hsps and remove redundant when saving hitlist in MegaBlast
957 *
958 * Revision 6.191 2000/03/30 21:45:04 madden
959 * Add call to BLASTResultHitlistFreeEx
960 *
961 * Revision 6.190 2000/03/29 22:19:43 dondosha
962 * BlastSaveCurrentHitlist adjusts query offsets for blastn; creates seqaligns for MegaBlast
963 *
964 * Revision 6.189 2000/03/27 16:46:22 madden
965 * Moved call to BlastFillQueryOffsets to BLASTSetUpSearchInternalByLoc
966 *
967 * Revision 6.188 2000/03/23 20:51:15 dondosha
968 * Set dbseq_num to gi_list_total if search space is not recalculated and gi_list exists
969 *
970 * Revision 6.187 2000/03/22 18:08:59 dondosha
971 * Free rdfp->shared_info in single threaded case the same way as in multithreaded after the search
972 *
973 * Revision 6.186 2000/03/14 21:01:16 dondosha
974 * Call BlastTickProc even when gi_list is set
975 *
976 * Revision 6.185 2000/03/13 21:11:35 dondosha
977 * Check options parameters use_real_db_size and sort_gi_list when dealing with gi_list
978 *
979 * Revision 6.184 2000/03/03 18:02:05 shavirin
980 * Added support for low character filering in "blastx", "tblastx"
981 * and translated RPS Blast.
982 *
983 * Revision 6.183 2000/03/03 17:41:09 egorov
984 * fix memory leak with oidlist
985 *
986 * Revision 6.182 2000/03/02 21:24:16 shavirin
987 * Checked for SEQLOC_PACKED_INT in blastMergeFilterLocs()
988 *
989 * Revision 6.181 2000/03/02 18:30:46 dondosha
990 * Minor bug fix in BlastSaveCurrentHsp for blastn
991 *
992 * Revision 6.180 2000/03/02 17:11:01 dondosha
993 * Fixed bug with one strand search option for blastn
994 *
995 * Revision 6.179 2000/03/01 21:40:53 shavirin
996 * Added code to filter lower-case character regions (except blastx and tblastx)
997 *
998 * Revision 6.178 2000/02/29 18:17:24 shavirin
999 * Variable query_dna_mask changed to query_lcase_mask.
1000 *
1001 * Revision 6.177 2000/02/29 18:09:36 dondosha
1002 * Call BlastFillQueryOffsets in BLASTSetUpSearchEx
1003 *
1004 * Revision 6.176 2000/02/23 20:56:51 dondosha
1005 * Returning strand concatenation for blastn with bug fixes
1006 *
1007 * Revision 6.175 2000/02/18 15:30:36 shavirin
1008 * Added parameter query_dna_mask into options and parameters.
1009 *
1010 * Revision 6.174 2000/02/17 21:23:08 shavirin
1011 * Added parameter is_rps_blast.
1012 *
1013 * Revision 6.173 2000/02/17 19:02:08 shavirin
1014 * Removed all references to absolete theCacheSize variable.
1015 *
1016 * Revision 6.172 2000/02/17 18:29:02 shavirin
1017 * Added function DefineToFrame().
1018 *
1019 * Revision 6.171 2000/02/16 21:47:45 shavirin
1020 * Fixed memory leaks in the function BlastReapHitlistByEvalue ().
1021 *
1022 * Revision 6.170 2000/02/15 21:02:00 shavirin
1023 * Added support to filter DNA sequence in translated RPS Blast.
1024 *
1025 * Revision 6.169 2000/02/15 19:17:29 shavirin
1026 * Added filter_string to Parameters block.
1027 *
1028 * Revision 6.168 2000/02/14 16:15:40 madden
1029 * Revert to 6.166
1030 *
1031 * Revision 6.167 2000/02/11 20:41:46 dondosha
1032 * Search on two query strands concatenated in blastn
1033 *
1034 * Revision 6.166 2000/02/09 19:40:00 madden
1035 * Fix purify problems in link_hsps
1036 *
1037 * Revision 6.165 2000/02/09 19:35:36 madden
1038 * Changed GetGisFromFile to also read binary gilists
1039 *
1040 * Revision 6.164 2000/02/03 21:34:07 dondosha
1041 * Fixed bug in setting extra_bytes_needed
1042 *
1043 * Revision 6.163 2000/02/02 20:01:57 madden
1044 * Added LIBCALLBACK to a callback
1045 *
1046 * Revision 6.162 2000/02/02 18:21:31 madden
1047 * Joerg optimizations for link_hsps
1048 *
1049 * Revision 6.161 2000/02/02 16:56:23 dondosha
1050 * Do not call BlastSaveCurrentHitlist if handle_results callback set
1051 *
1052 * Revision 6.160 2000/02/02 15:05:42 dondosha
1053 * Removed call to ReapHitlistByContext, erroneously included in previous version
1054 *
1055 * Revision 6.159 2000/02/01 22:37:05 dondosha
1056 * Call the new routine BlastReapHitlistByContext only when greedy alignment option is set
1057 *
1058 * Revision 6.158 2000/02/01 21:47:04 dondosha
1059 * Added greedy basic gapped alignment option
1060 *
1061 * Revision 6.157 2000/01/14 15:17:13 madden
1062 * Set no_check_score in pbp
1063 *
1064 * Revision 6.156 2000/01/13 18:10:41 madden
1065 * Fix problem with incorrect stat values for blastn and missing hits
1066 *
1067 * Revision 6.155 2000/01/13 14:27:04 madden
1068 * Fixed other problem in BlastWordFinder_contig()
1069 *
1070 * Revision 6.154 2000/01/12 18:52:23 shavirin
1071 * Fixed lookup_pos in BlastWordFinder_contig().
1072 *
1073 * Revision 6.153 2000/01/11 18:36:25 shavirin
1074 * Added functions, those handle dynamic lookup table.
1075 *
1076 * Revision 6.152 2000/01/11 15:32:46 dondosha
1077 * Fixed memory leaks in opening shared header and sequence file memory maps
1078 *
1079 * Revision 6.151 2000/01/04 22:52:25 madden
1080 * Restored code for using real db size
1081 *
1082 * Revision 6.150 1999/12/31 14:23:18 egorov
1083 * Add support for using mixture of real and maks database with gi-list files:
1084 * 1. Change logic of creating rdfp list.
1085 * 2. BlastGetDbChunk gets real databases first, then masks.
1086 * 3. Propoper calculation of database sizes using alias files.
1087 * 4. Change to CommonIndex to support using of mask databases.
1088 * 5. Use correct gis in formated output (BlastGetAllowedGis()).
1089 * 6. Other small changes
1090 *
1091 * Revision 6.149 1999/12/29 19:03:59 shavirin
1092 * Relative pointers in BlastWordFinder_mh_contig() updated to 8 byte pointers
1093 *
1094 * Revision 6.148 1999/12/29 18:57:03 shavirin
1095 * Added possibility to use relative pointers in BlastWordFinder_mh_contig().
1096 *
1097 * Revision 6.147 1999/12/22 21:55:38 dondosha
1098 * Close header and sequence files when search is done
1099 *
1100 * Revision 6.146 1999/12/21 20:05:48 egorov
1101 * Change logic of generating mask file when we have a gi-list file,
1102 * real database and mask database. In fact, this is a big bug fix.
1103 *
1104 * Revision 6.145 1999/12/16 19:17:34 egorov
1105 * Code cleanup
1106 *
1107 * Revision 6.144 1999/12/02 14:39:35 egorov
1108 * When both mask and gi_list are specified, do not overwrite calculated
1109 * number of sequences and database length with values from alias file.
1110 *
1111 * Revision 6.143 1999/11/30 19:00:49 madden
1112 * Added Nlm_SwapUint4 calls for the ordinal ID list
1113 *
1114 * Revision 6.142 1999/11/26 22:26:13 madden
1115 * Added BlastNT functions for nucl. extensions
1116 *
1117 * Revision 6.141 1999/11/24 21:43:35 madden
1118 * Added Nlm_SwapUint4 call to make database masks work with both big and small endian systems
1119 *
1120 * Revision 6.140 1999/11/12 20:57:39 shavirin
1121 * Added parameter use_best_align into BLAST_ParameterBlkPtr
1122 *
1123 * Revision 6.139 1999/10/27 21:33:00 madden
1124 * Use housekeeping threads only for larger sequences
1125 *
1126 * Revision 6.138 1999/10/26 20:45:18 madden
1127 * Add use_real_db_size option
1128 *
1129 * Revision 6.137 1999/10/19 17:41:20 madden
1130 * Ensure that ThreadJoin is called on every thread created
1131 *
1132 * Revision 6.136 1999/10/14 17:57:44 madden
1133 * Fix for database size set by user, remove ununsed variables
1134 *
1135 * Revision 6.135 1999/10/12 19:34:08 madden
1136 * Call MutexDestroy on callback_mutex
1137 *
1138 * Revision 6.134 1999/10/08 17:39:57 egorov
1139 * Store input gi list to pick up correct definition for redundant sequences
1140 *
1141 * Revision 6.133 1999/10/05 18:16:06 shavirin
1142 * Functions tick_proc and get_db_chunk were renamed and become public.
1143 *
1144 * Revision 6.132 1999/10/05 17:42:53 shavirin
1145 * Removed global variables from blast.c
1146 *
1147 * Revision 6.131 1999/10/01 21:07:12 shavirin
1148 * Chanded definition and adjusted function get_db_list().
1149 *
1150 * Revision 6.130 1999/09/28 20:14:32 madden
1151 * Joerg changes to mimize cache misses
1152 *
1153 * Revision 6.129 1999/09/22 21:54:08 egorov
1154 * remove debug info
1155 *
1156 * Revision 6.128 1999/09/22 21:03:55 egorov
1157 * Add mask DB stuff
1158 *
1159 * Revision 6.127 1999/09/16 16:54:23 madden
1160 * Changes to BlastNtWordFinder for long words
1161 *
1162 * Revision 6.126 1999/09/16 14:16:54 madden
1163 * Changed call to lookup_find_init
1164 *
1165 * Revision 6.125 1999/08/27 18:07:32 shavirin
1166 * Passed parameter decline_align from top to the engine.
1167 *
1168 * Revision 6.124 1999/08/26 14:55:15 madden
1169 * Fixed Int8 problem
1170 *
1171 * Revision 6.123 1999/08/25 13:11:16 madden
1172 * Roll back to rev 6.121
1173 *
1174 * Revision 6.121 1999/08/20 19:47:24 madden
1175 * Changed call to BlastSearchBlkNew(Extra), removed use of version array
1176 *
1177 * Revision 6.120 1999/08/06 18:46:13 madden
1178 * Fixed spelling of incompatible
1179 *
1180 * Revision 6.119 1999/06/07 18:28:20 beloslyu
1181 * NetBSD port
1182 *
1183 * Revision 6.118 1999/05/27 17:33:04 madden
1184 * Fixed Int2 (should have been Int4) problem
1185 *
1186 * Revision 6.117 1999/04/28 13:30:03 madden
1187 * Use BlastConstructErrorMessage for error messages
1188 *
1189 * Revision 6.116 1999/04/23 16:45:53 madden
1190 * call BQ_IncSemaphore as callback
1191 *
1192 * Revision 6.115 1999/04/22 16:45:29 shavirin
1193 * Added load-ballancing function.
1194 *
1195 * Revision 6.114 1999/04/13 16:39:14 madden
1196 * Fixed problem if first context not plus strand
1197 *
1198 * Revision 6.113 1999/04/07 20:43:33 egorov
1199 * Fix a bug when ordinal_id == 0 was not allowed
1200 *
1201 * Revision 6.112 1999/04/01 21:42:45 madden
1202 * Fix memory leaks when gi list is used
1203 *
1204 * Revision 6.111 1999/03/23 21:38:19 madden
1205 * Add Join to BlastStopAwakeThread
1206 *
1207 * Revision 6.110 1999/03/19 17:03:29 egorov
1208 * Initialize global variable
1209 *
1210 * Revision 6.109 1999/03/16 15:52:25 vakatov
1211 * Got rid of extra comments-within-comments in the CVS Log section
1212 *
1213 * Revision 6.108 1999/03/16 02:49:31 beloslyu
1214 * typo fixed
1215 *
1216 * Revision 6.107 1999/03/15 22:06:01 madden
1217 * Changed cpu limit message
1218 *
1219 * Revision 6.106 1999/03/12 15:03:43 egorov
1220 * Add proper Int4-long type casting
1221 *
1222 * Revision 6.105 1999/03/04 14:18:08 egorov
1223 * Do correct filter masking when query is seqloc
1224 * The only BlastMaskTheResidues() function is changed:
1225 *
1226 * Revision 6.104 1999/02/26 22:23:06 madden
1227 * Fixed bug when only one HSP allowed per area
1228 *
1229 * Revision 6.103 1999/02/25 17:40:48 madden
1230 * Check that proper sequence type is used in setup function
1231 *
1232 * Revision 6.102 1999/02/17 13:23:00 madden
1233 * Added hsp_num_max
1234 *
1235 * Revision 6.101 1999/02/11 13:52:59 madden
1236 * fixed memory leak
1237 *
1238 * Revision 6.100 1999/01/28 17:19:50 madden
1239 * Call BlastSeqLocFilterEx on reverse strand if plus strand NULL
1240 *
1241 * Revision 6.99 1999/01/28 16:04:25 madden
1242 * HspArrayPurge change, HeapSort of HSPs, efficiency in blastn wordfinder
1243 *
1244 * Revision 6.98 1999/01/26 17:55:50 madden
1245 * start set to last_db_seq
1246 *
1247 * Revision 6.97 1999/01/19 13:32:33 madden
1248 * Fix for final db sequence to search
1249 *
1250 * Revision 6.96 1998/12/31 18:17:02 madden
1251 * Added strand option
1252 *
1253 * Revision 6.95 1998/12/31 15:36:05 victorov
1254 * filtering internals is now based on SeqLoc instead of Bioseq
1255 *
1256 * Revision 6.94 1998/12/29 17:44:43 madden
1257 * Add BlastGetNonSumStatsEvalue, optimizations for NtWordFinder
1258 *
1259 * Revision 6.93 1998/12/18 16:19:57 madden
1260 * Make BLASTSetUpSearchWithReadDbInternal public, add BlastSearchBlkNewExtra
1261 *
1262 * Revision 6.92 1998/12/17 22:29:47 victorov
1263 * the way gifile is found has changed: now we look first in the
1264 * current directory then $BLASTDB and then in ncbirc
1265 *
1266 * Revision 6.91 1998/12/15 14:11:27 madden
1267 * Change to permit an arbitrary number of HSPs
1268 *
1269 * Revision 6.90 1998/11/27 15:44:58 madden
1270 * Ensure that gap_x_dropoff_final is at least as large as gap_x_dropoff.
1271 *
1272 * Revision 6.89 1998/11/23 13:36:07 madden
1273 * Check for non-NULL tick_callback before acquiring mutex
1274 *
1275 * Revision 6.88 1998/11/19 14:03:24 madden
1276 * Added comments, minor efficiency
1277 *
1278 * Revision 6.87 1998/10/13 20:37:51 madden
1279 * Use IS_residue after call to SeqPortGetResidue
1280 *
1281 * Revision 6.86 1998/09/24 15:26:34 egorov
1282 * Fix lint complaints
1283 *
1284 * Revision 6.85 1998/09/22 16:28:03 madden
1285 * Added call to lookup_position_aux_destruct
1286 *
1287 * Revision 6.84 1998/09/14 15:11:12 egorov
1288 * Add support for Int8 length databases; remove unused variables
1289 *
1290 * Revision 6.83 1998/09/04 14:45:39 madden
1291 * Moved code from blast.c blastool.c
1292 *
1293 * Revision 6.82 1998/08/29 20:06:46 madden
1294 * Do not find words for pattern search
1295 *
1296 * Revision 6.81 1998/08/26 19:20:26 madden
1297 * Added SignalIgnore
1298 *
1299 * Revision 6.80 1998/08/13 20:00:20 egorov
1300 * Add check if gilist file exists on server
1301 *
1302 * Revision 6.79 1998/08/11 13:27:22 madden
1303 * Fix to small function for culling
1304 *
1305 * Revision 6.78 1998/08/05 13:08:16 madden
1306 * Removed obsolete global_rdfp
1307 *
1308 * Revision 6.77 1998/07/30 19:00:24 madden
1309 * Change to allow search of subset of database
1310 *
1311 * Revision 6.76 1998/07/28 21:17:45 madden
1312 * added do_not_reevaluate and mask_at_hash
1313 *
1314 * Revision 6.75 1998/07/25 14:26:39 madden
1315 * Added comments
1316 *
1317 * Revision 6.74 1998/07/22 20:31:25 madden
1318 * Added comments
1319 *
1320 * Revision 6.73 1998/07/22 12:16:23 madden
1321 * Added handle_results
1322 *
1323 * Revision 6.72 1998/07/21 20:58:01 madden
1324 * Changes to allow masking at hash only
1325 *
1326 * Revision 6.71 1998/07/17 15:39:53 madden
1327 * Changes for Effective search space.
1328 *
1329 * Revision 6.70 1998/07/14 20:14:37 egorov
1330 * Allow to specify gilist and gifile from client side
1331 *
1332 * Revision 6.69 1998/07/09 14:39:04 madden
1333 * Fix memory leak
1334 *
1335 * Revision 6.68 1998/07/02 21:00:36 egorov
1336 * Remove memory leak in threaded version
1337 *
1338 * Revision 6.67 1998/06/25 13:14:48 madden
1339 * check for NULL pointer in BlastPossibleDeleteWholeHeap
1340 *
1341 * Revision 6.66 1998/06/12 16:07:40 madden
1342 * Fixed typo
1343 *
1344 * Revision 6.65 1998/06/12 15:52:52 madden
1345 * Fixed warnings
1346 *
1347 * Revision 6.64 1998/06/02 21:21:18 madden
1348 * Changes for DNA matrices
1349 *
1350 * Revision 6.63 1998/06/02 13:10:14 madden
1351 * Fixed increment problem in for loop
1352 *
1353 * Revision 6.62 1998/05/28 19:58:48 madden
1354 * Zhengs new culling code
1355 *
1356 * Revision 6.61 1998/05/22 20:19:51 madden
1357 * Changes to fix multi-db search bug
1358 *
1359 * Revision 6.60 1998/05/17 16:28:39 madden
1360 * Allow changes to filter options and cc filtering.
1361 *
1362 * Revision 6.59 1998/05/05 14:05:32 madden
1363 * Added functions BlastStartAwakeThread and BlastStopAwakeThread
1364 *
1365 * Revision 6.58 1998/04/24 21:51:12 madden
1366 * Check return value on BlastScoreBlkFill
1367 *
1368 * Revision 6.57 1998/04/24 19:26:47 madden
1369 * Allocate ideal Karlin-Blk
1370 *
1371 * Revision 6.56 1998/04/15 20:23:47 madden
1372 * offset arg removed from BlastMaskTheResidues
1373 *
1374 * Revision 6.55 1998/04/01 22:46:55 madden
1375 * Set query_invalid flag when there is no valid sequence
1376 *
1377 * Revision 6.54 1998/03/27 01:39:08 madden
1378 * Check for non-zero subject length in link_hsps
1379 *
1380 * Revision 6.53 1998/03/25 22:26:46 madden
1381 * Use NlmThreadCreateEx
1382 *
1383 * Revision 6.52 1998/03/24 15:38:20 madden
1384 * Use BlastDoubleInt4Ptr to keep track of gis and ordinal_ids
1385 *
1386 * Revision 6.51 1998/03/19 22:16:18 madden
1387 * Changes to allow blasting by gi list
1388 *
1389 * Revision 6.50 1998/03/18 14:14:05 madden
1390 * Support random access by gi list
1391 *
1392 * Revision 6.49 1998/03/14 18:29:16 madden
1393 * Added BlastSeqIdListPtr
1394 *
1395 * Revision 6.48 1998/03/09 22:14:39 madden
1396 * Set seqid_list to NULL for child threads
1397 *
1398 * Revision 6.47 1998/02/27 14:34:26 madden
1399 * Added missing return value
1400 *
1401 * Revision 6.46 1998/02/26 22:35:00 madden
1402 * Added return value to link_hsp
1403 *
1404 * Revision 6.45 1998/02/26 19:08:07 madden
1405 * Removed BlastNtFindWords BlastPopulateAllWordArrays BlastFindWords and BlastNewFindWords
1406 *
1407 * Revision 6.44 1998/02/26 16:56:02 madden
1408 * Fix for flyblast type searches
1409 *
1410 * Revision 6.43 1998/02/24 22:46:00 madden
1411 * Added option to shutdown culling
1412 *
1413 * Revision 6.42 1998/02/19 22:57:20 madden
1414 * Correctly set multiple_hits flag in BlastSetUpSearchEx
1415 *
1416 * Revision 6.41 1998/02/02 21:42:17 madden
1417 * link_hsps returns first BLAST_HSPPtr in list
1418 *
1419 * Revision 6.40 1998/01/31 21:33:49 madden
1420 * Fix to ensure hits are ranked properly
1421 *
1422 * Revision 6.39 1998/01/27 20:33:19 madden
1423 * Adjustments for query and db lengths
1424 *
1425 * Revision 6.38 1998/01/23 22:01:49 madden
1426 * Effective query length fixes for short sequences
1427 *
1428 * Revision 6.37 1998/01/15 19:30:31 madden
1429 * Protection against crashes for short sequences
1430 *
1431 * Revision 6.36 1998/01/09 22:30:06 madden
1432 * Fix for range-dependent BLAST with short sequences
1433 *
1434 * Revision 6.35 1998/01/07 23:04:25 madden
1435 * Added mutex for callbacks
1436 *
1437 * Revision 6.34 1998/01/06 18:25:24 madden
1438 * Save query_slp
1439 *
1440 * Revision 6.33 1998/01/05 22:37:34 madden
1441 * Check that options->multiple_hits_only is set before using multiple_hits
1442 *
1443 * Revision 6.32 1998/01/05 21:14:51 madden
1444 * Added protection against NULL LookupTablePtr and BLAST_WordFinderPtr
1445 *
1446 * Revision 6.31 1998/01/05 16:46:46 madden
1447 * One or both strands can be searched, as opposed to only both, changes to number of contexts
1448 *
1449 * Revision 6.30 1997/12/31 19:46:40 madden
1450 * Optimization of database scanning loop
1451 *
1452 * Revision 6.29 1997/12/31 17:50:42 madden
1453 * Added function BlastNtWordFinder_mh
1454 *
1455 * Revision 6.28 1997/12/29 16:15:01 madden
1456 * Optimizations for BlastNtWordFinder
1457 *
1458 * Revision 6.27 1997/12/24 19:42:57 madden
1459 * Fix for cell dependent blast
1460 *
1461 * Revision 6.26 1997/12/23 19:13:36 madden
1462 * Removed flags parameter from NlmThreadCreate
1463 *
1464 * Revision 6.25 1997/12/23 18:11:51 madden
1465 * Changes for range-dependent blast
1466 *
1467 * Revision 6.24 1997/12/17 19:25:36 madden
1468 * replace THR_BOUND with THREAD_BOUND
1469 *
1470 * Revision 6.23 1997/12/11 22:19:49 madden
1471 * Removed unused variables and function
1472 *
1473 * Revision 6.22 1997/12/10 22:40:28 madden
1474 * Floats used in call to blast_set_parameters, use of defines rather than strings
1475 *
1476 * Revision 6.21 1997/12/08 21:56:25 madden
1477 * Check for queries without valid sequences
1478 *
1479 * Revision 6.20 1997/12/04 21:49:05 madden
1480 * Check for NULL returned by BioseqLockById
1481 *
1482 * Revision 6.19 1997/11/07 21:38:40 madden
1483 * Check for virtual Bioseqs
1484 *
1485 * Revision 6.18 1997/10/30 15:40:55 madden
1486 * Casts and fixes for DEC alpha
1487 *
1488 * Revision 6.17 1997/10/24 19:09:14 madden
1489 * Removed BlastSetReadDB and BlastGetReadDB_ID, changed to ReadDBGetDb and ReadDBGetDbId
1490 *
1491 * Revision 6.16 1997/10/21 19:49:53 madden
1492 * Fix for no valid query sequence and hitlist_max of 1
1493 *
1494 * Revision 6.15 1997/10/06 17:57:49 madden
1495 * DB chunk size now done properly
1496 *
1497 * Revision 6.14 1997/09/29 17:19:30 madden
1498 * Checks for two threads using the same resource
1499 *
1500 * Revision 6.13 1997/09/25 13:44:56 madden
1501 * tblastn fix for mutliple db searches
1502 *
1503 * Revision 6.12 1997/09/24 22:36:29 madden
1504 * Fixes for MT multidb searches
1505 *
1506 * Revision 6.11 1997/09/22 18:24:25 madden
1507 * Added ifdef for OS_UNIX_LINUX
1508 *
1509 * Revision 6.10 1997/09/22 17:36:18 madden
1510 * MACROS for position-specific matrices from Andy Neuwald
1511 *
1512 * Revision 6.9 1997/09/16 18:47:44 madden
1513 * ifdef for OS_UNIX_SUN
1514 *
1515 * Revision 6.8 1997/09/16 16:31:22 madden
1516 * More changes for multiple db runs
1517 *
1518 * Revision 6.7 1997/09/15 22:07:19 madden
1519 * Replacing ifdef RLIMIT_CPU with ifdef OS_UNIX
1520 *
1521 * Revision 6.6 1997/09/12 19:56:53 madden
1522 * Fix for multi-threaded runs
1523 *
1524 * Revision 6.5 1997/09/11 18:49:20 madden
1525 * Changes to enable searches against multiple databases.
1526 *
1527 * Revision 6.4 1997/09/10 23:10:53 kans
1528 * added ifdef RLIMIT_CPU for signal and headers
1529 *
1530 * Revision 6.3 1997/09/10 21:27:52 madden
1531 * Changes to set CPU limits
1532 *
1533 * Revision 6.2 1997/09/03 19:06:02 madden
1534 * Bug fix for effective HSP longer than query
1535 *
1536 * Revision 6.1 1997/08/27 14:46:43 madden
1537 * Changes to enable multiple DB searches
1538 *
1539 * Revision 6.0 1997/08/25 18:52:19 madden
1540 * Revision changed to 6.0
1541 *
1542 * Revision 1.227 1997/08/19 18:19:16 madden
1543 * Cast arg of log to Nlm_FloatHi
1544 *
1545 * Revision 1.226 1997/08/12 20:50:28 madden
1546 * Fixed case where two HSPs start at same query offset
1547 *
1548 * Revision 1.225 1997/07/29 17:07:01 madden
1549 * Fix for possible collision of two star threads
1550 *
1551 * Revision 1.224 1997/07/25 15:39:27 madden
1552 * Set correct query ID for filtering
1553 *
1554 * Revision 1.223 1997/07/24 21:08:31 madden
1555 * Take frame into account in sorting of hits for linking
1556 *
1557 * Revision 1.222 1997/07/22 17:17:23 madden
1558 * Added index callback
1559 *
1560 * Revision 1.221 1997/07/17 20:27:51 madden
1561 * Set choice to indicat frame when masking seqLoc is saved
1562 *
1563 * Revision 1.220 1997/07/16 20:35:11 madden
1564 * Call to BlastConvertProteinSeqLoc
1565 *
1566 * Revision 1.219 1997/07/16 18:51:55 madden
1567 * call to BioseqSeg, added static function BlastMakeTempProteinBioseq
1568 *
1569 * Revision 1.218 1997/07/15 20:37:05 madden
1570 * Calls to SeqLocSeg and BioseqSeg
1571 *
1572 * Revision 1.217 1997/07/14 20:11:03 madden
1573 * Removed unused variables
1574 *
1575 * Revision 1.216 1997/07/14 15:30:46 madden
1576 * Changed call to BlastKarlinBlkGappedCalc
1577 *
1578 * Revision 1.215 1997/07/11 19:28:23 madden
1579 * Added function BLASTSetUpSearchByLocWithReadDb
1580 *
1581 * Revision 1.214 1997/07/01 17:50:52 madden
1582 * used gapped Karlin-Altschul parameters when needed in LinkHsp
1583 *
1584 * Revision 1.213 1997/06/27 22:18:31 madden
1585 * MT fix for more threads than db seqs.
1586 *
1587 * Revision 1.212 1997/06/24 13:51:20 madden
1588 * Fixed SeqLoc leak
1589 *
1590 * Revision 1.211 1997/05/27 20:19:17 madden
1591 * Use of SeqLocDust rather than BioseqDust
1592 *
1593 * Revision 1.210 1997/05/22 21:24:46 madden
1594 * Added support for final gapX dropoff value
1595 *
1596 * Revision 1.209 1997/05/20 17:49:55 madden
1597 * Added functions BLASTSetUpSearchByLoc and BLASTSetUpSearchInternalByLoc
1598 *
1599 * Revision 1.208 1997/05/07 20:59:13 madden
1600 * Call to SeqId2OrdinalId replaces call to readdb_gi2seq
1601 *
1602 * Revision 1.207 1997/05/07 13:45:08 madden
1603 * Set mutex lock for ambiguity reevaluation, added use_large_gaps flag
1604 *
1605 * Revision 1.206 1997/05/01 21:08:26 madden
1606 * use ordinal index to rank results when they are statist. equivalent
1607 *
1608 * Revision 1.205 1997/05/01 15:53:07 madden
1609 * Addition of extra KarlinBlk's for psi-blast
1610 *
1611 * Revision 1.204 1997/04/25 13:57:43 madden
1612 * Fixed floating point exception by checking for zero query length value.
1613 *
1614 * Revision 1.203 1997/04/23 21:56:07 madden
1615 * Changes in BlastGetGappedAlignmentTraceback for in-frame gapping tblastn.
1616 *
1617 * Revision 1.202 1997/04/22 14:00:14 madden
1618 * Removed unused variables.
1619 *
1620 * Revision 1.201 1997/04/22 13:04:19 madden
1621 * Changes for in-frame blastx gapping.
1622 *
1623 * Revision 1.200 1997/04/17 22:07:48 madden
1624 * Changes to allow in-frame gapped tblastn.
1625 *
1626 * Revision 1.199 1997/04/09 20:01:53 madden
1627 * Added global_seqid's to allow only certain sequences in a db to be searched.
1628 *
1629 * Revision 1.198 1997/04/07 18:17:09 madden
1630 * Changed length_adjustment calculation.
1631 *
1632 * Revision 1.197 1997/04/04 15:30:37 madden
1633 * Removed extra fprint statement.
1634 *
1635 * Revision 1.196 1997/04/03 19:48:13 madden
1636 * Changes to use effective database length instead of the length of each
1637 * sequence in statistical calculations.
1638 *
1639 * Revision 1.195 1997/03/27 22:30:51 madden
1640 * Used gapped Karlin-Altschul parameters to calculate trigger for gapping.
1641 *
1642 * Revision 1.194 1997/03/20 22:09:52 madden
1643 * Used SeqIdFindBest to find GI in query.
1644 *
1645 * Revision 1.193 1997/03/20 19:57:40 madden
1646 * Changes to support segmented Bioseq queries.
1647 *
1648 * Revision 1.192 1997/03/14 22:06:11 madden
1649 * fixed MT bug in BlastReevaluateWithAmbiguities.
1650 *
1651 * Revision 1.191 1997/03/08 16:52:16 madden
1652 * Check in Reevaluate function to see if sequence is worth checking,
1653 * Added discontinuous option to ParameterBlk.
1654 *
1655 * Revision 1.190 1997/03/07 21:58:36 madden
1656 * Added Boolean gapped argument to BLASTOptionNew.
1657 *
1658 * Revision 1.189 1997/03/07 21:11:22 madden
1659 * Added in check for blastn on gapped calculations.
1660 *
1661 * Revision 1.188 1997/03/05 14:29:46 madden
1662 * Moved BlastSaveCurrentHsp to blastutl.c.
1663 *
1664 * Revision 1.187 1997/03/04 21:34:59 madden
1665 * Added in HspArrayPurge.
1666 *
1667 * Revision 1.186 1997/03/04 20:08:19 madden
1668 * Moved gapped alignment code from blast.c to blastutl.c
1669 *
1670 * Revision 1.185 1997/03/03 22:39:45 madden
1671 * Moved code from blast.c to blastutl.c.
1672 *
1673 * Revision 1.184 1997/03/03 21:47:22 madden
1674 * Moved functions from blast.c to blastutl.c for 16-bit windows.
1675 *
1676 * Revision 1.183 1997/03/03 20:58:09 madden
1677 * Fixed call to BlastGetGappedAlignmentTraceback; purged hitlist
1678 * for very short database sequences.
1679 *
1680 * Revision 1.182 1997/03/01 18:25:33 madden
1681 * reverse flag added to BlastGetGappedAlignmentTraceback functions.
1682 *
1683 * Revision 1.181 1997/02/24 16:40:38 madden
1684 * Change to GapXEditBlockToSeqAlign to use first SeqIdPtr, duplicate.
1685 *
1686 * Revision 1.180 1997/02/24 15:09:38 madden
1687 * Fixed bug where NULL pointer was dereferenced.
1688 *
1689 * Revision 1.179 1997/02/24 13:10:27 madden
1690 * Added function BlastGappedScoreInternal.
1691 *
1692 * Revision 1.178 1997/02/23 16:44:47 madden
1693 * GapAlignBlk became GapAlignBlkPtr and GapAlignBlkNew called.
1694 *
1695 * Revision 1.177 1997/02/20 21:50:24 madden
1696 * Added frame and translation information to GapAlignBlk, assigned it.
1697 *
1698 * Revision 1.176 1997/02/20 18:38:34 madden
1699 * Allowed theoretical database length to be set.
1700 *
1701 * Revision 1.175 1997/02/19 22:29:32 madden
1702 * Changes to handle multiple contexts in BlastGetGappedScore.
1703 *
1704 * Revision 1.174 1997/02/19 14:17:03 madden
1705 * GappedScore routines now work on all contexts.
1706 *
1707 * Revision 1.173 1997/02/17 17:39:54 madden
1708 * Changes to RealBlastGetGappedAlignmentTraceback for gapped blastn.
1709 *
1710 * Revision 1.172 1997/02/13 21:04:15 madden
1711 * fixed UMR.
1712 *
1713 * Revision 1.171 1997/02/12 22:19:08 madden
1714 * Added functions BlastNewWordExtend, BlastNewWordExtend_prelim, and
1715 * BlastNewFindWords for use in position based blast.
1716 *
1717 * Revision 1.170 1997/02/11 19:29:34 madden
1718 * Addition of BlastGetGappedScoreWithReaddb, removed dependence of
1719 * BlastGetGappedScore on readdb.
1720 *
1721 * Revision 1.169 1997/02/10 20:27:01 madden
1722 * Changed some CharPtr's into Uint1Ptr's.
1723 *
1724 * Revision 1.168 1997/02/10 20:14:23 madden
1725 * replaced doubles by Nlm_FloatHi's.
1726 *
1727 * Revision 1.167 1997/02/10 20:02:58 madden
1728 * Changed BlastSearchBlkNew to allow a set of words to be passed in.
1729 *
1730 * Revision 1.166 1997/02/10 15:24:59 madden
1731 * Set posMatrix element in gap_align structure.
1732 *
1733 * Revision 1.165 1997/02/07 22:43:03 madden
1734 * Moved BLAST_WordFinderNew and Destruct from blast.c to blastutl.c, made
1735 * non-static.
1736 *
1737 * Revision 1.164 1997/02/07 22:32:40 madden
1738 * Moved BlastGetSubjectId to blastutl.c, changed calling convention of
1739 * BlastGetSubjectId.
1740 *
1741 * Revision 1.163 1997/02/06 15:36:14 madden
1742 * Resuse 1st threshold if necessary.
1743 *
1744 * Revision 1.162 1997/02/06 14:27:15 madden
1745 * Addition of BlastAllWord structure.
1746 *
1747 * Revision 1.161 1997/02/05 19:54:59 madden
1748 * Changes for blastn gapped alignments.
1749 *
1750 * Revision 1.160 1997/02/04 22:12:59 madden
1751 * Added function RealBlastGetGappedAlignmentTraceback.
1752 *
1753 * Revision 1.159 1997/02/04 20:11:42 madden
1754 * Moved functions to blastutl.c
1755 *
1756 * Revision 1.158 1997/02/04 16:22:32 madden
1757 * Changes to enable gapped alignments on the reverse strand.
1758 *
1759 * Revision 1.157 1997/02/03 19:24:01 madden
1760 * Added function CheckGappedAlignmentsForOverlap.
1761 *
1762 * Revision 1.156 1997/02/03 17:19:03 madden
1763 * Increased number of bits for second pass if context factor > 1.
1764 *
1765 * Revision 1.155 1997/02/03 13:02:12 madden
1766 * Corrected SeqAlign offsets for minus strands.
1767 *
1768 * Revision 1.154 1997/01/31 22:42:51 madden
1769 * changed default thresholds and added strands to construction of SeqAlign.s
1770 *
1771 * Revision 1.153 1997/01/31 22:13:02 madden
1772 * Adjusted bit score by logK.
1773 *
1774 * Revision 1.152 1997/01/31 14:45:27 madden
1775 * Added check for threshold value to ValidateOptions.
1776 *
1777 * Revision 1.151 1997/01/30 19:12:19 madden
1778 * Fixed memory leak.
1779 *
1780 * Revision 1.150 1997/01/28 22:38:56 madden
1781 * Added function BLASTOptionValidate.
1782 *
1783 * Revision 1.149 1997/01/28 21:50:05 madden
1784 * Adjustments to CopyResultHspToHSP.
1785 *
1786 * Revision 1.148 1997/01/24 16:51:44 madden
1787 * Fixed memory leak.
1788 *
1789 * Revision 1.147 1997/01/24 15:13:02 madden
1790 * Changes to accommodate gapped blastn.
1791 *
1792 * Revision 1.146 1997/01/22 17:45:08 madden
1793 * Added search to GetStartForGappedAlignment.
1794 *
1795 * Revision 1.145 1997/01/17 17:41:44 madden
1796 * Added flags for position based BLAST.
1797 *
1798 * Revision 1.144 1997/01/14 17:22:30 madden
1799 * Changes for MT, especially for small databases.
1800 *
1801 * Revision 1.143 1997/01/13 22:13:41 madden
1802 * set further_process to FALSE as needed.
1803 *
1804 * Revision 1.142 1997/01/13 20:06:36 madden
1805 * Added index_addition to strings before checking for ambiguties.
1806 *
1807 * Revision 1.141 1997/01/13 15:37:05 madden
1808 * Changed prototypes for star_callback and tick_callback.
1809 *
1810 * Revision 1.140 1997/01/11 18:58:29 madden
1811 * Removed defunct PerformBlastSearch... functions.
1812 *
1813 * Revision 1.139 1997/01/11 18:39:48 madden
1814 * Simplified ranged blast model.
1815 *
1816 * Revision 1.138 1997/01/11 18:22:10 madden
1817 * Changes to allow S2 to be set.
1818 *
1819 * Revision 1.137 1997/01/11 16:41:42 madden
1820 * Fix to tick_proc for MT runs.
1821 *
1822 * Revision 1.136 1997/01/09 17:44:35 madden
1823 * Added "bit_score" to BLASTResultHsp.
1824 *
1825 * Revision 1.135 1997/01/09 13:33:43 madden
1826 * Fixed NlmThreadCompare typo.
1827 *
1828 * Revision 1.134 1997/01/08 23:05:37 madden
1829 * Added call to TNlmThreadCompare.
1830 *
1831 * Revision 1.133 1997/01/07 20:40:29 madden
1832 * Added reverse Boolean to GetSeqAlignForResultHitList.
1833 *
1834 * Revision 1.132 1997/01/06 22:40:55 madden
1835 * Added function BlastGetSubjectId.
1836 *
1837 * Revision 1.131 1997/01/06 19:31:49 madden
1838 * Removed subject and query ID from GapAlignBlk.
1839 *
1840 * Revision 1.130 1997/01/06 17:22:59 madden
1841 * Used GapXEditScriptToSeqAlign to find SeqAlign.
1842 *
1843 * Revision 1.129 1997/01/04 20:41:11 madden
1844 * Shorter sequence is always the query in BlastTwoSequences.
1845 *
1846 * Revision 1.128 1997/01/03 20:29:32 madden
1847 * Corrected count of significant sequences.
1848 *
1849 * Revision 1.127 1997/01/03 19:03:35 madden
1850 * Fixed incorrect KarlinBlkPtr use.
1851 *
1852 * Revision 1.126 1997/01/03 17:26:50 madden
1853 * Fixed stats recordation.
1854 *
1855 * Revision 1.125 1996/12/30 21:45:28 madden
1856 * Added "strict" Boolean to CheckForRequiredRegion.
1857 *
1858 * Revision 1.124 1996/12/30 17:14:06 madden
1859 * Fixes for changes for "require a portion of the query sequence".
1860 *
1861 * Revision 1.123 1996/12/30 15:44:25 madden
1862 * Added capability to require a portion of the query sequence.
1863 *
1864 * Revision 1.122 1996/12/27 20:44:10 madden
1865 * Chnages to require that part of the query be included.
1866 *
1867 * Revision 1.121 1996/12/23 22:02:05 madden
1868 * Changes to allow two sequences to be compared.
1869 *
1870 * Revision 1.120 1996/12/23 15:57:21 madden
1871 * Removed extra call to BlastPreliminaryGappedScore.
1872 * y
1873 *
1874 * Revision 1.119 1996/12/23 14:04:44 madden
1875 * Added gap_trigger.
1876 *
1877 * Revision 1.118 1996/12/20 21:11:40 madden
1878 * Changes to allow multiple hits runs only.
1879 *
1880 * Revision 1.117 1996/12/20 15:31:05 madden
1881 * Removed defunct function.
1882 *
1883 * Revision 1.116 1996/12/20 14:22:48 madden
1884 * Added discontinuous Boolean to GetSeqAlignForResultHitList.
1885 *
1886 * Revision 1.115 1996/12/18 14:33:13 madden
1887 * Checked for high score when E-values are equivalent.
1888 *
1889 * Revision 1.114 1996/12/17 18:28:10 madden
1890 * Changed score used to gap HSP's.
1891 *
1892 * Revision 1.113 1996/12/17 17:28:27 madden
1893 * Removed sleep function for non-UNIX platforms.
1894 *
1895 * Revision 1.112 1996/12/17 17:27:03 madden
1896 * Count number of attempted gappings.
1897 *
1898 * Revision 1.111 1996/12/17 13:47:57 madden
1899 * Added star_proc.
1900 *
1901 * Revision 1.110 1996/12/16 19:24:38 madden
1902 * Correct to initial wordsize for blastn.
1903 *
1904 * Revision 1.109 1996/12/16 18:24:21 madden
1905 * Corrected shift in BlastNtFindWords.
1906 *
1907 * Revision 1.108 1996/12/16 15:29:12 madden
1908 * Changed gapalign.h to gapxdrop.h
1909 *
1910 * Revision 1.107 1996/12/16 14:35:48 madden
1911 * Replaced BLAST_GAPPED_OPTION ifdef with gapped_calculation Boolean.
1912 *
1913 * Revision 1.106 1996/12/13 22:00:23 madden
1914 * Corrected starting point for gapped extension with traceback.
1915 *
1916 * Revision 1.105 1996/12/13 18:13:56 madden
1917 * Added tick callback functions
1918 *
1919 * Revision 1.104 1996/12/13 15:09:31 madden
1920 * Changes to parameters used for gapped extensions.
1921 *
1922 * Revision 1.103 1996/12/12 16:44:35 madden
1923 * Removed unused variables.
1924 *
1925 * Revision 1.102 1996/12/12 16:34:58 madden
1926 * GapAlignBlk replaces arguments in PerformGappedAlignment etc.
1927 *
1928 * Revision 1.101 1996/12/12 14:04:03 madden
1929 * Fixes for check on whether HSP is already contained by gapped alignment.
1930 *
1931 * Revision 1.100 1996/12/10 19:20:15 madden
1932 * Changed minimal HSP score for gapped alignments.
1933 *
1934 * Revision 1.99 1996/12/10 17:30:59 madden
1935 * Changed statistics for gapped blastp
1936 *
1937 * Revision 1.98 1996/12/09 23:24:05 madden
1938 * Added parameters to control which sequences get a gapped alignment.
1939 *
1940 * Revision 1.97 1996/12/09 20:45:47 madden
1941 * Adjustments to calculation of gapped HSP's.
1942 *
1943 * Revision 1.96 1996/12/08 15:19:59 madden
1944 * Added functions to enable gapped alignments.
1945 *
1946 * Revision 1.95 1996/11/27 22:46:08 madden
1947 * Removed includes that are no longer used.
1948 *
1949 * Revision 1.94 1996/11/27 22:25:09 madden
1950 * Corrected collection of statistics for MT runs.
1951 *
1952 * Revision 1.93 1996/11/27 21:52:30 madden
1953 * Added function FilterWithSeg.
1954 *
1955 * Revision 1.92 1996/11/26 19:53:46 madden
1956 * Checked for return value on BlastScoreBlkMatFill.
1957 *
1958 * Revision 1.91 1996/11/25 20:13:47 madden
1959 * Changed how NlmMutexInit is called.
1960 *
1961 * Revision 1.90 1996/11/25 19:51:41 madden
1962 * Fix for tblastx stats.
1963 *
1964 * Revision 1.89 1996/11/25 18:58:24 madden
1965 * Adjustments for translated database.
1966 *
1967 * Revision 1.88 1996/11/22 19:04:58 madden
1968 * Removed ifdef for OLD_BIT_ORDER; changed default values.
1969 *
1970 * Revision 1.87 1996/11/22 15:28:03 madden
1971 * Fixed problem of last query residue examined on a diagonal.
1972 *
1973 * Revision 1.86 1996/11/21 18:08:38 madden
1974 * Changed order of if-else statements in get_db_chunk for
1975 * possible improvement of parallelization.
1976 *
1977 * Revision 1.85 1996/11/20 23:15:50 madden
1978 * Changes to acquisition of Mutex in BlastSaveCurrentHitlist to
1979 * improve parallelization.
1980 *
1981 * Revision 1.84 1996/11/19 22:23:52 madden
1982 * Changed link_hsps to link HSP's faster.
1983 *
1984 * Revision 1.83 1996/11/18 19:32:09 madden
1985 * Removed unused variables found by CodeWarrior.
1986 *
1987 * Revision 1.82 1996/11/18 18:07:57 madden
1988 * Duplicated translation_buffer (for tblast[nx]).
1989 *
1990 * Revision 1.81 1996/11/18 17:28:13 madden
1991 * Duplicated translation information in BlastSearchBlkDuplicate and
1992 * also number of contexts.
1993 *
1994 * Revision 1.80 1996/11/18 15:45:40 madden
1995 * FilterDNA function to perform dusting added (by Sergei Shavirin).
1996 *
1997 * Revision 1.79 1996/11/15 17:54:54 madden
1998 * Added support for alternate genetic codes for blastx, tblast[nx].
1999 *
2000 * Revision 1.78 1996/11/14 16:37:58 madden
2001 * Put average lengths in defines.
2002 *
2003 * Revision 1.77 1996/11/14 16:21:55 madden
2004 * changed CharPtr to Uint1Ptr in GetTranslation.
2005 *
2006 * Revision 1.76 1996/11/13 22:35:18 madden
2007 * Added tblast[nx] capability to BlastReevaluateWithAmbiguities.
2008 *
2009 * Revision 1.75 1996/11/12 19:56:35 madden
2010 * Small gaps not considered for blastn.
2011 *
2012 * Revision 1.74 1996/11/12 16:21:17 madden
2013 * Added in context_factor.
2014 *
2015 * Revision 1.73 1996/11/12 13:46:15 madden
2016 * Removed defunct SetUpBlastSearch type functions.
2017 *
2018 * Revision 1.72 1996/11/11 17:44:21 madden
2019 * Fixed check for overlap in search.
2020 *
2021 * Revision 1.71 1996/11/09 21:02:59 madden
2022 * Fixes for blastn extensions.
2023 *
2024 * Revision 1.70 1996/11/08 21:45:03 madden
2025 * Fix for blastn extensions.
2026 *
2027 * Revision 1.69 1996/11/07 22:31:15 madden
2028 * Added function BlastReevaluateWithAmbiguities for nucl. db's.
2029 *
2030 * Revision 1.68 1996/11/07 17:31:26 madden
2031 * Fixed over-incrementing of index in link_hsps.
2032 *
2033 * Revision 1.67 1996/11/06 22:10:01 madden
2034 * Further optimization of BlastTranslateUnambiguousSequence.
2035 *
2036 * Revision 1.66 1996/11/05 23:19:08 madden
2037 * Rewrote BlastTranslateUnambiguousSequence so it's faster.
2038 *
2039 * Revision 1.65 1996/11/04 19:27:13 madden
2040 * Deallocated search->translation_buffer if allocated.
2041 *
2042 * Revision 1.64 1996/11/04 16:59:43 madden
2043 * Added function GetPrivatTranslationTable to optimize translation
2044 * of database.
2045 *
2046 * Revision 1.63 1996/11/01 21:06:49 madden
2047 * Corrected the (nucl.) database for the translated length for tblast[nx].
2048 *
2049 * Revision 1.62 1996/10/31 16:27:20 shavirin
2050 * Multiple changes due to reverce of residues in BLAST database
2051 * for nucleotide sequences from (4321) to (1234)
2052 * New dumper now required to create BLAST databases.
2053 *
2054 * Revision 1.61 1996/10/28 22:15:24 madden
2055 * Added check in BlastNtWordFinder that subject sequence is longet
2056 * than min. word size.
2057 *
2058 * Revision 1.60 1996/10/04 20:12:26 madden
2059 * Fixed memory leaks found by purify.
2060 *
2061 * Revision 1.59 1996/10/03 20:49:29 madden
2062 * Calculate standard Karlin parameters for blastx and tblastx,
2063 * Use proper Karlin parameters in linking of HSP's.
2064 *
2065 * Revision 1.58 1996/10/02 19:59:44 madden
2066 * Fixed translation of query in blastx, calculated different karlin parameters
2067 * for each frame.
2068 *
2069 * Revision 1.57 1996/10/01 21:24:02 madden
2070 * e2 value now depends on program, correct cutoffs for blastn.
2071 *
2072 * Revision 1.56 1996/10/01 18:49:06 madden
2073 * Properly placed counters for number of hits, extensions.
2074 *
2075 * Revision 1.55 1996/09/30 21:56:12 madden
2076 * Replaced query alphabet of ncbi2na with blastna alphabet.
2077 *
2078 * Revision 1.54 1996/09/26 21:48:29 madden
2079 * Set small/large gaps in SeqALign.
2080 *
2081 * Revision 1.53 1996/09/26 20:18:43 madden
2082 * Addition of ExperimentalLocalBlastSearch function, fixes to SeqIdPtr's.
2083 *
2084 * Revision 1.52 1996/09/25 19:05:24 madden
2085 * Fixes to nucl. extension functions.
2086 *
2087 * Revision 1.51 1996/09/25 14:31:06 madden
2088 * Removed functions and statements for discontiguous word hits.
2089 *
2090 * Revision 1.50 1996/09/24 22:13:06 madden
2091 * BlastNtWordExtend now extends properly to end of query or subject.
2092 *
2093 * Revision 1.49 1996/09/24 18:39:51 madden
2094 * Changes to extend into the remainder of nucl. sequences (for blastn) and
2095 * to perform minus strand extensions.
2096 *
2097 * Revision 1.48 1996/09/20 21:58:14 madden
2098 * Changed CharPtr's to Uint1Ptr, got remainder length out of top order bits.
2099 *
2100 * Revision 1.47 1996/09/19 13:46:29 madden
2101 * Removed unused variables.
2102 *
2103 * Revision 1.46 1996/09/19 13:16:20 madden
2104 * Adjusted subject offset by READDB_COMPRESSION_RATIO for calc. of diagonal.
2105 *
2106 * Revision 1.45 1996/09/18 21:25:30 madden
2107 * Fixed bug in WordFinder for nucleotides.
2108 *
2109 * Revision 1.44 1996/09/18 13:39:24 madden
2110 * fixed offsets for SeqAligns on minus strands.
2111 *
2112 * Revision 1.43 1996/09/17 12:27:04 madden
2113 * Changes to perform correct extensions in blastn.
2114 *
2115 * Revision 1.42 1996/09/16 19:41:14 sad
2116 * Changed BlastTimeFillStructure() to use new functions from ncbitime.
2117 * That removes platform-dependent code from this function.
2118 *
2119 * Revision 1.41 1996/09/13 20:01:52 madden
2120 * put in READDB_UNPACK macros.
2121 *
2122 * Revision 1.40 1996/09/12 21:11:55 madden
2123 * Added extension funcitons for blastn
2124 *
2125 * Revision 1.39 1996/09/11 22:21:06 madden
2126 * Changes for blastn.
2127 *
2128 * Revision 1.38 1996/09/11 20:36:41 shavirin
2129 * Removed few Windows NT compiler warnings
2130 *
2131 * Revision 1.35 1996/09/11 19:14:09 madden
2132 * Added BLAST_OptionsBlkPtr structure and use thereof.
2133 *
2134 * Revision 1.34 1996/09/10 19:40:35 madden
2135 * Added functions to perform blastn comparison.
2136 *
2137 * Revision 1.33 1996/09/05 19:39:52 madden
2138 * Added "word_width" to position already covered on diagonal.
2139 *
2140 * Revision 1.32 1996/09/05 19:26:16 madden
2141 * Combined masking and shifting, removed some checks if prelim.
2142 *
2143 * Revision 1.31 1996/09/05 14:12:19 madden
2144 * New (faster) type of extension.
2145 *
2146 * Revision 1.30 1996/09/03 16:27:21 madden
2147 * Added efficiency in scanning of database.
2148 *
2149 * Revision 1.29 1996/08/30 19:27:37 madden
2150 * Fix for one-pass blast, memory-mapped file was being freed.
2151 *
2152 * Revision 1.28 1996/08/30 18:23:50 madden
2153 * A few efficiencies and a correction for one-pass blast.
2154 *
2155 * Revision 1.27 1996/08/30 15:17:57 madden
2156 * Minor efficiency in BlastReapHitlistByEvalue.
2157 *
2158 * Revision 1.25 1996/08/28 20:07:36 madden
2159 * Fix for UMR when the (nucl) sequence is exactly div. by four.
2160 *
2161 * Revision 1.24 1996/08/28 17:11:07 madden
2162 * Fixes for the translation of (nucl.) database sequences.
2163 *
2164 * Revision 1.23 1996/08/27 21:51:44 madden
2165 * Changes for tblastx
2166 *
2167 * Revision 1.22 1996/08/27 17:47:37 madden
2168 * current_hitlist purged on second pass for tblastn.
2169 *
2170 * Revision 1.21 1996/08/26 17:20:20 shavirin
2171 * Added support for WIN32 in function BlastTimeFillStructure()
2172 *
2173 * Revision 1.20 1996/08/23 18:50:23 madden
2174 * Adjusted some of the NT warning fixes to give correct results.
2175 *
2176 * Revision 1.19 1996/08/23 16:52:07 madden
2177 * Changed Int1 to Int4 in SetUpBlastSearchInternal.
2178 *
2179 * Revision 1.18 1996/08/23 16:39:02 madden
2180 * Fixed problem with SaveCurrentHsp.
2181 *
2182 * Revision 1.17 1996/08/23 15:29:44 shavirin
2183 * Fixed a lot of NT compiler warnings about type mismatch
2184 *
2185 * Revision 1.16 1996/08/21 21:37:01 madden
2186 * Added casts to silence compiler warning.s
2187 *
2188 * Revision 1.15 1996/08/21 21:24:56 madden
2189 * Changes for tblastn.
2190 *
2191 * Revision 1.14 1996/08/21 12:55:54 madden
2192 * Changed "purge" frame.
2193 *
2194 * Revision 1.13 1996/08/15 17:07:57 madden
2195 * Added efficiencies in loop that scans database.
2196 *
2197 * Revision 1.12 1996/08/14 20:01:30 madden
2198 * Efficiencies suggested by Zheng Zhang.
2199 *
2200 * Revision 1.11 1996/08/14 18:15:31 madden
2201 * Query frame moved from context to BlastSeqBlk.
2202 *
2203 * Revision 1.10 1996/08/14 17:19:29 madden
2204 * Correctly set frame for subject.
2205 *
2206 * Revision 1.9 1996/08/14 15:20:37 madden
2207 * Added Blast prefix to TranslateUnambiguousSequence function name.
2208 *
2209 * Revision 1.8 1996/08/14 14:30:42 madden
2210 * Cleaned up problem with UMR in TranslateUnambiguousSequence.
2211 *
2212 * Revision 1.7 1996/08/13 22:04:36 madden
2213 * Fixed TranslateUnambiguousSequence to properly read a nucl. db.
2214 *
2215 * Revision 1.6 1996/08/13 15:26:29 madden
2216 * Changes for tblastn.
2217 *
2218 * Revision 1.5 1996/08/09 22:11:12 madden
2219 * Added original_sequence to BlastSequenceAddSequence.
2220 *
2221 * Revision 1.4 1996/08/08 21:39:00 madden
2222 * Added some functions for tblastn.
2223 *
2224 * Revision 1.3 1996/08/07 14:23:45 madden
2225 * Added functions to produce SeqAlign from BLAST results.
2226 *
2227 * Revision 1.2 1996/08/06 16:07:31 madden
2228 * Removed unused functions Bsp2BLAST0Request.
2229 *
2230 * Revision 1.1 1996/08/05 19:45:46 madden
2231 * Initial revision
2232 *
2233 * Revision 1.118 1996/08/05 13:56:44 madden
2234 * Check if threads are available with NlmThreadsAvailable.
2235 *
2236 * Revision 1.117 1996/08/02 14:20:06 madden
2237 * Changes in call to readdb.
2238 *
2239 * Revision 1.116 1996/07/31 13:46:23 madden
2240 * Each thread gets own copy of ewp_params in SearchBlk.
2241 *
2242 * Revision 1.115 1996/07/31 13:09:17 madden
2243 * Changes for threaded blast.
2244 *
2245 * Revision 1.114 1996/07/25 20:45:20 madden
2246 * Change to calling convention of PerformBlastSearchWithReadDb.
2247 *
2248 * Revision 1.113 1996/07/25 12:55:20 madden
2249 * readdb_get_sequence call changed to allow for systems w/o mmap.
2250 *
2251 * Revision 1.112 1996/07/24 13:16:28 madden
2252 * Removed commented out fprintf.
2253 *
2254 * Revision 1.111 1996/07/24 12:00:07 madden
2255 * Changes for blastx.
2256 *
2257 * Revision 1.110 1996/07/18 22:00:02 madden
2258 * Changes for multiple contexts.
2259 *
2260 * Revision 1.109 1996/07/18 13:35:51 madden
2261 * Addition of the BLASTContextStructPtr.
2262 *
2263 * Revision 1.108 1996/07/16 15:01:02 madden
2264 * Cleaned up link_hsp function.
2265 *
2266 * Revision 1.107 1996/07/16 14:37:42 madden
2267 * Changes to link_hsp's so another array is not needed for the HSP's.
2268 *
2269 * Revision 1.106 1996/07/11 16:03:58 madden
2270 * SaveCurrentHitlist keeps track of which set an HSP belongs to.
2271 *
2272 * Revision 1.105 1996/07/05 17:16:34 madden
2273 * Optimized loop in contiguous word finder.
2274 *
2275 * Revision 1.104 1996/07/03 14:26:05 madden
2276 * Added test extension function.
2277 *
2278 * Revision 1.103 1996/07/02 14:32:53 madden
2279 * Added hspcnt_max.
2280 *
2281 * Revision 1.102 1996/07/02 12:04:15 madden
2282 * HSP's saved on array, rather than linked list.
2283 *
2284 * Revision 1.101 1996/07/01 15:30:06 madden
2285 * Don't NULL out hit if extension to left does not succeed.
2286 *
2287 * Revision 1.100 1996/06/27 18:41:39 madden
2288 * Changes to cutoff score to start second pass.
2289 *
2290 * Revision 1.99 1996/06/26 19:38:12 madden
2291 * Don't continue extension on 1st pass if the first (left) extension
2292 * doesn't reach to the first hit.
2293 *
2294 * Revision 1.98 1996/06/26 15:53:54 madden
2295 * Second dropoff score parameter added.
2296 *
2297 * Revision 1.97 1996/06/26 14:30:25 madden
2298 * Removed unused variables.
2299 *
2300 * Revision 1.96 1996/06/26 14:09:16 madden
2301 * Added comments and indents to loops.
2302 *
2303 * Revision 1.95 1996/06/26 13:29:50 madden
2304 * Changes to reduce the amount of memory and time of BlastFindWords.
2305 *
2306 * Revision 1.94 1996/06/24 20:26:46 madden
2307 * Dropoff ("X") set to first or second dropoff parameter.
2308 *
2309 * Revision 1.93 1996/06/24 17:57:09 madden
2310 * Added wordFinders to test dropoff scores.
2311 *
2312 * Revision 1.92 1996/06/20 16:51:17 madden
2313 * Removed unused parameters.
2314 *
2315 * Revision 1.91 1996/06/20 16:15:57 madden
2316 * Replaced int's with Int4's.
2317 *
2318 * Revision 1.90 1996/06/19 14:18:33 madden
2319 * Addition of SetUpBlastSearchInternal function.
2320 *
2321 * Revision 1.89 1996/06/17 19:02:13 madden
2322 * Removed unused MP code.
2323 *
2324 * Revision 1.88 1996/06/17 18:23:31 madden
2325 * Removed unused functions.
2326 *
2327 * Revision 1.87 1996/06/14 17:58:13 madden
2328 * Changes to avoid nulling out arrays for every sequence.
2329 *
2330 * Revision 1.86 1996/06/13 21:16:33 madden
2331 * database length removed from BLAST_ExtendWordNew.
2332 *
2333 * Revision 1.85 1996/06/13 21:04:17 madden
2334 * Added efficiencies to word finders.
2335 *
2336 * Revision 1.84 1996/06/11 18:13:54 madden
2337 * Removed unused variables.
2338 *
2339 * Revision 1.83 1996/06/11 17:58:31 madden
2340 * Changes to allow shorter arrays for multiple hits type blast.
2341 *
2342 * Revision 1.82 1996/06/10 16:52:16 madden
2343 * Use bit-shifting and masking instead of dividing and remainder.
2344 *
2345 * Revision 1.81 1996/06/10 13:44:07 madden
2346 * Changes to reduce the size of the "already visited" array.
2347 *
2348 * Revision 1.80 1996/06/06 17:54:09 madden
2349 * number_of_bits added to SetUpBlastSearch and SetUpBlastSearchWithReadDb.
2350 *
2351 * Revision 1.79 1996/06/06 14:09:22 madden
2352 * Removed defunct function BlastNWSThreshold, blast_set_parameters became
2353 * static.
2354 *
2355 * Revision 1.78 1996/06/06 13:54:51 madden
2356 * Removed defunct function BLAST_ParameterBlkFill
2357 *
2358 * Revision 1.77 1996/06/06 13:23:17 madden
2359 * CalculateSecondCutoffs only called for second pass.
2360 *
2361 * Revision 1.76 1996/06/04 15:32:53 madden
2362 * Changed counting of first and second pass hits.
2363 *
2364 * Revision 1.75 1996/06/04 13:50:28 madden
2365 * Purge HitList, rather than deleting it.
2366 *
2367 * Revision 1.74 1996/05/29 17:21:07 madden
2368 * Removed defunct BlastFixEandPValues function, replaced one call
2369 * to BlastSequenceAddSequence.
2370 *
2371 * Revision 1.73 1996/05/29 12:43:25 madden
2372 * Function BlastTimeFillStructure added to keep track of time.
2373 *
2374 * Revision 1.72 1996/05/28 14:12:53 madden
2375 * Added code to collect statistics.
2376 *
2377 * Revision 1.71 1996/05/23 21:55:04 madden
2378 * Removed unused variable initlen
2379 *
2380 * Revision 1.70 1996/05/22 20:19:22 madden
2381 * Removed unused variables, fixed codecenter nits.
2382 *
2383 * Revision 1.68 1996/05/20 21:17:49 madden
2384 * Changed (incorrect) NULL's to zero's.
2385 *
2386 * Revision 1.67 1996/05/16 19:50:15 madden
2387 * Added documentation block.
2388 *
2389 * Revision 1.66 1996/05/16 13:28:24 madden
2390 * Both 1st and 2nd pass can separately be contiguous or discontiguous.
2391 *
2392 * Revision 1.64 1996/05/14 19:51:37 madden
2393 * Added some register variables.
2394 *
2395 * Revision 1.63 1996/05/14 18:56:53 madden
2396 * Unrolled some loops in extension function.
2397 *
2398 * Revision 1.62 1996/05/14 16:15:59 madden
2399 * Fixes to SaveCurrentHitlist
2400 *
2401 * Revision 1.61 1996/05/10 18:19:20 madden
2402 * Made lookup_pos a register variable.
2403 *
2404 * Revision 1.59 1996/05/09 13:14:56 madden
2405 * Consolidated CalculateEffectiveLengths and BlastReapHSPsByEvalue into other
2406 * functions.
2407 *
2408 * Revision 1.58 1996/05/03 19:54:24 madden
2409 * Removed defunct seqalign functions, optimized BlastWordFinder functions.
2410 *
2411 * Revision 1.57 1996/05/01 14:57:37 madden
2412 * Added BlastResults structures.
2413 *
2414 * Revision 1.56 1996/04/24 19:46:34 madden
2415 * Removed q_rightmost and q_leftmost from the extend function.
2416 *
2417 * Revision 1.55 1996/04/24 18:01:11 madden
2418 * Used call to readdb_get_max_length for first call to BLAST_ExtendWordNew.
2419 *
2420 * Revision 1.54 1996/04/24 16:16:58 madden
2421 * Changed LinkHsp's not to reallocate the hsp array every time.
2422 *
2423 * Revision 1.53 1996/04/24 12:51:15 madden
2424 * deleted function BlastSequenceAddSequenceIdToSequenceBlk.
2425 *
2426 * Revision 1.52 1996/04/22 21:39:31 madden
2427 * New calls to readdb_get_sequence.
2428 *
2429 * Revision 1.51 1996/04/18 13:39:33 madden
2430 * demodularized lookup of initial hits.
2431 *
2432 * Revision 1.50 1996/04/16 15:32:47 madden
2433 * economies added to new extension functions, non-scoring identical
2434 * words not added to lookup tables.
2435 *
2436 * Revision 1.48 1996/04/11 14:29:33 madden
2437 * function BlastWordExtend completely rewritten.
2438 *
2439 * Revision 1.47 1996/04/04 20:46:22 madden
2440 * Optimized extension function; made "lookup_find" a FnPtr.
2441 *
2442 * Revision 1.46 1996/04/03 19:13:04 madden
2443 * added functions PerformBlastSearchWithReadDb and Perform2PassBlastSearchWithReadDb.
2444 *
2445 * Revision 1.45 1996/03/29 21:26:01 madden
2446 * "hitlist" now kept on SeqAlign rather than HitList.
2447 *
2448 * Revision 1.44 1996/03/29 14:08:18 madden
2449 * SetUpBlastSearchWithReadDb added.
2450 *
2451 * Revision 1.43 1996/03/28 18:45:45 madden
2452 * sequence now added to hitlist after significance has been established.
2453 *
2454 * Revision 1.42 1996/03/27 23:51:11 madden
2455 * added function AddDescriptorsToHitlistWithReadDb.
2456 *
2457 * Revision 1.41 1996/03/27 23:19:24 madden
2458 * Added PerformBlastSearchWithReadDb and Perform2PassBlastSearchWithReadDb,
2459 * changed parameters for PerformBlastSearch and Perform2PassBlastSearch.
2460 *
2461 * Revision 1.40 1996/03/27 19:51:09 madden
2462 * current hits now saved on "current_hitlist", not saved to main
2463 * hitlist until significance decided upon.
2464 *
2465 * Revision 1.39 1996/03/26 19:36:15 madden
2466 * Changes to read databases formatted with formatdb.
2467 *
2468 * Revision 1.38 1996/03/25 16:34:19 madden
2469 * Changes to mimic old statistics.
2470 *
2471 * Revision 1.37 1996/03/20 14:28:57 madden
2472 * Changed cutoff values.
2473 *
2474 * Revision 1.36 1996/03/11 13:52:52 madden
2475 * Ignore gaps when the sequences are too short.
2476 *
2477 * Revision 1.35 1996/02/28 21:36:54 madden
2478 * changes for discontiguous words.
2479 *
2480 * Revision 1.34 1996/02/15 23:31:19 madden
2481 * Trimmed ends of HSP's in comparison with gap.
2482 *
2483 * Revision 1.33 1996/02/15 23:19:43 madden
2484 * Changed call to BlastScoreBlkFill
2485 *
2486 * Revision 1.32 1996/02/15 15:22:52 madden
2487 * Trimming of sequence ends for linking.
2488 *
2489 * Revision 1.31 1996/02/13 14:05:57 madden
2490 * changes to ensure that closer to optimal HSP's are found.
2491 *
2492 * Revision 1.30 1996/02/09 13:50:09 madden
2493 * Added BlastReapHSPsByEvalue; changes to allow both one and two pass runs.
2494 *
2495 * Revision 1.29 1996/02/06 22:50:56 madden
2496 * Changes for two-pass runs.
2497 *
2498 * Revision 1.28 1996/02/05 18:46:09 madden
2499 * Added support for two threshold values.
2500 *
2501 * Revision 1.27 1996/02/02 19:24:53 madden
2502 * Added wfp_first and wfp_second for first and second pass.
2503 *
2504 * Revision 1.26 1996/01/31 17:33:54 madden
2505 * Added function BlastReapHitlistByEvalue.
2506 *
2507 * Revision 1.25 1996/01/29 21:11:38 madden
2508 * Changes for MultipleHits BLAST.
2509 *
2510 * Revision 1.24 1996/01/23 16:30:52 madden
2511 * e_cutoff changed from BLAST_Score to double in SetUpBlastSearch.
2512 *
2513 * Revision 1.23 1996/01/22 22:31:01 madden
2514 * Fixed BlastFindWords to increment index1 correctly.
2515 *
2516 * Revision 1.22 1996/01/22 22:05:05 madden
2517 * Set initial e2 to 0.5.
2518 *
2519 * Revision 1.20 1996/01/17 16:59:56 madden
2520 * Added gap arguments to SetUpBlastSearch.
2521 *
2522 * Revision 1.19 1996/01/17 13:45:03 madden
2523 * Added function BlastFixEandPValues.
2524 *
2525 * Revision 1.18 1996/01/16 15:28:05 madden
2526 * Set i_am_multitasking flag.
2527 *
2528 * Revision 1.16 1996/01/10 17:50:21 madden
2529 * sort hitlist by pvalue.
2530 *
2531 * Revision 1.15 1996/01/08 23:23:22 madden
2532 * Fixed neighborhood bug, added some MP stuff
2533 *
2534 * Revision 1.14 1996/01/06 18:56:52 madden
2535 * Removed obsolete code, fixed purify nit.
2536 *
2537 * Revision 1.13 1996/01/06 17:50:20 madden
2538 * Fixed HeapSort functions for linking of HSP's.
2539 *
2540 * Revision 1.12 1996/01/06 17:18:42 madden
2541 * Fixed setting of "next" pointers when the HSp is part of a linked set.
2542 *
2543 * Revision 1.11 1996/01/06 16:29:38 madden
2544 * NULL'ed out some "link" pointers.
2545 *
2546 * Revision 1.10 1996/01/05 22:54:18 madden
2547 * Fixed HeapSort calls in linking routines.
2548 *
2549 * Revision 1.9 1996/01/05 15:51:14 madden
2550 * Added Stephen Altschul's link_hsps.
2551 *
2552 * Revision 1.8 1995/12/30 19:21:01 madden
2553 * Added PerformBlastSearch.
2554 *
2555 * Revision 1.7 1995/12/30 18:38:51 madden
2556 * Added function SetUpBlastSearch.
2557 *
2558 * Revision 1.6 1995/12/28 21:22:19 madden
2559 * Deallocated leaking memory.
2560 *
2561 * Revision 1.5 1995/12/26 23:03:22 madden
2562 * Added in functions to automatically set some parameters.
2563 *
2564 * Revision 1.4 1995/12/26 20:27:11 madden
2565 * simplified hit extension routine.
2566 *
2567 * Revision 1.3 1995/12/21 23:09:57 madden
2568 * BLAST_Score functions moved to blastkar.c
2569 *
2570 * */
2571
2572 #include <ncbi.h>
2573 #include <blastpri.h>
2574 #include <lookup.h>
2575 #include <objcode.h>
2576 #include <objseq.h>
2577 #include <sequtil.h>
2578 #include <tofasta.h>
2579 #include <seqport.h>
2580 #include <readdb.h>
2581 #include <ncbithr.h>
2582 #include <gapxdrop.h>
2583 #include <dust.h>
2584
2585 #include <mbalign.h>
2586 #include <mblast.h>
2587
2588 /*
2589 The last database sequence a tick (progress indicator) was issued for
2590 and the increments (i.e., number of db sequences completed) that a tick
2591 should be emitted.
2592 */
2593 /* Int4 last_db_seq=0, db_incr=0; */
2594
2595 /*
2596 Set to TRUE if the process has timed out.
2597 */
2598 volatile Boolean time_out_boolean;
2599
2600 /*
2601 SeqId lists if only a certain number of the database sequences will be
2602 used for the search.
2603 */
2604 /* SeqIdPtr global_seqid_list=NULL, global_seqid_ptr; */
2605
2606 /*
2607 GI List to be used if database will be searched by GI.
2608 current is the current element in the array being worked on.
2609 global_gi_being_used specifies that it will be used.
2610 */
2611
2612 /* Int4 global_gi_current=0;
2613 Boolean global_gi_being_used=FALSE; */
2614
2615 /* Function to emit progress messages, set by user. */
2616 /* int (LIBCALLBACK *tick_callback)PROTO((Int4 done, Int4 positives)); */
2617
2618 /* int (LIBCALLBACK *star_callback)PROTO((Int4 done, Int4 positives));
2619 int (LIBCALLBACK *index_callback)PROTO((Int4 done, Int4 positives)); */
2620
2621 /* tells star_proc to check that a star should be emitted. */
2622 /* TNlmThread awake_thr=NULL;
2623 Boolean awake; */
2624
2625 /* tells index_proc to check that a message should be emitted. */
2626 /* TNlmThread index_thr=NULL;
2627 Boolean awake_index; */
2628
2629 /* period of sending out a star/message. */
2630 /* #define PERIOD 60 */
2631
2632 /* Use by star_proc to determine whether to emit a star. */
2633 /* time_t last_tick=0; */
2634
2635 /* How many positive hits were found (set by ReapHitlist, read by tick_proc
2636 and star_proc). */
2637 /* Int4 number_of_pos_hits=0; */
2638
2639 /* Mutex for assignment of db seqs to search. */
2640 /* TNlmMutex db_mutex=NULL; */
2641
2642 /* Mutex for insertion of results into list. */
2643 /* TNlmMutex results_mutex = NULL; */
2644 /* Mutex for the callbacks (star_proc, tick_proc, index_proc). */
2645 /* TNlmMutex callback_mutex=NULL; */
2646
2647 /* The last db sequence to be assigned. Used only in get_db_chunk after
2648 the acquisition of the "db_mutex" (above). */
2649 /* Int4 db_chunk_last=0; */
2650
2651 /* the last sequence in the database to be compared against. */
2652 /* Int4 final_db_seq; */
2653
2654 /* Default size of the chunks be that are assigned in the function get_db_chunk. */
2655 /* Actually db_chunk_size is used, which is smaller if the db is smaller. */
2656
2657 static Int4 BlastExtendWordSearch PROTO((BlastSearchBlkPtr search, Boolean multiple_hits));
2658
2659 static Int2 BlastWordExtend PROTO((BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context));
2660
2661 /*AAS*/
2662 static Int2 BlastNewWordExtend PROTO((BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context));
2663
2664 static Int2 BlastWordExtend_prelim PROTO((BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context));
2665
2666 /*AAS*/
2667 static Int2 BlastNewWordExtend_prelim PROTO((BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context));
2668
2669
2670 static Int4 BlastWordFinder PROTO((BlastSearchBlkPtr search));
2671 static Int4 BlastWordFinder_mh PROTO((BlastSearchBlkPtr search));
2672 static Int4 BlastWordFinder_contig PROTO((BlastSearchBlkPtr search, LookupTablePtr lookup));
2673 static Int4 BlastWordFinder_mh_contig PROTO((BlastSearchBlkPtr search, LookupTablePtr lookup));
2674
2675 static BLAST_HSPPtr link_hsps PROTO((BlastSearchBlkPtr search, BLAST_HitListPtr hitlist, BLAST_HSPPtr PNTR hsp_array));
2676
2677 static Int4 BlastNtWordFinder PROTO((BlastSearchBlkPtr search, LookupTablePtr lookup));
2678 static Int4 BlastNtWordFinder_mh PROTO((BlastSearchBlkPtr search, LookupTablePtr lookup));
2679
2680 /* DEBUGGING stuff */
2681 #ifdef BLAST_TIMER
2682 clock_t last_clock = 0;
2683 #endif
2684 /* end DEBUGGING stuff */
2685
2686
2687 /*
2688 The function that decides whether or not a tick should be
2689 emitted. This is performed through the callback function
2690 ("tick_callback") that is set in "do_the_blast_run". This
2691 function is called from "do_blast_search" for single processing
2692 machines and "get_db_chunk" for MT machines, after the db_mutex
2693 has been obtained in "get_db_chunk".
2694 */
2695
BlastTickProc(Int4 sequence_number,BlastThrInfoPtr thr_info)2696 void BlastTickProc(Int4 sequence_number, BlastThrInfoPtr thr_info)
2697
2698 {
2699 if(thr_info->tick_callback &&
2700 (sequence_number > (thr_info->last_db_seq + thr_info->db_incr))) {
2701 NlmMutexLockEx(&thr_info->callback_mutex);
2702 thr_info->last_db_seq += thr_info->db_incr;
2703 thr_info->tick_callback(sequence_number, thr_info->number_of_pos_hits);
2704 thr_info->last_tick = Nlm_GetSecs();
2705 NlmMutexUnlock(thr_info->callback_mutex);
2706 }
2707 return;
2708 }
2709
2710 /*
2711 Sends out a message every PERIOD (i.e., 60 secs.) for the index.
2712
2713 THis function runs as a separate thread and only runs on a threaded
2714 platform.
2715 */
2716 VoidPtr
index_proc(VoidPtr dummy)2717 index_proc(VoidPtr dummy)
2718
2719 {
2720
2721 /* Sleep only works on UNIX. An ifdef is used until
2722 a portable solution can be found. */
2723 #ifdef OS_UNIX
2724
2725 Int2 index;
2726 BlastThrInfoPtr thr_info = (BlastThrInfoPtr) dummy;
2727
2728 while (thr_info->awake_index) {
2729 for (index=0; index < STAR_MSG_PERIOD; index++) {
2730 sleep(1);
2731 if (thr_info->awake_index == FALSE)
2732 break;
2733 }
2734
2735 if (thr_info->awake_index && thr_info->index_callback) {
2736 NlmMutexLockEx(&thr_info->callback_mutex);
2737 thr_info->last_tick = Nlm_GetSecs();
2738 thr_info->index_callback(0, 0);
2739 NlmMutexUnlock(thr_info->callback_mutex);
2740 }
2741 }
2742 #endif
2743 return dummy;
2744 }
2745
2746 /*
2747 Sends out a message every PERIOD (i.e., 60 secs.) and sends out a
2748 "star" if a tick has not been sent out in the last PERIOD.
2749
2750 THis function runs as a separate thread and only runs on a threaded
2751 platform.
2752 */
2753 static VoidPtr
star_proc(VoidPtr dummy)2754 star_proc(VoidPtr dummy)
2755
2756 {
2757 /* Sleep only works on UNIX. An ifdef is used until
2758 a portable solution can be found. */
2759 #ifdef OS_UNIX
2760
2761 time_t now;
2762 Int2 index;
2763 BlastThrInfoPtr thr_info = (BlastThrInfoPtr) dummy;
2764
2765 now = Nlm_GetSecs();
2766 while (thr_info->awake) {
2767 if (now - thr_info->last_tick < STAR_MSG_PERIOD / 2) {
2768 for (index = 0; index < STAR_MSG_PERIOD; index++) {
2769 sleep(1);
2770 if (thr_info->awake == FALSE)
2771 break;
2772 }
2773 }
2774 if (thr_info->awake) {
2775 NlmMutexLockEx(&thr_info->callback_mutex);
2776 now = Nlm_GetSecs();
2777 if (now-thr_info->last_tick > STAR_MSG_PERIOD) {
2778 if (thr_info->star_callback) {
2779 thr_info->star_callback(thr_info->db_chunk_last,
2780 thr_info->number_of_pos_hits);
2781 thr_info->last_tick = now;
2782 }
2783 }
2784 NlmMutexUnlock(thr_info->callback_mutex);
2785 }
2786 }
2787 #endif
2788 return dummy;
2789 }
2790
2791 /*
2792 Make a temporary protein BioseqPtr to use with seg.
2793 */
2794 BioseqPtr
BlastMakeTempProteinBioseq(Uint1Ptr sequence,Int4 length,Uint1 alphabet)2795 BlastMakeTempProteinBioseq (Uint1Ptr sequence, Int4 length, Uint1 alphabet)
2796
2797 {
2798 BioseqPtr bsp;
2799 Int4 byte_store_length;
2800 Nlm_ByteStorePtr byte_store;
2801 ObjectIdPtr oip;
2802
2803 if (sequence == NULL || length == 0)
2804 return NULL;
2805
2806 byte_store = Nlm_BSNew(length);
2807
2808 byte_store_length = Nlm_BSWrite(byte_store, (VoidPtr) sequence, length);
2809 if (length != byte_store_length) {
2810 Nlm_BSDelete(byte_store, length);
2811 return NULL;
2812 }
2813
2814 bsp = BioseqNew();
2815 bsp->seq_data = (SeqDataPtr) byte_store;
2816 bsp->length = length;
2817 bsp->seq_data_type = alphabet;
2818 bsp->mol = Seq_mol_aa;
2819 bsp->repr = Seq_repr_raw;
2820
2821 oip = UniqueLocalId();
2822 ValNodeAddPointer(&(bsp->id), SEQID_LOCAL, oip);
2823 SeqMgrAddToBioseqIndex(bsp);
2824
2825 return bsp;
2826 }
2827
2828
2829 #define LINK_HSP_OVERLAP 9
2830 #define MY_EPS 1.0e-9
2831 /*
2832 Calculates cutoff scores and returns them.
2833 Equations provided by Stephen Altschul.
2834
2835 BlastSearchBlkPtr search: provides info to perform calculation.
2836 Int4 subject_length: length of the DB sequence.
2837 Boolean PNTR ignore_small_gaps: If TRUE, test only for large gaps.
2838 BLAST_Score PNTR cutoff_s_second: S2 score for second pass.
2839 BLAST_Score PNTR cutoff_big_gap: Cutoff score for big gaps.
2840
2841 */
2842 static void
CalculateSecondCutoffScore(BlastSearchBlkPtr search,Int4 subject_length,Boolean PNTR ignore_small_gaps,BLAST_Score PNTR cutoff_s_second,BLAST_Score PNTR cutoff_big_gap)2843 CalculateSecondCutoffScore(BlastSearchBlkPtr search, Int4 subject_length, Boolean PNTR ignore_small_gaps, BLAST_Score PNTR cutoff_s_second, BLAST_Score PNTR cutoff_big_gap)
2844
2845 {
2846 const Int4 overlap_size = LINK_HSP_OVERLAP;
2847 Nlm_FloatHi gap_prob, gap_decay_rate, x_variable, y_variable;
2848 BLAST_KarlinBlkPtr kbp;
2849 Int4 expected_length, window_size, query_length;
2850 Int8 search_sp;
2851
2852 /* Do this for the first context, should this be changed?? */
2853 kbp = search->sbp->kbp[search->first_context];
2854 window_size = search->pbp->gap_size + overlap_size + 1;
2855 gap_prob = search->pbp->gap_prob;
2856 gap_decay_rate = search->pbp->gap_decay_rate;
2857 query_length = search->context[search->first_context].query->length;
2858
2859 if (search->pbp->old_stats == FALSE)
2860 {
2861 /* Subtract off the expected score. */
2862 expected_length = Nint(log(kbp->K*((Nlm_FloatHi) query_length)*((Nlm_FloatHi) subject_length))/(kbp->H));
2863 query_length = query_length - expected_length;
2864 subject_length = subject_length - expected_length;
2865 query_length = MAX(query_length, 1);
2866 subject_length = MAX(subject_length, 1);
2867
2868 if (search->dblen > subject_length)
2869 y_variable = log((Nlm_FloatHi) (search->dblen)/(Nlm_FloatHi) subject_length)*(kbp->K)/(gap_decay_rate);
2870 else
2871 y_variable = log((Nlm_FloatHi) (subject_length + expected_length)/(Nlm_FloatHi) subject_length)*(kbp->K)/(gap_decay_rate);
2872 search_sp = ((Int8) query_length)* ((Int8) subject_length);
2873 x_variable = 0.25*y_variable*((FloatHi) search_sp);
2874
2875 /* To use "small" gaps the query and subject must be "large" compared to
2876 the gap size. If small gaps may be used, then the cutoff values must be
2877 adjusted for the "bayesian" possibility that both large and small gaps are
2878 being checked for. */
2879
2880 if (search_sp > 8*window_size*window_size)
2881 {
2882 x_variable /= (1.0 - gap_prob + MY_EPS);
2883 *cutoff_big_gap = (BLAST_Score) floor((log(x_variable)/kbp->Lambda)) + 1;
2884 x_variable = y_variable*(window_size*window_size);
2885 x_variable /= (gap_prob + MY_EPS);
2886 *cutoff_s_second= (BLAST_Score) floor((log(x_variable)/kbp->Lambda)) + 1;
2887 /* Don't allow this cutoff to be too small */
2888 *cutoff_s_second = MAX(*cutoff_s_second, search->pbp->gap_trigger);
2889 *ignore_small_gaps = FALSE;
2890 }
2891 else
2892 {
2893 *cutoff_big_gap = (BLAST_Score) floor((log(x_variable)/kbp->Lambda)) + 1;
2894 *cutoff_s_second = *cutoff_big_gap;
2895 *ignore_small_gaps = TRUE;
2896 }
2897 *cutoff_big_gap *= search->pbp->scalingFactor;
2898 *cutoff_s_second *= search->pbp->scalingFactor;
2899 }
2900 else
2901 {
2902 /* USE the old statistics, for comparison to the OLD BLAST. */
2903 *cutoff_big_gap = search->pbp->cutoff_s_second;
2904 *cutoff_s_second = *cutoff_big_gap;
2905 *ignore_small_gaps = TRUE;
2906 }
2907 }
2908
2909 /*
2910 Rounds down score to next even value if appropriate.
2911 */
2912
2913 static Int2
s_RoundDownOddScores(BLAST_ScoreBlkPtr sbp,BLAST_HitListPtr hitlist)2914 s_RoundDownOddScores(BLAST_ScoreBlkPtr sbp, BLAST_HitListPtr hitlist)
2915 {
2916 BLAST_HSPPtr PNTR hsp_array;
2917 Int4 hsp_cnt;
2918 Int4 index;
2919
2920 if (sbp->round_down == FALSE || hitlist->hspcnt == 0)
2921 return 0;
2922
2923 hsp_cnt = hitlist->hspcnt;
2924 hsp_array = hitlist->hsp_array;
2925 for (index=0; index<hsp_cnt; index++)
2926 {
2927 hsp_array[index]->score -= (hsp_array[index]->score &1);
2928 }
2929 return 0;
2930 }
2931
2932 /*
2933 This function reevaluates the HSP's from a blast run, checking that
2934 ambiguity characters, ignored until now, don't change the score or
2935 extent of the HSP's.
2936
2937 Only works for blastn right now.
2938 */
2939
2940 static Int2
BlastReevaluateWithAmbiguities(BlastSearchBlkPtr search,Int4 sequence_number)2941 BlastReevaluateWithAmbiguities (BlastSearchBlkPtr search, Int4 sequence_number)
2942
2943 {
2944 BioseqPtr bsp;
2945 register BLAST_Score sum, score;
2946 register BLAST_ScorePtr PNTR matrix;
2947 BLAST_HitListPtr current_hitlist;
2948 BLAST_HSPPtr PNTR hsp_array;
2949 Int4 context, hspcnt, hspcnt_max, index, index1, status;
2950 Int4 length, longest_hsp_length, start, stop;
2951 Nlm_FloatHi current_evalue=DBL_MAX;
2952 SeqPortPtr spp=NULL;
2953 Uint1Ptr nt_seq, nt_seq_start, subject, subject_start, query, old_query_s, old_query_f, new_query_s, new_query_f=NULL;
2954 Uint1Ptr query_start, query_end, subject_real_start=NULL;
2955 Int4 num_ident;
2956
2957 /* Only nucl. db's. */
2958 if (search->prog_number == blast_type_blastp || search->prog_number == blast_type_blastx)
2959 return 0;
2960
2961 /* Gapped alignments will be reevaluated anyway.*/
2962 if (search->pbp->gapped_calculation == TRUE || search->pbp->do_not_reevaluate == TRUE)
2963 return 0;
2964
2965 /* No hits to reevaluate. */
2966 if (search->current_hitlist == NULL || search->current_hitlist->hspcnt == 0)
2967 return 0;
2968
2969 /* Check if there are ambiguites at all, return 0 if there are none. */
2970 if(search->prog_number != blast_type_blastn &&
2971 readdb_ambchar_present(search->rdfp, sequence_number) == FALSE) {
2972
2973 return 0;
2974 }
2975 current_hitlist = search->current_hitlist;
2976 hspcnt = current_hitlist->hspcnt;
2977 hspcnt_max = current_hitlist->hspcnt_max;
2978 hsp_array = current_hitlist->hsp_array;
2979 matrix = search->sbp->matrix;
2980
2981 /* Look for longest HSP. */
2982 longest_hsp_length = 0;
2983 for (index=0; index<hspcnt_max; index++)
2984 {
2985 if (hsp_array[index] == NULL)
2986 continue;
2987
2988 if (hsp_array[index]->subject.length > longest_hsp_length)
2989 longest_hsp_length = hsp_array[index]->subject.length;
2990
2991 if (current_evalue > hsp_array[index]->evalue)
2992 current_evalue = hsp_array[index]->evalue;
2993 }
2994
2995 if (StringCmp(search->prog_name, "blastn") != 0)
2996 {
2997 longest_hsp_length *= CODON_LENGTH;
2998 }
2999
3000 if (longest_hsp_length > 0)
3001 {
3002 nt_seq_start = MemNew(longest_hsp_length*sizeof(Uint1));
3003 if (nt_seq_start == NULL)
3004 return 0;
3005 }
3006 else
3007 {
3008 return longest_hsp_length;
3009 }
3010
3011 if (search->thr_info->ambiguities_mutex)
3012 NlmMutexLock(search->thr_info->ambiguities_mutex);
3013
3014 bsp = readdb_get_bioseq(search->rdfp, sequence_number);
3015
3016 for (index=0; index<hspcnt_max; index++)
3017 {
3018 if (hsp_array[index] == NULL)
3019 continue;
3020
3021 context = hsp_array[index]->context;
3022
3023 if (StringCmp(search->prog_name, "blastn") == 0)
3024 {
3025 start = hsp_array[index]->subject.offset;
3026 stop = hsp_array[index]->subject.end - 1;
3027 length = hsp_array[index]->subject.length;
3028 }
3029 else
3030 { /* Convert for translated alphabet. */
3031 if (hsp_array[index]->subject.frame > 0)
3032 {
3033 start = hsp_array[index]->subject.frame - 1 + CODON_LENGTH*(hsp_array[index]->subject.offset);
3034 stop = start + CODON_LENGTH*(hsp_array[index]->subject.length) - 1;
3035 length = CODON_LENGTH*(hsp_array[index]->subject.length);
3036 }
3037 else
3038 {
3039 start = bsp->length - CODON_LENGTH*(hsp_array[index]->subject.offset + hsp_array[index]->subject.length) + hsp_array[index]->subject.frame + 1;
3040 stop = bsp->length - CODON_LENGTH*(hsp_array[index]->subject.offset) + hsp_array[index]->subject.frame;
3041 length = CODON_LENGTH*(hsp_array[index]->subject.length);
3042 }
3043 }
3044
3045 if (hsp_array[index]->subject.frame > 0)
3046 {
3047 spp = SeqPortNew(bsp, start, stop, Seq_strand_plus, Seq_code_ncbi4na);
3048 SeqPortSet_do_virtual(spp, TRUE);
3049
3050 }
3051 else
3052 { /* Offsets correct here?? */
3053 spp = SeqPortNew(bsp, start, stop, Seq_strand_minus, Seq_code_ncbi4na);
3054 SeqPortSet_do_virtual(spp, TRUE);
3055 }
3056
3057 if (StringCmp(search->prog_name, "blastn") == 0)
3058 {
3059 nt_seq = nt_seq_start;
3060 while (length > 0)
3061 {
3062 *nt_seq = ncbi4na_to_blastna[SeqPortGetResidue(spp)];
3063 nt_seq++;
3064 length--;
3065 }
3066 subject_start = nt_seq_start;
3067 }
3068 else
3069 {
3070 nt_seq = nt_seq_start;
3071 while (length > 0)
3072 {
3073 *nt_seq = SeqPortGetResidue(spp);
3074 nt_seq++;
3075 length--;
3076 }
3077 /* Set frame to one so we start at beginning of nt seq. */
3078 subject_real_start = GetTranslation(nt_seq_start, CODON_LENGTH*(hsp_array[index]->subject.length), 1, &length, search->db_genetic_code);
3079 /* The first Residue is a NULLB */
3080 subject_start = subject_real_start+1;
3081 }
3082 spp = SeqPortFree(spp);
3083
3084 query_start = (Uint1Ptr) search->context[context].query->sequence;
3085 query_end = query_start + search->context[context].query->length;
3086
3087 score = 0;
3088 sum = 0;
3089 num_ident = 0;
3090 subject = subject_start;
3091 old_query_s = query_start + hsp_array[index]->query.offset;
3092 old_query_f = query_start + hsp_array[index]->query.end;
3093 /* Assume, for now, that the real HSP starts where it does now. */
3094 new_query_s = old_query_s;
3095 for (query=old_query_s; query<old_query_f; query++, subject++)
3096 {
3097 if (*query == *subject)
3098 ++num_ident;
3099
3100 if ((sum += matrix[*query][*subject]) < 0)
3101 {
3102 if (score > 0)
3103 {
3104 if (score >= search->pbp->cutoff_s2)
3105 {
3106 break;
3107 }
3108 }
3109 score = sum = 0;
3110 num_ident = 0;
3111 new_query_s = new_query_f = query;
3112 }
3113 else if (sum > score)
3114 { /* Start of scoring regime. */
3115 if (score == 0)
3116 new_query_s = query;
3117 score = sum;
3118 new_query_f = query+1;
3119 }
3120 }
3121
3122 if (score >= search->pbp->cutoff_s2)
3123 { /* Adjust the information here. */
3124 hsp_array[index]->score = score;
3125 hsp_array[index]->query.offset = new_query_s - query_start;
3126 hsp_array[index]->query.end = new_query_f - query_start;
3127 hsp_array[index]->query.length = hsp_array[index]->query.end - hsp_array[index]->query.offset;
3128 hsp_array[index]->subject.offset = hsp_array[index]->subject.offset + new_query_s - old_query_s;
3129 hsp_array[index]->subject.end = hsp_array[index]->subject.end + new_query_f - old_query_f;
3130 hsp_array[index]->subject.length = hsp_array[index]->subject.end - hsp_array[index]->subject.offset;
3131 hsp_array[index]->num_ident = num_ident;
3132 hsp_array[index]->linked_set = FALSE;
3133 hsp_array[index]->start_of_chain = FALSE;
3134 Nlm_MemSet((VoidPtr) &(hsp_array[index]->hsp_link), 0, sizeof(BLAST_HSP_LINK));
3135 /* Need to NULL out more in HSP? */
3136 }
3137 else
3138 { /* Delete if this is now below the cutoff score. */
3139 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
3140 }
3141
3142 if (StringCmp(search->prog_name, "blastn") != 0)
3143 {
3144 subject_real_start = MemFree(subject_real_start);
3145 }
3146 }
3147
3148 bsp = BioseqFree(bsp);
3149 if (search->thr_info->ambiguities_mutex)
3150 NlmMutexUnlock(search->thr_info->ambiguities_mutex);
3151 nt_seq_start = MemFree(nt_seq_start);
3152
3153 /* Save HSP's again, discarding those that have been NULLed out. */
3154 index1 = HspArrayPurge(hsp_array, hspcnt_max, TRUE);
3155 current_hitlist->hspcnt = index1;
3156 current_hitlist->hspcnt_max = index1;
3157
3158 s_RoundDownOddScores(search->sbp, search->current_hitlist);
3159 /* Relink the HSP's, ReReap the Hits. */
3160 if (!search->pbp->mb_params && search->pbp->do_sum_stats == TRUE) {
3161 status = BlastLinkHsps(search);
3162 } else {
3163 status = BlastGetNonSumStatsEvalue(search);
3164 }
3165 status = BlastReapHitlistByEvalue(search);
3166
3167 return status;
3168 }
3169
3170 /* Auxiliary function to retrieve the virtual oidlist attached to the
3171 * rdfp_chain. Returns a pointer to the OIDList, called should *NOT* modify
3172 * this copy. Assumes that this function is called after BlastProcessGiLists
3173 * has been called (while setting up the search) */
BlastGetVirtualOIDList(ReadDBFILEPtr rdfp_chain)3174 OIDListPtr LIBCALL BlastGetVirtualOIDList(ReadDBFILEPtr rdfp_chain)
3175 {
3176 OIDListPtr virtual_oidlist = NULL;
3177
3178 while (rdfp_chain) {
3179 if (virtual_oidlist = rdfp_chain->oidlist) {
3180 break;
3181 }
3182 rdfp_chain = rdfp_chain->next;
3183 }
3184 return virtual_oidlist;
3185 }
3186
3187 /*
3188 Function to assign chunks of the database to a thread.
3189 The "start" and "stop" points are returned by the arguments.
3190 Note that this is a half-closed interval (stop is not searched).
3191
3192 The Int4 "db_chunk_last" (a global variable) keeps track of the last
3193 database number assigned and is only changed if the db_mutex has been acquired.
3194
3195 The Boolean done specifies that the search has already been
3196 completed.
3197 */
3198
BlastGetDbChunk(ReadDBFILEPtr rdfp,Int4Ptr start,Int4Ptr stop,Int4Ptr id_list,Int4Ptr id_list_number,BlastThrInfoPtr thr_info)3199 Boolean BlastGetDbChunk(ReadDBFILEPtr rdfp, Int4Ptr start, Int4Ptr stop,
3200 Int4Ptr id_list, Int4Ptr id_list_number,
3201 BlastThrInfoPtr thr_info)
3202
3203 {
3204 Boolean done=FALSE;
3205 OIDListPtr virtual_oidlist = NULL;
3206 *id_list_number = 0;
3207
3208 NlmMutexLockEx(&thr_info->db_mutex);
3209 if (thr_info->realdb_done) {
3210 if (virtual_oidlist = BlastGetVirtualOIDList(rdfp)) {
3211 /* Virtual database. Create id_list using mask file */
3212 Int4 gi_end = 0;
3213
3214 thr_info->final_db_seq = MIN(thr_info->final_db_seq, virtual_oidlist->total);
3215
3216 gi_end = thr_info->final_db_seq;
3217
3218 if (thr_info->gi_current < gi_end) {
3219 Int4 oidindex = 0;
3220 Int4 gi_start = thr_info->gi_current;
3221 Int4 bit_start = gi_start % MASK_WORD_SIZE;
3222 Int4 gi;
3223
3224 for(gi = gi_start; (gi < gi_end) && (oidindex < thr_info->db_chunk_size);) {
3225 Int4 bit_end = ((gi_end - gi + bit_start) < MASK_WORD_SIZE) ? (gi_end - gi + bit_start) : MASK_WORD_SIZE;
3226 Int4 bit;
3227
3228 Uint4 mask_index = gi / MASK_WORD_SIZE;
3229 Uint4 mask_word = Nlm_SwapUint4(virtual_oidlist->list[mask_index]);
3230
3231 if ( mask_word ) {
3232 for(bit = bit_start; bit<bit_end; bit++) {
3233 Uint4 bitshift = (MASK_WORD_SIZE-1)-bit;
3234
3235 if ((mask_word >> bitshift) & 1) {
3236 id_list[ oidindex++ ] = (gi - bit_start) + bit;
3237 }
3238 }
3239 }
3240
3241 gi += bit_end - bit_start;
3242 bit_start = 0;
3243 }
3244
3245 thr_info->gi_current = gi;
3246 *id_list_number = oidindex;
3247 BlastTickProc(thr_info->gi_current/32, thr_info);
3248 } else {
3249 done = TRUE;
3250 }
3251
3252 } else {
3253 done = TRUE;
3254 }
3255 } else {
3256 int real_readdb_entries;
3257 int total_readdb_entries;
3258 int final_real_seq;
3259
3260 real_readdb_entries = readdb_get_num_entries_total_real(rdfp);
3261 total_readdb_entries = readdb_get_num_entries_total(rdfp);
3262 final_real_seq = MIN( real_readdb_entries, thr_info->final_db_seq );
3263
3264 /* we have real database with start/stop specified */
3265 if (thr_info->db_mutex) {
3266 /* Emit a tick if needed. */
3267 BlastTickProc(thr_info->db_chunk_last, thr_info);
3268 *start = thr_info->db_chunk_last;
3269 if (thr_info->db_chunk_last < final_real_seq) {
3270 *stop = MIN((thr_info->db_chunk_last +
3271 thr_info->db_chunk_size), final_real_seq);
3272 } else {/* Already finished. */
3273 *stop = thr_info->db_chunk_last;
3274
3275 /* Change parameters for oidlist processing. */
3276 thr_info->realdb_done = TRUE;
3277 }
3278 thr_info->db_chunk_last = *stop;
3279 } else {
3280 if (*stop != final_real_seq) {
3281 done = FALSE;
3282 *start = thr_info->last_db_seq;
3283 *stop = final_real_seq;
3284 } else {
3285 thr_info->realdb_done = TRUE;
3286
3287 if (total_readdb_entries == real_readdb_entries) {
3288 done = TRUE;
3289 } else {
3290 thr_info->gi_current = final_real_seq;
3291 }
3292 }
3293 }
3294 }
3295
3296 NlmMutexUnlock(thr_info->db_mutex);
3297 return done;
3298 }
3299
3300 static VoidPtr
do_gapped_blast_search(VoidPtr ptr)3301 do_gapped_blast_search(VoidPtr ptr)
3302
3303 {
3304 BlastSearchBlkPtr search;
3305 Int2 status=0;
3306 Int4 index, index1, start=0, stop=0, id_list_length;
3307 Int4Ptr id_list=NULL;
3308 Uint4 i; /* AM: Support for query concatenation. */
3309
3310 search = (BlastSearchBlkPtr) ptr;
3311 if (search->thr_info->blast_gi_list || BlastGetVirtualOIDList(search->rdfp))
3312 { /* FIXME: magic constant? */
3313 id_list = MemNew((search->thr_info->db_chunk_size+33)*sizeof(Int4));
3314 }
3315
3316 while (BlastGetDbChunk(search->rdfp, &start, &stop, id_list,
3317 &id_list_length, search->thr_info) != TRUE)
3318 {
3319 if (id_list && id_list_length)
3320 {
3321 for (index=0; index<id_list_length; index++)
3322 {
3323 index1 = id_list[index];
3324 if ((status =
3325 BLASTPerformSearchWithReadDb(search, index1)) != 0)
3326 break;
3327
3328 if (search->pbp->do_sum_stats) {
3329 status = BlastLinkHsps(search);
3330 }
3331 status = BlastReapHitlistByEvalue(search);
3332 if (search->handle_results)
3333 search->handle_results((VoidPtr) search);
3334 else
3335 BlastSaveCurrentHitlist(search);
3336 /* Emit a tick if needed and we're not MT. */
3337 if (search->thr_info->db_mutex == NULL)
3338 BlastTickProc(index1, search->thr_info);
3339 if (time_out_boolean == TRUE)
3340 break;
3341 }
3342 } else if (!search->thr_info->realdb_done) {
3343 for (index=start; index<stop; index++)
3344 {
3345 if ((status = BLASTPerformSearchWithReadDb(search, index)) != 0)
3346 break;
3347
3348 /* AM: Support for query concatenation. */
3349 if( !search->mult_queries )
3350 {
3351 if (search->pbp->do_sum_stats) {
3352 status = BlastLinkHsps(search);
3353 }
3354 status = BlastReapHitlistByEvalue(search);
3355 if (search->handle_results)
3356 search->handle_results((VoidPtr) search);
3357 else
3358 BlastSaveCurrentHitlist(search);
3359 }
3360 else /* AM: Support for query concatenation. */
3361 {
3362 InitHitLists( search );
3363 search->mult_queries->use_mq = TRUE;
3364 search->mult_queries->delete_current_hitlist = FALSE;
3365
3366 for (i = 0; i < search->mult_queries->NumQueries; ++i) {
3367 search->mult_queries->current_query = i;
3368
3369 if (search->pbp->do_sum_stats) {
3370 status = BlastLinkHsps(search);
3371 }
3372 status = BlastReapHitlistByEvalue(search);
3373
3374 if (search->handle_results)
3375 search->handle_results( (VoidPtr)search );
3376 else
3377 BlastSaveCurrentHitlist(search);
3378 }
3379
3380 if( search->mult_queries->delete_current_hitlist )
3381 {
3382 search->current_hitlist
3383 = BlastHitListDestruct( search->current_hitlist );
3384 }
3385
3386 search->mult_queries->use_mq = FALSE;
3387 BlastHitListPurge( search->current_hitlist );
3388 }
3389
3390 /* Emit a tick if needed and we're not MT. */
3391 if (search->thr_info->db_mutex == NULL)
3392 BlastTickProc(index, search->thr_info);
3393 if (time_out_boolean == TRUE)
3394 break;
3395 }
3396 }
3397 /* Get out if "stop" was the last seq. */
3398 if (time_out_boolean || status)
3399 break;
3400 }
3401
3402 if (id_list)
3403 id_list = MemFree(id_list);
3404
3405 return (VoidPtr) search;
3406 }
3407
3408 static VoidPtr
do_blast_search(VoidPtr ptr)3409 do_blast_search(VoidPtr ptr)
3410
3411 {
3412 BlastSearchBlkPtr search;
3413 Int2 status = 0;
3414 Int4 index, index1, start=0, stop=0, id_list_length;
3415 Int4Ptr id_list=NULL;
3416 Uint4 i; /* AM: Query multiplexing. */
3417
3418 search = (BlastSearchBlkPtr) ptr;
3419 if (search->thr_info->blast_gi_list || BlastGetVirtualOIDList(search->rdfp))
3420 { /* FIXME: magic constant? */
3421 id_list = MemNew((search->thr_info->db_chunk_size+33)
3422 *sizeof(Int4));
3423 }
3424
3425 while (BlastGetDbChunk(search->rdfp, &start, &stop, id_list,
3426 &id_list_length, search->thr_info) != TRUE) {
3427 if (search->thr_info->realdb_done && id_list) {
3428 for (index=0; index<id_list_length; index++) {
3429 index1 = id_list[index];
3430 if ((status = BLASTPerformSearchWithReadDb(search, index1))
3431 != 0)
3432 break;
3433 s_RoundDownOddScores(search->sbp, search->current_hitlist);
3434 if (!search->pbp->mb_params) {
3435 if (search->pbp->do_sum_stats == TRUE)
3436 status = BlastLinkHsps(search);
3437 else
3438 status = BlastGetNonSumStatsEvalue(search);
3439 status = BlastReapHitlistByEvalue(search);
3440 if (!search->handle_results)
3441 status = BlastReevaluateWithAmbiguities(search, index1);
3442 } else {
3443 MegaBlastReevaluateWithAmbiguities(search);
3444 }
3445
3446 if (search->handle_results)
3447 search->handle_results((VoidPtr) search);
3448 else if (!search->pbp->mb_params)
3449 BlastSaveCurrentHitlist(search);
3450 else
3451 MegaBlastSaveCurrentHitlist(search);
3452 if (search->pbp->mb_params)
3453 /* Free the ncbi4na-encoded sequence */
3454 search->subject->sequence_start = (Uint1Ptr)
3455 MemFree(search->subject->sequence_start);
3456 /* Emit a tick if needed and we're not MT. */
3457 if (search->thr_info->db_mutex == NULL)
3458 BlastTickProc(index1, search->thr_info);
3459 if (time_out_boolean == TRUE)
3460 break;
3461 }
3462 } else if (!search->thr_info->realdb_done) {
3463 for (index=start; index<stop; index++) {
3464 if ((status = BLASTPerformSearchWithReadDb(search, index))
3465 != 0)
3466 break;
3467 s_RoundDownOddScores(search->sbp, search->current_hitlist);
3468 if (!search->pbp->mb_params) {
3469 if (search->pbp->do_sum_stats == TRUE)
3470 status = BlastLinkHsps(search);
3471 else
3472 status = BlastGetNonSumStatsEvalue(search);
3473 status = BlastReapHitlistByEvalue(search);
3474 if (!search->handle_results)
3475 status = BlastReevaluateWithAmbiguities(search, index);
3476 } else {
3477 MegaBlastReevaluateWithAmbiguities(search);
3478 }
3479 if (search->handle_results)
3480 search->handle_results((VoidPtr) search);
3481 else if (!search->pbp->mb_params)
3482 { /* AM: Query multiplexing. */
3483 if( !search->mult_queries )
3484 BlastSaveCurrentHitlist(search);
3485 else
3486 {
3487 InitHitLists( search );
3488 search->mult_queries->use_mq = TRUE;
3489 search->mult_queries->delete_current_hitlist = FALSE;
3490
3491 for( i = 0; i < search->mult_queries->NumQueries; ++i )
3492 {
3493 search->mult_queries->current_query = i;
3494 BlastSaveCurrentHitlist(search);
3495 }
3496
3497 if( search->mult_queries->delete_current_hitlist )
3498 {
3499 search->current_hitlist
3500 = BlastHitListDestruct( search->current_hitlist );
3501 }
3502
3503 search->mult_queries->use_mq = FALSE;
3504 BlastHitListPurge( search->current_hitlist );
3505 }
3506 }
3507 else
3508 MegaBlastSaveCurrentHitlist(search);
3509
3510 if (search->pbp->mb_params)
3511 /* Free the ncbi4na-encoded sequence */
3512 search->subject->sequence_start = (Uint1Ptr)
3513 MemFree(search->subject->sequence_start);
3514 /* Emit a tick if needed and we're not MT. */
3515 if (search->thr_info->db_mutex == NULL)
3516 BlastTickProc(index, search->thr_info);
3517 if (time_out_boolean == TRUE)
3518 break;
3519 }
3520 }
3521
3522 /* Get out if "stop" was the last seq. */
3523 if (time_out_boolean || status)
3524 break;
3525 }
3526
3527 if (id_list)
3528 id_list = MemFree(id_list);
3529
3530 return (VoidPtr) search;
3531 }
3532
3533 void LIBCALL
do_the_blast_run(BlastSearchBlkPtr search)3534 do_the_blast_run(BlastSearchBlkPtr search)
3535
3536 {
3537 BlastSearchBlkPtr PNTR array;
3538 Char buffer[256];
3539 Int2 index;
3540 TNlmThread PNTR thread_array;
3541 VoidPtr status=NULL;
3542 int num_entries_total;
3543 int num_entries_total_real;
3544 int start_seq;
3545 int end_seq;
3546 Int4 i; /* AM: query concatenation */
3547
3548 if (search == NULL)
3549 return;
3550
3551 num_entries_total = readdb_get_num_entries_total (search->rdfp);
3552 num_entries_total_real = readdb_get_num_entries_total_real(search->rdfp);
3553
3554 /* Set 'done with read db' according to whether real databases are present */
3555
3556 if (num_entries_total_real) {
3557 search->thr_info->realdb_done = FALSE;
3558 } else {
3559 search->thr_info->realdb_done = TRUE;
3560 }
3561
3562 /* Make sure first, last sequence indices are in-range (0, NUM-1) */
3563
3564 /* NOTE: search->pbp->final_seq is 1 beyond the last sequence ordinal id,
3565 except when it's <=0, which means search to the last sequence in the
3566 database. */
3567 /* search->thr_info versions are not. */
3568
3569 if (search->pbp->final_db_seq > 0) {
3570 end_seq = MIN(search->pbp->final_db_seq, num_entries_total);
3571 } else {
3572 end_seq = num_entries_total;
3573 }
3574
3575 start_seq = MAX(0, MIN(search->pbp->first_db_seq, end_seq));
3576
3577 /* Set BlastGetDbChunk()'s pointers and counters */
3578
3579 search->thr_info->last_db_seq =
3580 search->thr_info->gi_current =
3581 search->thr_info->db_chunk_last = start_seq;
3582
3583 search->thr_info->final_db_seq = end_seq;
3584
3585 ConfigureDbChunkSize(search, search->dbseq_num);
3586
3587 if (NlmThreadsAvailable() && search->pbp->process_num > 1) {
3588 NlmMutexInit(&search->thr_info->db_mutex);
3589 NlmMutexInit(&search->thr_info->results_mutex);
3590 NlmMutexInit(&search->thr_info->ambiguities_mutex);
3591
3592 array = (BlastSearchBlkPtr PNTR) MemNew((search->pbp->process_num)*sizeof(BlastSearchBlkPtr));
3593 array[0] = search;
3594 for (index=1; index<search->pbp->process_num; index++) {
3595 array[index] = BlastSearchBlkDuplicate(search);
3596 if (array[index] == NULL) {
3597 search->pbp->process_num = index;
3598 ErrPostEx(SEV_WARNING, 0, 0, "Number of threads reduced to %d", index);
3599 break;
3600 }
3601 }
3602
3603 thread_array = (TNlmThread PNTR) MemNew((search->pbp->process_num)*sizeof(TNlmThread));
3604 for (index=0; index<search->pbp->process_num; index++) {
3605 if (search->pbp->gapped_calculation && StringCmp(search->prog_name, "blastn") != 0)
3606 thread_array[index] = NlmThreadCreateEx(do_gapped_blast_search, (VoidPtr) array[index], THREAD_RUN|THREAD_BOUND, eTP_Default, NULL, NULL);
3607 else
3608 thread_array[index] = NlmThreadCreateEx(do_blast_search, (VoidPtr) array[index], THREAD_RUN|THREAD_BOUND, eTP_Default, NULL, NULL);
3609
3610 if (NlmThreadCompare(thread_array[index], NULL_thread)) {
3611 ErrPostEx(SEV_ERROR, 0, 0, "Unable to open thread.");
3612 }
3613 }
3614
3615 for (index=0; index<search->pbp->process_num; index++) {
3616 NlmThreadJoin(thread_array[index], &status);
3617 }
3618
3619 for (index=1; index<search->pbp->process_num; index++) {
3620 #ifdef BLAST_COLLECT_STATS
3621 search->first_pass_hits += array[index]->first_pass_hits;
3622 search->second_pass_hits += array[index]->second_pass_hits;
3623 search->second_pass_trys += array[index]->second_pass_trys;
3624 search->first_pass_extends += array[index]->first_pass_extends;
3625 search->second_pass_extends += array[index]->second_pass_extends;
3626 search->first_pass_good_extends += array[index]->first_pass_good_extends;
3627 search->second_pass_good_extends += array[index]->second_pass_good_extends;
3628 search->number_of_seqs_better_E += array[index]->number_of_seqs_better_E;
3629 search->prelim_gap_no_contest += array[index]->prelim_gap_no_contest;
3630 search->prelim_gap_passed += array[index]->prelim_gap_passed;
3631 search->prelim_gap_attempts += array[index]->prelim_gap_attempts;
3632 search->real_gap_number_of_hsps += array[index]->real_gap_number_of_hsps;
3633 #endif
3634
3635 if( array[index]->mult_queries ) { /* AM: query concatenation: free resources */
3636 if( array[index]->mult_queries->HitListArray )
3637 for( i = 0; i < array[index]->mult_queries->NumQueries; ++i )
3638 BlastHitListDestruct( array[index]->mult_queries->HitListArray[i] );
3639
3640 MemFree( array[index]->mult_queries->HitListArray );
3641 MemFree( array[index]->mult_queries );
3642 }
3643 /* Not copied at thread start. */
3644 array[index] = BlastSearchBlkDestruct(array[index]);
3645 }
3646 array = MemFree(array);
3647
3648 thread_array = MemFree(thread_array);
3649
3650 NlmMutexDestroy(search->thr_info->db_mutex);
3651 search->thr_info->db_mutex = NULL;
3652 NlmMutexDestroy(search->thr_info->results_mutex);
3653 search->thr_info->results_mutex = NULL;
3654 NlmMutexDestroy(search->thr_info->ambiguities_mutex);
3655 search->thr_info->ambiguities_mutex = NULL;
3656 } else {
3657 if (search->pbp->gapped_calculation && StringCmp(search->prog_name, "blastn") != 0)
3658 do_gapped_blast_search((VoidPtr) search);
3659 else
3660 do_blast_search((VoidPtr) search);
3661 }
3662 if (search->rdfp->parameters & READDB_CONTENTS_ALLOCATED)
3663 search->rdfp = ReadDBCloseMHdrAndSeqFiles(search->rdfp);
3664 if (time_out_boolean) {
3665 sprintf(buffer, "CPU limit exceeded");
3666 BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
3667 search->timed_out = TRUE;
3668 }
3669
3670 return;
3671 }
3672
3673 Uint1
FrameToDefine(Int2 frame)3674 FrameToDefine(Int2 frame)
3675
3676 {
3677 Uint1 retval;
3678
3679 switch (frame) {
3680 case -1:
3681 retval = SEQLOC_MASKING_MINUS1;
3682 break;
3683 case -2:
3684 retval = SEQLOC_MASKING_MINUS2;
3685 break;
3686 case -3:
3687 retval = SEQLOC_MASKING_MINUS3;
3688 break;
3689 case 1:
3690 retval = SEQLOC_MASKING_PLUS1;
3691 break;
3692 case 2:
3693 retval = SEQLOC_MASKING_PLUS2;
3694 break;
3695 case 3:
3696 retval = SEQLOC_MASKING_PLUS3;
3697 break;
3698 default:
3699 retval = SEQLOC_MASKING_NOTSET;
3700 break;
3701 }
3702
3703 return retval;
3704 }
3705 Int2
DefineToFrame(Uint1 define)3706 DefineToFrame(Uint1 define)
3707
3708 {
3709 Int2 frame;
3710
3711 switch (define) {
3712 case SEQLOC_MASKING_MINUS1:
3713 frame = -1;
3714 break;
3715 case SEQLOC_MASKING_MINUS2:
3716 frame = -2;
3717 break;
3718 case SEQLOC_MASKING_MINUS3:
3719 frame = -3;
3720 break;
3721 case SEQLOC_MASKING_PLUS1:
3722 frame = 1;
3723 break;
3724 case SEQLOC_MASKING_PLUS2:
3725 frame = 2;
3726 break;
3727 case SEQLOC_MASKING_PLUS3:
3728 frame = 3;
3729 break;
3730 case SEQLOC_MASKING_NOTSET:
3731 default:
3732 frame = 0;
3733 break;
3734 }
3735
3736 return frame;
3737 }
3738
3739 CharPtr
BlastConstructFilterString(Int4 filter_value)3740 BlastConstructFilterString(Int4 filter_value)
3741
3742 {
3743 Char buffer[32];
3744 CharPtr ptr;
3745
3746 ptr = buffer;
3747
3748 if (filter_value == FILTER_NONE)
3749 return NULL;
3750
3751 if (filter_value & FILTER_DUST)
3752 {
3753 *ptr = 'D'; ptr++;
3754 *ptr = ';'; ptr++;
3755 }
3756
3757 if (filter_value & FILTER_SEG)
3758 {
3759 *ptr = 'S'; ptr++;
3760 *ptr = ';'; ptr++;
3761 }
3762
3763 *ptr = NULLB;
3764
3765 return StringSave(buffer);
3766 }
3767
3768 void
HackSeqLocId(SeqLocPtr slp,SeqIdPtr id)3769 HackSeqLocId(SeqLocPtr slp, SeqIdPtr id)
3770 {
3771 if (slp == NULL) {
3772 return;
3773 }
3774 switch (slp->choice) {
3775 case SEQLOC_BOND:
3776 case SEQLOC_FEAT:
3777 /* unsupported */
3778 /* assert(0); */
3779 break;
3780 case SEQLOC_NULL:
3781 case SEQLOC_EMPTY:
3782 break;
3783 case SEQLOC_WHOLE:
3784 SeqIdSetFree((SeqIdPtr)slp->data.ptrvalue);
3785 slp->data.ptrvalue = SeqIdDup(id);
3786 break;
3787 case SEQLOC_EQUIV:
3788 case SEQLOC_MIX:
3789 case SEQLOC_PACKED_INT:
3790 slp = (SeqLocPtr)slp->data.ptrvalue;
3791 for (; slp != NULL; slp = slp->next) {
3792 HackSeqLocId(slp, id);
3793 }
3794 break;
3795 case SEQLOC_INT:
3796 SeqIdSetFree(((SeqIntPtr)slp->data.ptrvalue)->id);
3797 ((SeqIntPtr)slp->data.ptrvalue)->id = SeqIdDup(id);
3798 break;
3799 case SEQLOC_PNT:
3800 SeqIdSetFree(((SeqPntPtr)slp->data.ptrvalue)->id);
3801 ((SeqPntPtr)slp->data.ptrvalue)->id = SeqIdDup(id);
3802 break;
3803 case SEQLOC_PACKED_PNT:
3804 SeqIdSetFree(((PackSeqPntPtr)slp->data.ptrvalue)->id);
3805 ((PackSeqPntPtr)slp->data.ptrvalue)->id = SeqIdDup(id);
3806 break;
3807 /* default:
3808 assert(0); */
3809 }
3810 }
3811 /* This function duplicates a SEQLOC_PACKED_INT or a SEQLOC_INT type of SeqLoc */
blastDuplicateSeqLocInt(SeqLocPtr slp_head)3812 static SeqLocPtr blastDuplicateSeqLocInt(SeqLocPtr slp_head)
3813 {
3814 SeqLocPtr dup_slp, slp, dup_head = NULL;
3815 SeqIntPtr sqip;
3816
3817 if(slp_head == NULL)
3818 return NULL;
3819
3820 /* First seqLoc in lower level */
3821
3822 if (slp_head->choice == SEQLOC_PACKED_INT) {
3823 slp = slp_head->data.ptrvalue;
3824 dup_head = ValNodeNew(NULL);
3825 dup_head->choice = slp_head->choice;
3826 } else if (slp_head->choice == SEQLOC_INT) {
3827 slp = slp_head;
3828 } else {
3829 return NULL;
3830 }
3831 sqip = slp->data.ptrvalue;
3832
3833 /* Top level SeqLoc */
3834
3835 dup_slp = (VoidPtr) SeqLocIntNew(sqip->from, sqip->to, sqip->strand, sqip->id);
3836 if (dup_head)
3837 dup_head->data.ptrvalue = dup_slp;
3838 else
3839 dup_head = dup_slp;
3840
3841 /* Loop over all SeqIntPtr s in this SeqLoc */
3842 for(slp = slp->next; slp != NULL; slp = slp->next) {
3843 sqip = slp->data.ptrvalue;
3844 dup_slp->next = (VoidPtr) SeqLocIntNew(sqip->from, sqip->to, sqip->strand, sqip->id);
3845 dup_slp = dup_slp->next;
3846 }
3847
3848 return dup_head;
3849 }
3850 /* This function use PACKED INT as mask */
BLASTUpdateSeqIdInSeqInt(SeqLocPtr mask,SeqIdPtr sip)3851 void BLASTUpdateSeqIdInSeqInt(SeqLocPtr mask, SeqIdPtr sip)
3852 {
3853 SeqLocPtr slp;
3854 SeqIntPtr sintp;
3855
3856 if(mask == NULL)
3857 return;
3858
3859 for(slp = mask->data.ptrvalue; slp != NULL; slp = slp->next) {
3860 if(slp->choice != SEQLOC_INT)
3861 continue;
3862 sintp = (SeqIntPtr)slp->data.ptrvalue;
3863 SeqIdSetFree(sintp->id);
3864 sintp->id = SeqIdDup(sip);
3865 }
3866 return;
3867 }
3868
3869 /* Adjust offsets in the mask locations list; discard locations outside of
3870 the range */
3871 static SeqLocPtr
AdjustOffsetsInMaskLoc(SeqLocPtr mask_loc,Int4 start,Int4 end)3872 AdjustOffsetsInMaskLoc(SeqLocPtr mask_loc, Int4 start, Int4 end)
3873 {
3874 SeqLocPtr slp, last_slp = NULL, next_slp, head = NULL;
3875 SeqIntPtr loc;
3876
3877 if (!mask_loc)
3878 return NULL;
3879
3880 if (mask_loc->choice == SEQLOC_PACKED_INT)
3881 slp = (SeqLocPtr) mask_loc->data.ptrvalue;
3882 else if (mask_loc->choice == SEQLOC_INT)
3883 slp = mask_loc;
3884 else /* Should be impossible */
3885 return NULL;
3886
3887 while (slp) {
3888 if (slp->choice == SEQLOC_INT) {
3889 loc = (SeqIntPtr) slp->data.ptrvalue;
3890 loc->from = MAX(loc->from, start);
3891 loc->to = MIN(loc->to, end);
3892 if (loc->from >= loc->to) {
3893 /* This mask location does not intersect the interval.
3894 Remove it. */
3895 next_slp = slp->next;
3896 SeqLocFree(slp);
3897 slp = next_slp;
3898 } else {
3899 if (last_slp) {
3900 last_slp->next = slp;
3901 } else {
3902 head = slp;
3903 }
3904 last_slp = slp;
3905 slp = slp->next;
3906 }
3907 } else {
3908 next_slp = slp->next;
3909 SeqLocFree(slp);
3910 slp = next_slp;
3911 }
3912 }
3913 if (last_slp)
3914 last_slp->next = NULL;
3915
3916 if (mask_loc->choice == SEQLOC_PACKED_INT) {
3917 mask_loc->data.ptrvalue = head;
3918 /* If there are no locations left, free the packed-int and
3919 return NULL. */
3920 if (!head)
3921 mask_loc = ValNodeFree(mask_loc);
3922 return mask_loc;
3923 } else {
3924 return head;
3925 }
3926 }
3927
3928 /* This function use PACKED INT as slp2 */
blastMergeFilterLocs(SeqLocPtr slp1,SeqLocPtr slp2,Boolean translate,Int2 frame,Int4 length)3929 SeqLocPtr blastMergeFilterLocs(SeqLocPtr slp1, SeqLocPtr slp2, Boolean translate,
3930 Int2 frame, Int4 length)
3931 {
3932
3933 SeqLocPtr slp, dup_slp, dup_head;
3934
3935 if(slp1 == NULL && slp2 == NULL)
3936 return NULL;
3937
3938 if(slp2 == NULL)
3939 return slp1;
3940
3941 if (slp2->choice == SEQLOC_PACKED_INT || slp2->choice == SEQLOC_INT) {
3942 dup_slp = blastDuplicateSeqLocInt(slp2);
3943 }
3944 else if (slp2->choice == SEQLOC_MIX) {
3945 /* for mixed seqlocs, recursively flatten all the internal
3946 seqloc components into a single seqloc_int */
3947 SeqLocPtr list_slp = slp2;
3948 dup_slp = NULL;
3949 while (list_slp != NULL) {
3950 if (list_slp->choice == SEQLOC_MIX) {
3951 dup_slp = blastMergeFilterLocs(dup_slp, list_slp->data.ptrvalue,
3952 FALSE, frame, length);
3953 }
3954 else {
3955 dup_slp = blastMergeFilterLocs(dup_slp, list_slp,
3956 FALSE, frame, length);
3957 }
3958 list_slp = list_slp->next;
3959 }
3960 }
3961 else {
3962 ErrPostEx(SEV_FATAL, 1, 0, "Duplication of SeqLoc failed\n");
3963 }
3964
3965 /* Request to translate means, that slp2 is DNA SeqLoc, that should be
3966 translated into protein SeqLoc corresponding to the specific frame */
3967
3968 if(translate) {
3969 BlastConvertDNASeqLoc(dup_slp, frame, length);
3970 }
3971
3972 if(slp1 == NULL) {
3973 return dup_slp;
3974 }
3975
3976 /* OK We have 2 not NULL filters - merging... */
3977
3978 if(slp1->choice == SEQLOC_PACKED_INT)
3979 slp = (SeqLocPtr) slp1->data.ptrvalue;
3980 else
3981 slp = slp1;
3982
3983 if (dup_slp->choice == SEQLOC_PACKED_INT) {
3984 dup_head = dup_slp;
3985 dup_slp = (SeqLocPtr) dup_slp->data.ptrvalue;
3986 MemFree(dup_head);
3987 }
3988
3989 if(slp == NULL) {
3990 ErrPostEx(SEV_WARNING, 0, 0, "Invalid filter detected");
3991 slp1->data.ptrvalue = dup_slp;
3992 }
3993 else
3994 {
3995 while(slp->next != NULL)
3996 slp = slp->next;
3997
3998 slp->next = dup_slp;
3999 }
4000
4001 return slp1;
4002 }
4003
4004 /* This function is used to filter one frame of the translated DNA
4005 sequence */
rpsBlastFilterSequence(BlastSearchBlkPtr search,Int4 frame,Uint1Ptr sequence,Int4 prot_length,Int4 dna_length)4006 static void rpsBlastFilterSequence(BlastSearchBlkPtr search, Int4 frame,
4007 Uint1Ptr sequence, Int4 prot_length,
4008 Int4 dna_length)
4009 {
4010 BioseqPtr bsp_temp;
4011 Boolean mask_at_hash = FALSE;
4012 SeqLocPtr filter_slp = NULL;
4013
4014 if(search->pbp->query_lcase_mask == NULL) {
4015 if(search->pbp->filter_string == NULL || !StringICmp(search->pbp->filter_string, "F"))
4016 return; /* No filtering */
4017 }
4018
4019 bsp_temp = BlastMakeTempProteinBioseq(sequence+1, prot_length,
4020 Seq_code_ncbistdaa);
4021
4022 filter_slp = BlastBioseqFilterEx(bsp_temp, search->pbp->filter_string,
4023 &mask_at_hash);
4024 HackSeqLocId(filter_slp, search->subject_info->sip);
4025
4026 if(search->pbp->query_lcase_mask != NULL) {
4027 filter_slp = blastMergeFilterLocs(filter_slp, search->pbp->query_lcase_mask, TRUE, frame, dna_length);
4028 }
4029
4030 /* SeqMgrDeleteFromBioseqIndex(bsp_temp); */
4031
4032 /* bsp_temp->id = SeqIdSetFree(bsp_temp->id); */
4033 bsp_temp = BioseqFree(bsp_temp);
4034
4035 BlastMaskTheResidues(sequence+1, prot_length, 21, filter_slp, FALSE, 0);
4036
4037 /* Conversion to ProteinSeqLoc will be done after original SeqLoc will
4038 be used once again on the Gapped extention stage */
4039
4040 /* BlastConvertProteinSeqLoc(filter_slp, frame, dna_length); */
4041
4042 if (filter_slp)
4043 ValNodeAddPointer(&(search->mask), FrameToDefine(frame), filter_slp);
4044
4045 return;
4046 }
4047 BlastSequenceBlkPtr PNTR LIBCALL
BlastMakeCopyQueryDNAP(BlastSequenceBlkPtr PNTR bsbpp_in)4048 BlastMakeCopyQueryDNAP(BlastSequenceBlkPtr PNTR bsbpp_in)
4049 {
4050 BlastSequenceBlkPtr PNTR bsbpp;
4051 Int4 buff_size, m;
4052
4053 if(bsbpp_in == NULL)
4054 return NULL;
4055
4056 bsbpp = MemNew(sizeof(BlastSequenceBlkPtr)*2);
4057 for(m = 0; m < 2; m++) {
4058 if (bsbpp_in[m]) {
4059 bsbpp[m] = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
4060
4061 buff_size = bsbpp_in[m]->length+3*CODON_LENGTH;
4062 bsbpp[m]->sequence_start = MemNew(buff_size);
4063
4064 MemCpy(bsbpp[m]->sequence_start,
4065 bsbpp_in[m]->sequence_start, buff_size);
4066
4067 bsbpp[m]->sequence = bsbpp_in[m]->sequence;
4068
4069 bsbpp[m]->length = bsbpp_in[m]->length;
4070 bsbpp[m]->original_length = bsbpp_in[m]->original_length;
4071 bsbpp[m]->effective_length = bsbpp_in[m]->effective_length;
4072 }
4073 }
4074
4075 return bsbpp;
4076 }
4077
BlastFreeQueryDNAP(BlastSequenceBlkPtr PNTR bsbpp)4078 void LIBCALL BlastFreeQueryDNAP(BlastSequenceBlkPtr PNTR bsbpp)
4079 {
4080 Int4 m;
4081
4082 for(m = 0; m < 2; m++) {
4083 BlastSequenceBlkDestruct(bsbpp[m]);
4084 }
4085
4086 MemFree(bsbpp);
4087
4088 return;
4089 }
4090
4091 BlastSequenceBlkPtr PNTR LIBCALL
BlastCreateQueryDNAP(BlastSearchBlkPtr search,Int4 length)4092 BlastCreateQueryDNAP(BlastSearchBlkPtr search, Int4 length)
4093 {
4094
4095 BlastSequenceBlkPtr PNTR bsbpp;
4096 Uint1Ptr dnap;
4097 Int4 i, j, k, m;
4098 Int4 shift;
4099 BLASTContextStructPtr context = search->context;
4100 Uint1 strand_option;
4101
4102 if(context == NULL)
4103 return NULL;
4104
4105 strand_option = search->last_context / CODON_LENGTH;
4106
4107 bsbpp = MemNew(sizeof(BlastSequenceBlkPtr)*2);
4108
4109 for(m = search->first_context/CODON_LENGTH;
4110 m <= search->last_context/CODON_LENGTH; m++) {
4111
4112 bsbpp[m] = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
4113
4114 dnap = MemNew(length+3*CODON_LENGTH);
4115 /* dnap = MemNew(length + 1); */
4116
4117 dnap[0]=dnap[1]=dnap[2] = NULLB;
4118
4119 shift = m*CODON_LENGTH;
4120 for (i = 0, j = 0; i < length+1;) {
4121 for(k = shift; k < shift + CODON_LENGTH; k++) {
4122 dnap[i] = context[k].query->sequence_start[j];
4123 i++;
4124 }
4125 j++;
4126 }
4127 BlastSequenceAddSequence(bsbpp[m], dnap+3, dnap, length, length, 0);
4128 }
4129
4130 return bsbpp;
4131 }
4132
4133
BLASTCalculateSearchSpace(BLAST_OptionsBlkPtr options,Int4 nseq,Int8 dblen,Int4 qlen)4134 FloatHi LIBCALL BLASTCalculateSearchSpace(BLAST_OptionsBlkPtr options,
4135 Int4 nseq, Int8 dblen, Int4 qlen)
4136 {
4137 Int4 length_adjustment, qlen_eff;
4138 Int8 dblen_eff;
4139 BLAST_KarlinBlkPtr kbp;
4140 FloatHi searchsp;
4141
4142 if (options == NULL)
4143 return 0;
4144
4145 kbp = BlastKarlinBlkCreate();
4146 BlastKarlinBlkGappedCalcEx(kbp, options->gap_open, options->gap_extend,
4147 options->decline_align, options->matrix, NULL);
4148
4149 if (options->gapped_calculation ) {
4150 Nlm_FloatHi alpha, beta; /*alpha and beta for new scoring system */
4151 if (StringCmp(options->program_name, "blastn") != 0)
4152 getAlphaBeta(options->matrix,&alpha,&beta,options->gapped_calculation,
4153 options->gap_open, options->gap_extend);
4154 else
4155 BlastKarlinGetNuclAlphaBeta(options->reward, options->penalty, options->gap_open,
4156 options->gap_extend, kbp, options->gapped_calculation, &alpha, &beta);
4157
4158 BlastComputeLengthAdjustment(kbp->K,
4159 kbp->logK, alpha/kbp->Lambda, beta,
4160 qlen,
4161 dblen, nseq,
4162 &length_adjustment );
4163 } else {
4164 BlastComputeLengthAdjustment(kbp->K, kbp->logK, 1/kbp->H, 0.0,
4165 qlen,
4166 dblen, nseq,
4167 &length_adjustment );
4168 }
4169
4170 kbp = BlastKarlinBlkDestruct(kbp);
4171
4172 qlen_eff = qlen - length_adjustment;
4173 dblen_eff = dblen - nseq*length_adjustment;
4174 searchsp = ((Nlm_FloatHi) qlen_eff) * ((Nlm_FloatHi) dblen_eff);
4175
4176 return searchsp;
4177 }
4178
4179 #define DROPOFF_NUMBER_OF_BITS 10.0
4180 #define INDEX_THR_MIN_SIZE 20000
4181 #define DEFAULT_LONGEST_INTRON 122
4182
BLASTSetUpSearchInternalByLoc(BlastSearchBlkPtr search,SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)(Int4 done,Int4 positives))4183 Int2 LIBCALL BLASTSetUpSearchInternalByLoc (BlastSearchBlkPtr search, SeqLocPtr query_slp, BioseqPtr query_bsp, CharPtr prog_name, Int4 qlen, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)(Int4 done, Int4 positives))
4184
4185 {
4186 BioseqPtr bsp_temp, bsp;
4187 Boolean mask_at_hash=FALSE, private_slp_delete;
4188 Boolean query_is_na, db_is_na;
4189 Char buffer[128];
4190 Int2 retval = 0, status, last_index;
4191 Int4 effective_query_length, query_length, full_query_length,
4192 index, length, length_adjustment=0;
4193 Int4 max_length, block_width;
4194 Nlm_FloatHi avglen;
4195 ReadDBFILEPtr rdfp;
4196 SeqIdPtr query_id;
4197 SeqPortPtr spp=NULL, spp_reverse=NULL;
4198 SeqLocPtr filter_slp=NULL, private_slp=NULL, private_slp_rev=NULL, private_slp_double=NULL;
4199 GeneticCodePtr gcp;
4200 Uint1 residue, strand;
4201 Uint1Ptr sequence;
4202 Uint1Ptr query_seq, query_seq_start, query_seq_rev, query_seq_start_rev;
4203 ValNodePtr vnp;
4204 Int4 query_loc_start;
4205
4206 /* AM: Temporaries to compute effective lengths of individual queries. */
4207 IntArray lengths_eff=NULL;
4208 IntArray length_adj_tmp=NULL;
4209 Int4 le_iter, length_tmp;
4210 Int4 i;
4211 BLAST_ScoreBlkPtr sbptmp = NULL; /* AM: query concatenation */
4212
4213 /* AM: To support individual masking in the case of query multiplexing. */
4214 SeqLocPtr *concat_filter_slp=NULL, *concat_private_slp=NULL, *concat_private_slp_rev=NULL,
4215 * indiv_filter_slp=NULL, *indiv_private_slp=NULL, *indiv_private_slp_rev=NULL;
4216 SeqLocPtr ConcatLCaseMask;
4217 Boolean * indiv_mask_at_hash=NULL;
4218 QueriesPtr mult_queries = NULL;
4219
4220 if (options == NULL)
4221 {
4222 ErrPostEx(SEV_FATAL, 1, 0, "BLAST_OptionsBlkPtr is NULL\n");
4223 return 1;
4224 }
4225
4226 if (query_slp == NULL && query_bsp == NULL)
4227 {
4228 ErrPostEx(SEV_FATAL, 1, 0, "Query is NULL\n");
4229 return 1;
4230 }
4231
4232 /* AM: Support for query multiplexing. */
4233 mult_queries = search->mult_queries;
4234
4235 if( mult_queries )
4236 {
4237 concat_filter_slp
4238 = (SeqLocPtr *)MemNew( mult_queries->NumQueries*sizeof( SeqLocPtr ) );
4239 indiv_filter_slp
4240 = (SeqLocPtr *)MemNew( mult_queries->NumQueries*sizeof( SeqLocPtr ) );
4241 concat_private_slp
4242 = (SeqLocPtr *)MemNew( mult_queries->NumQueries*sizeof( SeqLocPtr ) );
4243 concat_private_slp_rev
4244 = (SeqLocPtr *)MemNew( mult_queries->NumQueries*sizeof( SeqLocPtr ) );
4245 indiv_private_slp
4246 = (SeqLocPtr *)MemNew( mult_queries->NumQueries*sizeof( SeqLocPtr ) );
4247 indiv_private_slp_rev
4248 = (SeqLocPtr *)MemNew( mult_queries->NumQueries*sizeof( SeqLocPtr ) );
4249 indiv_mask_at_hash
4250 = (Boolean *)MemNew( mult_queries->NumQueries*sizeof( Boolean ) );
4251 }
4252
4253 query_seq = NULL; /* Gets rid of warning. */
4254 query_seq_rev = NULL; /* Gets rid of warning. */
4255 query_seq_start = NULL; /* Gets rid of warning. */
4256 query_seq_start_rev = NULL; /* Gets rid of warning. */
4257
4258 /* These parameters are used by translated RPS Blast */
4259 search->pbp->filter_string = StringSave(options->filter_string);
4260 search->pbp->is_rps_blast = options->is_rps_blast;
4261
4262 /* Restrict lower case mask to the query interval, if it is
4263 not a whole Bioseq. */
4264 if (query_slp) {
4265 options->query_lcase_mask =
4266 AdjustOffsetsInMaskLoc(options->query_lcase_mask,
4267 SeqLocStart(query_slp),
4268 SeqLocStop(query_slp));
4269 }
4270 search->pbp->query_lcase_mask = options->query_lcase_mask;
4271 search->pbp->is_ooframe = options->is_ooframe;
4272 search->pbp->shift_pen = options->shift_pen;
4273
4274 if (query_slp)
4275 {
4276 query_loc_start = SeqLocStart(query_slp);
4277 strand = SeqLocStrand(query_slp);
4278 if (strand == Seq_strand_unknown || strand == Seq_strand_plus || strand == Seq_strand_both)
4279 {
4280 private_slp = SeqLocIntNew(query_loc_start, SeqLocStop(query_slp), Seq_strand_plus, SeqLocId(query_slp));
4281
4282 /* AM: Support for query multiplexing. */
4283 if( mult_queries )
4284 for( i = 0; i < mult_queries->NumQueries; ++i )
4285 {
4286 indiv_private_slp[i]
4287 = SeqLocIntNew( 0,
4288 mult_queries->QueryEnds[i] - mult_queries->QueryStarts[i],
4289 Seq_strand_plus,
4290 mult_queries->FakeBsps[i]->id );
4291 concat_private_slp[i]
4292 = SeqLocIntNew( mult_queries->QueryStarts[i],
4293 mult_queries->QueryEnds[i],
4294 Seq_strand_plus,
4295 SeqLocId( query_slp ) );
4296 }
4297 }
4298 if (strand == Seq_strand_minus || strand == Seq_strand_both)
4299 {
4300 private_slp_rev = SeqLocIntNew(query_loc_start, SeqLocStop(query_slp), Seq_strand_minus, SeqLocId(query_slp));
4301
4302 /* AM: Support for query multiplexing. */
4303 if( mult_queries )
4304 for( i = 0; i < mult_queries->NumQueries; ++i )
4305 {
4306 indiv_private_slp_rev[i]
4307 = SeqLocIntNew( 0,
4308 mult_queries->QueryEnds[i] - mult_queries->QueryStarts[i],
4309 Seq_strand_minus,
4310 mult_queries->FakeBsps[i]->id );
4311 concat_private_slp_rev[i]
4312 = SeqLocIntNew( mult_queries->QueryStarts[i],
4313 mult_queries->QueryEnds[i],
4314 Seq_strand_minus,
4315 SeqLocId( query_slp ) );
4316 }
4317 }
4318 private_slp_delete = TRUE;
4319 if (search->prog_number==blast_type_blastn)
4320 search = BlastFillQueryOffsets(search, query_slp, 1);
4321
4322 }
4323 else
4324 {
4325 private_slp = SeqLocIntNew(0, query_bsp->length-1 , Seq_strand_plus, SeqIdFindBest(query_bsp->id, SEQID_GI));
4326 private_slp_rev = SeqLocIntNew(0, query_bsp->length-1 , Seq_strand_minus, SeqIdFindBest(query_bsp->id, SEQID_GI));
4327 private_slp_delete = FALSE;
4328
4329 private_slp_double = SeqLocIntNew(0, query_bsp->length-1 , Seq_strand_both, SeqIdFindBest(query_bsp->id, SEQID_GI));
4330 if (search->prog_number==blast_type_blastn)
4331 search = BlastFillQueryOffsets(search,
4332 private_slp_double, 1);
4333 SeqLocFree(private_slp_double);
4334 }
4335
4336 query_length = 0;
4337 if (private_slp)
4338 query_length = SeqLocLen(private_slp);
4339 else if (private_slp_rev)
4340 query_length = SeqLocLen(private_slp_rev);
4341 if (query_length == 0)
4342 {
4343 sprintf(buffer, "No valid query sequence");
4344 BlastConstructErrorMessage("Blast", buffer, 2,
4345 &(search->error_return));
4346 retval = 1;
4347 goto BlastSetUpReturn;
4348 }
4349
4350 bsp = NULL;
4351 if (private_slp)
4352 bsp = BioseqLockById(SeqLocId(private_slp));
4353 else if (private_slp_rev)
4354 bsp = BioseqLockById(SeqLocId(private_slp_rev));
4355
4356 if (bsp == NULL) {
4357 ErrPostEx(SEV_WARNING, 0, 0, "No valid query sequence, BioseqLockById returned NULL\n");
4358 retval = 1;
4359 goto BlastSetUpReturn;
4360 }
4361 full_query_length = bsp->length;
4362
4363 BlastGetTypes(prog_name, &query_is_na, &db_is_na);
4364 if (query_is_na != ISA_na(bsp->mol)) {
4365 ErrPostEx(SEV_WARNING, 0, 0, "Query molecule is incompatible with %s program", prog_name);
4366 BioseqUnlock(bsp);
4367 retval = 1;
4368 goto BlastSetUpReturn;
4369 }
4370 if (bsp && bsp->repr == Seq_repr_virtual) {
4371 BioseqUnlock(bsp);
4372 ErrPostEx(SEV_WARNING, 0, 0, "Virtual sequence detected\n");
4373 retval = 1;
4374 goto BlastSetUpReturn;
4375 }
4376 BioseqUnlock(bsp);
4377
4378 if (query_slp)
4379 {
4380 search->query_slp = query_slp;
4381 }
4382 else
4383 {
4384 search->query_slp = private_slp;
4385 search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
4386 }
4387
4388
4389 search->translation_buffer = NULL;
4390 search->translation_buffer_size = 0;
4391
4392 /*
4393 Get genetic codes (should be determined from BLAST_OptionsBlkPtr.
4394 Only needed for blastx, tblast[nx]
4395 */
4396 if (StringCmp(prog_name, "blastp") != 0 && StringCmp(prog_name, "blastn") != 0)
4397 {
4398
4399 if (StringCmp(prog_name, "tblastx") == 0
4400 || StringCmp(prog_name, "tblastn") == 0
4401 ||StringCmp(prog_name, "psitblastn") == 0)
4402
4403 {
4404 gcp = GeneticCodeFind(options->db_genetic_code, NULL);
4405 for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
4406 {
4407 if (vnp->choice == 3) /* ncbieaa */
4408 {
4409 search->db_genetic_code = (CharPtr)vnp->data.ptrvalue;
4410 break;
4411 }
4412 }
4413 search->translation_table = GetPrivatTranslationTable(search->db_genetic_code, FALSE);
4414 search->translation_table_rc = GetPrivatTranslationTable(search->db_genetic_code, TRUE);
4415 max_length = 0;
4416 rdfp = search->rdfp;
4417 while (rdfp)
4418 {
4419 max_length = MAX(max_length, readdb_get_maxlen(rdfp));
4420 rdfp = rdfp->next;
4421 }
4422 search->translation_buffer = MemNew((3+(max_length/3))*sizeof(Uint1));
4423 search->translation_buffer_size = 1+(max_length/3);
4424 search->allocated += BLAST_SEARCH_ALLOC_TRANS_INFO;
4425 }
4426
4427 if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0)
4428 {
4429 gcp = GeneticCodeFind(options->genetic_code, NULL);
4430 for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
4431 {
4432 if (vnp->choice == 3) /* ncbieaa */
4433 {
4434 search->genetic_code = (CharPtr)vnp->data.ptrvalue;
4435 break;
4436 }
4437 }
4438 }
4439 }
4440
4441 if (options->filter && !options->filter_string)
4442 options->filter_string = BlastConstructFilterString(options->filter);
4443
4444 /* If the query is translated do this below. */
4445 if (StringCmp(prog_name, "blastx") &&
4446 StringCmp(prog_name, "tblastx")) {
4447 /* Futamura */
4448 if(!(search->pbp->is_rps_blast &&
4449 (!StringCmp(prog_name, "tblastn")||
4450 !StringCmp(prog_name, "psitblastn")))) {
4451 /* AM: Query multiplexing. */
4452 if( !mult_queries )
4453 {
4454 if (private_slp)
4455 filter_slp = BlastSeqLocFilterEx(private_slp, options->filter_string, &mask_at_hash);
4456 else if (private_slp_rev)
4457 filter_slp = BlastSeqLocFilterEx(private_slp_rev, options->filter_string, &mask_at_hash);
4458
4459 /* If lower case characters were detected in the input
4460 their locations will be masked out */
4461
4462 if(search->pbp->query_lcase_mask != NULL) {
4463 filter_slp = blastMergeFilterLocs(filter_slp, search->pbp->query_lcase_mask, FALSE, 0, 0);
4464 }
4465 }
4466 else
4467 for( i = 0; i < mult_queries->NumQueries; ++i )
4468 {
4469 if( indiv_private_slp[i] )
4470 {
4471 indiv_filter_slp[i]
4472 = BlastSeqLocFilterEx( indiv_private_slp[i],
4473 options->filter_string,
4474 indiv_mask_at_hash + i );
4475 concat_filter_slp[i]
4476 = BlastSeqLocFilterEx( concat_private_slp[i],
4477 options->filter_string,
4478 indiv_mask_at_hash + i );
4479 }
4480 else if( indiv_private_slp_rev[i] )
4481 {
4482 indiv_filter_slp[i]
4483 = BlastSeqLocFilterEx( indiv_private_slp_rev[i],
4484 options->filter_string,
4485 indiv_mask_at_hash + i );
4486 concat_filter_slp[i]
4487 = BlastSeqLocFilterEx( concat_private_slp_rev[i],
4488 options->filter_string,
4489 indiv_mask_at_hash + i );
4490 }
4491
4492 if( mult_queries->LCaseMasks && mult_queries->LCaseMasks[i] )
4493 {
4494 indiv_filter_slp[i] = blastMergeFilterLocs( indiv_filter_slp[i],
4495 (SeqLocPtr)mult_queries->LCaseMasks[i]->data.ptrvalue,
4496 FALSE, 0, 0 );
4497 ConcatLCaseMask = ConcatSeqLoc( mult_queries, mult_queries->LCaseMasks[i],
4498 SeqLocId( query_slp ), i );
4499 concat_filter_slp[i] = blastMergeFilterLocs( concat_filter_slp[i],
4500 (SeqLocPtr)ConcatLCaseMask->data.ptrvalue,
4501 FALSE, 0, 0 );
4502 }
4503 }
4504 }
4505 }
4506
4507 if( mult_queries ) { /* AM: query concatenation: free resources */
4508 for( i = 0; i < mult_queries->NumQueries; ++i ) {
4509 SeqLocFree( indiv_private_slp_rev[i] );
4510 SeqLocFree( indiv_private_slp[i] );
4511 SeqLocFree( concat_private_slp_rev[i] );
4512 SeqLocFree( concat_private_slp[i] );
4513 }
4514
4515 MemFree( indiv_private_slp_rev );
4516 MemFree( indiv_private_slp );
4517 MemFree( concat_private_slp_rev );
4518 MemFree( concat_private_slp );
4519 }
4520
4521 /*
4522 Dusting of query sequence. Only needed for blastn, optional
4523 */
4524
4525 if(StringCmp(prog_name, "blastn") == 0) {
4526 /* AM: Changed to support query multiplexing. */
4527 if( !mult_queries )
4528 if (filter_slp && !mask_at_hash)
4529 ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
4530 else
4531 ValNodeAddPointer(&(search->mask1), SEQLOC_MASKING_NOTSET, filter_slp);
4532 else
4533 for( i = 0; i < mult_queries->NumQueries; ++i )
4534 if( !indiv_mask_at_hash[i] )
4535 ValNodeAddPointer( &(search->mask), SEQLOC_MASKING_NOTSET, indiv_filter_slp[i] );
4536 else
4537 ValNodeAddPointer( &(search->mask1), SEQLOC_MASKING_NOTSET, indiv_filter_slp[i] );
4538 }
4539
4540
4541 if (StringCmp(prog_name, "blastp") == 0
4542 || StringCmp(prog_name, "tblastn") == 0
4543 || StringCmp(prog_name, "psitblastn") == 0)
4544 {
4545 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
4546 SeqPortSet_do_virtual(spp, TRUE);
4547
4548 /* AM: Changed to support query multiplexing. */
4549 if( !mult_queries )
4550 if (filter_slp && !mask_at_hash)
4551 ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
4552 else
4553 ValNodeAddPointer(&(search->mask1), SEQLOC_MASKING_NOTSET, filter_slp);
4554 else
4555 for( i = 0; i < mult_queries->NumQueries; ++i )
4556 if( !indiv_mask_at_hash[i] )
4557 ValNodeAddPointer( &(search->mask), SEQLOC_MASKING_NOTSET, indiv_filter_slp[i] );
4558 else
4559 ValNodeAddPointer( &(search->mask1), SEQLOC_MASKING_NOTSET, indiv_filter_slp[i] );
4560 }
4561 else if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0 || StringCmp(prog_name, "blastn") == 0)
4562 {
4563 if (private_slp)
4564 {
4565 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbi4na);
4566 SeqPortSet_do_virtual(spp, TRUE);
4567 }
4568 if (private_slp_rev)
4569 {
4570 spp_reverse = SeqPortNewByLoc(private_slp_rev, Seq_code_ncbi4na);
4571 SeqPortSet_do_virtual(spp_reverse, TRUE);
4572 }
4573 }
4574 else
4575 {
4576 ErrPostEx(SEV_FATAL, 1, 0, "Only blastn, blastp, blastx, tblastn tblastx is allowed\n");
4577 retval = 1;
4578 goto BlastSetUpReturn;
4579 }
4580
4581 /* AM: query concatenation: free resources */
4582 MemFree( indiv_mask_at_hash );
4583 MemFree( indiv_filter_slp );
4584
4585 if (spp)
4586 {
4587 query_seq_start = (Uint1Ptr) MemNew(2*((query_length)+2)*sizeof(Char));
4588 query_seq_start[0] = NULLB;
4589 query_seq = query_seq_start+1;
4590 index=0;
4591 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
4592 {
4593
4594 if (IS_residue(residue))
4595 {
4596 if (residue == 24) /* 24 is Selenocysteine. */
4597 {
4598 residue = 21; /* change Selenocysteine to X. */
4599 sprintf(buffer, "Selenocysteine (U) at position %ld replaced by X",
4600 (long) index+1);
4601 BlastConstructErrorMessage("Blast", buffer, 1, &(search->error_return));
4602 }
4603 query_seq[index] = residue;
4604 index++;
4605 }
4606 }
4607 query_seq[index] = NULLB;
4608 spp = SeqPortFree(spp);
4609 if (StringCmp(prog_name, "blastn") == 0)
4610 {
4611 if (filter_slp)
4612 {
4613 if (mask_at_hash)
4614 search->context[0].location =
4615 BlastSeqLocFillDoubleIntEx(filter_slp, full_query_length, query_length, FALSE, SeqLocStart(private_slp));
4616 else
4617 BlastMaskTheResidues(query_seq, full_query_length, 15, filter_slp, FALSE, SeqLocStart(private_slp));
4618 }
4619
4620 /* AM: query multiplexing. */
4621 if( mult_queries )
4622 for( i = 0; i < mult_queries->NumQueries; ++i )
4623 if( concat_filter_slp[i] )
4624 BlastMaskTheResidues( query_seq,
4625 full_query_length,
4626 15, concat_filter_slp[i], FALSE,
4627 SeqLocStart( private_slp ) );
4628
4629 for (index=0; index<=query_length+1; index++)
4630 query_seq_start[index] = ncbi4na_to_blastna[query_seq_start[index]];
4631 }
4632 }
4633
4634 if (spp_reverse)
4635 {
4636 query_seq_start_rev = (Uint1Ptr) MemNew(((query_length)+2)*sizeof(Char));
4637 query_seq_start_rev[0] = NULLB;
4638 query_seq_rev = query_seq_start_rev+1;
4639 index=0;
4640 while ((residue=SeqPortGetResidue(spp_reverse)) != SEQPORT_EOF)
4641 {
4642 if (IS_residue(residue))
4643 {
4644 if (residue == 24) /* 24 is Selenocysteine. */
4645 {
4646 residue = 21; /* change Selenocysteine to X. */
4647 sprintf(buffer, "Selenocysteine (U) at position %ld replaced by X",
4648 (long) index+1);
4649 BlastConstructErrorMessage("Blast", buffer, 1, &(search->error_return));
4650 }
4651 query_seq_rev[index] = residue;
4652 index++;
4653 }
4654 }
4655 query_seq_rev[index] = NULLB;
4656 spp_reverse = SeqPortFree(spp_reverse);
4657 if (StringCmp(prog_name, "blastn") == 0)
4658 {
4659 if (filter_slp)
4660 {
4661 if (mask_at_hash)
4662 search->context[0].location =
4663 BlastSeqLocFillDoubleIntRev(search->context[0].location, filter_slp, query_length, full_query_length, full_query_length - SeqLocStop(private_slp_rev) - 1);
4664 else
4665 BlastMaskTheResidues(query_seq_rev, full_query_length, 15, filter_slp, TRUE, full_query_length - SeqLocStop(private_slp_rev) - 1);
4666 }
4667
4668 /* AM: query multiplexing. */
4669 if( mult_queries )
4670 for( i = 0; i < mult_queries->NumQueries; ++i )
4671 if( concat_filter_slp[i] )
4672 BlastMaskTheResidues( query_seq_rev,
4673 full_query_length,
4674 15, concat_filter_slp[i], TRUE,
4675 full_query_length
4676 - SeqLocStop( private_slp_rev ) - 1 );
4677
4678 for (index=0; index<=query_length+1; index++)
4679 query_seq_start_rev[index] =
4680 ncbi4na_to_blastna[query_seq_start_rev[index]];
4681 if (query_seq_start)
4682 MemCpy(query_seq_start+query_length+1,
4683 query_seq_start_rev,query_length+2);
4684 }
4685 }
4686
4687 /*
4688 Set the context_factor, which specifies how many different
4689 ways the query or db is examined (e.g., blastn looks at both
4690 stands of query, context_factor is 2).
4691 */
4692 if (StringCmp(prog_name, "blastp") == 0)
4693 {
4694 search->context_factor = 1;
4695 length = query_length;
4696 }
4697 else if (StringCmp(prog_name, "blastn") == 0)
4698 { /* two strands concatenated in one sequence */
4699 search->context_factor = 1;
4700 length = query_length;
4701 }
4702 else if (StringCmp(prog_name, "blastx") == 0)
4703 { /* query translated in six frames. */
4704 search->context_factor = search->last_context-search->first_context+1;
4705 length = query_length/3;
4706 }
4707 else if ( (StringCmp(prog_name, "tblastn") == 0)
4708 || (StringCmp(prog_name, "psitblastn") == 0))
4709 { /* db translated in six frames. */
4710 search->context_factor = 6;
4711 length = query_length;
4712 }
4713 else if (StringCmp(prog_name, "tblastx") == 0)
4714 { /* db and query each translated in six frames. */
4715 search->context_factor = 6*CODON_LENGTH*(search->last_context-search->first_context+1);
4716 length = query_length/3;
4717 }
4718 else
4719 {
4720 sprintf(buffer, "%s is not a valid program name", prog_name);
4721 BlastConstructErrorMessage("BLASTSetUpSearch", buffer, 2, &(search->error_return));
4722 retval = 1;
4723 goto BlastSetUpReturn;
4724 }
4725
4726 if (private_slp)
4727 query_id = SeqIdFindBest(SeqLocId(private_slp), SEQID_GI);
4728 else
4729 query_id = SeqIdFindBest(SeqLocId(private_slp_rev), SEQID_GI);
4730
4731 search->query_id = SeqIdDup(query_id);
4732
4733 /* Store the query sequence, or the translation thereof. */
4734 if (StringCmp(prog_name, "blastp") == 0
4735 || StringCmp(prog_name, "tblastn") == 0
4736 || StringCmp(prog_name, "psitblastn") == 0)
4737 { /* One blastp context for now. */
4738 if (filter_slp)
4739 {
4740 if (mask_at_hash)
4741 search->context[0].location =
4742 BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
4743 else
4744 BlastMaskTheResidues(query_seq, full_query_length, 21, filter_slp, FALSE, SeqLocStart(private_slp));
4745 }
4746
4747 /* AM: query multiplexing. */
4748 if( mult_queries )
4749 for( i = 0; i < mult_queries->NumQueries; ++i )
4750 if( concat_filter_slp[i] )
4751 BlastMaskTheResidues( query_seq, full_query_length,
4752 21, concat_filter_slp[i], FALSE,
4753 SeqLocStart( private_slp ) );
4754
4755 BlastSequenceAddSequence(search->context[0].query, NULL, query_seq_start, query_length, query_length, 0);
4756 }
4757 else if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0)
4758 {
4759
4760 for (index=search->first_context; index<=search->last_context; index++)
4761 {
4762 if (search->context[index].query->frame > 0)
4763 {
4764 sequence = GetTranslation(query_seq, query_length, search->context[index].query->frame, &length, search->genetic_code);
4765 }
4766 else
4767 {
4768 sequence = GetTranslation(query_seq_rev, query_length, search->context[index].query->frame, &length, search->genetic_code);
4769 }
4770 if (options->filter_string && length > 0)
4771 {
4772 bsp_temp = BlastMakeTempProteinBioseq(sequence+1, length, Seq_code_ncbistdaa);
4773
4774 filter_slp = BlastBioseqFilterEx(bsp_temp, options->filter_string, &mask_at_hash);
4775 HackSeqLocId(filter_slp, search->query_id);
4776
4777 /* If FASTA filtering is set - updating this SeqLoc */
4778 if(search->pbp->query_lcase_mask != NULL) {
4779 filter_slp = blastMergeFilterLocs(filter_slp, search->pbp->query_lcase_mask, TRUE, search->context[index].query->frame, query_length);
4780 }
4781
4782 /* SeqMgrDeleteFromBioseqIndex(bsp_temp); */
4783
4784 /* bsp_temp->id = SeqIdSetFree(bsp_temp->id); */
4785
4786 bsp_temp = BioseqFree(bsp_temp);
4787 if (mask_at_hash)
4788 {
4789 search->context[index].location =
4790 BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
4791 }
4792 else
4793 {
4794 BlastMaskTheResidues(sequence+1, length, 21, filter_slp, FALSE, 0);
4795 BlastConvertProteinSeqLoc(filter_slp, search->context[index].query->frame, query_length);
4796 }
4797 if (filter_slp && !mask_at_hash)
4798 ValNodeAddPointer(&(search->mask), FrameToDefine(search->context[index].query->frame), filter_slp);
4799 else
4800 ValNodeAddPointer(&(search->mask1), FrameToDefine(search->context[index].query->frame), filter_slp);
4801 }
4802 BlastSequenceAddSequence(search->context[index].query, NULL, sequence, length, query_length, 0);
4803 }
4804 query_seq_start = MemFree(query_seq_start);
4805 query_seq_start_rev = MemFree(query_seq_start_rev);
4806
4807 if(search->pbp->is_ooframe) {
4808 search->query_dnap = BlastCreateQueryDNAP(search, query_length);
4809 }
4810 } else if (StringCmp(prog_name, "blastn") == 0) {
4811 if (search->last_context - search->first_context > 0) {
4812 /* Both strands are searched */
4813 BlastSequenceAddSequence(search->context[search->first_context].query, NULL, query_seq_start, 2*query_length+2, 2*query_length+2, 0);
4814 BlastSequenceAddSequence(search->context[search->last_context].query, NULL,
4815 query_seq_start_rev, query_length,
4816 query_length, 0);
4817 } else if (search->first_context==0)
4818 /* Only first strand is searched */
4819 BlastSequenceAddSequence(search->context[search->first_context].query, NULL, query_seq_start, query_length+1, query_length+1, 0);
4820 else {/* Only second strand is searched */
4821 BlastSequenceAddSequence(search->context[search->first_context].query, NULL,
4822 query_seq_start_rev, query_length+1,
4823 query_length+1, 0);
4824 }
4825 }
4826
4827 if( mult_queries ) { /* AM: query concatenation: free resources */
4828 for( i = 0; i < mult_queries->NumQueries; ++i )
4829 SeqLocFree( concat_filter_slp[i] );
4830
4831 MemFree( concat_filter_slp );
4832 }
4833
4834 if (mask_at_hash)
4835 { /* No longer needed. */
4836 /*
4837 filter_slp = SeqLocSetFree(filter_slp);
4838 */
4839 }
4840
4841 /* Set the ambiguous residue before the ScoreBlk is filled. */
4842 if (StringCmp(prog_name, "blastn") != 0)
4843 {
4844 search->sbp->read_in_matrix = TRUE;
4845 BlastScoreSetAmbigRes(search->sbp, 'X');
4846 }
4847 else
4848 {
4849 if(options->matrix!=NULL && *(options->matrix) != NULLB) {
4850 search->sbp->read_in_matrix = TRUE;
4851 } else {
4852 search->sbp->read_in_matrix = FALSE;
4853 }
4854 BlastScoreSetAmbigRes(search->sbp, 'N');
4855 }
4856
4857
4858 search->sbp->penalty = options->penalty;
4859 search->sbp->reward = options->reward;
4860
4861 /* option is to use alignments chosen by user in PSM computation API (used in WWW PSI-Blast); */
4862 search->pbp->use_best_align = options->use_best_align;
4863
4864
4865 /* Should culling be used at all? */
4866 search->pbp->perform_culling = options->perform_culling;
4867 search->pbp->hsp_range_max = options->hsp_range_max;
4868 /* This assures that search->pbp->max_pieces is at least one wide. */
4869 block_width = MIN(query_length, options->block_width);
4870 if (block_width > 0)
4871 search->pbp->max_pieces = query_length/block_width;
4872
4873 search->sbp->query_length = query_length;
4874
4875 search->result_struct = BLASTResultsStructNew(search->result_size,
4876 search->pbp->max_pieces,
4877 search->pbp->hsp_range_max);
4878
4879 if (options->matrix != NULL)
4880 status = BlastScoreBlkMatFill(search->sbp, options->matrix);
4881 else
4882 status = BlastScoreBlkMatFill(search->sbp, "BLOSUM62");
4883
4884 if (status != 0)
4885 {
4886 ErrPostEx(SEV_WARNING, 0, 0, "BlastScoreBlkMatFill returned non-zero status");
4887 retval = 1;
4888 goto BlastSetUpReturn;
4889 }
4890
4891 /* This is used right below. */
4892 search->pbp->gapped_calculation = options->gapped_calculation;
4893 search->pbp->do_not_reevaluate = options->do_not_reevaluate;
4894
4895 /* Set up sum statistics */
4896 search->pbp->do_sum_stats = options->do_sum_stats;
4897 if(search->prog_number == blast_type_blastx ||
4898 search->prog_number == blast_type_tblastn ||
4899 search->prog_number == blast_type_psitblastn)
4900 {
4901 /* The program may use new_link_hsps to evaluate sum
4902 statistics. */
4903 Int4 max_protein_gap; /* the largest gap permitted in the
4904 * translated sequence */
4905
4906 max_protein_gap = (options->longest_intron - 2)/3;
4907 if(search->pbp->gapped_calculation) {
4908 if(options->longest_intron == 0) {
4909 /* a zero value of longest_intron
4910 * invokes the default behavior, which for gapped
4911 * calculation is to set longest_intron to a
4912 * predefined value. */
4913 search->pbp->longest_intron = (DEFAULT_LONGEST_INTRON - 2) / 3;
4914 } else if(max_protein_gap <= 0) {
4915 /* A nonpositive value of max_protein_gap turns linking off */
4916 search->pbp->do_sum_stats = FALSE;
4917 search->pbp->longest_intron = 0;
4918 } else { /* the value of max_protein_gap is positive */
4919 search->pbp->longest_intron = max_protein_gap;
4920 }
4921 } else { /* This is an ungapped calculation. */
4922 /* For ungapped calculations, we preserve the old behavior
4923 * of the longest_intron parameter to maintain
4924 * backward-compatibility with older versions of BLAST. */
4925 search->pbp->longest_intron = MAX(max_protein_gap, 0);
4926 }
4927 }
4928 search->pbp->first_db_seq = options->first_db_seq;
4929 search->pbp->final_db_seq = options->final_db_seq;
4930
4931 retval = 0;
4932 for (index=search->first_context; index<=search->last_context; index++)
4933 {
4934 /* AM: Changed to support query multiplexing. */
4935 if (search->prog_number != blast_type_blastn ||
4936 index>search->first_context ||
4937 search->last_context==search->first_context)
4938 {
4939 if( search->prog_number == blast_type_tblastn
4940 && search->mult_queries )
4941 {
4942 for( i = 0; i < search->mult_queries->NumQueries; ++i )
4943 {
4944 sbptmp = BLAST_ScoreBlkNew(
4945 Seq_code_ncbistdaa, search->last_context + 1 );
4946 sbptmp->read_in_matrix = TRUE;
4947 BlastScoreSetAmbigRes( sbptmp, 'X' );
4948 sbptmp->penalty = options->penalty;
4949 sbptmp->reward = options->reward;
4950 sbptmp->query_length = query_length;
4951
4952 if (options->matrix != NULL)
4953 status = BlastScoreBlkMatFill(sbptmp, options->matrix);
4954 else
4955 status = BlastScoreBlkMatFill(sbptmp, "BLOSUM62");
4956
4957 status = BlastScoreBlkFill(
4958 sbptmp,
4959 ((CharPtr)search->context[index].query->sequence)
4960 + search->mult_queries->QueryStarts[i],
4961 search->mult_queries->QueryEnds[i]
4962 - search->mult_queries->QueryStarts[i] + 1,
4963 index );
4964
4965 if( status ) break;
4966
4967 search->mult_queries->lambda_array[i]
4968 = sbptmp->kbp_std[search->first_context]->Lambda;
4969
4970 if( i )
4971 {
4972 if( search->mult_queries->LambdaMin
4973 > sbptmp->kbp_std[search->first_context]->Lambda )
4974 search->mult_queries->LambdaMin
4975 = sbptmp->kbp_std[search->first_context]->Lambda;
4976
4977 if( search->mult_queries->LambdaMax
4978 < sbptmp->kbp_std[search->first_context]->Lambda )
4979 search->mult_queries->LambdaMax
4980 = sbptmp->kbp_std[search->first_context]->Lambda;
4981
4982 if( search->mult_queries->LogKMin
4983 > sbptmp->kbp_std[search->first_context]->logK )
4984 search->mult_queries->LogKMin
4985 = sbptmp->kbp_std[search->first_context]->logK;
4986
4987 if( search->mult_queries->LogKMax
4988 < sbptmp->kbp_std[search->first_context]->logK )
4989 search->mult_queries->LogKMax
4990 = sbptmp->kbp_std[search->first_context]->logK;
4991 }
4992 else
4993 {
4994 search->mult_queries->LambdaMin
4995 = search->mult_queries->LambdaMax
4996 = sbptmp->kbp_std[search->first_context]->Lambda;
4997 search->mult_queries->LogKMin
4998 = search->mult_queries->LogKMax
4999 = sbptmp->kbp_std[search->first_context]->logK;
5000 }
5001
5002 sbptmp = BLAST_ScoreBlkDestruct( sbptmp );
5003 }
5004 }
5005
5006 status
5007 = BlastScoreBlkFill(search->sbp, (CharPtr)
5008 search->context[index].query->sequence,
5009 search->context[index].query->length,
5010 index);
5011 }
5012 else
5013 {
5014 status
5015 = BlastScoreBlkFill(search->sbp, (CharPtr)
5016 search->context[index].query->sequence,
5017 search->context[index+1].query->length,
5018 index);
5019 }
5020
5021 if (status != 0)
5022 {
5023 sprintf(buffer, "Unable to calculate Karlin-Altschul params, check query sequence");
5024 BlastConstructErrorMessage("BLASTSetUpSearch", buffer, 2, &(search->error_return));
5025 retval = 1;
5026 }
5027 if (search->pbp->gapped_calculation)
5028 {
5029 if (StringCmp(search->prog_name, "blastn") != 0)
5030 {
5031 search->sbp->kbp_gap_std[index] = BlastKarlinBlkCreate();
5032 status = BlastKarlinBlkGappedCalcEx(search->sbp->kbp_gap_std[index], options->gap_open, options->gap_extend, options->decline_align, search->sbp->name, &(search->error_return));
5033 if (status != 0)
5034 {
5035 retval = 1;
5036 }
5037 search->sbp->kbp_gap_psi[index] = BlastKarlinBlkCreate();
5038 status = BlastKarlinBlkGappedCalcEx(search->sbp->kbp_gap_psi[index], options->gap_open, options->gap_extend, options->decline_align, search->sbp->name, &(search->error_return));
5039 if (status != 0)
5040 {
5041 retval = 1;
5042 }
5043 }
5044 else
5045 {
5046 search->sbp->kbp_gap_std[index] = BlastKarlinBlkCreate();
5047 status = BlastKarlinBlkNuclGappedCalc(search->sbp->kbp_gap_std[index], options->gap_open, options->gap_extend, options->reward, options->penalty, search->sbp->kbp_std[index], &(search->sbp->round_down), &(search->error_return));
5048 if (status != 0)
5049 retval = 1;
5050 }
5051 }
5052 }
5053
5054 search->sbp->kbp_gap = search->sbp->kbp_gap_std;
5055 search->sbp->kbp = search->sbp->kbp_std;
5056 if (search->pbp->gapped_calculation && StringCmp(prog_name, "blastn") != 0)
5057 {
5058 Int4 array_size = BlastKarlinGetMatrixValues(search->sbp->name,
5059 NULL, NULL, NULL, NULL,
5060 NULL, NULL);
5061 if ( !(array_size > 0)) {
5062 /* This can only happen in case of unsupported matrix! */
5063 sprintf(buffer,
5064 "matrix %s is not supported\n",
5065 search->sbp->name);
5066 BlastConstructErrorMessage("BLASTSetUpSearch", buffer, 2,
5067 &search->error_return);
5068 retval = 1;
5069 }
5070 if (search->sbp->kbp_ideal == NULL)
5071 search->sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(search->sbp);
5072 }
5073
5074 /* Adjust the Karlin parameters. */
5075 if (StringCmp(prog_name, "blastx") == 0 ||
5076 StringCmp(prog_name, "tblastx") == 0 ||
5077 (search->pbp->is_rps_blast && !StringCmp(prog_name, "tblastn")))
5078 {
5079 /* Make sure ideal values are used for RPS tblastn, because the previously
5080 obtained values are for the fake protein. */
5081 if (search->pbp->is_rps_blast && !StringCmp(prog_name, "tblastn"))
5082 search->sbp->kbp[0]->Lambda = search->sbp->kbp_ideal->Lambda;
5083 BlastKarlinBlkStandardCalc(search->sbp, search->first_context, search->last_context);
5084 }
5085
5086 /* If retval was set non-zero above (by the routines calculating Karlin-Altschul params),
5087 return here before these values are used.
5088 */
5089 if (retval)
5090 goto BlastSetUpReturn;
5091
5092 if (options->gapped_calculation) {
5093
5094 BLAST_KarlinBlkPtr kbp_gap =
5095 search->sbp->kbp_gap_std[search->first_context];
5096 Nlm_FloatHi alpha, beta; /*alpha and beta for the scoring system */
5097 if (StringCmp(options->program_name, "blastn") != 0)
5098 getAlphaBeta(options->matrix,&alpha,&beta,options->gapped_calculation,
5099 options->gap_open, options->gap_extend);
5100 else
5101 BlastKarlinGetNuclAlphaBeta(options->reward, options->penalty, options->gap_open,
5102 options->gap_extend, kbp_gap, options->gapped_calculation, &alpha, &beta);
5103
5104 BlastComputeLengthAdjustment(kbp_gap->K,
5105 kbp_gap->logK,
5106 alpha/kbp_gap->Lambda, beta,
5107 length,
5108 search->dblen, search->dbseq_num,
5109 &length_adjustment );
5110
5111 effective_query_length = length - length_adjustment;
5112
5113 /* AM: If concatenating queries, then compute effective lengths of
5114 individual queries. */
5115 if( search->mult_queries )
5116 {
5117 search->mult_queries->TotalLength = length;
5118 lengths_eff =
5119 (IntArray) MemNew( sizeof( Int4 )*
5120 search->mult_queries->NumQueries );
5121 length_adj_tmp =
5122 (IntArray)MemNew( sizeof( Int4 )*
5123 search->mult_queries->NumQueries );
5124
5125 for( le_iter = 0;
5126 le_iter < search->mult_queries->NumQueries;
5127 ++le_iter ) {
5128 length_tmp = search->mult_queries->QueryEnds[le_iter]
5129 - search->mult_queries->QueryStarts[le_iter]
5130 + 1;
5131 length_adj_tmp[le_iter] = 0;
5132
5133 BlastComputeLengthAdjustment(kbp_gap->K,
5134 kbp_gap->logK,
5135 alpha/kbp_gap->Lambda,
5136 beta,
5137 length_tmp,
5138 search->dblen, search->dbseq_num,
5139 &length_adj_tmp[le_iter] );
5140
5141 lengths_eff[le_iter] = length_tmp - length_adj_tmp[le_iter];
5142
5143 search->mult_queries->EffLengths[le_iter] =
5144 lengths_eff[le_iter];
5145 search->mult_queries->Adjustments[le_iter] =
5146 length_adj_tmp[le_iter];
5147
5148 if( search->mult_queries->MinLen > length_tmp )
5149 search->mult_queries->MinLen = length_tmp;
5150
5151 if( search->mult_queries->MinLenEff > lengths_eff[le_iter] )
5152 search->mult_queries->MinLenEff = lengths_eff[le_iter];
5153 }
5154 }
5155 }
5156 else /* this is an ungapped alignment */
5157 {
5158 BLAST_KarlinBlkPtr kbp = search->sbp->kbp[search->first_context];
5159
5160 BlastComputeLengthAdjustment( kbp->K, kbp->logK, 1/kbp->H, 0.0,
5161 length,
5162 search->dblen, search->dbseq_num,
5163 &length_adjustment );
5164
5165 effective_query_length = length - length_adjustment;
5166
5167 /* AM: If concatenating queries, then compute effective lengths of
5168 individual queries. */
5169 if( search->mult_queries ) {
5170 search->mult_queries->TotalLength = length;
5171 lengths_eff =
5172 (IntArray)MemNew( sizeof( Int4 )*
5173 search->mult_queries->NumQueries );
5174 length_adj_tmp =
5175 (IntArray)MemNew( sizeof( Int4 )*
5176 search->mult_queries->NumQueries );
5177
5178 for( le_iter = 0;
5179 le_iter < search->mult_queries->NumQueries;
5180 ++le_iter ) {
5181 length_tmp = search->mult_queries->QueryEnds[le_iter]
5182 - search->mult_queries->QueryStarts[le_iter]
5183 + 1;
5184 length_adj_tmp[le_iter] = 0;
5185
5186 BlastComputeLengthAdjustment( kbp->K, kbp->logK,
5187 1/kbp->H, 0.0,
5188 length_tmp,
5189 search->dblen, search->dbseq_num,
5190 &(length_adj_tmp[le_iter]) );
5191
5192 lengths_eff[le_iter] = length_tmp - length_adj_tmp[le_iter];
5193 search->mult_queries->EffLengths[le_iter] =
5194 lengths_eff[le_iter];
5195 search->mult_queries->Adjustments[le_iter] =
5196 length_adj_tmp[le_iter];
5197
5198 if( search->mult_queries->MinLen > length_tmp )
5199 search->mult_queries->MinLen = length_tmp;
5200
5201 if( search->mult_queries->MinLenEff > lengths_eff[le_iter] )
5202 search->mult_queries->MinLenEff = lengths_eff[le_iter];
5203 }
5204 }
5205 }
5206
5207 search->length_adjustment = MAX(length_adjustment, 0);
5208
5209 if (!search->dblen_eff) {
5210 search->dblen_eff =
5211 search->dblen - search->dbseq_num*search->length_adjustment;
5212 /* AM: If concatenating queries find effective db lengths for each query. */
5213 if( search->mult_queries )
5214 {
5215 for( le_iter = 0; le_iter < search->mult_queries->NumQueries;
5216 ++le_iter )
5217 {
5218 if( search->prog_number == blast_type_blastn )
5219 search->mult_queries->DbLenEff[le_iter]
5220 = MAX( 1, search->dblen
5221 - search->dbseq_num*length_adj_tmp[le_iter] );
5222 else
5223 search->mult_queries->DbLenEff[le_iter]
5224 = MAX( search->dbseq_num,
5225 search->dblen
5226 - search->dbseq_num*length_adj_tmp[le_iter] );
5227 }
5228
5229 MemFree( length_adj_tmp );
5230 }
5231 }
5232
5233 for (index=search->first_context; index<=search->last_context; index++)
5234 {
5235 search->context[index].query->effective_length = effective_query_length;
5236 }
5237
5238 /* AM: Setting up effective search spaces for individual queries. */
5239 if (search->searchsp_eff == 0)
5240 {
5241 search->searchsp_eff = ((Nlm_FloatHi) search->dblen_eff)*((Nlm_FloatHi) effective_query_length);
5242
5243 if( search->mult_queries )
5244 for( le_iter = 0; le_iter < search->mult_queries->NumQueries; ++le_iter )
5245 {
5246 search->mult_queries->SearchSpEff[le_iter]
5247 = ((Nlm_FloatHi)search->mult_queries->DbLenEff[le_iter])
5248 * ((Nlm_FloatHi)lengths_eff[le_iter]);
5249
5250 if( lengths_eff[le_iter] == search->mult_queries->MinLenEff )
5251 search->mult_queries->MinSearchSpEff
5252 = search->mult_queries->SearchSpEff[le_iter];
5253 }
5254 }
5255 else if( search->mult_queries )
5256 for( le_iter = 0; le_iter < search->mult_queries->NumQueries; ++le_iter )
5257 search->mult_queries->SearchSpEff[le_iter] = search->searchsp_eff;
5258
5259 /* The default is that cutoff_s was not set and is zero. */
5260 if (options->cutoff_s == 0)
5261 {
5262 search->pbp->cutoff_e = options->expect_value;
5263 search->pbp->cutoff_e_set = TRUE;
5264 search->pbp->cutoff_s = options->cutoff_s;
5265 search->pbp->cutoff_s_set = FALSE;
5266 }
5267 else
5268 {
5269 search->pbp->cutoff_e = options->expect_value;
5270 search->pbp->cutoff_e_set = FALSE;
5271 search->pbp->cutoff_s = options->cutoff_s;
5272 search->pbp->cutoff_s_set = TRUE;
5273 }
5274
5275 MemFree( lengths_eff ); /* AM: query concatenation: free resources */
5276
5277 /* For now e2 is set to 0.5 and cutoff_e2_set is FALSE. This is then
5278 changed to the proper values in blast_set_parameters. In the final version
5279 of this program (where more blast programs and command-line options are
5280 available) this needs to be set higher up. */
5281 if (options->cutoff_s2 == 0)
5282 {
5283 search->pbp->cutoff_e2 = options->e2;
5284 search->pbp->cutoff_e2_set = TRUE;
5285 search->pbp->cutoff_s2 = options->cutoff_s2;
5286 search->pbp->cutoff_s2_set = FALSE;
5287 }
5288 else
5289 {
5290 search->pbp->cutoff_e2 = options->e2;
5291 search->pbp->cutoff_e2_set = FALSE;
5292 search->pbp->cutoff_s2 = options->cutoff_s2;
5293 search->pbp->cutoff_s2_set = TRUE;
5294 }
5295
5296 search->pbp->discontinuous = options->discontinuous;
5297
5298
5299 /* For postion based blast. */
5300 search->pbp->ethresh = options->ethresh;
5301 search->pbp->maxNumPasses = options->maxNumPasses;
5302 search->pbp->pseudoCountConst = options->pseudoCountConst;
5303
5304 if (NlmThreadsAvailable()) /* ONly allow more than one cpu if MT compiled. */
5305 search->pbp->process_num = options->number_of_cpus;
5306 else
5307 search->pbp->process_num = 1;
5308
5309 search->pbp->cpu_limit = options->cpu_limit;
5310 search->pbp->gap_decay_rate = options->gap_decay_rate;
5311 search->pbp->gap_size = options->gap_size;
5312 search->pbp->gap_prob = options->gap_prob;
5313 search->pbp->old_stats = options->old_stats;
5314 search->pbp->use_large_gaps = options->use_large_gaps;
5315 search->pbp->number_of_bits = options->number_of_bits;
5316 search->pbp->two_pass_method = options->two_pass_method;
5317 search->pbp->multiple_hits_only = options->multiple_hits_only;
5318 search->pbp->gap_open = options->gap_open;
5319 search->pbp->gap_extend = options->gap_extend;
5320 search->pbp->decline_align = options->decline_align;
5321 search->pbp->total_hsp_limit = options->total_hsp_limit;
5322
5323 search->pbp->hsp_num_max = options->hsp_num_max;
5324 /* CHANGE HERE??? */
5325 if (search->pbp->gapped_calculation && StringCmp(search->prog_name, "blastn"))
5326 {
5327 /*
5328 search->pbp->cutoff_s2_set = TRUE;
5329 */
5330 if (StringCmp(search->prog_name, "blastn") != 0)
5331 {
5332 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
5333 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
5334
5335 /* AM: Change to support query multiplexing. */
5336 if( StringCmp( search->prog_name, "tblastn" ) == 0
5337 && search->mult_queries )
5338 {
5339 search->pbp->gap_trigger
5340 = (BLAST_Score)( ( options->gap_trigger*NCBIMATH_LN2
5341 + search->mult_queries->LogKMin )
5342 /search->mult_queries->LambdaMax );
5343 }
5344 else
5345 search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
5346 }
5347 else
5348 {
5349 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
5350 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
5351 search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
5352 }
5353 /* The trigger value sets the s2 cutoff. */
5354 search->pbp->cutoff_s2 = (Int4) search->pbp->gap_trigger;
5355 }
5356 else
5357 {
5358 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
5359 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
5360 search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
5361 /* Set S and S2 equal if not sum stats. */
5362 if (search->pbp->do_sum_stats == FALSE)
5363 search->pbp->cutoff_s2 = search->pbp->cutoff_s;
5364 }
5365 /* Ensures that gap_x_dropoff_final is at least as large as gap_x_dropoff. */
5366 search->pbp->gap_x_dropoff_final = MAX(search->pbp->gap_x_dropoff_final, search->pbp->gap_x_dropoff);
5367
5368 /* "threshold" (first and second) must be set manually for two-pass right now.*/
5369 search->pbp->threshold_set = TRUE;
5370 search->pbp->threshold_second = options->threshold_second;
5371
5372 search->pbp->window_size = options->window_size;
5373 search->pbp->window_size_set = TRUE;
5374
5375 search->whole_query = TRUE;
5376 if (options->required_start != 0 || options->required_end != -1)
5377 {
5378 search->whole_query = FALSE;
5379 search->required_start = options->required_start;
5380 if (options->required_end != -1)
5381 search->required_end = options->required_end;
5382 else
5383 search->required_end = query_length;
5384 }
5385
5386 if (qlen <= 0)
5387 qlen = query_length;
5388
5389 /* Use DROPOFF_NUMBER_OF_BITS as the default if it's set to zero. */
5390 if (options->dropoff_1st_pass == 0)
5391 options->dropoff_1st_pass = DROPOFF_NUMBER_OF_BITS;
5392
5393 if (options->dropoff_2nd_pass == 0)
5394 options->dropoff_2nd_pass = DROPOFF_NUMBER_OF_BITS;
5395
5396 if (StringCmp(search->prog_name, "blastn") != 0)
5397 {
5398 avglen = BLAST_AA_AVGLEN;
5399 }
5400 else
5401 {
5402 avglen = BLAST_NT_AVGLEN;
5403 /* Use only one type of gap for blastn */
5404 search->pbp->ignore_small_gaps = FALSE;
5405 }
5406
5407 if (search->rdfp)
5408 {
5409 Int4 total_number;
5410 Int8 total_length;
5411
5412 readdb_get_totals(search->rdfp, &total_length, &total_number);
5413 if (total_number > 0)
5414 avglen = ((Nlm_FloatHi) total_length)/total_number;
5415 }
5416 else if (search->dblen > 0 && search->dbseq_num == 1)
5417 {
5418 avglen = search->dblen;
5419 }
5420
5421 if (blast_set_parameters(search, options->dropoff_1st_pass, options->dropoff_2nd_pass, avglen, search->searchsp_eff, options->window_size) != 0) {
5422 retval = 1;
5423 goto BlastSetUpReturn;
5424 }
5425 if (options->scalingFactor == 0.0)
5426 options->scalingFactor = 1.0;
5427
5428 if (options->scalingFactor != 0.0 && options->scalingFactor != 1.0)
5429 {
5430 search->pbp->gap_open *= options->scalingFactor;
5431 search->pbp->gap_extend *= options->scalingFactor;
5432 search->pbp->dropoff_1st_pass *= options->scalingFactor;
5433 search->pbp->dropoff_2nd_pass *= options->scalingFactor;
5434 search->pbp->gap_x_dropoff *= options->scalingFactor;
5435 search->pbp->gap_x_dropoff_final *= options->scalingFactor;
5436 search->pbp->decline_align *= options->scalingFactor;
5437 search->pbp->gap_trigger *= options->scalingFactor;
5438 search->pbp->cutoff_s *= options->scalingFactor;
5439 search->pbp->cutoff_s1 *= options->scalingFactor;
5440 search->pbp->cutoff_s2 *= options->scalingFactor;
5441 search->pbp->cutoff_s2_max *= options->scalingFactor;
5442 search->pbp->cutoff_s_first *= options->scalingFactor;
5443 search->pbp->cutoff_s_second *= options->scalingFactor;
5444 }
5445 search->pbp->scalingFactor = options->scalingFactor;
5446 if (options->is_megablast_search)
5447 search->pbp->mb_params = MegaBlastParameterBlkNew(options);
5448 search->pbp->explode_seqids = options->explode_seqids;
5449
5450 if (search->pbp->multiple_hits_only)
5451 {
5452 if (search->context[search->first_context].query->length < 2*options->wordsize)
5453 {
5454 BlastConstructErrorMessage("Blast",
5455 "Query must be at least twice wordsize for two hit mode", 2, &(search->error_return));
5456 retval = 1;
5457 goto BlastSetUpReturn;
5458 }
5459 }
5460 else
5461 {
5462 if (search->context[search->first_context].query->length < options->wordsize)
5463 {
5464 Char tmp_buffer[128];
5465 sprintf(tmp_buffer,
5466 "Query length %d is less than wordsize %d",
5467 search->context[search->first_context].query->length,
5468 options->wordsize);
5469 BlastConstructErrorMessage("Blast", buffer, 2,
5470 &(search->error_return));
5471 BlastConstructErrorMessage("Blast",
5472 tmp_buffer, 2, &(search->error_return));
5473 retval = 1;
5474 goto BlastSetUpReturn;
5475 }
5476 }
5477
5478 search->thr_info->awake_index = FALSE;
5479 if (NlmThreadsAvailable() && (search->context_factor*query_length) > INDEX_THR_MIN_SIZE) {
5480 search->thr_info->awake_index = TRUE;
5481 search->thr_info->last_tick = Nlm_GetSecs();
5482 search->thr_info->index_thr =
5483 NlmThreadCreate(index_proc, search->thr_info);
5484 search->thr_info->index_callback = callback;
5485 }
5486
5487 /* Only do this if this is not a pattern search. */
5488 if (options->isPatternSearch == FALSE && search->pbp->is_rps_blast == FALSE)
5489 {
5490 if (StrCmp(search->prog_name, "blastn"))
5491 last_index = search->last_context;
5492 else
5493 last_index = search->first_context;
5494 for (index=search->first_context; index<=last_index; index++)
5495 {
5496 if (options->threshold_second > 0)
5497 {
5498 search->wfp = search->wfp_first;
5499 if (!(search->positionBased)) /*AAS*/
5500 status = BlastFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
5501 else
5502 status = BlastNewFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
5503 if (status < 0) {
5504 search->thr_info->awake_index = FALSE;
5505 ErrPostEx(SEV_WARNING, 0, 0,
5506 "BlastFindWords returned non-zero status");
5507 retval = 1;
5508 goto BlastSetUpReturn;
5509 }
5510 }
5511 search->wfp = search->wfp_second;
5512 if (StringCmp(prog_name, "blastn") != 0)
5513 {
5514 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND)
5515 {
5516 if (!(search->positionBased))
5517 status = BlastFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
5518 else
5519 status = BlastNewFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
5520 }
5521 }
5522 else
5523 {
5524 status = BlastNtFindWords(search, 0, search->context[index].query->length,
5525 (Uint1) index);
5526 }
5527
5528 search->context[index].location = ValNodeFree(search->context[index].location);
5529
5530 if (status > 0)
5531 {
5532 search->thr_info->awake_index = FALSE;
5533 sprintf(buffer, "No valid letters to be indexed on context %d", index);
5534 /* This is just a warning */
5535 BlastConstructErrorMessage("Blast", buffer, 1,
5536 &(search->error_return));
5537 }
5538 else if (status < 0)
5539 {
5540 search->thr_info->awake_index = FALSE;
5541 sprintf(buffer, "Error finding words");
5542 BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
5543 retval = 1;
5544 goto BlastSetUpReturn;
5545 }
5546 }
5547 if (StrCmp(search->prog_name, "blastn"))
5548 lookup_position_aux_destruct(search->wfp->lookup);
5549 else
5550 mb_lookup_position_aux_destruct(search->wfp->lookup);
5551 }
5552
5553
5554 /*
5555 Turn off the index thread by setting this flag. Don't wait for a join, as the
5556 search will take much longer than the one second for this to die.
5557 */
5558 search->thr_info->awake_index = FALSE;
5559 BlastSetUpReturn:
5560 if (private_slp && private_slp_delete)
5561 private_slp = SeqLocFree(private_slp);
5562 if (private_slp_rev)
5563 private_slp_rev = SeqLocFree(private_slp_rev);
5564
5565 return retval;
5566 }
5567
5568 Boolean
BlastGetFirstAndLastContext(CharPtr prog_name,SeqLocPtr query_slp,Int2Ptr first_context,Int2Ptr last_context,Uint1 strand_options)5569 BlastGetFirstAndLastContext(CharPtr prog_name, SeqLocPtr query_slp, Int2Ptr first_context, Int2Ptr last_context, Uint1 strand_options)
5570 {
5571 Uint1 strand;
5572
5573 if (query_slp == NULL)
5574 { /* Query was a BioseqPtr, Check strand_options. */
5575 strand = Seq_strand_both;
5576 }
5577 else
5578 {
5579 strand = SeqLocStrand(query_slp);
5580 }
5581
5582 /*
5583 Check the strand_options and use that if top or bottom is specified.
5584 otherwise use what's specified above.
5585 */
5586 if (strand_options == BLAST_TOP_STRAND)
5587 strand = Seq_strand_plus;
5588 else if (strand_options == BLAST_BOTTOM_STRAND)
5589 strand = Seq_strand_minus;
5590
5591 if (StringCmp(prog_name, "blastp") == 0
5592 || StringCmp(prog_name, "tblastn") == 0
5593 || StringCmp(prog_name, "psitblastn") == 0)
5594 {
5595 *first_context = 0;
5596 *last_context = 0;
5597 }
5598 else if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0)
5599 {
5600 if (strand == Seq_strand_unknown || strand == Seq_strand_plus || strand == Seq_strand_both)
5601 *first_context = 0;
5602 else
5603 *first_context = 3;
5604
5605 if (strand == Seq_strand_minus || strand == Seq_strand_both)
5606 *last_context = 5;
5607 else
5608 *last_context = 2;
5609 }
5610 else if (StringCmp(prog_name, "blastn") == 0)
5611 {
5612 if (strand == Seq_strand_unknown || strand == Seq_strand_plus || strand == Seq_strand_both)
5613 *first_context = 0;
5614 else
5615 *first_context = 1;
5616
5617 if (strand == Seq_strand_minus || strand == Seq_strand_both)
5618 *last_context = 1;
5619 else
5620 *last_context = 0;
5621 }
5622 return TRUE;
5623 }
5624
5625 BlastDoubleInt4Ptr
GetGisFromFile(CharPtr gifile,Int4Ptr gi_list_size)5626 GetGisFromFile (CharPtr gifile, Int4Ptr gi_list_size)
5627 {
5628 BlastDoubleInt4Ptr retval = NULL;
5629 Int4ListPtr gilist = NULL;
5630 register Int4 i;
5631
5632 if ( !(gilist = Int4ListReadFromFile(gifile)))
5633 return NULL;
5634
5635 retval = (BlastDoubleInt4Ptr) MemNew(sizeof(BlastDoubleInt4)*gilist->count);
5636 if (!retval)
5637 return retval;
5638
5639 if (gi_list_size)
5640 *gi_list_size = gilist->count;
5641
5642 for (i = 0; i < gilist->count; i++)
5643 retval[i].gi = gilist->i[i];
5644
5645 gilist = Int4ListFree(gilist);
5646
5647 return retval;
5648 }
5649
5650 BlastSearchBlkPtr LIBCALL
BLASTSetUpSearchWithReadDbInternalEx(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,ReadDBFILEPtr rdfp)5651 BLASTSetUpSearchWithReadDbInternalEx (SeqLocPtr query_slp, BioseqPtr query_bsp,
5652 CharPtr prog_name, Int4 qlen, CharPtr
5653 dbname, BLAST_OptionsBlkPtr options, int
5654 (LIBCALLBACK *callback)PROTO((Int4 done,
5655 Int4
5656 positives)),
5657 SeqIdPtr seqid_list, BlastDoubleInt4Ptr
5658 gi_list, Int4 gi_list_total, ReadDBFILEPtr
5659 rdfp)
5660 {
5661 if (options->is_megablast_search)
5662 return MegaBlastSetUpSearchWithReadDbInternal(query_slp, query_bsp,
5663 prog_name, 0,
5664 dbname, options, callback,
5665 seqid_list, gi_list,
5666 gi_list_total, rdfp);
5667 else
5668 return BLASTSetUpSearchWithReadDbInternal(query_slp, query_bsp,
5669 prog_name, qlen,
5670 dbname, options, callback,
5671 seqid_list, gi_list,
5672 gi_list_total, rdfp);
5673 }
5674
5675
5676 BlastSearchBlkPtr
BLASTSetUpSearchWithReadDbInternal(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,ReadDBFILEPtr rdfp)5677 BLASTSetUpSearchWithReadDbInternal (SeqLocPtr query_slp, BioseqPtr query_bsp, CharPtr prog_name, Int4 qlen, CharPtr dbname, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, ReadDBFILEPtr rdfp)
5678 {
5679 return BLASTSetUpSearchWithReadDbInternalMult(query_slp, query_bsp, prog_name, qlen, dbname, options, callback, seqid_list, gi_list, gi_list_total, rdfp, NULL);
5680 }
5681
5682
5683 /**
5684 * Calculate the hitlist size for preliminary alignments with a single
5685 * query, i.e. all but the final alignment with traceback. This size
5686 * is generally somewhat larger than the final hitlist size because:
5687 * - the final alignment is the most sensitive, and may improve the
5688 * score of alignments that would not otherwise be reported; and
5689 * - when composition-based statitics is used, many hits may be
5690 * dropped in the final phase
5691 */
5692 Int4
BlastSingleQueryResultSize(BLAST_OptionsBlkPtr options)5693 BlastSingleQueryResultSize(BLAST_OptionsBlkPtr options)
5694 {
5695 Int4 result_size = /* size to be returned */
5696 options->hitlist_size;
5697
5698 if (options->tweak_parameters) {
5699 /* Composition based statistics are being used. */
5700 result_size *= 2;
5701 }
5702 if ((options->is_megablast_search && options->no_traceback) ||
5703 (!options->is_megablast_search && options->gapped_calculation)) {
5704 /* This search uses preliminary alignments before the final
5705 * gapped calculation with traceback; increase the results
5706 * size. */
5707 result_size = MIN(2*result_size, result_size + 50);
5708 }
5709 return result_size;
5710 }
5711
5712
5713 BlastSearchBlkPtr
BLASTSetUpSearchWithReadDbInternalMult(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,ReadDBFILEPtr rdfp,QueriesPtr mult_queries)5714 BLASTSetUpSearchWithReadDbInternalMult (SeqLocPtr query_slp, BioseqPtr query_bsp, CharPtr prog_name, Int4 qlen, CharPtr dbname, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, ReadDBFILEPtr rdfp, QueriesPtr mult_queries)
5715 /* --KM added mult_queries param */
5716
5717 {
5718
5719 BlastSearchBlkPtr search = NULL;
5720 Boolean multiple_hits, options_alloc=FALSE;
5721 Int2 status, first_context = 0, last_context = 0;
5722 Int8 dblen = 0;
5723 Int4 query_length = 0;
5724 Nlm_FloatHi searchsp_eff = 0;
5725 Int4 hitlist_size = 0;
5726 Int4 i = 0; /* AM: Query multiplexing. */
5727
5728 /* Allocate default options if none are allocated yet. */
5729 if (options == NULL) {
5730 options = BLASTOptionNew(prog_name, FALSE);
5731 options_alloc = TRUE;
5732 }
5733
5734 multiple_hits = options->multiple_hits_only;
5735 /*
5736 if (options->window_size != 0)
5737 multiple_hits = TRUE;
5738 else
5739 multiple_hits = FALSE;
5740 */
5741 BlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
5742
5743 if (query_slp)
5744 query_length = SeqLocLen(query_slp);
5745 else
5746 query_length = query_bsp->length;
5747
5748 hitlist_size = BlastSingleQueryResultSize(options);
5749
5750 /* AM: Query multiplexing */
5751 if( mult_queries )
5752 {
5753 for( i = 0; i < mult_queries->NumQueries; ++i )
5754 mult_queries->result_info[i].results
5755 = (BLASTResultHitlistPtr *)MemNew(
5756 (hitlist_size + 1)*sizeof( BLASTResultHitlistPtr ) );
5757
5758 mult_queries->max_results_per_query = hitlist_size;
5759 hitlist_size *= mult_queries->NumQueries;
5760 }
5761
5762 /* On the first call query length is used for the subject length. */
5763 search = BlastSearchBlkNewExtra(options->wordsize, query_length, dbname, multiple_hits, 0, options->threshold_second, hitlist_size, prog_name, NULL, first_context, last_context, rdfp, options->window_size);
5764
5765 if (search) {
5766 readdb_get_totals_ex(search->rdfp, &(dblen), &(search->dbseq_num), TRUE);
5767
5768 if (!options->ignore_gilist)
5769 {
5770 Boolean looking_for_gis = FALSE;
5771 /* Create virtual database if any of the databases have gi lists or
5772 ordinal id masks, or if gi list is provided from options */
5773 looking_for_gis = BlastProcessGiLists(search, options, gi_list, gi_list_total);
5774
5775 /* search->thr_info->blast_gi_list will be non-NULL if gi_list or
5776 * options->gilist or options->gifile was non-NULL and therefore
5777 * intersected with any oidlists in the search->rdfp(s). If this is the
5778 * case, we need to recalculate the database length and number of
5779 * sequences */
5780 if (search->thr_info->blast_gi_list && !options->use_real_db_size)
5781 readdb_get_totals_ex3(search->rdfp, &dblen, &search->dbseq_num,
5782 FALSE, TRUE, eApproximate);
5783
5784 if (looking_for_gis && search->thr_info->blast_gi_list == NULL)
5785 {
5786 ErrPostEx(SEV_WARNING, 0, 0, "Intersection of gilist and BLAST database ID's empty");
5787 search->query_invalid = TRUE;
5788 }
5789 }
5790
5791 /* command-line/options trump alias file. */
5792 if (options->db_length > 0)
5793 dblen = options->db_length;
5794 if (options->dbseq_num > 0)
5795 search->dbseq_num = options->dbseq_num;
5796 if (options->searchsp_eff > 0)
5797 searchsp_eff = options->searchsp_eff;
5798
5799 if (StringCmp(prog_name, "tblastn") == 0 ||
5800 StringCmp(prog_name, "tblastx") == 0 ||
5801 StringCmp(prog_name, "psitblastn") == 0) {
5802 dblen /= 3;
5803 searchsp_eff /= 3.0;
5804 }
5805 search->dblen = dblen;
5806 if (options->db_length > 0)
5807 search->dblen_eff = dblen;
5808 search->searchsp_eff = searchsp_eff;
5809 /* AM: Moved next two lines here to be able to use mult_queries
5810 in BLASTSetUpSearchInternalByLoc() */
5811 /* --KM put mult_queries, from Main, into the search structure */
5812 search->mult_queries = mult_queries;
5813 status = BLASTSetUpSearchInternalByLoc (search, query_slp, query_bsp, prog_name, qlen, options, callback);
5814 if (status != 0) {
5815 ErrPostEx(SEV_WARNING, 0, 0, "SetUpBlastSearch failed.");
5816 search->query_invalid = TRUE;
5817 }
5818
5819 if (search->pbp->mb_params)
5820 search = GreedyAlignMemAlloc(search);
5821 else
5822 search->abmp = NULL;
5823
5824 if (search->rdfp->parameters & READDB_CONTENTS_ALLOCATED)
5825 search->rdfp = ReadDBCloseMHdrAndSeqFiles(search->rdfp);
5826 }
5827
5828 if (options_alloc)
5829 options = BLASTOptionDelete(options);
5830
5831 return search;
5832 }
5833
5834 /*
5835 Performs setup for a BLAST search. This function must be used
5836 with a search file accessed through readdb.
5837 */
5838
5839 BlastSearchBlkPtr LIBCALL
BLASTSetUpSearchWithReadDb(BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))5840 BLASTSetUpSearchWithReadDb(BioseqPtr query_bsp, CharPtr prog_name, Int4 qlen, CharPtr dbname, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
5841
5842 {
5843 return BLASTSetUpSearchWithReadDbInternal(NULL, query_bsp, prog_name, qlen, dbname, options, callback, NULL, NULL, 0, NULL);
5844 }
5845
5846 BlastSearchBlkPtr LIBCALL
BLASTSetUpSearchWithReadDbEx(BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total)5847 BLASTSetUpSearchWithReadDbEx(BioseqPtr query_bsp, CharPtr prog_name, Int4 qlen, CharPtr dbname, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
5848
5849 {
5850 return BLASTSetUpSearchWithReadDbInternal (NULL, query_bsp, prog_name, qlen, dbname, options, callback, seqid_list, gi_list, gi_list_total, NULL);
5851 }
5852
5853 /*
5854 Performs setup for a BLAST search. This function must be used
5855 with a search file accessed through readdb.
5856 */
5857
5858 BlastSearchBlkPtr LIBCALL
BLASTSetUpSearchByLocWithReadDb(SeqLocPtr query_slp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))5859 BLASTSetUpSearchByLocWithReadDb(SeqLocPtr query_slp, CharPtr prog_name, Int4 qlen, CharPtr dbname, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
5860
5861 {
5862 return BLASTSetUpSearchWithReadDbInternalMult (query_slp, NULL, prog_name, qlen, dbname, options, callback, NULL, NULL, 0, NULL, NULL);
5863 /* --KM pass NULL mult_queries */
5864 }
5865
5866
5867 BlastSearchBlkPtr LIBCALL
BLASTSetUpSearchByLocWithReadDbEx(SeqLocPtr query_slp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,QueriesPtr mult_queries)5868 BLASTSetUpSearchByLocWithReadDbEx(SeqLocPtr query_slp, CharPtr prog_name, Int4 qlen, CharPtr dbname, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, QueriesPtr mult_queries)
5869 /* --KM added mult_queries param */
5870
5871 {
5872 return BLASTSetUpSearchWithReadDbInternalMult (query_slp, NULL, prog_name, qlen, dbname, options, callback, seqid_list, gi_list, gi_list_total, NULL, mult_queries);
5873 /* --KM pass mult_queries */
5874 }
5875 static BlastSearchBlkPtr
BLASTSetUpSearchEx(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,Int8 dblen,BlastAllWordPtr all_words,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))5876 BLASTSetUpSearchEx (SeqLocPtr query_slp, BioseqPtr query_bsp, CharPtr prog_name, Int4 qlen, Int8 dblen, BlastAllWordPtr all_words, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
5877
5878 {
5879 BlastSearchBlkPtr search;
5880 Boolean options_alloc=FALSE, multiple_hits;
5881 Int2 status, first_context, last_context;
5882 Int4 actual_query_length=0;
5883 Nlm_FloatHi searchsp_eff=0;
5884 Int4 hitlist_size;
5885
5886 /* Allocate default options if no are allocated yet. */
5887 if (options == NULL)
5888 {
5889 options = BLASTOptionNew(prog_name, FALSE);
5890 options_alloc = TRUE;
5891 }
5892
5893 multiple_hits = options->multiple_hits_only;
5894 /*
5895 if (options->window_size != 0)
5896 multiple_hits = TRUE;
5897 else
5898 multiple_hits = FALSE;
5899 */
5900 if (query_slp == NULL && query_bsp == NULL)
5901 return NULL;
5902
5903 if (query_slp)
5904 actual_query_length = SeqLocLen(query_slp);
5905 else if (query_bsp)
5906 actual_query_length = query_bsp->length;
5907
5908 if (qlen <= 0)
5909 {
5910 qlen = actual_query_length;
5911 }
5912
5913 /* If dblen is not set, use qlen. */
5914 if (dblen <= 0)
5915 dblen = qlen;
5916
5917 BlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
5918
5919 hitlist_size = BlastSingleQueryResultSize(options);
5920
5921 /* On the first call query length is used for the subject length. */
5922 search = BlastSearchBlkNew(options->wordsize, actual_query_length, NULL, multiple_hits, 0, options->threshold_second, hitlist_size, prog_name, all_words, first_context, last_context, options->window_size);
5923
5924 if (search)
5925 {
5926 search->subject->length = dblen;
5927 /* Options setting overrides parameter. */
5928 if (options->db_length > 0)
5929 dblen = options->db_length;
5930 if (options->searchsp_eff > 0)
5931 searchsp_eff = options->searchsp_eff;
5932 if (StringCmp(prog_name, "tblastn") == 0
5933 || StringCmp(prog_name, "tblastx") == 0
5934 || StringCmp(prog_name, "psitblastn") == 0)
5935 {
5936 dblen /= 3;
5937 searchsp_eff /= 3.0;
5938 }
5939 if (options->dbseq_num > 0)
5940 search->dbseq_num = options->dbseq_num;
5941 else
5942 search->dbseq_num = (Int4) dblen/qlen;
5943
5944 if (search->dbseq_num <=0)
5945 search->dbseq_num = 1;
5946
5947 search->dblen = dblen;
5948 /* If searchsp_eff is > 0 it will be used. */
5949 search->searchsp_eff = searchsp_eff;
5950 if (options->is_megablast_search)
5951 search->pbp->mb_params = MegaBlastParameterBlkNew(options);
5952 if (search->pbp->mb_params)
5953 status = MegaBlastSetUpSearchInternalByLoc (search, query_slp, query_bsp, prog_name, qlen, options, callback);
5954 else
5955 status = BLASTSetUpSearchInternalByLoc(search, query_slp, query_bsp, prog_name, qlen, options, callback);
5956 if (status != 0)
5957 {
5958 ErrPostEx(SEV_WARNING, 0, 0, "SetUpBlastSearch failed.");
5959 search->query_invalid = TRUE;
5960 }
5961
5962 if (search->pbp->mb_params)
5963 search = GreedyAlignMemAlloc(search);
5964 }
5965
5966 if (options_alloc)
5967 options = BLASTOptionDelete(options);
5968
5969 return search;
5970 }
5971
5972 /*
5973 Performs necessary setup for a BLAST search. The arguments are:
5974
5975 - search: BlastSearchBlkPtr created by BlastSearchBlkNew
5976 - query_bsp: BioseqPtr for the query
5977 - matrix: CharPtr containing the name of the matrix
5978 - prog_name: CharPtr containing name of the program
5979 - qlen: Int4 with length of the query, if a lenght should be
5980 specified (for statistical calculations); if this argument is
5981 zero, then query_bsp->length is used.
5982 -dblen: Int8 with length of the database.
5983 - e_cutoff: BLAST_Score specifying the "expect" value.
5984 - number_of_processors: number of processors to use.
5985 - gap_decay_rate: between zero and one, related to prob. of # of HSP's.
5986 - gap_size: largest allowable gap if "small" gaps are used.
5987 - gap_prob: probability of "small" gap model being correct.
5988 - multiple_hits: if TRUE, multiple hits method is used.
5989 - window: window size for multiple hits method
5990 - threshold_second: initial hit threshold for 2nd pass
5991 - discontiguous: should discontiguous words be used?
5992 - old_stats: should the old statistics be used?
5993 - is_prot: is this a protein?
5994
5995
5996 */
5997
5998 BlastSearchBlkPtr LIBCALL
BLASTSetUpSearch(BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,Int8 dblen,BlastAllWordPtr all_words,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))5999 BLASTSetUpSearch (BioseqPtr query_bsp, CharPtr prog_name, Int4 qlen, Int8 dblen, BlastAllWordPtr all_words, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
6000
6001 {
6002 return BLASTSetUpSearchEx (NULL, query_bsp, prog_name, qlen, dblen, all_words, options, callback);
6003 }
6004
6005 BlastSearchBlkPtr LIBCALL
BLASTSetUpSearchByLoc(SeqLocPtr query_slp,CharPtr prog_name,Int4 qlen,Int8 dblen,BlastAllWordPtr all_words,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))6006 BLASTSetUpSearchByLoc (SeqLocPtr query_slp, CharPtr prog_name, Int4 qlen, Int8 dblen, BlastAllWordPtr all_words, BLAST_OptionsBlkPtr options, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
6007
6008 {
6009 return BLASTSetUpSearchEx (query_slp, NULL, prog_name, qlen, dblen, all_words, options, callback);
6010 }
6011
6012 static int LIBCALLBACK
diag_compare_hsps(VoidPtr v1,VoidPtr v2)6013 diag_compare_hsps(VoidPtr v1, VoidPtr v2)
6014 {
6015 BLAST_HSPPtr h1, h2;
6016
6017 h1 = *((BLAST_HSPPtr PNTR) v1);
6018 h2 = *((BLAST_HSPPtr PNTR) v2);
6019
6020 return (h1->query.offset - h1->subject.offset) -
6021 (h2->query.offset - h2->subject.offset);
6022 }
6023
6024 /*
6025 Shifts all HSP coordinates according to where the partial sequence
6026 started in the large sequence
6027 */
AdjustOffsetsInBLASTHitList(BLAST_HitListPtr hitlist,Int4 start)6028 void AdjustOffsetsInBLASTHitList(BLAST_HitListPtr hitlist, Int4 start)
6029 {
6030 Int4 index;
6031 BLAST_HSPPtr hsp;
6032
6033 for (index=0; index<hitlist->hspcnt; index++) {
6034 hsp = hitlist->hsp_array[index];
6035 hsp->subject.offset += start;
6036 hsp->subject.end += start;
6037 hsp->subject.gapped_start += start;
6038 if (hsp->gap_info)
6039 hsp->gap_info->start2 += start;
6040 }
6041 }
6042
6043 #define DBSEQ_CHUNK_OVERLAP 100
6044 #define OVERLAP_DIAG_CLOSE 10
6045 /* This is a hard merge, i.e. the two HSPs will be merged only
6046 if they intersect
6047 */
6048 static Boolean
BLASTMergeHsps(BLAST_HSPPtr hsp1,BLAST_HSPPtr hsp2,Int4 start)6049 BLASTMergeHsps(BLAST_HSPPtr hsp1, BLAST_HSPPtr hsp2, Int4 start)
6050 {
6051 BLASTHSPSegmentPtr segments1, segments2, new_segment1, new_segment2;
6052 GapXEditScriptPtr esp1, esp2, esp;
6053 Int4 end = start + DBSEQ_CHUNK_OVERLAP - 1;
6054 Int4 min_diag, max_diag, num1, num2, dist, next_dist=0;
6055 Int4 diag1_start, diag1_end, diag2_start, diag2_end;
6056 Int4 index;
6057 Uint1 intersection_found;
6058 Uint1 op_type;
6059
6060 if (!hsp1->gap_info || !hsp2->gap_info) {
6061 /* Assume that this is an ungapped alignment, hence simply compare
6062 diagonals. Do not merge if they are on different diagonals */
6063 if (diag_compare_hsps(&hsp1, &hsp2) == 0 &&
6064 hsp1->query.end >= hsp2->query.offset) {
6065 hsp1->query.end = hsp2->query.end;
6066 hsp1->subject.end = hsp2->subject.end;
6067 hsp1->query.length = hsp1->query.end - hsp1->query.offset;
6068 hsp1->subject.length = hsp1->subject.end - hsp1->subject.offset;
6069 return TRUE;
6070 } else
6071 return FALSE;
6072 }
6073 /* Find whether these HSPs have an intersection point */
6074 segments1 = (BLASTHSPSegmentPtr) MemNew(sizeof(BLASTHSPSegment));
6075
6076 esp1 = hsp1->gap_info->esp;
6077 esp2 = hsp2->gap_info->esp;
6078
6079 segments1->q_start = hsp1->query.offset;
6080 segments1->s_start = hsp1->subject.offset;
6081 while (segments1->s_start < start) {
6082 if (esp1->op_type == GAPALIGN_INS)
6083 segments1->q_start += esp1->num;
6084 else if (segments1->s_start + esp1->num < start) {
6085 if (esp1->op_type == GAPALIGN_SUB) {
6086 segments1->s_start += esp1->num;
6087 segments1->q_start += esp1->num;
6088 } else if (esp1->op_type == GAPALIGN_DEL)
6089 segments1->s_start += esp1->num;
6090 } else
6091 break;
6092 esp1 = esp1->next;
6093 }
6094 /* Current esp is the first segment within the overlap region */
6095 segments1->s_end = segments1->s_start + esp1->num - 1;
6096 if (esp1->op_type == GAPALIGN_SUB)
6097 segments1->q_end = segments1->q_start + esp1->num - 1;
6098 else
6099 segments1->q_end = segments1->q_start;
6100
6101 new_segment1 = segments1;
6102
6103 for (esp = esp1->next; esp; esp = esp->next) {
6104 new_segment1->next = (BLASTHSPSegmentPtr)
6105 MemNew(sizeof(BLASTHSPSegment));
6106 new_segment1->next->q_start = new_segment1->q_end + 1;
6107 new_segment1->next->s_start = new_segment1->s_end + 1;
6108 new_segment1 = new_segment1->next;
6109 if (esp->op_type == GAPALIGN_SUB) {
6110 new_segment1->q_end += esp->num - 1;
6111 new_segment1->s_end += esp->num - 1;
6112 } else if (esp->op_type == GAPALIGN_INS) {
6113 new_segment1->q_end += esp->num - 1;
6114 new_segment1->s_end = new_segment1->s_start;
6115 } else {
6116 new_segment1->s_end += esp->num - 1;
6117 new_segment1->q_end = new_segment1->q_start;
6118 }
6119 }
6120
6121 /* Now create the second segments list */
6122
6123 segments2 = (BLASTHSPSegmentPtr) MemNew(sizeof(BLASTHSPSegment));
6124 segments2->q_start = hsp2->query.offset;
6125 segments2->s_start = hsp2->subject.offset;
6126 segments2->q_end = segments2->q_start + esp2->num - 1;
6127 segments2->s_end = segments2->s_start + esp2->num - 1;
6128
6129 new_segment2 = segments2;
6130
6131 for (esp = esp2->next; esp && new_segment2->s_end < end;
6132 esp = esp->next) {
6133 new_segment2->next = (BLASTHSPSegmentPtr)
6134 MemNew(sizeof(BLASTHSPSegment));
6135 new_segment2->next->q_start = new_segment2->q_end + 1;
6136 new_segment2->next->s_start = new_segment2->s_end + 1;
6137 new_segment2 = new_segment2->next;
6138 if (esp->op_type == GAPALIGN_INS) {
6139 new_segment2->s_end = new_segment2->s_start;
6140 new_segment2->q_end = new_segment2->q_start + esp->num - 1;
6141 } else if (esp->op_type == GAPALIGN_DEL) {
6142 new_segment2->s_end = new_segment2->s_start + esp->num - 1;
6143 new_segment2->q_end = new_segment2->q_start;
6144 } else if (esp->op_type == GAPALIGN_SUB) {
6145 new_segment2->s_end = new_segment2->s_start + esp->num - 1;
6146 new_segment2->q_end = new_segment2->q_start + esp->num - 1;
6147 }
6148 }
6149
6150 new_segment1 = segments1;
6151 new_segment2 = segments2;
6152 intersection_found = 0;
6153 num1 = num2 = 0;
6154 while (new_segment1 && new_segment2 && !intersection_found) {
6155 if (new_segment1->s_end < new_segment2->s_start ||
6156 new_segment1->q_end < new_segment2->q_start) {
6157 new_segment1 = new_segment1->next;
6158 num1++;
6159 continue;
6160 }
6161 if (new_segment2->s_end < new_segment1->s_start ||
6162 new_segment2->q_end < new_segment1->q_start) {
6163 new_segment2 = new_segment2->next;
6164 num2++;
6165 continue;
6166 }
6167 diag1_start = new_segment1->s_start - new_segment1->q_start;
6168 diag2_start = new_segment2->s_start - new_segment2->q_start;
6169 diag1_end = new_segment1->s_end - new_segment1->q_end;
6170 diag2_end = new_segment2->s_end - new_segment2->q_end;
6171
6172 if (diag1_start == diag1_end && diag2_start == diag2_end &&
6173 diag1_start == diag2_start) {
6174 /* Both segments substitutions, on same diagonal */
6175 intersection_found = 1;
6176 dist = new_segment2->s_end - new_segment1->s_start + 1;
6177 break;
6178 } else if (diag1_start != diag1_end && diag2_start != diag2_end) {
6179 /* Both segments gaps - must intersect */
6180 intersection_found = 3;
6181
6182 dist = new_segment2->s_end - new_segment1->s_start + 1;
6183 op_type = GAPALIGN_INS;
6184 next_dist = new_segment2->q_end - new_segment1->q_start - dist + 1;
6185 if (new_segment2->q_end - new_segment1->q_start < dist) {
6186 dist = new_segment2->q_end - new_segment1->q_start + 1;
6187 op_type = GAPALIGN_DEL;
6188 next_dist = new_segment2->s_end - new_segment1->s_start - dist + 1;
6189 }
6190 break;
6191 } else if (diag1_start != diag1_end) {
6192 max_diag = MAX(diag1_start, diag1_end);
6193 min_diag = MIN(diag1_start, diag1_end);
6194 if (diag2_start >= min_diag && diag2_start <= max_diag) {
6195 intersection_found = 2;
6196 dist = diag2_start - min_diag + 1;
6197 if (new_segment1->s_end == new_segment1->s_start)
6198 next_dist = new_segment2->s_end - new_segment1->s_end + 1;
6199 else
6200 next_dist = new_segment2->q_end - new_segment1->q_end + 1;
6201 break;
6202 }
6203 } else if (diag2_start != diag2_end) {
6204 max_diag = MAX(diag2_start, diag2_end);
6205 min_diag = MIN(diag2_start, diag2_end);
6206 if (diag1_start >= min_diag && diag1_start <= max_diag) {
6207 intersection_found = 2;
6208 next_dist = max_diag - diag1_start + 1;
6209 if (new_segment2->s_end == new_segment2->s_start)
6210 dist = new_segment2->s_start - new_segment1->s_start + 1;
6211 else
6212 dist = new_segment2->q_start - new_segment1->q_start + 1;
6213 break;
6214 }
6215 }
6216 if (new_segment1->s_end <= new_segment2->s_end) {
6217 new_segment1 = new_segment1->next;
6218 num1++;
6219 } else {
6220 new_segment2 = new_segment2->next;
6221 num2++;
6222 }
6223 }
6224
6225 if (intersection_found) {
6226 esp = NULL;
6227 for (index = 0; index < num1-1; index++)
6228 esp1 = esp1->next;
6229 for (index = 0; index < num2-1; index++) {
6230 esp = esp2;
6231 esp2 = esp2->next;
6232 }
6233 if (intersection_found < 3) {
6234 if (num1 > 0)
6235 esp1 = esp1->next;
6236 if (num2 > 0) {
6237 esp = esp2;
6238 esp2 = esp2->next;
6239 }
6240 }
6241 switch (intersection_found) {
6242 case 1:
6243 esp1->num = dist;
6244 esp1->next = esp2->next;
6245 esp2->next = NULL;
6246 break;
6247 case 2:
6248 esp1->num = dist;
6249 esp2->num = next_dist;
6250 esp1->next = esp2;
6251 if (esp)
6252 esp->next = NULL;
6253 break;
6254 case 3:
6255 esp1->num += dist;
6256 esp2->op_type = op_type;
6257 esp2->num = next_dist;
6258 esp1->next = esp2;
6259 if (esp)
6260 esp->next = NULL;
6261 break;
6262 default: break;
6263 }
6264 hsp1->query.end = hsp2->query.end;
6265 hsp1->subject.end = hsp2->subject.end;
6266 hsp1->query.length = hsp1->query.end - hsp1->query.offset;
6267 hsp1->subject.length = hsp1->subject.end - hsp1->subject.offset;
6268 }
6269
6270 return (Boolean) intersection_found;
6271 }
6272
BLASTHspContained(BLAST_HSPPtr hsp1,BLAST_HSPPtr hsp2)6273 static Boolean BLASTHspContained(BLAST_HSPPtr hsp1, BLAST_HSPPtr hsp2)
6274 {
6275 Boolean hsp_start_is_contained=FALSE, hsp_end_is_contained=FALSE;
6276
6277 if (hsp1->score > hsp2->score ||
6278 SIGN(hsp2->query.frame) != SIGN(hsp1->query.frame) ||
6279 SIGN(hsp2->subject.frame) != SIGN(hsp1->subject.frame))
6280 return FALSE;
6281
6282 if (CONTAINED_IN_HSP(hsp2->query.offset, hsp2->query.end, hsp1->query.offset, hsp2->subject.offset, hsp2->subject.end, hsp1->subject.offset) == TRUE) {
6283 hsp_start_is_contained = TRUE;
6284 }
6285 if (CONTAINED_IN_HSP(hsp2->query.offset, hsp2->query.end, hsp1->query.end, hsp2->subject.offset, hsp2->subject.end, hsp1->subject.end) == TRUE) {
6286 hsp_end_is_contained = TRUE;
6287 }
6288
6289 return (hsp_start_is_contained && hsp_end_is_contained);
6290 }
6291
6292 /*
6293 Merges the hits from different chunks of the subject sequence that
6294 have been searched separately
6295 */
6296 static BLAST_HitListPtr
BLASTMergeHitLists(BlastSearchBlkPtr search,BLAST_HitListPtr hitlist1,BLAST_HitListPtr hitlist2,Int4 start,Boolean merge_hsps)6297 BLASTMergeHitLists(BlastSearchBlkPtr search, BLAST_HitListPtr hitlist1,
6298 BLAST_HitListPtr hitlist2, Int4 start, Boolean merge_hsps)
6299 {
6300 BLAST_HSPPtr hsp, hsp_var, PNTR hspp1, PNTR hspp2;
6301 Int4 index, index1, index2;
6302 Int4 hspcnt1, hspcnt2, new_hspcnt = 0;
6303 BLAST_HSPPtr PNTR new_hsp_array;
6304
6305 if (hitlist1 == NULL) {
6306 hitlist1 = (BLAST_HitListPtr)
6307 MemDup(hitlist2, sizeof(BLAST_HitList));
6308 hitlist1->hsp_array = (BLAST_HSPPtr PNTR)
6309 MemNew(hitlist2->hspmax*sizeof(BLAST_HSPPtr));
6310 MemCpy(hitlist1->hsp_array, hitlist2->hsp_array,
6311 hitlist2->hspcnt*sizeof(BLAST_HSPPtr));
6312 return hitlist1;
6313 } else {
6314 /* In case these have changed */
6315 hitlist1->exact_match_array = hitlist2->exact_match_array;
6316 hitlist1->exact_match_max = hitlist2->exact_match_max;
6317 }
6318
6319 hspcnt1 = hspcnt2 = 0;
6320
6321 /* Put all HSPs that intersect the overlap region at the front of the
6322 respective HSP arrays. */
6323 for (index = 0; index < hitlist1->hspcnt; index++) {
6324 hsp = hitlist1->hsp_array[index];
6325 if (hsp->subject.end > start) {
6326 /* At least part of this HSP lies in the overlap strip. */
6327 hsp_var = hitlist1->hsp_array[hspcnt1];
6328 hitlist1->hsp_array[hspcnt1] = hsp;
6329 hitlist1->hsp_array[index] = hsp_var;
6330 ++hspcnt1;
6331 }
6332 }
6333 for (index = 0; index < hitlist2->hspcnt; index++) {
6334 hsp = hitlist2->hsp_array[index];
6335 if (hsp->subject.offset < start + DBSEQ_CHUNK_OVERLAP) {
6336 /* At least part of this HSP lies in the overlap strip. */
6337 hsp_var = hitlist2->hsp_array[hspcnt2];
6338 hitlist2->hsp_array[hspcnt2] = hsp;
6339 hitlist2->hsp_array[index] = hsp_var;
6340 ++hspcnt2;
6341 }
6342 }
6343 hspp1 = hitlist1->hsp_array;
6344 hspp2 = hitlist2->hsp_array;
6345
6346 HeapSort(hspp1, hspcnt1, sizeof(BLAST_HSPPtr), diag_compare_hsps);
6347 HeapSort(hspp2, hspcnt2, sizeof(BLAST_HSPPtr), diag_compare_hsps);
6348
6349 for (index=0; index<hspcnt1; index++) {
6350 for (index1=0; index1<hspcnt2; index1++) {
6351 if (hspp2[index1] &&
6352 hspp2[index1]->query.frame == hspp1[index]->query.frame &&
6353 hspp2[index1]->subject.frame == hspp1[index]->subject.frame &&
6354 ABS(diag_compare_hsps(&hspp1[index], &hspp2[index1])) <
6355 OVERLAP_DIAG_CLOSE) {
6356 if (merge_hsps) {
6357 if (BLASTMergeHsps(hspp1[index], hspp2[index1], start)) {
6358 /* Free the second HSP. */
6359 hspp2[index1] = BLAST_HSPFree(hspp2[index1]);
6360 }
6361 } else { /* No gap information available */
6362 if (BLASTHspContained(hspp1[index], hspp2[index1])) {
6363 /* Point the first HSP to the new HSP; */
6364 hspp1[index] = BLAST_HSPFree(hspp1[index]);
6365 hspp1[index] = hspp2[index1];
6366 hspp2[index1] = NULL;
6367 /* This HSP has been removed, so break out of the inner
6368 loop */
6369 break;
6370 } else if (BLASTHspContained(hspp2[index1], hspp1[index])) {
6371 hspp2[index1] = BLAST_HSPFree(hspp2[index1]);
6372 }
6373 }
6374 } else {
6375 /* This and remaining HSPs are too far from the one being
6376 checked */
6377 break;
6378 }
6379 }
6380 }
6381
6382 HspArrayPurge(hitlist2->hsp_array, hitlist2->hspcnt, FALSE);
6383
6384 /* The new number of HSPs is now the sum of the remaining counts in the
6385 two lists, but if there is a restriction on the number of HSPs to keep,
6386 it might have to be reduced. */
6387 new_hspcnt = hitlist2->hspcnt + hitlist1->hspcnt;
6388 if (search->pbp->hsp_num_max)
6389 new_hspcnt = MIN(new_hspcnt, search->pbp->hsp_num_max);
6390
6391 if (new_hspcnt >= hitlist1->hspmax-1 && hitlist1->do_not_reallocate == FALSE) {
6392 Int4 new_allocated = 2*new_hspcnt;
6393 if (search->pbp->hsp_num_max)
6394 new_allocated = MIN(new_allocated, search->pbp->hsp_num_max);
6395 new_hsp_array = (BLAST_HSPPtr PNTR)
6396 Realloc(hitlist1->hsp_array, new_allocated*sizeof(BLAST_HSPPtr));
6397 if (new_hsp_array == NULL) {
6398 ErrPostEx(SEV_WARNING, 0, 0, "UNABLE to reallocate in BlastSaveCurrentHsp for ordinal id %ld, continuing with fixed array of %ld HSP's", (long) search->subject_id, (long) hitlist1->hspmax);
6399 hitlist1->do_not_reallocate = TRUE;
6400 } else {
6401 hitlist1->hsp_array = new_hsp_array;
6402 hitlist1->hspmax = new_allocated;
6403 }
6404 new_hspcnt = MIN(new_hspcnt, hitlist1->hspmax);
6405 }
6406
6407 if (new_hspcnt >= hitlist2->hspcnt + hitlist1->hspcnt) {
6408 /* All HSPs from both arrays are saved */
6409 for (index=hitlist1->hspcnt, index1=0;
6410 index1<hitlist2->hspcnt; index1++) {
6411 if (hitlist2->hsp_array[index1] != NULL)
6412 hitlist1->hsp_array[index++] = hitlist2->hsp_array[index1];
6413 }
6414 } else {
6415 /* Not all HSPs are be saved; sort both arrays by score and save only
6416 the new_hspcnt best ones.
6417 For the merged set of HSPs, allocate array the same size as in the
6418 old HSP list. */
6419 new_hsp_array = (BLAST_HSP**)
6420 malloc(hitlist1->hspmax*sizeof(BLAST_HSP*));
6421 HeapSort(hitlist1->hsp_array, hitlist1->hspcnt,
6422 sizeof(BLAST_HSP*), score_compare_hsps);
6423 HeapSort(hitlist2->hsp_array, hitlist2->hspcnt, sizeof(BLAST_HSP*),
6424 score_compare_hsps);
6425 index1 = index2 = 0;
6426 for (index = 0; index < new_hspcnt; ++index) {
6427 if (index1 < hitlist1->hspcnt &&
6428 (index2 >= hitlist2->hspcnt ||
6429 (hitlist1->hsp_array[index1]->score >=
6430 hitlist2->hsp_array[index2]->score))) {
6431 new_hsp_array[index] = hitlist1->hsp_array[index1];
6432 ++index1;
6433 } else {
6434 new_hsp_array[index] = hitlist2->hsp_array[index2];
6435 ++index2;
6436 }
6437 }
6438 /* Free the extra HSPs that could not be saved */
6439 for ( ; index1 < hitlist1->hspcnt; ++index1) {
6440 hitlist1->hsp_array[index1] =
6441 BLAST_HSPFree(hitlist1->hsp_array[index1]);
6442 }
6443 for ( ; index2 < hitlist2->hspcnt; ++index2) {
6444 hitlist2->hsp_array[index2] =
6445 BLAST_HSPFree(hitlist2->hsp_array[index2]);
6446 }
6447 /* Point hitlist1's HSP array to the new one */
6448 hitlist1->hsp_array = (BLAST_HSP**) MemFree(hitlist1->hsp_array);
6449 hitlist1->hsp_array = new_hsp_array;
6450 }
6451
6452 hitlist1->hspcnt = index;
6453 /* Second HSP list now does not own any HSPs */
6454 hitlist2->hspcnt = 0;
6455
6456 return hitlist1;
6457 }
6458
6459 /* Remove HSPs that do not touch the overlap region and have initial evalue
6460 estimate more than 10 times higher than the cutoff.
6461 */
6462 static BlastSearchBlkPtr
BlastReapPartialHitlistByEvalue(BlastSearchBlkPtr search,Int4 start)6463 BlastReapPartialHitlistByEvalue(BlastSearchBlkPtr search, Int4 start)
6464 {
6465 BLAST_HSPPtr hsp;
6466 Int4 index, hspcnt;
6467 FloatHi searchsp_eff;
6468 BLAST_KarlinBlkPtr PNTR kbp;
6469 Int4 context;
6470 Uint4 query_num; /* AM: Support for query concatenation. */
6471
6472 if (search->pbp->gapped_calculation)
6473 kbp = search->sbp->kbp_gap;
6474 else
6475 kbp = search->sbp->kbp;
6476
6477 hspcnt = search->current_hitlist->hspcnt;
6478 for (index=0; index<hspcnt; index++) {
6479 hsp = search->current_hitlist->hsp_array[index];
6480
6481 if (hsp->subject.offset > start + DBSEQ_CHUNK_OVERLAP) {
6482 if (search->pbp->mb_params)
6483 context = BinarySearchInt4(hsp->query.offset,
6484 search->query_context_offsets,
6485 (Int4) (search->last_context+1));
6486 else
6487 context = (Int4) hsp->context;
6488
6489 /* AM: Changed to support query concatenation. */
6490 if( !search->mult_queries )
6491 searchsp_eff = (FloatHi) search->dblen_eff *
6492 (FloatHi) search->context[context].query->effective_length;
6493 else
6494 {
6495 query_num = GetQueryNum( search->mult_queries,
6496 hsp->query.offset,
6497 hsp->query.end,
6498 hsp->query.frame );
6499 searchsp_eff = search->mult_queries->SearchSpEff[query_num];
6500 }
6501
6502 if (kbp[context]) {
6503 /* kbp[context] == NULL means that this alignment has been
6504 extended across the boundary between different query sequences.
6505 Leave it like this for now */
6506 hsp->evalue = BlastKarlinStoE_simple(hsp->score, kbp[context],
6507 searchsp_eff);
6508
6509 if (hsp->evalue > 10*search->pbp->cutoff_e) {
6510 hsp = BLAST_HSPFree(hsp);
6511 search->current_hitlist->hsp_array[index] = NULL;
6512 }
6513 }
6514 }
6515 }
6516 search->current_hitlist->hspcnt =
6517 HspArrayPurge(search->current_hitlist->hsp_array, hspcnt, FALSE);
6518 return search;
6519 }
6520
6521 /*
6522 Performs a BLAST search using a sequence from obtained from readdb.
6523 */
6524 Int2 LIBCALL
BLASTPerformSearchWithReadDb(BlastSearchBlkPtr search,Int4 sequence_number)6525 BLASTPerformSearchWithReadDb (BlastSearchBlkPtr search, Int4 sequence_number)
6526
6527 {
6528 Int4 subject_length;
6529 Uint1Ptr subject_seq=NULL;
6530
6531 /* This mutex should not be necessary - readdb seems to have
6532 * synchronization issues when dealing with multiple volumes
6533 * from multiple threads. This mutex fixes the symptom. */
6534
6535 static int init_mutex = 0;
6536 static TNlmMutex wrap_readdb_mutex = 0;
6537
6538 if (! init_mutex) {
6539 init_mutex++;
6540 NlmMutexInit(& wrap_readdb_mutex);
6541 }
6542
6543 NlmMutexLock(wrap_readdb_mutex);
6544
6545 subject_length = readdb_get_sequence(search->rdfp, sequence_number, &subject_seq);
6546
6547 NlmMutexUnlock(wrap_readdb_mutex);
6548
6549 search->dblen_eff_real += MAX(subject_length-search->length_adjustment, 1);
6550 search->subject_id = sequence_number;
6551
6552 return BLASTPerformSearch(search, subject_length, subject_seq);
6553 }
6554
6555 /*
6556 Performs a BLAST search with a subject sequence that is passed in.
6557 Used when an entire database is being scanned (by
6558 BLASTPerformSearchWithReadDb) and when only two seqs are being
6559 compared.
6560 */
6561 Int2 LIBCALL
BLASTPerformSearch(BlastSearchBlkPtr search,Int4 subject_length,Uint1Ptr subject_seq)6562 BLASTPerformSearch (BlastSearchBlkPtr search, Int4 subject_length, Uint1Ptr subject_seq)
6563
6564 {
6565 Int2 status;
6566
6567 if (search->pbp->two_pass_method)
6568 {
6569 status = BLASTPerform2PassSearch(search, subject_length, subject_seq);
6570 }
6571 else
6572 {
6573 status = BLASTPerformFinalSearch(search, subject_length, subject_seq);
6574 }
6575
6576 return status;
6577 }
6578
6579 /*
6580
6581 Performs a BLAST search using the two-pass method: the first pass
6582 looks for multiple initial hits and then performs a second pass
6583 (with single hits extended) wiht a lower T value.
6584
6585 Arguments are:
6586
6587 - search: BlastSearchBlkPtr returned by SetUpBlastSearch, call
6588 SetUpBlastSearch before calling this function.
6589 - sequence_number: number assigned to sequence (by user). The
6590 "readdb" library uses this number to access the sequence.
6591 This number should be zero if it's not important.
6592 - subject_length: the length of the database sequence (not the length
6593 allocated in *subject_seq).
6594 - subject_seq: CharPtr pointing to the sequence.
6595
6596 NOTE: static variables in PerformBlastSearch for subject_seq and
6597 allocated_length are not an option as they can't be deallocated
6598 after the last call and they are NOT MP-safe.
6599 */
6600
6601 Int2 LIBCALL
BLASTPerform2PassSearch(BlastSearchBlkPtr search,Int4 subject_length,Uint1Ptr subject_seq)6602 BLASTPerform2PassSearch (BlastSearchBlkPtr search, Int4 subject_length, Uint1Ptr subject_seq)
6603
6604 {
6605 Int2 outer_frame, outer_frame_max, status, outer_frame_min;
6606 Int4 prot_length;
6607 Uint1Ptr prot_seq;
6608
6609 search->current_hitlist_purge = TRUE; /* The default. */
6610 outer_frame_max = 1;
6611
6612 if (StringCmp(search->prog_name, "tblastn") == 0
6613 || StringCmp(search->prog_name, "tblastx") == 0
6614 || StringCmp(search->prog_name, "psitblastn") == 0)
6615 {
6616 outer_frame_min = -3;
6617 outer_frame_max = 3;
6618 }
6619 else
6620 {
6621 outer_frame_min = 0;
6622 outer_frame_max = 0;
6623 }
6624
6625 for (outer_frame=outer_frame_min; outer_frame<=outer_frame_max; outer_frame++)
6626 {
6627 search->subject->frame = outer_frame;
6628 if (StringCmp("tblastn", search->prog_name) == 0
6629 || StringCmp("tblastx", search->prog_name) == 0
6630 || StringCmp("psitblastn", search->prog_name) == 0)
6631 {
6632 if (outer_frame == 0)
6633 continue;
6634 prot_seq = search->translation_buffer;
6635 prot_length = BlastTranslateUnambiguousSequence(search, subject_length, prot_seq, subject_seq, outer_frame);
6636
6637 if(search->pbp->is_rps_blast) {
6638 /* SEG Filtering of query DNA sequence */
6639
6640 rpsBlastFilterSequence(search, outer_frame,
6641 prot_seq, prot_length,
6642 subject_length);
6643 }
6644
6645 BlastSequenceAddSequence(search->subject, NULL, prot_seq, prot_length, subject_length, 0);
6646 }
6647 else
6648 {
6649 BlastSequenceAddSequence(search->subject, NULL, subject_seq-1, subject_length, subject_length, 0);
6650 }
6651
6652 search->prelim = TRUE;
6653 search->wfp = search->wfp_first;
6654
6655 /* First pass with multiple hits. */
6656 status = BlastExtendWordSearch(search, TRUE);
6657 /* status = 0 means NO significant matches found on first pass.*/
6658 if (status > 0)
6659 { /* Match found on initial pass, DO second pass. */
6660 status = BLASTPerformFinalSearch(search, subject_length, subject_seq);
6661 break;
6662 }
6663 else
6664 { /* NULL out the sequence to prevent unintentional FREE's
6665 (it's in "*subject_seq"), but delete the descriptor. */
6666 search->subject->sequence = NULL;
6667 }
6668
6669 if (status < 0)
6670 { /* Error */
6671 ErrPostEx(SEV_FATAL, 1, 0, "BlastExtendWordSearch returned non-zero status");
6672 return 1;
6673 }
6674 }
6675
6676 /* NULL out the sequence, leave in the proper length which is still needed
6677 for the significance evaluation. */
6678 search->subject->length = subject_length;
6679 search->subject->sequence = NULL;
6680 search->subject->sequence_start = NULL;
6681
6682 return 0;
6683 }
6684
6685 /*
6686
6687 Performs a BLAST search using the two-pass method: the first pass
6688 looks for multiple initial hits and then performs a second pass
6689 (with single hits extended) wiht a lower T value.
6690
6691 Arguments are:
6692
6693 - search: BlastSearchBlkPtr returned by SetUpBlastSearch, call
6694 SetUpBlastSearch before calling this function.
6695 - sequence_number: number assigned to sequence (by user). The
6696 "readdb" library uses this number to access the sequence.
6697 This number should be zero if it's not important.
6698 - subject_length: the length of the database sequence (not the length
6699 allocated in *subject_seq).
6700 - subject_seq: CharPtr pointing to the sequence.
6701
6702 NOTE: static variables in PerformBlastSearch for subject_seq and
6703 allocated_length are not an option as they can't be deallocated
6704 after the last call and they are NOT MP-safe.
6705 */
6706
6707 Int2 LIBCALL
BLASTPerformFinalSearch(BlastSearchBlkPtr search,Int4 subject_length,Uint1Ptr subject_seq)6708 BLASTPerformFinalSearch (BlastSearchBlkPtr search, Int4 subject_length, Uint1Ptr subject_seq)
6709
6710 {
6711 BLAST_HitListPtr current_hitlist, hitlist = NULL;
6712 Int2 inner_frame, inner_frame_max, inner_frame_min, status;
6713 Int4 real_length, length, start = 0, num_chunks, index;
6714 Uint1Ptr prot_seq;
6715
6716 BlastHitListPurge(search->current_hitlist);
6717 if (subject_length == 0)
6718 /* Normal return */
6719 return 0;
6720
6721 BlastSequenceAddSequence(search->subject, NULL, subject_seq-1, subject_length, subject_length, 0);
6722 search->current_hitlist_purge = TRUE; /* The default. */
6723 inner_frame_max = 1;
6724 if (search->prog_number == blast_type_tblastn
6725 || search->prog_number == blast_type_tblastx
6726 || search->prog_number == blast_type_psitblastn) {
6727 inner_frame_min = -3;
6728 inner_frame_max = 3;
6729 } else if (search->prog_number == blast_type_blastn) {
6730 inner_frame_min = 1;
6731 inner_frame_max = 1;
6732 } else {
6733 inner_frame_min = 0;
6734 inner_frame_max = 0;
6735 }
6736
6737 /* Match found on initial pass, DO second pass. */
6738 for (inner_frame=inner_frame_min; inner_frame<=inner_frame_max; inner_frame++) {
6739 search->subject->frame = inner_frame;
6740 if (search->prog_number == blast_type_tblastn
6741 || search->prog_number == blast_type_tblastx
6742 || search->prog_number == blast_type_psitblastn) {
6743 if (inner_frame == inner_frame_min) /* Purge on 1st call. */
6744 search->current_hitlist_purge = TRUE;
6745 else
6746 search->current_hitlist_purge = FALSE;
6747 if (inner_frame == 0)
6748 continue;
6749 start = 0;
6750 prot_seq = search->translation_buffer;
6751 real_length = BlastTranslateUnambiguousSequence(search, subject_length, prot_seq, subject_seq, inner_frame);
6752
6753 if(search->pbp->is_rps_blast) {
6754 /* SEG Filtering of query DNA sequence */
6755
6756 rpsBlastFilterSequence(search, inner_frame,
6757 prot_seq, real_length,
6758 subject_length);
6759 }
6760
6761 /* subject seq stays the same, except for tblast[nx]. */
6762 BlastSequenceAddSequence(search->subject, NULL, prot_seq, real_length, subject_length, 0);
6763 if (real_length == 0)
6764 continue;
6765 } else
6766 real_length = subject_length;
6767
6768 search->prelim = FALSE;
6769 /* Calculate some cutoff scores, these depend upon the seq lengths.*/
6770 /* For blastn and gapped calc. use the cutoff's originally found. */
6771 if (!search->pbp->gapped_calculation &&
6772 search->prog_number != blast_type_blastn) {
6773 CalculateSecondCutoffScore(search, search->subject->length, &search->pbp->ignore_small_gaps, &search->pbp->cutoff_s_second, &search->pbp->cutoff_big_gap);
6774 }
6775
6776 #ifdef BLAST_COLLECT_STATS
6777 search->second_pass_trys++;
6778 #endif
6779
6780 if (search->pbp->mb_params)
6781 /* sequence_start is reserved for ncbi4na encoded sequence
6782 in this case */
6783 search->subject->sequence_start = NULL;
6784
6785 length = real_length;
6786 /* Split subject sequence into chunks if it is too long */
6787 num_chunks = (length - DBSEQ_CHUNK_OVERLAP) /
6788 (MAX_DBSEQ_LEN - DBSEQ_CHUNK_OVERLAP) + 1;
6789 search->subject->original_length = 0;
6790 if (search->pbp->mb_params && !search->rdfp) {
6791 /* Coming from the 2 sequences engine: save the entire
6792 ncbi4na sequence in search->subject->sequence_start
6793 */
6794 Uint1Ptr seq_blastna, seq_2na;
6795 Uint1 rem;
6796
6797 search->subject->sequence_start =
6798 (Uint1Ptr) MemNew(subject_length + 1);
6799 seq_blastna = search->subject->sequence_start;
6800 seq_2na = search->subject->sequence;
6801 rem = 3;
6802 *seq_blastna = (Uint1) ncbi4na_to_blastna[NULLB];
6803 seq_blastna++;
6804 for (index=0; index<subject_length; index++) {
6805 *seq_blastna =
6806 (Uint1) ncbi4na_to_blastna[(1 << READDB_UNPACK_BASE_N(*seq_2na, rem))];
6807 seq_blastna++;
6808 if (rem>0) rem--;
6809 else {
6810 rem = 3;
6811 seq_2na++;
6812 }
6813 }
6814 }
6815
6816 for (index=0; index<num_chunks; index++) {
6817 length = MIN(real_length-start, MAX_DBSEQ_LEN);
6818 search->subject->length = length;
6819 /* THE BLAST SEARCH _IS_ HERE! */
6820 if (BlastExtendWordSearch(search, search->pbp->multiple_hits_only) < 0) {
6821 /* Error occurred in BlastExtendWordSearch */
6822 return 1;
6823 }
6824 /* HSP's were not saved in any special order, sort. */
6825 current_hitlist = search->current_hitlist;
6826 if (current_hitlist && current_hitlist->do_not_reallocate == FALSE)
6827 HeapSort(current_hitlist->hsp_array, current_hitlist->hspcnt,sizeof(BLAST_HSPPtr), score_compare_hsps);
6828 if (search->pbp->gapped_calculation &&
6829 search->prog_number != blast_type_blastn) {
6830 status = BlastPreliminaryGappedScore(search, search->subject->sequence, search->subject->length, inner_frame);
6831 status = BlastGetGappedScore(search, search->subject->length, search->subject->sequence, inner_frame);
6832 }
6833 #if 1
6834 else if (!search->pbp->do_sum_stats && !search->pbp->mb_params) {
6835 status = BlastNTPreliminaryGappedScore(search, search->subject->sequence, search->subject->length);
6836 if (status < 0)
6837 return status;
6838 status = BlastNTGetGappedScore(search, search->subject->length, search->subject->sequence);
6839 if (status < 0)
6840 return status;
6841 }
6842 #endif
6843 if (num_chunks > 1) {
6844 AdjustOffsetsInBLASTHitList(search->current_hitlist, start);
6845
6846 if (search->current_hitlist->hspcnt > 0) {
6847 search = BlastReapPartialHitlistByEvalue(search, start);
6848 hitlist = BLASTMergeHitLists(search, hitlist,
6849 search->current_hitlist, start,
6850 (search->pbp->mb_params != NULL));
6851 }
6852 start += length - DBSEQ_CHUNK_OVERLAP;
6853 search->subject->original_length = start;
6854 if (search->prog_number == blast_type_blastn)
6855 search->subject->sequence +=
6856 (length - DBSEQ_CHUNK_OVERLAP)/READDB_COMPRESSION_RATIO;
6857 else
6858 search->subject->sequence += length - DBSEQ_CHUNK_OVERLAP;
6859 search->current_hitlist->hspcnt =
6860 search->current_hitlist->hspcnt_max = 0;
6861 }
6862 else if (search->prog_number == blast_type_tblastn ||
6863 search->prog_number == blast_type_psitblastn)
6864 {
6865 hitlist = BLASTMergeHitLists(search, hitlist, search->current_hitlist, 0, FALSE);
6866 MemSet((VoidPtr) search->current_hitlist->hsp_array, 0,
6867 sizeof(BLAST_HSPPtr)*(search->current_hitlist->hspcnt_max));
6868 search->current_hitlist->hspcnt = search->current_hitlist->hspcnt_max = 0;
6869 }
6870 }
6871 } /* for (inner_frame=inner_frame_min; inner_frame */
6872
6873 if (hitlist) {
6874 MemFree(search->current_hitlist->hsp_array);
6875 MemCpy(search->current_hitlist, hitlist, sizeof(BLAST_HitList));
6876 MemFree(hitlist);
6877 if (!search->pbp->mb_params)
6878 search->subject->sequence = search->subject->sequence_start + 1;
6879 }
6880
6881 /* NULL out the sequence, leave in the proper length which is still
6882 needed for the significance evaluation. */
6883 search->subject->length = subject_length;
6884 search->subject->sequence = NULL;
6885 if (!search->pbp->mb_params)
6886 /* This holds the ncbi4na-encoded sequence for Mega BLAST */
6887 search->subject->sequence_start = NULL;
6888
6889 return 0;
6890 }
6891
6892
6893
6894 /*
6895 Gets the translation array for a give genetic code.
6896 This array is optimized for the NCBI2na alphabet.
6897 The reverse complement can also be spcified.
6898
6899 Int4 id: The number of the NCBI genetic code,
6900 CharPtr name: The name of the NCBI genetic code,
6901 (only one of id or name must be specified).
6902 Boolean reverse_complement: translations for reverse
6903 complement are needed.
6904 */
6905
6906 Uint1Ptr
GetPrivatTranslationTable(CharPtr genetic_code,Boolean reverse_complement)6907 GetPrivatTranslationTable(CharPtr genetic_code, Boolean reverse_complement)
6908
6909 {
6910 Int2 index1, index2, index3, bp1, bp2, bp3;
6911 Int2 codon;
6912 SeqMapTablePtr smtp;
6913 Uint1Ptr translation;
6914 /* The next array translate between the ncbi2na rep's and
6915 the rep's used by the genetic_code tables. The rep used by the
6916 genetic code arrays is in mapping: T=0, C=1, A=2, G=3 */
6917 static Uint1 mapping[4] = {2, /* A in ncbi2na */
6918 1, /* C in ncbi2na. */
6919 3, /* G in ncbi2na. */
6920 0 /* T in ncbi2na. */ };
6921
6922
6923 if (genetic_code == NULL)
6924 return NULL;
6925
6926 translation = MemNew(64*sizeof(Uint1));
6927 if (translation == NULL)
6928 return NULL;
6929
6930 smtp = SeqMapTableFind(Seq_code_ncbistdaa, Seq_code_ncbieaa);
6931
6932 for (index1=0; index1<4; index1++)
6933 {
6934 for (index2=0; index2<4; index2++)
6935 {
6936 for (index3=0; index3<4; index3++)
6937 {
6938 /*
6939 The reverse complement codon is saved in it's orginal (non-complement)
6940 form AND with the high-order bits reversed from the non-complement form,
6941 as this is how they appear in the sequence.
6942 */
6943 if (reverse_complement)
6944 {
6945 bp1 = 3 - index1;
6946 bp2 = 3 - index2;
6947 bp3 = 3 - index3;
6948 codon = (mapping[bp1]<<4) + (mapping[bp2]<<2) + (mapping[bp3]);
6949 translation[(index3<<4) + (index2<<2) + index1] = SeqMapTableConvert(smtp, genetic_code[codon]);
6950 }
6951 else
6952 {
6953 codon = (mapping[index1]<<4) + (mapping[index2]<<2) + (mapping[index3]);
6954 translation[(index1<<4) + (index2<<2) + index3] = SeqMapTableConvert(smtp, genetic_code[codon]);
6955 }
6956
6957 }
6958 }
6959 }
6960 return translation;
6961 } /* GetPrivatTranslationTable */
6962
6963 /* Attach the "sequence" pointer to the BlastSequenceBlkPtr. sequence_start may be the
6964 actual start of the sequence (this pointer is kept for deallocation purposes). The
6965 sequence may start before "sequence" starts as there may be a sentinel (i.e., NULLB)
6966 before the start of the sequence. When the extension function extends this way it
6967 can tell that there is a NULLB there and stop the extension.
6968
6969 */
6970
6971 Int2 LIBCALL
BlastSequenceAddSequence(BlastSequenceBlkPtr sequence_blk,Uint1Ptr sequence,Uint1Ptr sequence_start,Int4 length,Int4 original_length,Int4 effective_length)6972 BlastSequenceAddSequence (BlastSequenceBlkPtr sequence_blk, Uint1Ptr sequence, Uint1Ptr sequence_start, Int4 length, Int4 original_length, Int4 effective_length)
6973
6974 {
6975 if (sequence_blk == NULL)
6976 return 1;
6977
6978 if (sequence == NULL && sequence_start != NULL)
6979 {
6980 sequence_blk->sequence = sequence_start+1;
6981 }
6982 else if (sequence != NULL)
6983 {
6984 sequence_blk->sequence = sequence;
6985 }
6986 sequence_blk->sequence_start = sequence_start;
6987 sequence_blk->length = length;
6988 sequence_blk->original_length = original_length;
6989 sequence_blk->effective_length = effective_length;
6990
6991 return 0;
6992 }
6993
6994 /*
6995 Select the appropriate wordfinder and then perform the search.
6996 The "wordfinder's" called here look through the already found
6997 words and extend those above a set limit ("T").
6998
6999 These wordfinders operate in two modes. One is the "preliminary"
7000 mode (search->prelim is TRUE); the wordfinders attempt to extend
7001 an initial hit. If they succeed at all, they return a positive
7002 return status. On the second pass (search->prelim is FALSE)
7003 only those db seqs with hits are further investigated.
7004
7005 */
7006 static Int4
BlastExtendWordSearch(BlastSearchBlkPtr search,Boolean multiple_hits)7007 BlastExtendWordSearch(BlastSearchBlkPtr search, Boolean multiple_hits)
7008 {
7009 Int4 status=0;
7010
7011
7012 /* multiple hits structure needed to perform mh extensions. */
7013 if (multiple_hits == TRUE && search->ewp_params->multiple_hits == FALSE)
7014 return -1;
7015
7016 if (multiple_hits == TRUE)
7017 status = BlastWordFinder_mh(search);
7018 else
7019 status = BlastWordFinder(search);
7020
7021 return status;
7022 }
7023
7024 /*---------- search a sequence with 1 Context, 1 Letter per byte ---------*/
7025 static Int4
BlastWordFinder(BlastSearchBlkPtr search)7026 BlastWordFinder(BlastSearchBlkPtr search)
7027 {
7028 BLAST_WordFinderPtr wfp;
7029 LookupTablePtr lookup;
7030 BLAST_ParameterBlkPtr pbp;
7031
7032 pbp = search->pbp;
7033 if (search->prelim == TRUE)
7034 {
7035 wfp=search->wfp_first;
7036 if (pbp->cutoff_s2_set == TRUE)
7037 pbp->cutoff_s2 = pbp->cutoff_s2_max;
7038 else
7039 pbp->cutoff_s2 = MIN(pbp->cutoff_s_first, pbp->cutoff_s2_max);
7040 pbp->X = pbp->dropoff_1st_pass;
7041 }
7042 else
7043 {
7044 wfp=search->wfp_second;
7045 if (!search->pbp->mb_params) {
7046 if (pbp->cutoff_s2_set == TRUE)
7047 pbp->cutoff_s2 = pbp->cutoff_s2_max;
7048 else
7049 pbp->cutoff_s2 = MIN(pbp->cutoff_s_second,
7050 pbp->cutoff_s2_max);
7051 }
7052 pbp->X = pbp->dropoff_2nd_pass;
7053 }
7054
7055 lookup = wfp->lookup;
7056
7057 if (search->prog_number == blast_type_blastn)
7058 {
7059 if (search->pbp->mb_params)
7060 return MegaBlastWordFinder(search, lookup);
7061 else
7062 return BlastNtWordFinder(search, lookup);
7063 }
7064 else
7065 {
7066 return BlastWordFinder_contig(search, lookup);
7067 }
7068 }
7069
7070 /* This function was updated to use mod_lt instead of the original lookup table,
7071 * but was not heavily optimized or tested.
7072 * (Modifications listed in comments before BlastWordFinder_mh_contig.)
7073 * -cfj
7074 */
7075
7076 /*
7077 Search a sequence with contiguous words.
7078 */
7079 static Int4
BlastWordFinder_contig(BlastSearchBlkPtr search,LookupTablePtr lookup)7080 BlastWordFinder_contig(BlastSearchBlkPtr search, LookupTablePtr lookup)
7081 {
7082 register Uint1Ptr s, s_end;
7083 register Int4 char_size, lookup_index, mask;
7084 register BLAST_Diag diag, diag_tmp, real_diag;
7085 BLAST_ExtendWordPtr ewp;
7086 BLAST_ExtendWordParamsPtr ewp_params;
7087 Boolean prelim, succeed_to_right;
7088 Uint1Ptr subject0;
7089 CfjModStruct *combo_array;
7090 Int4 index=0;
7091 register ModLookupPosition hit_info;
7092 Int2 context;
7093 Int4 q_off, s_off, offset, word_width;
7094 register Int4 bits_to_shift, min_diag_length, min_diag_mask;
7095 Int8 number_of_hits=0;
7096 register Int4 num_hits;
7097 register ModLookupPositionPtr lookup_pos;
7098 ModLAEntry *mod_lt=lookup->mod_lt;
7099 ewp_params=search->ewp_params;
7100 prelim = search->prelim;
7101
7102 /* this function only does final run, prelim is done by BlastWordFinder_mh_contig */
7103 if (prelim)
7104 return 1;
7105
7106 char_size = lookup->char_size;
7107 mask = lookup->mask;
7108 offset = ewp_params->offset;
7109 subject0 = s = search->subject->sequence;
7110 min_diag_length = ewp_params->min_diag_length;
7111 bits_to_shift = ewp_params->bits_to_shift;
7112 min_diag_mask = ewp_params->min_diag_mask;
7113
7114 /* The word_width tells how "long" a word is; if it's contiguous then it's
7115 the size of the word. */
7116 word_width = lookup->wordsize;
7117
7118
7119 if (search->current_hitlist == NULL)
7120 {
7121 search->current_hitlist = BlastHitListNew(search);
7122 }
7123 else
7124 { /* Scrub the hitlist. */
7125 if (search->current_hitlist_purge)
7126 BlastHitListPurge(search->current_hitlist);
7127 }
7128
7129 /* subject is too short to find anything! */
7130 if (word_width > search->subject->length)
7131 return 0;
7132
7133 s = lookup_find_init(lookup, &index, s);
7134 lookup_index = index;
7135
7136 /* Determines when to stop scanning the database. */
7137 s_end = subject0 + search->subject->length;
7138 if ((search->last_context-search->first_context+1) > 1)
7139 {
7140 for (;;)
7141 {
7142 do {
7143 /* lookup a contiguous word. */
7144 s++;
7145 lookup_index = (((lookup_index) & mask)<<char_size) + *s;
7146 if (s == s_end)
7147 goto NormalReturn;
7148 } while (mod_lt[lookup_index].num_used == 0);
7149
7150 num_hits = mod_lt[lookup_index].num_used;
7151 lookup_pos = mod_lt[lookup_index].entries;
7152 hit_info = *((Uint4 *) lookup_pos);
7153 lookup_pos++;
7154
7155 if(num_hits > 3){
7156 if(search->pbp->is_rps_blast) {
7157 lookup_pos = (ModLookupPositionPtr) ((Uint1Ptr) lookup->mod_lookup_table_memory + (Uint4) *lookup_pos);
7158 } else {
7159 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
7160 }
7161 }
7162
7163 s_off = s-subject0;
7164 diag_tmp = s_off + min_diag_length;
7165 /* Extend each hit in the linked list */
7166 do {
7167 #ifdef BLAST_COLLECT_STATS
7168 number_of_hits++;
7169 #endif
7170 q_off = hinfo_get_pos(hit_info);
7171 context = hinfo_get_context(hit_info);
7172 num_hits--;
7173 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
7174 lookup_pos++;
7175
7176 diag = diag_tmp - q_off;
7177
7178 real_diag = diag & min_diag_mask;
7179 ewp=search->context[context].ewp;
7180 combo_array = ewp->combo_array;
7181
7182 if (combo_array[real_diag].diag_level > (s_off+offset))
7183 {
7184 continue;
7185 }
7186 if (!(search->positionBased)) {
7187 if (BlastWordExtend(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, context) != 0)
7188 goto ErrorReturn;
7189 }
7190 else {
7191 if (BlastNewWordExtend(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, context) != 0)
7192 goto ErrorReturn;
7193 }
7194 } while (num_hits>0);
7195 }
7196 }
7197 else /* only one context. */
7198 {
7199 ewp=search->context[search->first_context].ewp;
7200 combo_array = ewp->combo_array;
7201 for (;;)
7202 {
7203 do {
7204 /* lookup a contiguous word. */
7205 lookup_index = (((lookup_index) & mask)<<char_size);
7206 s++;
7207 lookup_index += *s;
7208 if (s == s_end)
7209 goto NormalReturn;
7210 } while (mod_lt[lookup_index].num_used == 0);
7211
7212
7213 num_hits = mod_lt[lookup_index].num_used;
7214 lookup_pos = mod_lt[lookup_index].entries;
7215 hit_info = *((Uint4 *) lookup_pos);
7216 lookup_pos++;
7217
7218 if(num_hits > 3){
7219 if(search->pbp->is_rps_blast) {
7220 lookup_pos = (ModLookupPositionPtr) ((Uint1Ptr) lookup->mod_lookup_table_memory + (Uint4) *lookup_pos);
7221 } else {
7222 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
7223 }
7224 }
7225
7226 s_off = s-subject0;
7227 diag_tmp = s_off + min_diag_length;
7228 /* Extend each hit in the linked list */
7229 do {
7230 #ifdef BLAST_COLLECT_STATS
7231 number_of_hits++;
7232 #endif
7233 q_off = hinfo_get_pos(hit_info);
7234 num_hits--;
7235 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
7236 lookup_pos++;
7237
7238 diag = diag_tmp - q_off;
7239 real_diag = diag & min_diag_mask;
7240 if (combo_array[real_diag].diag_level > (s_off+offset))
7241 {
7242 continue;
7243 }
7244 if (!(search->positionBased)) {
7245 if (BlastWordExtend(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, 0) != 0)
7246 goto ErrorReturn;
7247 }
7248 else {
7249 if (BlastNewWordExtend(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, 0) != 0)
7250 goto ErrorReturn;
7251 }
7252 } while (num_hits>0);
7253 }
7254 }
7255
7256 NormalReturn:
7257 if (search->prelim)
7258 search->first_pass_hits += number_of_hits;
7259 else
7260 search->second_pass_hits += number_of_hits;
7261 BlastExtendWordExit(search);
7262 return search->current_hitlist->hspcnt;
7263
7264 ErrorReturn:
7265 BlastExtendWordExit(search);
7266 return 3;
7267 }
7268
7269 /***************************************************************************
7270 * This function is called once for each subject sequence.
7271 *
7272 * New (experimental) version of the Word Finder that makes use of
7273 * an idea of Stephen Altschul's. Multiple hits are found before a
7274 * hit is extended.
7275
7276 * "diagpos" is an Int4 array that is as long as the query sequence
7277 * and the longest database sequence. An efficient comparison of
7278 * whether a new hit is in the same window as the last one is done
7279 * by keeping track of how far along an "imaginary" array (i.e.,
7280 * increment) one is; this array changes every time this function is
7281 * called by the subject length plus window.
7282 *
7283 ***************************************************************************/
7284 /*---------- search a sequence with 1 Context, 1 Letter per byte ---------*/
7285 static Int4
BlastWordFinder_mh(BlastSearchBlkPtr search)7286 BlastWordFinder_mh(BlastSearchBlkPtr search)
7287 {
7288 BLAST_WordFinderPtr wfp;
7289 LookupTablePtr lookup;
7290 BLAST_ParameterBlkPtr pbp;
7291
7292 pbp = search->pbp;
7293 if (search->prelim == TRUE)
7294 {
7295 wfp=search->wfp_first;
7296 if (pbp->cutoff_s2_set == TRUE)
7297 pbp->cutoff_s2 = pbp->cutoff_s2_max;
7298 else
7299 pbp->cutoff_s2 = MIN(pbp->cutoff_s_first, pbp->cutoff_s2_max);
7300 pbp->X = pbp->dropoff_1st_pass;
7301 }
7302 else
7303 {
7304 wfp=search->wfp_second;
7305 if (pbp->cutoff_s2_set == TRUE)
7306 pbp->cutoff_s2 = pbp->cutoff_s2_max;
7307 else
7308 pbp->cutoff_s2 = MIN(pbp->cutoff_s_second, pbp->cutoff_s2_max);
7309 pbp->X = search->pbp->dropoff_2nd_pass;
7310 }
7311
7312 lookup = wfp->lookup;
7313
7314 if (search->prog_number == blast_type_blastn)
7315 {
7316 return BlastNtWordFinder_mh(search, lookup);
7317 }
7318 else
7319 {
7320 return BlastWordFinder_mh_contig(search, lookup);
7321 }
7322 }
7323
7324 /****************************************************************************
7325
7326 This function scans the database, looking for matches to the words in
7327 the 'lookup_index'.
7328
7329 In order to keep track of how far along a certain diagonal has already
7330 been extended an Int4 array that is twice as long as the shortest sequence
7331 is used (actually it is the power of two that is more than twice as long as the
7332 shortest sequence). There is a need for a mapping from 'true' diagonals (which would
7333 be the length of both query and database sequence) to the pseudo-diagonals
7334 used here (i.e., the Int4 array). This is done below with the 'version'.
7335 The procedure is as follows:
7336
7337 1.) diag_tmp is calculated with the 'subject' offset + min_diag_length: s_off + min_diag_length
7338 (min_diag_length is 2**n such that n is large enough to make min_diag_length larger
7339 than the shorter of the query and database sequence).
7340
7341 2.) diag is calculated with diag_tmp - q_off. This is the 'real' diagonal, except
7342 for the sum min_diag_length.
7343
7344 3.) real_diag is calculated by keeping only those bits in diag that are less than
7345 min_diag_length-1. This provides a unique number within a range.
7346
7347 4.) the version is calculated by shifting over 'bits_to_shift', which
7348 corresonds to dividing by min_diag_length.
7349
7350 5.) the combination of the version and the 'real_diag' provide a unique location
7351 for the diagonal.
7352
7353
7354
7355 modifications (cfj):
7356 - changed hash_table entries to reduce cache misses (see comments in lookup.c)
7357 - when walking through sequence, precompute next_index and prefetch the entry
7358 - combined last_hit/version/diag_level into array of struct for better locality.
7359 - eliminated the need for the version[] array by changing the value stored as diag_level.
7360 (This is done by measuring diag_level along s (rather than q) -- With this measure,
7361 previous hits found in XX[real_diag] will either really be from the same diag, or will
7362 have a diag_level and last_hit much smaller (by at least min_diag_length) than the current
7363 position.)
7364
7365
7366 ******************************************************************************/
7367
7368
7369
7370 static Int4
BlastWordFinder_mh_contig(BlastSearchBlkPtr search,LookupTablePtr lookup)7371 BlastWordFinder_mh_contig(BlastSearchBlkPtr search, LookupTablePtr lookup)
7372 {
7373 register Uint1Ptr s;
7374 register Uint1Ptr s_end;
7375 Uint1Ptr subject0;
7376 BLAST_Diag diag, diag_tmp, real_diag;
7377 BLAST_ExtendWordPtr ewp, ewp_pointer[40];
7378 Uint4 q_off;
7379 register Int4 s_off;
7380 Uint2 context;
7381 Int4 diff, offset, s_pos, window;
7382 Int4 min_diag_length, min_diag_mask;
7383 Int4 *last_hit_p;
7384 CfjModStruct *combo_array;
7385 CfjModStruct *ca_ptr[40];
7386 register ModLookupPositionPtr lookup_pos;
7387 register Uint4 hit_info;
7388
7389 Int4 char_size, lookup_index, mask, wordsize;
7390 Int4 next_lindex;
7391 Int4 * next_nhits_addr;
7392 Int4 word_width, index=0;
7393 Int8 number_of_hits=0;
7394 register Int4 num_hits;
7395 register Int4 next_nhits;
7396
7397 BLAST_ExtendWordParamsPtr ewp_params;
7398 Boolean prelim, succeed_to_right;
7399 ModLAEntry *mod_lt=lookup->mod_lt;
7400 PV_ARRAY_TYPE *pv_array = lookup->pv_array;
7401 register PV_ARRAY_TYPE PNTR next_pv_array_addr;
7402 register PV_ARRAY_TYPE next_pv_val,pv_val;
7403
7404 ewp = NULL; /* Gets rid of a warning. */
7405
7406 ewp_params=search->ewp_params;
7407 prelim = search->prelim;
7408
7409 /* The word_width tells how "long" a word is; for a contiguous word it's
7410 the length of the word. */
7411 word_width = lookup->wordsize;
7412
7413 wordsize = lookup->wordsize;
7414 char_size = lookup->char_size;
7415 mask = lookup->mask;
7416 subject0 = s = (Uint1Ptr) search->subject->sequence;
7417
7418 window = ewp_params->window;
7419 offset = ewp_params->offset;
7420 min_diag_length = ewp_params->min_diag_length;
7421 min_diag_mask = ewp_params->min_diag_mask;
7422
7423 if (search->current_hitlist == NULL) {
7424 search->current_hitlist = BlastHitListNew(search);
7425 } else { /* Scrub the hitlist. */
7426 if (search->current_hitlist_purge)
7427 BlastHitListPurge(search->current_hitlist);
7428 }
7429
7430 /* subject is too short to find anything! */
7431 if (word_width > search->subject->length)
7432 return 0;
7433
7434 /* Move along string to appropriate starting point. */
7435 s = lookup_find_init(lookup, &index, s);
7436 lookup_pos=NULL;
7437 lookup_index = index;
7438 /* Determines when to stop scanning the database. */
7439 s_end = subject0 + search->subject->length;
7440
7441 if (pv_array) {
7442 if ((search->last_context-search->first_context+1) > 1) {
7443 /* Only used if more than one context. */
7444 for (index=search->first_context; index<=search->last_context; index++){
7445 ewp_pointer[index] = search->context[index].ewp;
7446 ca_ptr[index]=ewp_pointer[index]->combo_array;
7447 }
7448 s_off = (Int4) (s - subject0);
7449 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7450 next_pv_val = pv_array[next_lindex>>PV_ARRAY_BTS];
7451
7452 for (;;) {
7453 do {
7454 /* lookup a contiguous word. */
7455 s++;
7456 lookup_index = next_lindex;
7457
7458 if (s == s_end)
7459 goto NormalReturn;
7460
7461 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7462 next_pv_array_addr = &pv_array[next_lindex>>PV_ARRAY_BTS];
7463 pv_val = next_pv_val;
7464 next_pv_val = *next_pv_array_addr;
7465
7466 } while ((pv_val&(((PV_ARRAY_TYPE) 1)<<(lookup_index&PV_ARRAY_MASK))) == 0);
7467
7468 num_hits = mod_lt[lookup_index].num_used;
7469
7470 /* Changed by TLM. */
7471 lookup_pos = mod_lt[lookup_index].entries;
7472 hit_info = *((Uint4 *) lookup_pos);
7473 lookup_pos++;
7474
7475 if(num_hits > 3){
7476 if(search->pbp->is_rps_blast) {
7477 lookup_pos = (ModLookupPositionPtr) ((Uint1Ptr) lookup->mod_lookup_table_memory + (Uint4) *lookup_pos);
7478 } else {
7479 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
7480 }
7481 }
7482 /* Changed by TLM. */
7483 s_off = (Int4) (s - subject0);
7484
7485 s_pos = s_off + offset;
7486 diag_tmp = s_off + min_diag_length;
7487
7488 /* Extend each hit in the linked list */
7489 /* Each link corresponds to different hits on the query sequence */
7490 do { /* for each hit */
7491
7492 #ifdef BLAST_COLLECT_STATS
7493 number_of_hits++;
7494 #endif
7495 q_off = hinfo_get_pos(hit_info);
7496 context = hinfo_get_context(hit_info);
7497 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
7498
7499 diag = diag_tmp - q_off;
7500 real_diag = (diag_tmp - q_off) & min_diag_mask;
7501 /* conxtext dependent values */
7502 combo_array = ca_ptr[context];
7503
7504 last_hit_p = &combo_array[real_diag].last_hit;
7505 diff = s_pos - *last_hit_p;
7506 num_hits--;
7507 lookup_pos++;
7508
7509 /* diff is always greater than window for the first time in a function. */
7510 if (diff >= window) {
7511 *last_hit_p = s_pos;
7512 } else if (diff >= wordsize) {
7513 succeed_to_right = TRUE;
7514 if (combo_array[real_diag].diag_level <= (s_off+offset)) {
7515 ewp = ewp_pointer[context];
7516 ewp->actual_window = diff;
7517 if (!(search->positionBased)) {
7518 if (BlastWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, context) != 0)
7519 goto ErrorReturn;
7520 } else {
7521 if (BlastNewWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, context) != 0)
7522 goto ErrorReturn;
7523 }
7524 if (search->current_hitlist->hspcnt > 0 && prelim)
7525 goto NormalReturn;
7526
7527 }
7528 if (succeed_to_right)
7529 *last_hit_p = 0;
7530 else
7531 *last_hit_p = s_pos;
7532 }
7533 } while(num_hits>0); /* end for pos_cnt... */
7534 }
7535 } else { /* Only one context. */
7536
7537 ewp=search->context[search->first_context].ewp;
7538 combo_array=ewp->combo_array;
7539
7540 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7541 next_pv_val = pv_array[next_lindex>>PV_ARRAY_BTS];
7542
7543 for (;;) {
7544 do {
7545 /* lookup a contiguous word. */
7546 s++;
7547 lookup_index = next_lindex;
7548
7549 if (s == s_end)
7550 goto NormalReturn;
7551
7552 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7553 next_pv_array_addr = &pv_array[next_lindex>>PV_ARRAY_BTS];
7554 pv_val = next_pv_val;
7555 next_pv_val = *next_pv_array_addr;
7556
7557 } while ((pv_val&(((PV_ARRAY_TYPE) 1)<<(lookup_index&PV_ARRAY_MASK))) == 0);
7558
7559 num_hits = mod_lt[lookup_index].num_used;
7560
7561 /* Changed by TLM. */
7562 lookup_pos = mod_lt[lookup_index].entries;
7563 hit_info = *((Uint4 *) lookup_pos);
7564 lookup_pos++;
7565
7566 if(num_hits > 3){
7567 if(search->pbp->is_rps_blast) {
7568 lookup_pos = (ModLookupPositionPtr) ((Uint1Ptr) lookup->mod_lookup_table_memory + (Uint4) *lookup_pos);
7569 } else {
7570 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
7571 }
7572 }
7573
7574 /* Changed by TLM. */
7575 s_off = (Int4) (s - subject0);
7576 s_pos = s_off + offset;
7577 diag_tmp = s_off + min_diag_length;
7578
7579 /* Extend each hit in the linked list */
7580 /* Each link corresponds to different hits on the query sequence */
7581 do { /* for each hit */
7582
7583 #ifdef BLAST_COLLECT_STATS
7584 number_of_hits++;
7585 #endif
7586 /* Changed by TLM. */
7587 q_off = hit_info;
7588 num_hits--;
7589 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
7590 lookup_pos++;
7591
7592 diag = diag_tmp - q_off;
7593 real_diag = diag & min_diag_mask;
7594
7595 last_hit_p = &combo_array[real_diag].last_hit;
7596 diff = s_pos - *last_hit_p;
7597
7598
7599 /* diff is always greater than window for the first time in a function. */
7600 if (diff >= window) {
7601 *last_hit_p = s_pos;
7602 } else if (diff >= wordsize) {
7603 succeed_to_right = TRUE;
7604 if (combo_array[real_diag].diag_level <= (s_off+offset)) {
7605 ewp->actual_window = diff;
7606 if (!(search->positionBased)) {
7607 if (BlastWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, 0) != 0)
7608 goto ErrorReturn;
7609 } else {
7610 if (BlastNewWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, 0) != 0)
7611 goto ErrorReturn;
7612 }
7613 if (search->current_hitlist->hspcnt > 0 && prelim)
7614 goto NormalReturn;
7615
7616 }
7617 if (succeed_to_right)
7618 *last_hit_p = 0;
7619 else
7620 *last_hit_p = s_pos;
7621 }
7622 } while(num_hits > 0); /* end for pos_cnt... */
7623
7624
7625 }
7626 }
7627 } else {
7628 if ((search->last_context-search->first_context+1) > 1) {
7629 /* Only used if more than one context. */
7630 for (index=search->first_context; index<=search->last_context; index++){
7631 ewp_pointer[index] = search->context[index].ewp;
7632 ca_ptr[index]=ewp_pointer[index]->combo_array;
7633 }
7634 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7635 next_nhits_addr=&mod_lt[next_lindex].num_used ;
7636 next_nhits=*next_nhits_addr;
7637 s_off = (Int4) (s - subject0);
7638 for (;;) {
7639 do {
7640 /* lookup a contiguous word. */
7641 lookup_index = next_lindex;
7642 s++;
7643
7644 if (s == s_end) goto NormalReturn;
7645
7646 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7647
7648 next_nhits_addr = &mod_lt[next_lindex].num_used;
7649
7650 num_hits = next_nhits;
7651 next_nhits=*next_nhits_addr;
7652
7653 } while (num_hits == 0);
7654
7655 /* Changed by TLM. */
7656 lookup_pos = mod_lt[lookup_index].entries;
7657 hit_info = *((Uint4 *) lookup_pos);
7658 lookup_pos++;
7659
7660 if(num_hits>3){
7661 if(search->pbp->is_rps_blast) {
7662 lookup_pos = (ModLookupPositionPtr) ((Uint1Ptr) lookup->mod_lookup_table_memory + (Uint4) *lookup_pos);
7663 } else {
7664 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
7665 }
7666 }
7667 /* Changed by TLM. */
7668 s_off = (Int4) (s - subject0);
7669
7670 s_pos = s_off + offset;
7671 diag_tmp = s_off + min_diag_length;
7672
7673 /* Extend each hit in the linked list */
7674 /* Each link corresponds to different hits on the query sequence */
7675 /* printf(" dtmp:%3d ",diag_tmp); */
7676 do{ /* for each hit */
7677
7678 #ifdef BLAST_COLLECT_STATS
7679 number_of_hits++;
7680 #endif
7681 q_off = hinfo_get_pos(hit_info);
7682 context = hinfo_get_context(hit_info);
7683 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
7684
7685 diag = diag_tmp - q_off;
7686 real_diag = (diag_tmp - q_off) & min_diag_mask;
7687 /* conxtext dependent values */
7688 combo_array = ca_ptr[context];
7689
7690 last_hit_p = &combo_array[real_diag].last_hit;
7691 diff = s_pos - *last_hit_p;
7692 num_hits--;
7693 lookup_pos++;
7694
7695 /* diff is always greater than window for the first time in a function. */
7696 if (diff >= window) {
7697 *last_hit_p = s_pos;
7698 } else if (diff >= wordsize) {
7699 succeed_to_right = TRUE;
7700 if (combo_array[real_diag].diag_level <= (s_off+offset)) {
7701 ewp = ewp_pointer[context];
7702 ewp->actual_window = diff;
7703 if (!(search->positionBased)) {
7704 if (BlastWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, context) != 0)
7705 goto ErrorReturn;
7706 } else {
7707 if (BlastNewWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, context) != 0)
7708 goto ErrorReturn;
7709 }
7710 if (search->current_hitlist->hspcnt > 0 && prelim)
7711 goto NormalReturn;
7712
7713 }
7714 if (succeed_to_right)
7715 *last_hit_p = 0;
7716 else
7717 *last_hit_p = s_pos;
7718 }
7719 } while(num_hits>0); /* end for pos_cnt... */
7720 }
7721 } else { /* Only one context. */
7722
7723 ewp=search->context[search->first_context].ewp;
7724 combo_array=ewp->combo_array;
7725
7726 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7727 /* CHanged by TLM.
7728 next_nhits_addr=&mod_lt[next_lindex].num_used ;
7729 next_nhits=*next_nhits_addr;
7730 */
7731 next_nhits=mod_lt[next_lindex].num_used ;
7732 for (;;) {
7733 do {
7734 /* lookup a contiguous word. */
7735 lookup_index = next_lindex;
7736 s++;
7737 if (s == s_end) goto NormalReturn;
7738
7739 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
7740 /* CHanged by TLM.
7741 next_nhits_addr = &mod_lt[next_lindex].num_used;
7742 */
7743
7744 num_hits = next_nhits;
7745 next_nhits=mod_lt[next_lindex].num_used;
7746
7747 } while (num_hits == 0);
7748
7749
7750 /* Changed by TLM. */
7751 lookup_pos = mod_lt[lookup_index].entries;
7752 hit_info = *((Uint4 *) lookup_pos);
7753 lookup_pos++;
7754
7755 if(num_hits > 3){
7756 if(search->pbp->is_rps_blast) {
7757 lookup_pos = (ModLookupPositionPtr) ((Uint1Ptr) lookup->mod_lookup_table_memory + (Uint4) *lookup_pos);
7758 } else {
7759 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
7760 }
7761 }
7762
7763 /* Changed by TLM. */
7764 s_off = (Int4) (s - subject0);
7765 s_pos = s_off + offset;
7766 diag_tmp = s_off + min_diag_length;
7767
7768 /* Extend each hit in the linked list */
7769 /* Each link corresponds to different hits on the query sequence */
7770 do { /* for each hit */
7771
7772 #ifdef BLAST_COLLECT_STATS
7773 number_of_hits++;
7774 #endif
7775 /* Changed by TLM. */
7776 q_off = hit_info;
7777 num_hits--;
7778 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
7779 lookup_pos++;
7780
7781 diag = diag_tmp - q_off;
7782 real_diag = diag & min_diag_mask;
7783
7784 last_hit_p = &combo_array[real_diag].last_hit;
7785 diff = s_pos - *last_hit_p;
7786
7787 /* diff is always greater than window for the first time in a function. */
7788 if (diff >= window) {
7789 *last_hit_p = s_pos;
7790 } else if (diff >= wordsize) {
7791 succeed_to_right = TRUE;
7792 if (combo_array[real_diag].diag_level <= (s_off+offset)) {
7793 ewp->actual_window = diff;
7794 if (!(search->positionBased)) {
7795 if (BlastWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, 0) != 0)
7796 goto ErrorReturn;
7797 } else {
7798 if (BlastNewWordExtend_prelim(search, q_off, s_off, word_width, diag, real_diag, &succeed_to_right, 0) != 0)
7799 goto ErrorReturn;
7800 }
7801 if (search->current_hitlist->hspcnt > 0 && prelim)
7802 goto NormalReturn;
7803
7804 }
7805 if (succeed_to_right)
7806 *last_hit_p = 0;
7807 else
7808 *last_hit_p = s_pos;
7809 }
7810 } while(num_hits>0); /* end for pos_cnt... */
7811 } /* for(;;) */
7812 }
7813 }
7814
7815 NormalReturn:
7816 if (search->prelim)
7817 search->first_pass_hits += number_of_hits;
7818 else
7819 search->second_pass_hits += number_of_hits;
7820 BlastExtendWordExit(search);
7821 return search->current_hitlist->hspcnt;
7822
7823 ErrorReturn:
7824 BlastExtendWordExit(search);
7825 return 3;
7826 }
7827
7828 /* BlastWordExtend -- extend a word-sized hit to a longer match */
7829 static Int2
BlastWordExtend(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off,Int4 word_width,BLAST_Diag diag,BLAST_Diag real_diag,Boolean PNTR succeed_to_right,Int2 context)7830 BlastWordExtend(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context)
7831 {
7832 BLAST_ExtendWordPtr ewp;
7833 BLAST_ParameterBlkPtr pbp;
7834 BLAST_ScoreBlkPtr sbp;
7835 BLAST_Score leftsum, rightsum, rightscore, leftscore;
7836 Uint1Ptr query;
7837 register Uint1Ptr q, s;
7838 register Uint1Ptr q_right, q_left, s_left, q_best_right, q_best_left;
7839 register BLAST_Score score, sum;
7840 register BLAST_ScorePtr PNTR matrix;
7841 register BLAST_Score x, X;
7842
7843
7844 q_best_left = NULL; /* Gets rid of warning. */
7845 q_best_right = NULL; /* Gets rid of warning. */
7846
7847 #ifdef BLAST_COLLECT_STATS
7848 if (search->prelim)
7849 search->first_pass_extends++;
7850 else
7851 search->second_pass_extends++;
7852 #endif
7853
7854 *succeed_to_right = FALSE;
7855
7856 ewp=search->context[context].ewp;
7857
7858 diag -= search->ewp_params->min_diag_length;
7859
7860 sbp=search->sbp;
7861 pbp=search->pbp;
7862
7863 query = search->context[context].query->sequence;
7864 q = query + q_off;
7865 s = search->subject->sequence + s_off;
7866
7867 X=pbp->X;
7868 matrix = sbp->matrix;
7869
7870 score=0;
7871 sum = 0;
7872 q_left = q - word_width;
7873 q_right = q;
7874
7875 /* Look for the highest scoring region in the initial word. */
7876 while (q > q_left)
7877 {
7878 if ((sum += matrix[*q][*s]) > score)
7879 {
7880 score = sum;
7881 q_best_right = q_right;
7882 q_best_left = q;
7883 }
7884 else if (sum <= 0)
7885 {
7886 sum = 0;
7887 q_right = q-1;
7888 }
7889 q--; s--;
7890 }
7891
7892 leftsum = rightsum = rightscore = 0;
7893
7894 /* q_left is the where the "attempted" extension along the query was
7895 stopped (and may be picked up again if the "goto Extend_Left" is used).
7896 q_best_left is the "best" extension along the query that should be
7897 reported. Analogous logic applies to q_right and q_best_right. */
7898
7899 q_left = q_best_left;
7900 q_right = q_best_right;
7901
7902 q = q_left;
7903 s = search->subject->sequence + (q - query) + diag;
7904 sum = leftsum;
7905
7906 x = X;
7907 while (sum > x)
7908 {
7909 q--; s--;
7910 if ((sum += matrix[*q][*s]) > 0)
7911 {
7912 do {
7913 score += sum;
7914 q_best_left = q;
7915 q--; s--;
7916 } while ((sum = matrix[*q][*s]) > 0);
7917 }
7918 }
7919
7920 if (score > rightscore && rightsum > X && -rightscore > X)
7921 {
7922 leftscore = score;
7923 leftsum = sum;
7924 q_left = q;
7925
7926 q = q_right;
7927 s = search->subject->sequence + (q - query) + diag;
7928 sum = rightsum;
7929
7930 /* "score" is actually the "maxscore", if sum drops by "score", then the
7931 total new score is zero and the extension can stop. */
7932 if ((x = -score) < X)
7933 x = X;
7934
7935 while (sum > x)
7936 {
7937 q++; s++;
7938 if ((sum += matrix[*q][*s]) > 0)
7939 {
7940 do {
7941 score += sum;
7942 q_best_right = q;
7943 q++; s++;
7944 } while ((sum = matrix[*q][*s]) > 0);
7945 /* do this if score changes. */
7946 if ((x = -score) < X)
7947 x = X;
7948 }
7949 }
7950
7951 q_right = q;
7952 }
7953
7954 /* Record how far this diagonal has been traversed,
7955 "q_right" was the last position on the query sequence.
7956 ewp_params->offset is added to provide the proper "zero-point" */
7957 ewp->combo_array[real_diag].diag_level = q_right - query - q_off + word_width + s_off + search->ewp_params->offset;
7958
7959 if (score >= pbp->cutoff_s2) /* Score is reportable */
7960 {
7961
7962 #ifdef BLAST_COLLECT_STATS
7963 if (search->prelim)
7964 search->first_pass_good_extends++;
7965 else
7966 search->second_pass_good_extends++;
7967 #endif
7968 s_left = search->subject->sequence + (q_best_left - query) + diag;
7969 BlastSaveCurrentHsp(search, score, (q_best_left-query), (s_left-search->subject->sequence), (q_best_right-q_best_left+1), context);
7970 }
7971
7972 return 0;
7973 }
7974 /*AAS*/
7975 /* BlastWordExtend -- extend a word-sized hit to a longer match,
7976 BlastNewWordExtend is position based */
7977 static Int2
BlastNewWordExtend(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off,Int4 word_width,BLAST_Diag diag,BLAST_Diag real_diag,Boolean PNTR succeed_to_right,Int2 context)7978 BlastNewWordExtend(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context)
7979 {
7980 BLAST_ExtendWordPtr ewp;
7981 BLAST_ParameterBlkPtr pbp;
7982 BLAST_Score leftsum, rightsum, rightscore, leftscore;
7983 Uint1Ptr query;
7984 register Uint1Ptr q, s;
7985 register Uint1Ptr q_right, q_left, s_left, q_best_right, q_best_left;
7986 register BLAST_Score score, sum;
7987 register BLAST_Score x, X;
7988
7989
7990 #ifdef BLAST_COLLECT_STATS
7991 if (search->prelim)
7992 search->first_pass_extends++;
7993 else
7994 search->second_pass_extends++;
7995 #endif
7996
7997 *succeed_to_right = FALSE;
7998
7999 ewp=search->context[context].ewp;
8000
8001 diag -= search->ewp_params->min_diag_length;
8002
8003 pbp=search->pbp;
8004
8005 query = search->context[context].query->sequence;
8006 q = query + q_off;
8007 s = search->subject->sequence + s_off;
8008
8009 X=pbp->X;
8010
8011 score=0;
8012 sum = 0;
8013 q_left = q - word_width;
8014 q_right = q;
8015 q_best_left = q;
8016 q_best_right = q; /*AAS*/
8017
8018 /* Look for the highest scoring region in the initial word. */
8019 while (q > q_left)
8020 {
8021 if ((sum += MtrxScorePosSearch(search->sbp,
8022 (Int4) (q - query),*s)) > score)
8023 {
8024 score = sum;
8025 q_best_right = q_right;
8026 q_best_left = q;
8027 }
8028 else if (sum <= 0)
8029 {
8030 sum = 0;
8031 q_right = q-1;
8032 }
8033 q--; s--;
8034 }
8035
8036 if ((x = -score) < X)
8037 x = X;
8038
8039 leftsum = rightsum = rightscore = 0;
8040
8041 /* q_left is the where the "attempted" extension along the query was
8042 stopped (and may be picked up again if the "goto Extend_Left" is used).
8043 q_best_left is the "best" extension along the query that should be
8044 reported. Analogous logic applies to q_right and q_best_right. */
8045
8046 q_left = q_best_left;
8047 q_right = q_best_right;
8048
8049 q = q_left;
8050 s = search->subject->sequence + (q - query) + diag;
8051 sum = leftsum;
8052 x = X;
8053
8054 do
8055 {
8056 q--; s--;
8057 if (((q -query) >=0) &&
8058 (sum += MtrxScorePosSearch(search->sbp,
8059 (Int4) (q - query),*s)) > 0)
8060 {
8061 do {
8062 score += sum;
8063 q_best_left = q;
8064 q--; s--;
8065 } while (((q -query) >= 0) &&
8066 ((sum = MtrxScorePosSearch(search->sbp,
8067 (Int4) (q - query),*s)) > 0));
8068 }
8069 } while (((q -query) >= 0) && (sum >= x));
8070
8071
8072 if (score > rightscore && rightsum > X && -rightscore > X)
8073 {
8074 leftscore = score;
8075 leftsum = sum;
8076 q_left = q;
8077
8078 q = q_right;
8079 s = search->subject->sequence + (q - query) + diag;
8080 sum = rightsum;
8081
8082 /* "score" is actually the "maxscore", if sum drops by "score", then the
8083 total new score is zero and the extension can stop. */
8084 if ((x = -score) < X)
8085 x = X;
8086
8087 do
8088 {
8089 q++; s++;
8090 if ((sum += MtrxScorePosSearch(search->sbp,
8091 (Int4) (q - query),*s)) > 0)
8092 {
8093 do {
8094 score += sum;
8095 q_best_right = q;
8096 q++; s++;
8097 } while ((sum = MtrxScorePosSearch(search->sbp,
8098 (Int4) (q - query),*s)) > 0);
8099 /* do this if score changes. */
8100 if ((x = -score) < X)
8101 x = X;
8102 }
8103 } while (sum >= x);
8104
8105 q_right = q;
8106 }
8107
8108 /* Record how far this diagonal has been traversed,
8109 "q_right" was the last position on the query sequence.
8110 ewp_params->offset is added to provide the proper "zero-point" */
8111 ewp->combo_array[real_diag].diag_level = q_right - query - q_off + word_width + s_off + search->ewp_params->offset;
8112
8113 if (score >= pbp->cutoff_s2) /* Score is reportable */
8114 {
8115
8116 #ifdef BLAST_COLLECT_STATS
8117 if (search->prelim)
8118 search->first_pass_good_extends++;
8119 else
8120 search->second_pass_good_extends++;
8121 #endif
8122 s_left = search->subject->sequence + (q_best_left - query) + diag;
8123 BlastSaveCurrentHsp(search, score, (q_best_left-query), (s_left-search->subject->sequence), (q_best_right-q_best_left+1), context);
8124 }
8125
8126 return 0;
8127 }
8128
8129
8130
8131 /* BlastWordExtend_prelim -- for timing purposes. */
8132 static Int2
BlastWordExtend_prelim(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off,Int4 word_width,BLAST_Diag diag,BLAST_Diag real_diag,Boolean PNTR succeed_to_right,Int2 context)8133 BlastWordExtend_prelim(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context)
8134 {
8135 BLAST_ExtendWordPtr ewp;
8136 BLAST_ParameterBlkPtr pbp;
8137 BLAST_ScoreBlkPtr sbp;
8138 register Uint1Ptr q, s, query;
8139 register Uint1Ptr q_right, q_left, s_left, q_best_right, q_best_left;
8140 register BLAST_Score score, sum;
8141 register BLAST_ScorePtr PNTR matrix;
8142 register BLAST_Score x, X;
8143 Uint4 query_num; /* AM: Support for query multiplexing. */
8144
8145
8146
8147 q_best_left = NULL; /* Gets rid of warning. */
8148 q_best_right = NULL; /* Gets rid of warning. */
8149
8150 #ifdef BLAST_COLLECT_STATS
8151 if (search->prelim)
8152 search->first_pass_extends++;
8153 else
8154 search->second_pass_extends++;
8155 #endif
8156
8157 *succeed_to_right = FALSE;
8158
8159 ewp=search->context[context].ewp;
8160
8161 diag -= search->ewp_params->min_diag_length;
8162
8163 sbp=search->sbp;
8164 pbp=search->pbp;
8165
8166 query = search->context[context].query->sequence;
8167 q = query + q_off;
8168 s = search->subject->sequence + s_off;
8169
8170 /* AM: Support for query multiplexing. */
8171 if( search->prog_number == blast_type_tblastn && search->mult_queries )
8172 {
8173 query_num = GetQueryNum( search->mult_queries, q_off - word_width + 1,
8174 q_off + 1, 0 );
8175 X = search->mult_queries->dropoff_2nd_pass_array[query_num];
8176 }
8177 else X=pbp->X;
8178
8179 matrix = sbp->matrix;
8180
8181 score=0;
8182 sum = 0;
8183 q_left = q - word_width;
8184 q_right = q;
8185
8186 /* Look for the highest scoring region in the initial word. */
8187 while (q > q_left)
8188 {
8189 sum += matrix[*q][*s];
8190 if (sum > score)
8191 {
8192 score = sum;
8193 q_best_right = q_right;
8194 q_best_left = q;
8195 }
8196 else if (sum <= 0)
8197 {
8198 sum = 0;
8199 q_right = q-1;
8200 }
8201 q--; s--;
8202 }
8203
8204 q = q_left = q_best_left;
8205 s = s_left = search->subject->sequence + (q_left - query) + diag;
8206
8207 q_left--;
8208
8209 /******************************************************************
8210
8211 The extension procedure used here is to:
8212
8213 1.) keep on extending as long as it increases the total score so far, record this
8214 maximum score and the corresponding extents as each new maximum score is reached.
8215
8216 2.) if extending decreases the total score so far then keep on extending
8217 until the score has dropped by "X" from the last maximum score to explore
8218 whether it is only a local minima that has been encountered:
8219
8220 a.) if the score drops by "X" from the last maximum score, then stop
8221 the extension and record the last maximum score as well as the
8222 corresponding extents for query and subject.
8223
8224 b.) if the score recovers again and becomes higher than the last maximum
8225 score, reset the maximum score so far as well as the corresponding
8226 query and subject offsets.
8227
8228
8229 3.) When the end of a sequence (either query or subject) is encountered record the last
8230 maximum score as well as the corresponding extents.
8231
8232
8233
8234 In the "while" loop below the maximum score is the variable "score" and "sum"
8235 is the change since the maximum score was last recorded (i.e., the variable
8236 "score" was modified).
8237
8238 Both x and X are negative and the outer "while" loops continues
8239 as long as sum is less negative than x. Iterations of the "while"
8240 loop with "sum" containing a negative value corresponds to 2.) above.
8241
8242 The inner do-while loop is executed only as long as each extension
8243 increases the maximum score, corresponding to 1.) above.
8244
8245 There is no explicit check for the end of a sequence here, but
8246 between sequences in the blast database there is a "sentinel"
8247 byte. If this sentinel byte is encountered then matrix[*q][*s]
8248 will be much more negative than "X" so that the extension will
8249 stop. This corresponds to 3.) above.
8250
8251 *******************************************************************/
8252
8253 sum = 0;
8254 x = X;
8255 while (sum > x)
8256 {
8257 q--; s--;
8258 if ((sum += matrix[*q][*s]) > 0)
8259 {
8260 do {
8261 score += sum;
8262 q--; s--;
8263 } while ((sum = matrix[*q][*s]) > 0);
8264 q_left = q;
8265 }
8266 }
8267 /* Adjust for extra decrement in do-while loop above. */
8268 q_left++;
8269 s_left = search->subject->sequence + (q_left - query) + diag;
8270
8271
8272 /* Extend towards the right (for this preliminary run) if
8273 q_off - q_left is greater than the window. */
8274 if (((query+q_off)-q_left) >= ewp->actual_window)
8275 {
8276 *succeed_to_right = TRUE;
8277 q = q_right = q_best_right;
8278 s = search->subject->sequence + (q - query) + diag;
8279 sum = 0;
8280 q_right++; /* pre-increment in case while() loop doesn't run */
8281
8282 /**************************************************************
8283
8284 The extension to the right is performed in the same way as the extension
8285 to the left, except that the extension can stop if the score
8286 drops by X or becomes negative, in which case the last maximum score
8287 is recorded.
8288
8289 *****************************************************************/
8290 if ((x = -score) < X)
8291 x = X;
8292 while (sum > x)
8293 {
8294 q++; s++;
8295 if ((sum += matrix[*q][*s]) > 0)
8296 {
8297 do {
8298 score += sum;
8299 q++; s++;
8300 } while ((sum = matrix[*q][*s]) > 0);
8301 q_right = q;
8302 /* do this if score changes. */
8303 if ((x = -score) < X)
8304 x = X;
8305 }
8306 }
8307 /* Adjust for extra increment in do-while loop above. */
8308 q_right--;
8309 }
8310
8311 /* Record how far this diagonal has been traversed,
8312 "q" was the last position on the query sequence.
8313 ewp->offset is added to provide the proper "zero-point" */
8314 ewp->combo_array[real_diag].diag_level = q - query - q_off + word_width + s_off + search->ewp_params->offset;
8315
8316 if (score >= pbp->cutoff_s2) /* Score is reportable */
8317 {
8318
8319 #ifdef BLAST_COLLECT_STATS
8320 if (search->prelim)
8321 search->first_pass_good_extends++;
8322 else
8323 search->second_pass_good_extends++;
8324 #endif
8325
8326 BlastSaveCurrentHsp(search, score, (q_left-query), (s_left-search->subject->sequence), (q_right-q_left+1), context);
8327 }
8328
8329 return 0;
8330 }
8331
8332 /*AAS*/
8333 /* BlastWordExtend_prelim -- for timing purposes. */
8334 static Int2
BlastNewWordExtend_prelim(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off,Int4 word_width,BLAST_Diag diag,BLAST_Diag real_diag,Boolean PNTR succeed_to_right,Int2 context)8335 BlastNewWordExtend_prelim(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, Int4 word_width, BLAST_Diag diag, BLAST_Diag real_diag, Boolean PNTR succeed_to_right, Int2 context)
8336 {
8337 BLAST_ExtendWordPtr ewp;
8338 BLAST_ParameterBlkPtr pbp;
8339 register Uint1Ptr q, s, query;
8340 register Uint1Ptr q_right, q_left, s_left, q_best_right, q_best_left;
8341 register BLAST_Score score, sum;
8342 register BLAST_Score x, X;
8343
8344
8345
8346 #ifdef BLAST_COLLECT_STATS
8347 if (search->prelim)
8348 search->first_pass_extends++;
8349 else
8350 search->second_pass_extends++;
8351 #endif
8352
8353 *succeed_to_right = FALSE;
8354
8355 ewp=search->context[context].ewp;
8356
8357 diag -= search->ewp_params->min_diag_length;
8358
8359 pbp=search->pbp;
8360
8361 query = search->context[context].query->sequence;
8362 q = query + q_off;
8363 s = search->subject->sequence + s_off;
8364
8365 X=pbp->X;
8366
8367 score=0;
8368 sum = 0;
8369 q_left = q - word_width;
8370 q_right = q+1;
8371 q_best_left = q;
8372 q_best_right = q; /*AAS*/
8373
8374 /* Look for the highest scoring region in the initial word. */
8375 while (q > q_left)
8376 {
8377 sum += MtrxScorePosSearch(search->sbp,(Int4) (q - query),*s);
8378 if (sum > score)
8379 {
8380 score = sum;
8381 q_best_right = q_right;
8382 q_best_left = q;
8383 }
8384 else if (sum <= 0)
8385 {
8386 sum = 0;
8387 q_right = q;
8388 }
8389 q--; s--;
8390 }
8391
8392 q = q_left = q_best_left;
8393 s = s_left = search->subject->sequence + (q_left - query) + diag;
8394
8395 q_left--;
8396
8397 sum = 0;
8398 x = X;
8399 while (((q - query) >= 0) && (sum > x))
8400 {
8401 q--; s--;
8402 if (((q - query) >= 0) &&
8403 ((sum += MtrxScorePosSearch(search->sbp,
8404 (Int4) (q - query),*s)) > 0))
8405 {
8406 do {
8407 score += sum;
8408 q--; s--;
8409 } while (((q -query) >= 0) &&
8410 ((sum = MtrxScorePosSearch(search->sbp,
8411 (Int4) ( q- query),*s)) > 0));
8412 q_left = q;
8413 }
8414 }
8415 /* Adjust for extra decrement in do-while loop above. */
8416 q_left++;
8417 s_left = search->subject->sequence + (q_left - query) + diag;
8418
8419 /* Extend towards the right (for this preliminary run) if
8420 q_off - q_left is greater than the window. */
8421 if (((query+q_off)-q_left) >= ewp->actual_window)
8422 {
8423 *succeed_to_right = TRUE;
8424 q = q_right = q_best_right;
8425 q--;
8426 s = search->subject->sequence + (q - query) + diag;
8427 sum = 0;
8428
8429 /* "score" is actually the "maxscore", if sum drops by "score", then the
8430 total new score is zero and the extension can stop. */
8431 if ((x = -score) < X)
8432 x = X;
8433 while (sum > x)
8434 {
8435 q++; s++;
8436 if ((sum += MtrxScorePosSearch(search->sbp,
8437 (Int4) (q - query),*s)) > 0)
8438 {
8439 do {
8440 score += sum;
8441 q++; s++;
8442 } while ((sum = MtrxScorePosSearch(search->sbp,
8443 (Int4) (q - query),*s)) > 0);
8444 q_right = q;
8445 /* do this if score changes. */
8446 if ((x = -score) < X)
8447 x = X;
8448 }
8449 }
8450 /* Adjust for extra increment in do-while loop above. */
8451 q_right--;
8452 }
8453
8454 /* Record how far this diagonal has been traversed,
8455 "q" was the last position on the query sequence.
8456 ewp->offset is added to provide the proper "zero-point" */
8457 ewp->combo_array[real_diag].diag_level = q - query -q_off + word_width + s_off + search->ewp_params->offset;
8458
8459 if (score >= pbp->cutoff_s2) /* Score is reportable */
8460 {
8461
8462 #ifdef BLAST_COLLECT_STATS
8463 if (search->prelim)
8464 search->first_pass_good_extends++;
8465 else
8466 search->second_pass_good_extends++;
8467 #endif
8468
8469 BlastSaveCurrentHsp(search, score, (q_left-query), (s_left-search->subject->sequence), (q_right-q_left+1), context);
8470 }
8471
8472 return 0;
8473 }
8474
8475
8476 /* Ungapped extension a blastn type word hit, to be used in Mega BLAST with
8477 discontiguous word models.
8478
8479 BlastSearchBlkPtr search: main BLAST structure,
8480 Int4 q_off: offset of query sequence,
8481 Int4 s_off: offset of subject sequence, divided by four!
8482 Return: true if ungapped score below cutoff (to indicate that this
8483 HSP should be deleted.
8484 */
8485
8486 Boolean
BlastNtWordUngappedExtend(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off,Int4 cutoff)8487 BlastNtWordUngappedExtend(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off,
8488 Int4 cutoff)
8489 {
8490 register Uint1Ptr q;
8491 register BLAST_ScorePtr PNTR matrix;
8492 register BLAST_Score sum, score;
8493 Uint1 ch;
8494 Uint1Ptr query0, subject0, sf, q_beg, q_end, s_beg, s_end, s, start;
8495 BLAST_Score X;
8496 Int2 remainder, base;
8497 BLAST_ParameterBlkPtr pbp;
8498 BLAST_ScoreBlkPtr sbp;
8499 Int4 q_avail, s_avail;
8500
8501 base = 3 - (s_off % 4);
8502
8503 sbp=search->sbp;
8504 pbp=search->pbp;
8505
8506 matrix = sbp->matrix;
8507 matrix = sbp->matrix;
8508 query0 = (Uint1Ptr) search->context[search->first_context].query->sequence;
8509 subject0 = (Uint1Ptr) search->subject->sequence;
8510 q_avail = search->context[search->first_context].query->length - q_off;
8511 s_avail = search->subject->length - s_off;
8512
8513 q = q_beg = q_end = query0 + q_off;
8514 s = s_beg = s_end = subject0 + s_off/READDB_COMPRESSION_RATIO;
8515 if (q_off < s_off) {
8516 start = (Uint1Ptr) search->subject->sequence +
8517 (s_off-q_off)/READDB_COMPRESSION_RATIO;
8518 remainder = 3 - ((s_off-q_off)%READDB_COMPRESSION_RATIO);
8519 } else {
8520 start = (Uint1Ptr) search->subject->sequence;
8521 remainder = 3;
8522 }
8523
8524 /* Find where positive scoring starts & ends within the word hit */
8525 score = 0;
8526 sum = 0;
8527
8528 X = pbp->X;
8529
8530 /* extend to the left */
8531 do {
8532 if (base == 3) {
8533 s--;
8534 base = 0;
8535 } else
8536 base++;
8537 ch = *s;
8538 if ((sum += matrix[*--q][READDB_UNPACK_BASE_N(ch, base)]) > 0) {
8539 q_beg = q;
8540 score += sum;
8541 sum = 0;
8542 } else if (sum < X)
8543 break;
8544 } while ((s > start) || (s == start && base <= remainder));
8545
8546 if (score >= cutoff)
8547 return FALSE;
8548
8549 if (q_avail < s_avail) {
8550 sf = subject0 + (s_off + q_avail)/READDB_COMPRESSION_RATIO;
8551 remainder = 3 - ((s_off + q_avail)%READDB_COMPRESSION_RATIO);
8552 } else {
8553 sf = subject0 + (search->subject->length)/READDB_COMPRESSION_RATIO;
8554 remainder = 3 - ((search->subject->length)%READDB_COMPRESSION_RATIO);
8555 }
8556 /* extend to the right */
8557 q = q_end;
8558 s = s_end;
8559 sum = 0;
8560 base = 3 - (s_off % 4);
8561
8562 while (s < sf || (s == sf && base >= remainder)) {
8563 ch = *s;
8564 if ((sum += matrix[*q++][READDB_UNPACK_BASE_N(ch, base)]) > 0) {
8565 q_end = q;
8566 score += sum;
8567 sum = 0;
8568 } else if (sum < X)
8569 break;
8570 if (base == 0) {
8571 base = 3;
8572 s++;
8573 } else
8574 base--;
8575 }
8576
8577 return (score < cutoff);
8578 }
8579
8580 /* Extend a blastn type word hit.
8581
8582 BlastSearchBlkPtr search: main BLAST structure,
8583 Int4 q_off: offset of query sequence,
8584 Int4 s_off: offset of subject sequence, divided by four!
8585 BLAST_Diag real_diag: diagonal,
8586 Int2 context: must be 0 (plus strand) or 1 (minus strand).
8587 */
8588 Int2
BlastNtWordExtend(BlastSearchBlkPtr search,Int4 q_off,Int4 s_off,BLAST_Diag real_diag,Int2 context)8589 BlastNtWordExtend(BlastSearchBlkPtr search, Int4 q_off, Int4 s_off, BLAST_Diag real_diag, Int2 context)
8590 {
8591 register Uint1Ptr q;
8592 register BLAST_ScorePtr PNTR matrix;
8593 register BLAST_Score sum, score;
8594 Uint1 ch;
8595 Uint1Ptr query0, subject0, sf, q_beg, q_end, s_beg, s_end, s, start;
8596 BLAST_Score X;
8597 Int2 remainder;
8598 BLAST_ExtendWordPtr ewp;
8599 BLAST_ParameterBlkPtr pbp;
8600 BLAST_ScoreBlkPtr sbp;
8601 Int4 q_avail, s_avail;
8602
8603 #ifdef BLAST_COLLECT_STATS
8604 search->second_pass_extends++;
8605 #endif
8606 ewp=search->context[context].ewp;
8607
8608 sbp=search->sbp;
8609 pbp=search->pbp;
8610
8611 matrix = sbp->matrix;
8612 matrix = sbp->matrix;
8613 query0 = (Uint1Ptr) search->context[context].query->sequence;
8614 subject0 = (Uint1Ptr) search->subject->sequence;
8615 q_avail = search->context[context].query->length - q_off;
8616 s_avail = search->subject->length - s_off*READDB_COMPRESSION_RATIO;
8617 if (q_avail < s_avail)
8618 {
8619 sf = subject0 + s_off + q_avail/READDB_COMPRESSION_RATIO;
8620 remainder = q_avail%READDB_COMPRESSION_RATIO;
8621 }
8622 else
8623 {
8624 sf = subject0 + (search->subject->length)/READDB_COMPRESSION_RATIO;
8625 remainder = (search->subject->length)%READDB_COMPRESSION_RATIO;
8626 }
8627
8628 q = q_beg = q_end = query0 + q_off;
8629 s = s_beg = s_end = subject0 + s_off;
8630 if (q_off < s_off*READDB_COMPRESSION_RATIO)
8631 {
8632 start = (Uint1Ptr) search->subject->sequence + (s_off-q_off/READDB_COMPRESSION_RATIO);
8633 }
8634 else
8635 {
8636 start = (Uint1Ptr) search->subject->sequence;
8637 }
8638
8639 /* Find where positive scoring starts & ends within the word hit */
8640 score = sum = 0;
8641
8642 X = pbp->X;
8643
8644 /* extend to the left */
8645 do {
8646 s--;
8647 ch = *s;
8648 if ((sum += matrix[*--q][READDB_UNPACK_BASE_4(ch)]) > 0) {
8649 q_beg = q;
8650 score += sum;
8651 sum = 0;
8652 }
8653 else
8654 if (sum < X)
8655 break;
8656 if ((sum += matrix[*--q][READDB_UNPACK_BASE_3(ch)]) > 0) {
8657 q_beg = q;
8658 score += sum;
8659 sum = 0;
8660 }
8661 else
8662 if (sum < X)
8663 break;
8664 if ((sum += matrix[*--q][READDB_UNPACK_BASE_2(ch)]) > 0) {
8665 q_beg = q;
8666 score += sum;
8667 sum = 0;
8668 }
8669 else
8670 if (sum < X)
8671 break;
8672 if ((sum += matrix[*--q][READDB_UNPACK_BASE_1(ch)]) > 0) {
8673 q_beg = q;
8674 score += sum;
8675 sum = 0;
8676 }
8677 else
8678 if (sum < X)
8679 break;
8680 } while (s > start);
8681
8682 /* There is still another partial byte to be extended through. */
8683 if (sum >= X && start != (Uint1Ptr) search->subject->sequence)
8684 {
8685 s--;
8686 ch = *s;
8687 while (q > query0)
8688 {
8689 if ((sum += matrix[*--q][READDB_UNPACK_BASE_4(ch)]) > 0)
8690 {
8691 q_beg = q;
8692 score += sum;
8693 sum = 0;
8694 }
8695 else if (sum < X)
8696 {
8697 break;
8698 }
8699 ch >>= 2;
8700 }
8701 }
8702
8703 /* extend to the right */
8704 q = q_end;
8705 s = s_end;
8706 sum = 0;
8707 while (s < sf)
8708 {
8709 ch = *s;
8710 if ((sum += matrix[*q++][READDB_UNPACK_BASE_1(ch)]) > 0)
8711 {
8712 q_end = q;
8713 score += sum;
8714 sum = 0;
8715 }
8716 else if (sum < X)
8717 {
8718 break;
8719 }
8720
8721 if ((sum += matrix[*q++][READDB_UNPACK_BASE_2(ch)]) > 0)
8722 {
8723 q_end = q;
8724 score += sum;
8725 sum = 0;
8726 }
8727 else if (sum < X)
8728 {
8729 break;
8730 }
8731
8732 if ((sum += matrix[*q++][READDB_UNPACK_BASE_3(ch)]) > 0)
8733 {
8734 q_end = q;
8735 score += sum;
8736 sum = 0;
8737 }
8738 else if (sum < X)
8739 {
8740 break;
8741 }
8742
8743 if ((sum += matrix[*q++][READDB_UNPACK_BASE_4(ch)]) > 0)
8744 {
8745 q_end = q;
8746 score += sum;
8747 sum = 0;
8748 }
8749 else if (sum < X)
8750 {
8751 break;
8752 }
8753 s++;
8754 }
8755
8756 /* extend into the final, partially packed byte (if one exists) */
8757 /* If the query ends before the subject, then don't extend any more as the query
8758 has no remainder. */
8759 if (remainder > 0 && sum >= X)
8760 {
8761 ch = *sf;
8762
8763 while (remainder > 0)
8764 {
8765 if ((sum += matrix[*q++][READDB_UNPACK_BASE_1(ch)]) > 0)
8766 {
8767 q_end = q;
8768 score += sum;
8769 sum = 0;
8770 }
8771 else if (sum < X)
8772 {
8773 break;
8774 }
8775 #ifdef OLD_BYTE_ORDER
8776 ch >>= 2;
8777 #else
8778 ch <<= 2;
8779 #endif
8780 remainder--;
8781 }
8782 } /* End ungapped alignment */
8783
8784 /* Record how far this diagonal has been traversed */
8785 /* ewp->combo_array[real_diag].diag_level = q_end - query0 + search->ewp_params->offset; */
8786 ewp->combo_array[real_diag].diag_level = (q_end - query0 - q_off) + s_off*READDB_COMPRESSION_RATIO + search->ewp_params->offset;
8787
8788
8789
8790 if (score >= pbp->cutoff_s2) /* Score is reportable */
8791 {
8792 #ifdef BLAST_COLLECT_STATS
8793 search->second_pass_good_extends++;
8794 #endif
8795 if(search->pbp->gapped_calculation)
8796 BlastNtSaveCurrentHsp(search, score, (q_beg-query0),
8797 (q_beg-query0+READDB_COMPRESSION_RATIO*s_off-q_off),
8798 (q_end-q_beg), context,
8799 q_off - 5, READDB_COMPRESSION_RATIO*s_off - 5);
8800 else
8801 BlastSaveCurrentHsp(search, score,
8802 (q_beg-query0),
8803 (q_beg-query0+READDB_COMPRESSION_RATIO*s_off-q_off),
8804 (q_end-q_beg), context);
8805 }
8806
8807 return 0;
8808 }
8809
8810 /*
8811 search_nt_orig -- an adaptation of the original search_nt() function
8812 of BLASTN
8813
8814 * Can this ever be called?
8815 * - It is only called for blastn, only from BlastWordFinder_mh().
8816 * - BlastWordFinder_mh() is only called if BlastExtendWordSearch() is called w/ multiphe_hits==TRUE
8817 * - BlastExtendWordSearch() is called in 2 places:
8818 * BLASTPerform2PassSearch(called w/ multiple_hits=TRUE)
8819 * which is called from BLASTPerfromSearch if search->pbp->two_pass_method
8820 * BLASTPerformFinalSearch(called w/ search->pbp->multiple_hits_only)
8821 * * so multiple_hits_only, or two_pass_method must be set for this to be called.
8822 * For blastn, these are set to false in blastool, blastutl.
8823 * These can be set to TRUE in blastpgp.c, but can blastn alse be true in this case?
8824 *
8825 * I have updated the array accesses to use the new mod_lt[], as the other WordFinder routines
8826 * now do, but I have not been able to test this change.
8827 * -cfj
8828 */
8829 static Int4
BlastNtWordFinder_mh(BlastSearchBlkPtr search,LookupTablePtr lookup)8830 BlastNtWordFinder_mh(BlastSearchBlkPtr search, LookupTablePtr lookup)
8831 {
8832 register Uint1Ptr s, s_end;
8833 Uint1Ptr subject0;
8834 BLAST_Diag diag, diag_tmp, real_diag;
8835 BLAST_ExtendWordPtr ewp;
8836 BLAST_ExtendWordParamsPtr ewp_params;
8837 BLAST_WordFinderPtr wfp;
8838 CfjModStruct *combo_array;
8839 register Int4 diff, window, lookup_index, mask;
8840 Int4 char_size, index=0, current_count;
8841 register ModLookupPosition hit_info;
8842 Int4 s_pos, q_off, s_off, offset, virtual_wordsize, wordsize, compressed_wordsize, compression_factor;
8843 register Int4 bits_to_shift, min_diag_length, min_diag_mask;
8844 register Int4 num_hits;
8845 register Int4 next_nhits;
8846 Int4 * next_nhits_addr;
8847 register ModLookupPositionPtr lookup_pos;
8848 int next_lindex;
8849 PV_ARRAY_TYPE *pv_array = lookup->pv_array;
8850 register PV_ARRAY_TYPE PNTR next_pv_array_addr;
8851 register PV_ARRAY_TYPE next_pv_val,pv_val;
8852 #if 1
8853 ModLAEntry *mod_lt=lookup->mod_lt;
8854 #endif
8855
8856 ewp_params=search->ewp_params;
8857
8858 wfp = search->wfp_second;
8859 char_size = lookup->char_size;
8860 mask = lookup->mask;
8861 offset = ewp_params->offset;
8862 window = ewp_params->window;
8863 subject0 = s = (Uint1Ptr) search->subject->sequence;
8864 min_diag_length = ewp_params->min_diag_length;
8865 bits_to_shift = ewp_params->bits_to_shift;
8866 min_diag_mask = ewp_params->min_diag_mask;
8867
8868 if (search->current_hitlist == NULL)
8869 {
8870 search->current_hitlist = BlastHitListNew(search);
8871 }
8872 else
8873 { /* Scrub the hitlist. */
8874 if (search->current_hitlist_purge)
8875 BlastHitListPurge(search->current_hitlist);
8876 }
8877
8878 compressed_wordsize = lookup->wordsize;
8879 wordsize = wfp->wordsize;
8880
8881 /* The subject sequence is too short, exit this function now. */
8882 if (wordsize > search->subject->length)
8883 goto NormalReturn;
8884
8885 s = lookup_find_init(lookup, &index, s);
8886 lookup_index = index;
8887 /* Determines when to stop scanning the database; does not include remainder. */
8888 s_end = subject0 + (search->subject->length)/READDB_COMPRESSION_RATIO;
8889 compression_factor = wfp->compression_ratio*compressed_wordsize;
8890 virtual_wordsize = READDB_COMPRESSION_RATIO*compressed_wordsize;
8891
8892 /* conxtext dependent values */
8893 ewp=search->context[search->first_context].ewp;
8894 combo_array=ewp->combo_array;
8895
8896 if (pv_array)
8897 {
8898 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
8899 next_pv_val = pv_array[next_lindex>>PV_ARRAY_BTS];
8900
8901 for (;;) {
8902 do {
8903 /* lookup a contiguous word. */
8904 lookup_index = next_lindex;
8905 s++;
8906
8907 if (s == s_end) goto NormalReturn;
8908
8909 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
8910 next_pv_array_addr = &pv_array[next_lindex>>PV_ARRAY_BTS];
8911 pv_val = next_pv_val;
8912 next_pv_val = *next_pv_array_addr;
8913 } while ((pv_val&(((PV_ARRAY_TYPE) 1)<<(lookup_index&PV_ARRAY_MASK))) == 0);
8914
8915
8916 num_hits = mod_lt[lookup_index].num_used;
8917 lookup_pos = mod_lt[lookup_index].entries;
8918 hit_info = *((Uint4 *) lookup_pos);
8919 lookup_pos++;
8920
8921 if(num_hits>3){
8922 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
8923 }
8924
8925 s_off = s-subject0+1;
8926 diag_tmp = s_off*READDB_COMPRESSION_RATIO + min_diag_length;
8927 s_pos = (s-subject0)*READDB_COMPRESSION_RATIO+offset;
8928 /* Extend each hit in the linked list */
8929 do {
8930 q_off = hit_info;
8931 num_hits--;
8932 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
8933 lookup_pos++;
8934
8935 diag = diag_tmp - q_off;
8936 real_diag = diag & min_diag_mask;
8937
8938 diff = s_pos - combo_array[real_diag].last_hit;
8939
8940 if (diff >= window)
8941 {
8942 combo_array[real_diag].last_hit = s_pos;
8943 }
8944 else if (diff >= virtual_wordsize)
8945 {
8946 #ifdef BLAST_COLLECT_STATS
8947 search->second_pass_hits++;
8948 #endif
8949 current_count = search->current_hitlist->hspcnt;
8950 if (combo_array[real_diag].diag_level <= (s_off*READDB_COMPRESSION_RATIO+offset))
8951 {
8952 if (BlastNtWordExtend(search, q_off, s_off, real_diag, search->first_context) != 0)
8953 goto ErrorReturn;
8954 }
8955 /* If no HSP's saved, save last hit. */
8956 if (current_count == search->current_hitlist->hspcnt)
8957 combo_array[real_diag].last_hit = s_pos;
8958 else
8959 combo_array[real_diag].last_hit = 0;
8960 }
8961
8962 } while (num_hits>0);
8963 }
8964 }
8965 else
8966 {
8967 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
8968 next_nhits_addr=&mod_lt[next_lindex].num_used ;
8969 next_nhits=*next_nhits_addr;
8970
8971 for (;;) {
8972 do {
8973 /* lookup a contiguous word. */
8974 lookup_index = next_lindex;
8975 s++;
8976
8977 if (s == s_end) goto NormalReturn;
8978
8979 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
8980
8981 next_nhits_addr = &mod_lt[next_lindex].num_used;
8982
8983 num_hits = next_nhits;
8984 next_nhits=*next_nhits_addr;
8985 } while (num_hits == 0);
8986
8987 lookup_pos = mod_lt[lookup_index].entries;
8988 hit_info = *((Uint4 *) lookup_pos);
8989 lookup_pos++;
8990
8991 if(num_hits>3){
8992 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
8993 }
8994
8995 s_off = s-subject0+1;
8996 diag_tmp = s_off*READDB_COMPRESSION_RATIO + min_diag_length;
8997 s_pos = (s-subject0)*READDB_COMPRESSION_RATIO+offset;
8998 /* Extend each hit in the linked list */
8999 do {
9000 q_off = hit_info;
9001 num_hits--;
9002 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
9003 lookup_pos++;
9004
9005 diag = diag_tmp - q_off;
9006 real_diag = diag & min_diag_mask;
9007
9008 diff = s_pos - combo_array[real_diag].last_hit;
9009
9010 if (diff >= window)
9011 {
9012 combo_array[real_diag].last_hit = s_pos;
9013 }
9014 else if (diff >= virtual_wordsize)
9015 {
9016 #ifdef BLAST_COLLECT_STATS
9017 search->second_pass_hits++;
9018 #endif
9019 current_count = search->current_hitlist->hspcnt;
9020 if (combo_array[real_diag].diag_level <= (s_off*READDB_COMPRESSION_RATIO+offset))
9021 {
9022 if (BlastNtWordExtend(search, q_off, s_off, real_diag, search->first_context) != 0)
9023 goto ErrorReturn;
9024 }
9025 /* If no HSP's saved, save last hit. */
9026 if (current_count == search->current_hitlist->hspcnt)
9027 combo_array[real_diag].last_hit = s_pos;
9028 else
9029 combo_array[real_diag].last_hit = 0;
9030 }
9031
9032 } while (num_hits>0);
9033 }
9034 }
9035
9036 NormalReturn:
9037 BlastExtendWordExit(search);
9038 return search->current_hitlist->hspcnt;
9039
9040 ErrorReturn:
9041 BlastExtendWordExit(search);
9042 return 3;
9043 }
9044 /*
9045 search_nt_orig -- an adaptation of the original search_nt() function
9046 of BLASTN
9047 */
9048 static Int4
BlastNtWordFinder(BlastSearchBlkPtr search,LookupTablePtr lookup)9049 BlastNtWordFinder(BlastSearchBlkPtr search, LookupTablePtr lookup)
9050 {
9051 BLASTContextStructPtr search_context;
9052 register Uint1Ptr s, s_end;
9053 Uint1Ptr q, q_end, subject0, query0;
9054 Uint1 p, packed_query, p_start;
9055 BLAST_Diag diag, diag_tmp, real_diag;
9056 BLAST_ExtendWordPtr ewp;
9057 BLAST_ExtendWordParamsPtr ewp_params;
9058 BLAST_WordFinderPtr wfp;
9059 CfjModStruct *combo_array;
9060 Int4 lookup_index, mask;
9061 Int4 char_size, index=0, query_length=0;
9062
9063 register ModLookupPosition hit_info;
9064
9065 register PV_ARRAY_TYPE PNTR next_pv_array_addr;
9066 register PV_ARRAY_TYPE next_pv_val,pv_val;
9067 register ModLookupPositionPtr lookup_pos;
9068 register Int4 num_hits;
9069 register Int4 next_nhits;
9070 Int4 * next_nhits_addr;
9071
9072
9073 Int2 left, right;
9074 Int4 q_off, s_off, offset, virtual_wordsize, wordsize, compressed_wordsize, compression_factor;
9075 Int4 extra_bytes, extra_bytes_needed, my_index;
9076 Int4 bits_to_shift, min_diag_length, min_diag_mask;
9077 int next_lindex;
9078 PV_ARRAY_TYPE *pv_array = lookup->pv_array;
9079 ModLAEntry *mod_lt=lookup->mod_lt;
9080
9081 query0 = NULL; /* Gets rid of a warning. */
9082 p = 255; /* Gets rid of a warning. */
9083 ewp_params=search->ewp_params;
9084
9085 wfp = search->wfp_second;
9086 char_size = lookup->char_size;
9087 mask = lookup->mask;
9088 offset = ewp_params->offset;
9089 subject0 = s = (Uint1Ptr) search->subject->sequence;
9090 min_diag_length = ewp_params->min_diag_length;
9091 bits_to_shift = ewp_params->bits_to_shift;
9092 min_diag_mask = ewp_params->min_diag_mask;
9093
9094
9095 if (search->current_hitlist == NULL)
9096 {
9097 search->current_hitlist = BlastHitListNew(search);
9098 }
9099 else
9100 { /* Scrub the hitlist. */
9101 if (search->current_hitlist_purge)
9102 BlastHitListPurge(search->current_hitlist);
9103 }
9104
9105 compressed_wordsize = lookup->reduced_wordsize;
9106 wordsize = wfp->wordsize;
9107 extra_bytes = lookup->wordsize - compressed_wordsize;
9108
9109 /* The subject sequence is too short, exit this function now. */
9110 if (wordsize > search->subject->length)
9111 goto NormalReturn;
9112
9113 s = lookup_find_init(lookup, &index, s);
9114 lookup_index = index;
9115 /* Determines when to stop scanning the database; does not include remainder. */
9116 s_end = subject0 + (search->subject->length)/READDB_COMPRESSION_RATIO;
9117 compression_factor = wfp->compression_ratio*compressed_wordsize;
9118 virtual_wordsize = wordsize - READDB_COMPRESSION_RATIO*lookup->wordsize;
9119 search_context = search->context;
9120 query_length = search_context[search->first_context].query->length;
9121 extra_bytes_needed = extra_bytes;
9122 query0 = search_context[search->first_context].query->sequence;
9123 q_end = query0 + query_length;
9124 ewp = search_context[search->first_context].ewp;
9125 combo_array = ewp->combo_array;
9126
9127 if (extra_bytes_needed) {
9128
9129 /** The first for() loop is optimized for sparse tables (which rarely hit), the second for dense */
9130 if(pv_array){
9131 /* We use the pv_array here, since (on short-med queries) most lookups fail */
9132
9133 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9134 next_pv_val = pv_array[next_lindex>>PV_ARRAY_BTS];
9135
9136 for (;;) {
9137 do {
9138 /* lookup a contiguous word. */
9139 s++;
9140 lookup_index = next_lindex;
9141
9142 if (s == s_end)
9143 goto NormalReturn;
9144
9145 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9146 next_pv_array_addr = &pv_array[next_lindex>>PV_ARRAY_BTS];
9147 pv_val = next_pv_val;
9148 next_pv_val = *next_pv_array_addr;
9149
9150 }while ((pv_val&(((PV_ARRAY_TYPE) 1)<<(lookup_index&PV_ARRAY_MASK))) == 0);
9151
9152 num_hits = mod_lt[lookup_index].num_used;
9153 s_off = s-subject0+1;
9154 diag_tmp = s_off*READDB_COMPRESSION_RATIO + min_diag_length;
9155 lookup_pos = mod_lt[lookup_index].entries;
9156 hit_info = *((Uint4 *) lookup_pos);
9157 lookup_pos++;
9158
9159 if(num_hits>3){
9160 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
9161 }
9162
9163 p_start = *((Uint1Ptr) search->subject->sequence + s_off);
9164 /* Extend each hit in the linked list */
9165 do {
9166 q_off = hit_info;
9167 num_hits--;
9168 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
9169 lookup_pos++;
9170
9171 diag = diag_tmp - q_off;
9172
9173 /* Check for extra bytes if required for longer words. */
9174 /* extend to the right */
9175 p = p_start;
9176 q = query0 + q_off;
9177 my_index=0;
9178 while (extra_bytes_needed)
9179 {
9180 /* Note: no check is done that q[0-3] is not an ambiguity code. Could be done, but might slow things down. */
9181 packed_query = (q[0]<<6) + (q[1]<<4) + (q[2]<<2) + q[3];
9182 if (p != packed_query)
9183 break;
9184 q += 4;
9185 extra_bytes_needed--;
9186 my_index++;
9187 p = *((Uint1Ptr) search->subject->sequence + s_off + my_index);
9188 }
9189 if (extra_bytes_needed)
9190 { /* extra_bytes_needed next round. */
9191 extra_bytes_needed = extra_bytes;
9192 continue; /* not enough bytes found. */
9193 }
9194 extra_bytes_needed = extra_bytes;
9195
9196
9197 q = query0 + q_off - compression_factor;
9198 if (s_off > compressed_wordsize)
9199 p = *(subject0 + s_off - compressed_wordsize - 1);
9200
9201 /* extend to the left */
9202 if (s_off == compressed_wordsize || READDB_UNPACK_BASE_4(p) != *--q || q < query0)
9203 {
9204 left = 0;
9205 }
9206 else
9207 {
9208 if (READDB_UNPACK_BASE_3(p) != *--q || q < query0)
9209 {
9210 left = 1;
9211 }
9212 else
9213 {
9214 if (READDB_UNPACK_BASE_2(p) != *--q || q < query0)
9215 {
9216 left = 2;
9217 }
9218 else
9219 {
9220 if (READDB_UNPACK_BASE_1(p) != *--q || q < query0)
9221 {
9222 left = 3;
9223 }
9224 else
9225 {
9226 left = 4;
9227 }
9228 }
9229 }
9230 }
9231 /* extend to the right */
9232 p = *(subject0 + s_off + extra_bytes_needed);
9233 q = query0 + q_off + 4*extra_bytes_needed;
9234 if (s+extra_bytes_needed >= s_end || READDB_UNPACK_BASE_1(p) != *q++ || q >= q_end)
9235 {
9236 right = 0;
9237 }
9238 else
9239 {
9240 if (READDB_UNPACK_BASE_2(p) != *q++ || q >= q_end)
9241 {
9242 right = 1;
9243 }
9244 else
9245 {
9246 if (READDB_UNPACK_BASE_3(p) != *q++ || q >= q_end)
9247 {
9248 right = 2;
9249 }
9250 else
9251 {
9252 if (READDB_UNPACK_BASE_4(p) != *q++ || q >= q_end)
9253 {
9254 right = 3;
9255 }
9256 else
9257 {
9258 right = 4;
9259 }
9260 }
9261 }
9262 }
9263 if (left + right >= virtual_wordsize)
9264 {
9265 /* Check if this diagonal has already been explored. */
9266 real_diag = diag & min_diag_mask;
9267 if (combo_array[real_diag].diag_level >= (s_off*READDB_COMPRESSION_RATIO+offset))
9268 {
9269 continue;
9270 }
9271 #ifdef BLAST_COLLECT_STATS
9272 search->second_pass_hits++;
9273 #endif
9274 if (BlastNtWordExtend(search, q_off, s_off, real_diag, search->first_context) != 0)
9275 goto ErrorReturn;
9276 }
9277 } while (num_hits>0);
9278 } /* end for(;;) */
9279 }else{
9280 /* Dense version - doesn't use pv_array */
9281 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9282 next_nhits_addr=&mod_lt[next_lindex].num_used ;
9283 next_nhits=*next_nhits_addr;
9284
9285 for (;;) {
9286 do {
9287 /* lookup a contiguous word. */
9288 lookup_index = next_lindex;
9289 s++;
9290
9291 if (s == s_end) goto NormalReturn;
9292
9293 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9294
9295 next_nhits_addr = &mod_lt[next_lindex].num_used;
9296
9297 num_hits = next_nhits;
9298 next_nhits=*next_nhits_addr;
9299 } while (num_hits == 0);
9300
9301 lookup_pos = mod_lt[lookup_index].entries;
9302
9303 s_off = s-subject0+1;
9304 diag_tmp = s_off*READDB_COMPRESSION_RATIO + min_diag_length;
9305 hit_info = *((Uint4 *) lookup_pos);
9306 lookup_pos++;
9307
9308
9309 if(num_hits>3){
9310 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
9311 }
9312
9313 p_start = *((Uint1Ptr) search->subject->sequence + s_off);
9314 /* Extend each hit in the linked list */
9315 do {
9316 q_off = hit_info;
9317 num_hits--;
9318 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
9319 lookup_pos++;
9320
9321 diag = diag_tmp - q_off;
9322
9323
9324 /* Check for extra bytes if required for longer words. */
9325 /* extend to the right */
9326 p = p_start;
9327 q = query0 + q_off;
9328 my_index=0;
9329 while (extra_bytes_needed)
9330 {
9331 /* Note: no check is done that q[0-3] is not an ambiguity code. Could be done, but might slow things down. */
9332 packed_query = (q[0]<<6) + (q[1]<<4) + (q[2]<<2) + q[3];
9333 if (p != packed_query)
9334 break;
9335 q += 4;
9336 extra_bytes_needed--;
9337 my_index++;
9338 p = *((Uint1Ptr) search->subject->sequence + s_off + my_index);
9339 }
9340 if (extra_bytes_needed)
9341 { /* extra_bytes_needed next round. */
9342 extra_bytes_needed = extra_bytes;
9343 continue; /* not enough bytes found. */
9344 }
9345 extra_bytes_needed = extra_bytes;
9346
9347 q = query0 + q_off - compression_factor;
9348 if (s_off > compressed_wordsize)
9349 p = *(subject0 + s_off - compressed_wordsize - 1);
9350
9351 /* extend to the left */
9352 if (s_off == compressed_wordsize || READDB_UNPACK_BASE_4(p) != *--q || q < query0)
9353 {
9354 left = 0;
9355 }
9356 else
9357 {
9358 if (READDB_UNPACK_BASE_3(p) != *--q || q < query0)
9359 {
9360 left = 1;
9361 }
9362 else
9363 {
9364 if (READDB_UNPACK_BASE_2(p) != *--q || q < query0)
9365 {
9366 left = 2;
9367 }
9368 else
9369 {
9370 if (READDB_UNPACK_BASE_1(p) != *--q || q < query0)
9371 {
9372 left = 3;
9373 }
9374 else
9375 {
9376 left = 4;
9377 }
9378 }
9379 }
9380 }
9381 /* extend to the right */
9382 p = *(subject0 + s_off + extra_bytes_needed);
9383 q = query0 + q_off + 4*extra_bytes_needed;
9384 if (s+extra_bytes_needed >= s_end || READDB_UNPACK_BASE_1(p) != *q++ || q >= q_end)
9385 {
9386 right = 0;
9387 }
9388 else
9389 {
9390 if (READDB_UNPACK_BASE_2(p) != *q++ || q >= q_end)
9391 {
9392 right = 1;
9393 }
9394 else
9395 {
9396 if (READDB_UNPACK_BASE_3(p) != *q++ || q >= q_end)
9397 {
9398 right = 2;
9399 }
9400 else
9401 {
9402 if (READDB_UNPACK_BASE_4(p) != *q++ || q >= q_end)
9403 {
9404 right = 3;
9405 }
9406 else
9407 {
9408 right = 4;
9409 }
9410 }
9411 }
9412 }
9413 if (left + right >= virtual_wordsize)
9414 {
9415 /* Check if this diagonal has already been explored. */
9416 real_diag = diag & min_diag_mask;
9417 if (combo_array[real_diag].diag_level >= (s_off*READDB_COMPRESSION_RATIO+offset))
9418 {
9419 continue;
9420 }
9421 #ifdef BLAST_COLLECT_STATS
9422 search->second_pass_hits++;
9423 #endif
9424 if (BlastNtWordExtend(search, q_off, s_off, real_diag, search->first_context) != 0)
9425 goto ErrorReturn;
9426 }
9427 } while (num_hits>0);
9428 }
9429 }
9430 }
9431 else /* extra_bytes not needed. */
9432 {
9433 /** The first for() loop is optimized for sparse tables (which rarely hit), the second for dense */
9434 if(pv_array){
9435 /* We use the pv_array here, since (on short-med queries) most lookups fail */
9436 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9437 next_pv_val = pv_array[next_lindex>>PV_ARRAY_BTS];
9438
9439 for (;;) {
9440 do {
9441 /* lookup a contiguous word. */
9442 s++;
9443 lookup_index = next_lindex;
9444
9445 if (s == s_end)
9446 goto NormalReturn;
9447
9448 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9449 next_pv_array_addr = &pv_array[next_lindex>>PV_ARRAY_BTS];
9450 pv_val = next_pv_val;
9451 next_pv_val = *next_pv_array_addr;
9452
9453 } while ((pv_val&(((PV_ARRAY_TYPE) 1)<<(lookup_index&PV_ARRAY_MASK))) == 0);
9454
9455 num_hits = mod_lt[lookup_index].num_used;
9456 s_off = s-subject0+1;
9457 diag_tmp = s_off*READDB_COMPRESSION_RATIO + min_diag_length;
9458 lookup_pos = mod_lt[lookup_index].entries;
9459 hit_info = *((Uint4 *) lookup_pos);
9460 lookup_pos++;
9461
9462 if(num_hits>3){
9463 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
9464 }
9465
9466 /* Extend each hit in the linked list */
9467 do {
9468 q_off = hit_info;
9469 num_hits--;
9470 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
9471 lookup_pos++;
9472
9473
9474 q = query0 + q_off - compression_factor;
9475
9476 if (s_off > compressed_wordsize)
9477 p = *(subject0 + s_off - compressed_wordsize - 1);
9478
9479 diag = diag_tmp - q_off;
9480
9481 /* extend to the left */
9482 if (s_off == compressed_wordsize || READDB_UNPACK_BASE_4(p) != *--q || q < query0)
9483 {
9484 left = 0;
9485 }
9486 else
9487 {
9488 if (READDB_UNPACK_BASE_3(p) != *--q || q < query0)
9489 {
9490 left = 1;
9491 }
9492 else
9493 {
9494 if (READDB_UNPACK_BASE_2(p) != *--q || q < query0)
9495 {
9496 left = 2;
9497 }
9498 else
9499 {
9500 if (READDB_UNPACK_BASE_1(p) != *--q || q < query0)
9501 {
9502 left = 3;
9503 }
9504 else
9505 {
9506 left = 4;
9507 }
9508 }
9509 }
9510 }
9511 /* extend to the right */
9512 p = *(subject0 + s_off);
9513 q = query0 + q_off;
9514 if (s >= s_end || READDB_UNPACK_BASE_1(p) != *q++ || q >= q_end)
9515 {
9516 right = 0;
9517 }
9518 else
9519 {
9520 if (READDB_UNPACK_BASE_2(p) != *q++ || q >= q_end)
9521 {
9522 right = 1;
9523 }
9524 else
9525 {
9526 if (READDB_UNPACK_BASE_3(p) != *q++ || q >= q_end)
9527 {
9528 right = 2;
9529 }
9530 else
9531 {
9532 if (READDB_UNPACK_BASE_4(p) != *q++ || q >= q_end)
9533 {
9534 right = 3;
9535 }
9536 else
9537 {
9538 right = 4;
9539 }
9540 }
9541 }
9542 }
9543 if (left + right >= virtual_wordsize)
9544 {
9545 /* Check if this diagonal has already been explored. */
9546 real_diag = diag & min_diag_mask;
9547 if (combo_array[real_diag].diag_level >= (s_off*READDB_COMPRESSION_RATIO+offset))
9548 {
9549 continue;
9550 }
9551 #ifdef BLAST_COLLECT_STATS
9552 search->second_pass_hits++;
9553 #endif
9554 if (BlastNtWordExtend(search, q_off, s_off, real_diag, search->first_context) != 0)
9555 goto ErrorReturn;
9556 }
9557 } while (num_hits>0);
9558 } /* end for(;;) */
9559
9560 }else{
9561 /* Dense version - doesn't use pv_array */
9562 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9563 next_nhits_addr=&mod_lt[next_lindex].num_used ;
9564 next_nhits=*next_nhits_addr;
9565
9566 for (;;) {
9567 do {
9568 /* lookup a contiguous word. */
9569 lookup_index = next_lindex;
9570 s++;
9571
9572 if (s == s_end) goto NormalReturn;
9573
9574 next_lindex = (((lookup_index) & mask)<<char_size) + *(s+1);
9575
9576 next_nhits_addr = &mod_lt[next_lindex].num_used;
9577
9578 num_hits = next_nhits;
9579 next_nhits=*next_nhits_addr;
9580 } while (num_hits == 0);
9581
9582 lookup_pos = mod_lt[lookup_index].entries;
9583
9584 s_off = s-subject0+1;
9585 diag_tmp = s_off*READDB_COMPRESSION_RATIO + min_diag_length;
9586 hit_info = *((Uint4 *) lookup_pos);
9587 lookup_pos++;
9588
9589
9590 if(num_hits>3){
9591 lookup_pos=*((ModLookupPositionPtr PNTR) lookup_pos);
9592 }
9593
9594 /* Extend each hit in the linked list */
9595 do {
9596 q_off = hit_info;
9597 num_hits--;
9598 hit_info = *((Uint4 *) lookup_pos); /* load next hit_info */
9599 lookup_pos++;
9600
9601
9602 q = query0 + q_off - compression_factor;
9603
9604 if (s_off > compressed_wordsize)
9605 p = *(subject0 + s_off - compressed_wordsize - 1);
9606
9607 diag = diag_tmp - q_off;
9608
9609 /* extend to the left */
9610 if (s_off == compressed_wordsize || READDB_UNPACK_BASE_4(p) != *--q || q < query0)
9611 {
9612 left = 0;
9613 }
9614 else
9615 {
9616 if (READDB_UNPACK_BASE_3(p) != *--q || q < query0)
9617 {
9618 left = 1;
9619 }
9620 else
9621 {
9622 if (READDB_UNPACK_BASE_2(p) != *--q || q < query0)
9623 {
9624 left = 2;
9625 }
9626 else
9627 {
9628 if (READDB_UNPACK_BASE_1(p) != *--q || q < query0)
9629 {
9630 left = 3;
9631 }
9632 else
9633 {
9634 left = 4;
9635 }
9636 }
9637 }
9638 }
9639 /* extend to the right */
9640 p = *(subject0 + s_off);
9641 q = query0 + q_off;
9642 if (s >= s_end || READDB_UNPACK_BASE_1(p) != *q++ || q >= q_end)
9643 {
9644 right = 0;
9645 }
9646 else
9647 {
9648 if (READDB_UNPACK_BASE_2(p) != *q++ || q >= q_end)
9649 {
9650 right = 1;
9651 }
9652 else
9653 {
9654 if (READDB_UNPACK_BASE_3(p) != *q++ || q >= q_end)
9655 {
9656 right = 2;
9657 }
9658 else
9659 {
9660 if (READDB_UNPACK_BASE_4(p) != *q++ || q >= q_end)
9661 {
9662 right = 3;
9663 }
9664 else
9665 {
9666 right = 4;
9667 }
9668 }
9669 }
9670 }
9671 if (left + right >= virtual_wordsize)
9672 {
9673 /* Check if this diagonal has already been explored. */
9674 real_diag = diag & min_diag_mask;
9675 if (combo_array[real_diag].diag_level >= (s_off*READDB_COMPRESSION_RATIO+offset))
9676 {
9677 continue;
9678 }
9679 #ifdef BLAST_COLLECT_STATS
9680 search->second_pass_hits++;
9681 #endif
9682 if (BlastNtWordExtend(search, q_off, s_off, real_diag, search->first_context) != 0)
9683 goto ErrorReturn;
9684 }
9685 } while (num_hits>0);
9686 }
9687 }
9688 }
9689
9690 NormalReturn:
9691 BlastExtendWordExit(search);
9692 return search->current_hitlist->hspcnt;
9693
9694 ErrorReturn:
9695 BlastExtendWordExit(search);
9696 return 3;
9697 }
9698
9699 static Int4
BlastPurgeResultList(BLASTResultHitlistPtr PNTR results,Int4 hitlist_count)9700 BlastPurgeResultList(BLASTResultHitlistPtr PNTR results, Int4 hitlist_count)
9701 {
9702 Int4 index, index_new;
9703
9704 for (index=0; index<hitlist_count; index++)
9705 {
9706 if (results[index]->num_ref <= 0) {
9707 if (results[index]->seqalign)
9708 SeqAlignSetFree(results[index]->seqalign);
9709 results[index] = BLASTResultHitlistFree(results[index]);
9710 }
9711 }
9712
9713 index_new=0;
9714 for (index=0; index < hitlist_count; index++)
9715 {
9716 if (results[index] != NULL)
9717 {
9718 results[index_new] = results[index];
9719 index_new++;
9720 }
9721 }
9722 for (index=index_new; index<hitlist_count; index++)
9723 results[index] = NULL;
9724
9725 return index_new;
9726 }
9727
9728 /* CC: Changed to have the same tie-breakers as score_compare_hsps */
BLASTResultHspScoreCmp(VoidPtr v1,VoidPtr v2)9729 int LIBCALLBACK BLASTResultHspScoreCmp(VoidPtr v1, VoidPtr v2)
9730 {
9731 BLASTResultHspPtr hsp1 = (BLASTResultHspPtr) v1;
9732 BLASTResultHspPtr hsp2 = (BLASTResultHspPtr) v2;
9733 int result = 0; /* the result of the comparison */
9734 int query_end1, query_end2;
9735 int subject_end1, subject_end2;
9736
9737 /* Null HSPs are "greater" than any non-null ones, so they go to the end
9738 of a sorted list. */
9739 if (!hsp1 && !hsp2)
9740 return 0;
9741 else if (!hsp1)
9742 return 1;
9743 else if (!hsp2)
9744 return -1;
9745
9746 query_end1 = hsp1->query_offset + hsp1->query_length;
9747 query_end2 = hsp2->query_offset + hsp2->query_length;
9748 subject_end1 = hsp1->subject_offset + hsp1->subject_length;
9749 subject_end2 = hsp2->subject_offset + hsp2->subject_length;
9750
9751 if (0 == (result = BLAST_CMP(hsp2->score, hsp1->score)) &&
9752 0 == (result = BLAST_CMP(hsp1->subject_offset, hsp2->subject_offset)) &&
9753 0 == (result = BLAST_CMP(subject_end2, subject_end1)) &&
9754 0 == (result = BLAST_CMP(hsp1->query_offset, hsp2->query_offset))) {
9755 /* if all other test can't distinguish the HSPs, then the final
9756 test is the result */
9757 result = BLAST_CMP(query_end2, query_end1);
9758 }
9759 return result;
9760 }
9761 /*
9762 Move the "current_hitlist" to the BLASTResultHitlistPtr
9763 result_hitlist. This function should be called after a
9764 subject sequence has been thoroughly investigated.
9765 If a hitlist is not significant, it will be deleted. Note that
9766 the actual sequence is not saved. This can be retrieved later
9767 with readdb when the formatting is done.
9768
9769 The number of significant HSP's is returned.
9770 */
9771
9772 Int4 LIBCALL
BlastSaveCurrentHitlist(BlastSearchBlkPtr search)9773 BlastSaveCurrentHitlist(BlastSearchBlkPtr search)
9774 {
9775 BLASTResultHitlistPtr result_hitlist, PNTR results;
9776 BLASTResultsStructPtr result_struct;
9777 BLAST_HitListPtr current_hitlist;
9778 BLAST_HSPPtr hsp;
9779 BLAST_KarlinBlkPtr kbp;
9780 BLASTResultHspPtr hsp_array;
9781 Int4 hspcnt, index, index1, new_index, old_index, low_index, high_index;
9782 Int4 hitlist_count, hitlist_max, hspmax, hspset_cnt, high_score=0, retval;
9783 Nlm_FloatHi current_evalue=DBL_MAX;
9784 Int2 deleted;
9785 Int4 query_length;
9786
9787 /* AM: Query multiplexing. */
9788 QueriesPtr mult_queries = NULL;
9789 Uint4 current_query = 0;
9790 MQ_ResultInfoPtr result_info = NULL;
9791 Int4 mq_new_index=0, del_index;
9792 BLASTResultHitlistPtr mq_worst_result = NULL;
9793 Uint4 tmp_num_results;
9794
9795 if (search == NULL)
9796 return 0;
9797
9798 if (search->current_hitlist == NULL || search->current_hitlist->hspcnt <= 0) /* No hits to save. */
9799 {
9800 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
9801 return 0;
9802 }
9803
9804 /* AM: Support for query concatenation. */
9805 if( !search->mult_queries )
9806 current_hitlist = search->current_hitlist;
9807 else
9808 current_hitlist = search->mult_queries->HitListArray[
9809 search->mult_queries->current_query];
9810
9811 retval = current_hitlist->hspcnt;
9812
9813 /* AM: Support for query concatenation. */
9814 if( search->mult_queries && !retval ) return 0;
9815
9816
9817 if (search->pbp->gapped_calculation)
9818 kbp = search->sbp->kbp_gap[search->first_context];
9819 else
9820 kbp = search->sbp->kbp[search->first_context];
9821
9822 if (search->prog_number==blast_type_blastn)
9823 query_length = search->query_context_offsets[search->first_context+1] - 1;
9824 result_hitlist = BLASTResultHitlistNew(current_hitlist->hspcnt);
9825 if (result_hitlist != NULL)
9826 {
9827 result_hitlist->subject_id = search->subject_id;
9828 result_hitlist->subject_info = search->subject_info;
9829 search->subject_info = NULL;
9830
9831 hspcnt = result_hitlist->hspcnt;
9832 hsp_array = result_hitlist->hsp_array;
9833 index1 = 0;
9834 hspmax = current_hitlist->hspcnt_max;
9835
9836 hsp = current_hitlist->hsp_array[0];
9837 hspset_cnt = -1;
9838
9839 for (index=0; index<hspcnt; index++)
9840 {
9841 while (hsp == NULL && index1 < hspmax)
9842 {
9843 index1++;
9844 hsp = current_hitlist->hsp_array[index1];
9845 }
9846 if (index1==hspmax) break;
9847 if (current_evalue > hsp->evalue)
9848 current_evalue = hsp->evalue;
9849 if (high_score < hsp->score)
9850 high_score = hsp->score;
9851 hsp_array[index].ordering_method = hsp->ordering_method;
9852 hsp_array[index].number = hsp->num;
9853 hsp_array[index].score = hsp->score;
9854 hsp_array[index].e_value = hsp->evalue;
9855 hsp_array[index].num_ident = hsp->num_ident;
9856 hsp_array[index].bit_score = ((hsp->score*kbp->Lambda) -
9857 kbp->logK)/NCBIMATH_LN2;
9858 if (search->prog_number==blast_type_blastn) {
9859 if (search->last_context > 0 &&
9860 hsp->query.offset >=
9861 search->query_context_offsets[search->last_context]) {
9862 hsp->context = 1;
9863 hsp->query.offset -=
9864 search->query_context_offsets[hsp->context];
9865 hsp->query.gapped_start -=
9866 search->query_context_offsets[hsp->context];
9867 }
9868
9869 if (hsp->context & 1)
9870 hsp_array[index].query_frame = -1;
9871 else
9872 hsp_array[index].query_frame = 1;
9873 hsp_array[index].query_gapped_start = hsp->query.gapped_start;
9874 hsp_array[index].subject_gapped_start = hsp->subject.gapped_start;
9875 } else {
9876 hsp_array[index].query_frame = hsp->query.frame;
9877 hsp_array[index].query_gapped_start = hsp->query.gapped_start;
9878 hsp_array[index].subject_gapped_start =
9879 hsp->subject.gapped_start;
9880 }
9881 hsp_array[index].context = hsp->context;
9882 hsp_array[index].query_offset = hsp->query.offset;
9883 hsp_array[index].query_length = hsp->query.length;
9884 hsp_array[index].subject_offset = hsp->subject.offset;
9885 hsp_array[index].subject_length = hsp->subject.length;
9886 hsp_array[index].subject_frame = hsp->subject.frame;;
9887 hsp_array[index].point_back = result_hitlist;
9888
9889 if (hsp->start_of_chain)
9890 { /* starting new set of HSP's, incr count.*/
9891 hspset_cnt++;
9892 }
9893 hsp_array[index].hspset_cnt = hspset_cnt;
9894
9895 index1++;
9896 if (index1 >= hspmax)
9897 break;
9898 hsp = current_hitlist->hsp_array[index1];
9899 }
9900 /* Check if there were less HSPs than expected */
9901 result_hitlist->hspcnt = index1;
9902 result_hitlist->best_evalue = current_evalue;
9903 result_hitlist->high_score = high_score;
9904 }
9905
9906 /* For MP BLAST we check that no other thread is attempting to insert results. */
9907 if (search->thr_info->results_mutex)
9908 NlmMutexLock(search->thr_info->results_mutex);
9909
9910 /* This is the structure that is identical on every thread. */
9911 result_struct = search->result_struct;
9912 hitlist_count = result_struct->hitlist_count;
9913 hitlist_max = result_struct->hitlist_max;
9914 results = result_struct->results;
9915
9916 /* AM: Query multiplexing. */
9917 if( search->mult_queries )
9918 {
9919 mult_queries = search->mult_queries;
9920 current_query = mult_queries->current_query;
9921 result_info = mult_queries->result_info + current_query;
9922 }
9923
9924 /* Record the worst evalue for ReevaluateWithAmbiguities. */
9925 if (hitlist_count == hitlist_max)
9926 {
9927 search->worst_evalue = results[hitlist_count-1]->best_evalue;
9928 }
9929
9930 /* New hit is less significant than all the other hits. */
9931 if (hitlist_count > 0 && (current_evalue > results[hitlist_count-1]->best_evalue ||
9932 (current_evalue >= results[hitlist_count-1]->best_evalue &&
9933 high_score < results[hitlist_count-1]->high_score)))
9934 {
9935 if (hitlist_count == hitlist_max)
9936 { /* Array is full, delete the entry. */
9937 if( !mult_queries ) /* AM: Query multiplexing. */
9938 search->current_hitlist =
9939 BlastHitListDestruct(search->current_hitlist);
9940 else search->mult_queries->delete_current_hitlist = TRUE;
9941
9942 result_hitlist = BLASTResultHitlistFreeEx(search, result_hitlist);
9943 if (search->thr_info->results_mutex)
9944 NlmMutexUnlock(search->thr_info->results_mutex); /* Free mutex. */
9945 return 0;
9946 }
9947 else
9948 {
9949 /* AM: Query multiplexing. */
9950 if( !mult_queries )
9951 /* Add to end of array. */
9952 deleted = BlastInsertList2Heap(search, result_hitlist);
9953 else
9954 {
9955 if( result_info->NumResults
9956 == mult_queries->max_results_per_query )
9957 { /* AM: No more results for this query. */
9958 search->mult_queries->delete_current_hitlist = TRUE;
9959 result_hitlist
9960 = BLASTResultHitlistFreeEx( search, result_hitlist );
9961
9962 if( search->thr_info->results_mutex )
9963 NlmMutexUnlock( search->thr_info->results_mutex );
9964
9965 return 0;
9966 }
9967 else /* AM: Append to results_struct and to local. */
9968 deleted = BlastInsertList2Heap(search, result_hitlist);
9969 }
9970 }
9971
9972 if (deleted == 1)
9973 {
9974 /* AM: Query multiplexing. */
9975 if( mult_queries ) MQ_UpdateResultLists( mult_queries );
9976
9977 hitlist_count = result_struct->hitlist_count =
9978 BlastPurgeResultList(results, hitlist_count);
9979 }
9980 else if (deleted == 0)
9981 {
9982 result_hitlist = BLASTResultHitlistFreeEx(search, result_hitlist);
9983 if (search->thr_info->results_mutex)
9984 NlmMutexUnlock(search->thr_info->results_mutex); /* Free mutex. */
9985 return retval;
9986 }
9987 new_index = hitlist_count;
9988
9989 /* AM: Query multiplexing. */
9990 if( mult_queries ) mq_new_index = result_info->NumResults;
9991 }
9992 else
9993 {
9994 if (hitlist_count != 0) /* The array is all NULL's if hitlist_count==0 */
9995 {
9996 deleted = BlastInsertList2Heap(search, result_hitlist);
9997 if (deleted == 1)
9998 {
9999 /* AM: Query multiplexing. */
10000 if( mult_queries ) MQ_UpdateResultLists( mult_queries );
10001
10002 hitlist_count = result_struct->hitlist_count =
10003 BlastPurgeResultList(results, hitlist_count);
10004 }
10005 else if (deleted == 0) {
10006 result_hitlist = BLASTResultHitlistFreeEx(search, result_hitlist);
10007 if (search->thr_info->results_mutex)
10008 NlmMutexUnlock(search->thr_info->results_mutex); /* Free mutex. */
10009 return retval;
10010 }
10011 if (hitlist_count > 0)
10012 {
10013 high_index=0;
10014 low_index=hitlist_count-1;
10015 new_index = (high_index+low_index)/2;
10016 old_index = new_index;
10017 for (index=0; index<BLAST_SAVE_ITER_MAX; index++)
10018 {
10019 if (results[new_index]->best_evalue > current_evalue)
10020 {
10021 low_index = new_index;
10022 }
10023 else if (results[new_index]->best_evalue < current_evalue)
10024 {
10025 high_index = new_index;
10026 }
10027 else
10028 { /* If e-values are the same, use high score. */
10029 /* If scores are the same, use ordinal number. */
10030 if (results[new_index]->high_score < high_score)
10031 low_index = new_index;
10032 else if (results[new_index]->high_score > high_score)
10033 high_index = new_index;
10034 else if (results[new_index]->subject_id < search->subject_id)
10035 low_index = new_index;
10036 else
10037 high_index = new_index;
10038 }
10039
10040 new_index = (high_index+low_index)/2;
10041 if (old_index == new_index)
10042 {
10043 if (results[new_index]->best_evalue < current_evalue)
10044 { /* Perform this check as new_index get rounded DOWN above.*/
10045 new_index++;
10046 }
10047 else if (results[new_index]->best_evalue == current_evalue && results[new_index]->high_score > high_score)
10048 {
10049 new_index++;
10050 }
10051 break;
10052 }
10053 old_index = new_index;
10054 }
10055
10056 /* AM: Query multiplexing. */
10057 if( !mult_queries )
10058 {
10059 if (hitlist_count == hitlist_max)
10060 { /* The list is full, delete the last entry. */
10061 BlastFreeHeap(search, results[hitlist_max-1]);
10062 if (results[hitlist_max-1]->seqalign)
10063 SeqAlignSetFree(results[hitlist_max-1]->seqalign);
10064 results[hitlist_max-1] = BLASTResultHitlistFreeEx(search, results[hitlist_max-1]);
10065 result_struct->hitlist_count--;
10066 hitlist_count = result_struct->hitlist_count;
10067 }
10068 if (hitlist_max > 1)
10069 Nlm_MemMove((results+new_index+1), (results+new_index), (hitlist_count-new_index)*sizeof(results[0]));
10070 }
10071 else
10072 {
10073 new_index = ResultIndex( current_evalue, high_score,
10074 search->subject_id,
10075 results, hitlist_count );
10076
10077 tmp_num_results = result_info->NumResults;
10078 del_index = hitlist_count;
10079 mq_new_index = ResultIndex( current_evalue, high_score,
10080 search->subject_id,
10081 result_info->results,
10082 result_info->NumResults );
10083
10084 if( mq_new_index == mult_queries->max_results_per_query )
10085 { /* AM: The list is full and new result is too low --- do nothing. */
10086 search->mult_queries->delete_current_hitlist = TRUE;
10087 result_hitlist
10088 = BLASTResultHitlistFreeEx( search, result_hitlist );
10089
10090 if( search->thr_info->results_mutex )
10091 NlmMutexUnlock( search->thr_info->results_mutex );
10092
10093 return 0;
10094 }
10095
10096 if( result_info->NumResults
10097 == mult_queries->max_results_per_query )
10098 { /* AM: must remove the worst result for this query. */
10099 mq_worst_result
10100 = result_info->results[result_info->NumResults - 1];
10101 --tmp_num_results;
10102 del_index = ResultIndex1( mq_worst_result,
10103 results, hitlist_count );
10104 BlastFreeHeap( search, results[del_index] );
10105
10106 if( results[del_index]->seqalign )
10107 SeqAlignSetFree( results[del_index]->seqalign );
10108
10109 results[del_index]
10110 = BLASTResultHitlistFreeEx( search,
10111 results[del_index] );
10112 hitlist_count = --result_struct->hitlist_count;
10113 }
10114
10115 if( hitlist_max > 1 )
10116 if( new_index < del_index )
10117 Nlm_MemMove( results + new_index + 1,
10118 results + new_index,
10119 (del_index - new_index)
10120 *sizeof( results[0] ) );
10121 else if( del_index < new_index )
10122 Nlm_MemMove( results + del_index,
10123 results + del_index + 1,
10124 (new_index - del_index)
10125 *sizeof( results[0] ) );
10126
10127 if( mult_queries->max_results_per_query > 1 )
10128 Nlm_MemMove( result_info->results + mq_new_index + 1,
10129 result_info->results + mq_new_index,
10130 (result_info->NumResults - mq_new_index)
10131 *sizeof( results[0] ) );
10132
10133 result_info->NumResults = tmp_num_results;
10134 }
10135 }
10136 else
10137 { /* Case of K=1 and the first hit is eliminated */
10138 new_index = 0;
10139 BlastInsertList2Heap(search, result_hitlist);
10140
10141 /* AM: Query multiplexing. */
10142 if( mult_queries ) mq_new_index = 0;
10143 }
10144 }
10145 else
10146 { /* First hit to be stored. */
10147 new_index = 0;
10148 BlastInsertList2Heap(search, result_hitlist);
10149
10150 /* AM: Query multiplexing. */
10151 if( mult_queries ) mq_new_index = 0;
10152 }
10153 }
10154
10155 if (new_index < hitlist_max)
10156 {
10157 results[new_index] = result_hitlist;
10158 result_struct->hitlist_count++;
10159
10160 /* AM: Query multiplexing. */
10161 if( mult_queries )
10162 {
10163 result_info->results[mq_new_index] = result_hitlist;
10164 ++result_info->NumResults;
10165 }
10166 }
10167
10168 /* We need to sort all hits by score/e_value in results[new_index] */
10169
10170 HeapSort(results[new_index]->hsp_array, results[new_index]->hspcnt,
10171 sizeof(BLASTResultHsp), BLASTResultHspScoreCmp);
10172
10173 /* --------------------------------------------------------------- */
10174
10175 if (search->thr_info->results_mutex)
10176 NlmMutexUnlock(search->thr_info->results_mutex); /* Free mutex. */
10177 return retval;
10178 }
10179
10180 Int2
blast_set_parameters(BlastSearchBlkPtr search,Nlm_FloatHi dropoff_number_of_bits_1st_pass,Nlm_FloatHi dropoff_number_of_bits_2nd_pass,Nlm_FloatHi avglen,Nlm_FloatHi searchsp,Int4 window)10181 blast_set_parameters(BlastSearchBlkPtr search,
10182 Nlm_FloatHi dropoff_number_of_bits_1st_pass,
10183 Nlm_FloatHi dropoff_number_of_bits_2nd_pass,
10184 Nlm_FloatHi avglen, /* Average length of a sequence. */
10185 Nlm_FloatHi searchsp, /* total search space. */
10186 Int4 window) /* length where two hits must be found to count. */
10187 {
10188 BLAST_ExtendWordPtr ewp;
10189 BLAST_KarlinBlkPtr kbp, kbp_gap;
10190 BLAST_ParameterBlkPtr pbp;
10191 BLAST_ScoreBlkPtr sbp;
10192 BLAST_Score s, s2;
10193 BLAST_Score dropoff_1st_pass, dropoff_2nd_pass;
10194 Int2 index;
10195 Int4 i; /* AM: Support for query multiplexing. */
10196
10197 Nlm_FloatHi meff, e, e2;
10198 Int2 last_context;
10199
10200 if (search == NULL)
10201 return 1;
10202
10203 pbp = search->pbp;
10204 if (pbp == NULL)
10205 return 1;
10206
10207 sbp = search->sbp;
10208 if (sbp == NULL)
10209 return 1;
10210
10211 /* Do for first context only, should this be changed?? */
10212 kbp_gap = sbp->kbp_gap[search->first_context];
10213 kbp = sbp->kbp[search->first_context];
10214 if (kbp == NULL && kbp_gap == NULL)
10215 return 1;
10216
10217 last_context = (search->prog_number == blast_type_blastn) ?
10218 search->first_context : search->last_context;
10219 for (index=search->first_context; index<=last_context; index++)
10220 {
10221 ewp = search->context[index].ewp;
10222 if (ewp == NULL && !pbp->mb_params)
10223 return 1;
10224
10225 }
10226
10227 s = pbp->cutoff_s;
10228 e = pbp->cutoff_e;
10229 s2 = pbp->cutoff_s2;
10230 e2 = pbp->cutoff_e2;
10231 if (pbp->cutoff_s_set && !pbp->cutoff_e_set)
10232 e = 0.;
10233
10234 meff = (Nlm_FloatHi) search->context[search->first_context].query->length;
10235 if (pbp->mb_params)
10236 BlastCutoffs(&s, &e, kbp, searchsp, TRUE, search->pbp->gap_decay_rate );
10237 else
10238 {
10239 if (pbp->gapped_calculation)
10240 { /* AM: Changed to support query concatenation. */
10241 if( !search->mult_queries )
10242 BlastCutoffs(&s, &e, kbp_gap, searchsp, FALSE, 0.0 );
10243 else
10244 BlastCutoffs( &s, &e, kbp_gap,
10245 search->mult_queries->MinSearchSpEff,
10246 FALSE, 0.0 );
10247 }
10248 else
10249 { /* AM: Changed to support query concatenation. */
10250 if( !search->mult_queries )
10251 BlastCutoffs(&s, &e, kbp, searchsp, FALSE, 0.0 );
10252 else
10253 BlastCutoffs( &s, &e, kbp, search->mult_queries->MinSearchSpEff,
10254 FALSE, 0.0 );
10255 }
10256 }
10257 /* Determine the secondary cutoff score, S2, to use */
10258 if (e2 == 0. && !pbp->cutoff_s2_set)
10259 s2 = s;
10260
10261 if ((pbp->cutoff_e2_set && !pbp->cutoff_s2_set && e2 == 0.) ||
10262 (pbp->cutoff_s2_set && s2 > s))
10263 {
10264 e2 = 0., s2 = s;
10265 }
10266 else
10267 {
10268 /* e2 = MIN(e, e2); */
10269 if (pbp->cutoff_s2_set && !pbp->cutoff_e2_set)
10270 e2 = 0.;
10271 /*
10272 BlastCutoffs(&s2, &e2, kbp, meff, avglen, TRUE);
10273 */
10274 if (pbp->gapped_calculation)
10275 {
10276 if( !search->mult_queries )
10277 BlastCutoffs(&s2, &e2, kbp_gap, (FloatHi) MIN(avglen,meff) * (FloatHi) avglen,
10278 TRUE, search->pbp->gap_decay_rate );
10279 else
10280 BlastCutoffs( &s2, &e2, kbp_gap,
10281 (FloatHi) MIN( avglen,search->mult_queries->MinLen ) * (FloatHi) avglen,
10282 TRUE, search->pbp->gap_decay_rate );
10283 }
10284 else
10285 { /* AM: Changed to support query concatenation. */
10286 if( !search->mult_queries )
10287 BlastCutoffs(&s2, &e2, kbp, (FloatHi) MIN(avglen,meff) * (FloatHi) avglen,
10288 TRUE, search->pbp->gap_decay_rate );
10289 else
10290 BlastCutoffs( &s2, &e2, kbp,
10291 (FloatHi) MIN(avglen,2*(search->mult_queries->MinLen)) * (FloatHi) avglen,
10292 TRUE, search->pbp->gap_decay_rate );
10293 }
10294 /* Adjust s2 to be in line with s, as necessary */
10295 s2 = MAX(s2, 1);
10296 if (s2 > s)
10297 s2 = s;
10298 /*
10299 e2 = BlastKarlinStoE_simple(s2, kbp, searchsp);
10300 */
10301 }
10302
10303 if (pbp->cutoff_s2_set)
10304 pbp->cutoff_s2_max = s2;
10305 else
10306 pbp->cutoff_s2_max = s;
10307
10308 if (pbp->do_sum_stats)
10309 pbp->cutoff_s1 = s2;
10310 else
10311 pbp->cutoff_s1 = s;
10312
10313 if (pbp->gapped_calculation && search->prog_number != blast_type_blastn)
10314 {
10315 pbp->gap_trigger = MIN(pbp->gap_trigger, s2);
10316 s2 = MIN(pbp->gap_trigger, s2);
10317 }
10318
10319 dropoff_1st_pass = (BLAST_Score) ceil((Nlm_FloatHi) dropoff_number_of_bits_1st_pass * NCBIMATH_LN2 / kbp->Lambda);
10320 dropoff_1st_pass = (BLAST_Score) MIN((Nlm_FloatHi) dropoff_1st_pass, s);
10321
10322 /* AM: Change to support query multiplexing. */
10323 if( search->prog_number == blast_type_tblastn && search->mult_queries )
10324 dropoff_2nd_pass = (BLAST_Score)ceil(
10325 (Nlm_FloatHi)dropoff_number_of_bits_2nd_pass*NCBIMATH_LN2
10326 /search->mult_queries->LambdaMin );
10327 else
10328 dropoff_2nd_pass = (BLAST_Score) ceil((Nlm_FloatHi) dropoff_number_of_bits_2nd_pass * NCBIMATH_LN2 / kbp->Lambda);
10329
10330 dropoff_2nd_pass = (BLAST_Score) MIN((Nlm_FloatHi) dropoff_2nd_pass, s);
10331
10332 /* AM: Change to support query multiplexing. */
10333 if( search->prog_number == blast_type_tblastn && search->mult_queries )
10334 for( i = 0; i < search->mult_queries->NumQueries; ++i )
10335 search->mult_queries->dropoff_2nd_pass_array[i]
10336 = - (BLAST_Score)ceil( (Nlm_FloatHi)dropoff_number_of_bits_2nd_pass*NCBIMATH_LN2
10337 /search->mult_queries->lambda_array[i] );
10338
10339 /* The drop-off parameter MUST be negative. */
10340 pbp->dropoff_1st_pass = -dropoff_1st_pass;
10341 pbp->dropoff_2nd_pass = -dropoff_2nd_pass;
10342 pbp->cutoff_s = s;
10343 pbp->cutoff_e = e;
10344 pbp->cutoff_s2 = s2;
10345 pbp->cutoff_e2 = e2;
10346
10347 /* The first and second pass S2 values are from formula by Stephen Altschul.*/
10348 /* If no bits were specified on the command line, then the following
10349 formula is used:
10350 calculate ln(25000*query_length*K)/lambda
10351
10352 and
10353
10354 21(bits)*ln2/lammbda
10355
10356 Take the smaller of those two formulas.
10357 */
10358 if (pbp->number_of_bits == 0.0)
10359 {
10360 pbp->cutoff_s_first = (BLAST_Score) MIN(log((Nlm_FloatHi)(25000*(kbp->K)*(search->context[search->first_context].query->length)))/kbp->Lambda, 21*NCBIMATH_LN2/kbp->Lambda);
10361 /* Adjust the cutoff value for translating searches. */
10362 pbp->cutoff_s_first += (BLAST_Score)
10363 (log((Nlm_FloatHi)search->context_factor)/kbp->Lambda);
10364 }
10365 else
10366 {
10367 pbp->cutoff_s_first = (BLAST_Score) (pbp->number_of_bits*NCBIMATH_LN2 / kbp->Lambda);
10368 }
10369
10370 /* This value is used only if the "old" statistics are used. If not an
10371 individual cutoff score is calculated for each subject sequence in
10372 CalculateSecondCutoffScore. */
10373
10374 pbp->cutoff_s_second = s2;
10375
10376 /* If we're just collecting HSP's, use one cutoff. */
10377 if (!pbp->gapped_calculation && !pbp->do_sum_stats)
10378 {
10379 pbp->cutoff_s2 = MAX(pbp->cutoff_s, pbp->cutoff_s2);
10380 pbp->cutoff_s2_max = MAX(pbp->cutoff_s, pbp->cutoff_s2);
10381 }
10382
10383 return 0;
10384 }
10385
10386 /*
10387 Arrange the HSP's (on every HitList) for linking by "link_hsps".
10388
10389 link_hsps requires an array of HSP's and the first member of this
10390 array is used just to hold the HSP's (i.e., not a real HSP).
10391
10392 Could this all be integrated with link_hsp's??
10393 */
10394 static Int2
10395 new_link_hsps(BlastSearchBlkPtr search, BLAST_HitListPtr hitlist);
10396
10397 Int2 LIBCALL
BlastLinkHsps(BlastSearchBlkPtr search)10398 BlastLinkHsps (BlastSearchBlkPtr search)
10399
10400 {
10401 BLAST_HitListPtr hitlist, orig_hitlist;
10402 BLAST_HSPPtr hsp;
10403 Int4 index;
10404 Int2 status = 0;
10405
10406 /* AM: Support for query concatenation. */
10407 if( search->mult_queries && search->mult_queries->use_mq )
10408 {
10409 orig_hitlist = search->current_hitlist;
10410 search->current_hitlist = search->mult_queries->HitListArray[
10411 search->mult_queries->current_query];
10412 }
10413
10414 hitlist = search->current_hitlist;
10415
10416 if (hitlist && hitlist->hspcnt > 0)
10417 {
10418 /* For ungapped blastn, assign frames to all HSPs,
10419 since this is necessary for linking, and frames have not yet been
10420 assigned. Do it only if both strands are searched. Note that
10421 we don't assign context numbers here because the offsets have
10422 not yet been adjusted to be relative to individual contexts. */
10423 if (search->prog_number == blast_type_blastn &&
10424 search->last_context > search->first_context) {
10425
10426 for (index = 0; index < hitlist->hspcnt; ++index) {
10427 if (hitlist->hsp_array[index]->query.offset >=
10428 search->query_context_offsets[search->last_context]) {
10429 hitlist->hsp_array[index]->query.frame = -1;
10430 }
10431 }
10432 }
10433
10434
10435 /* Link up the HSP's for this hitlist. */
10436 if (search->pbp->longest_intron <= 0 ||
10437 (search->prog_number != blast_type_tblastn &&
10438 search->prog_number != blast_type_psitblastn &&
10439 search->prog_number != blast_type_blastx))
10440 {
10441 hsp = link_hsps(search, hitlist, hitlist->hsp_array);
10442 /* The HSP's may be in a different order than they were before,
10443 but hsp contains the first one. */
10444 for (index=0; index<hitlist->hspcnt; index++) {
10445 hitlist->hsp_array[index] = hsp;
10446 hsp = hsp->next;
10447 }
10448 } else {
10449 status = new_link_hsps(search, hitlist);
10450 }
10451 }
10452
10453 /* AM: Support for query concatenation. */
10454 if( search->mult_queries && search->mult_queries->use_mq )
10455 search->current_hitlist = orig_hitlist;
10456
10457 return status;
10458 }
10459
10460 /*
10461 Sort the HSP's by starting position of the query. Called by HeapSort.
10462 The first function sorts in forward, the second in reverse order.
10463 */
10464
10465 static int LIBCALLBACK
fwd_compare_hsps(VoidPtr v1,VoidPtr v2)10466 fwd_compare_hsps(VoidPtr v1, VoidPtr v2)
10467
10468 {
10469 BLAST_HSPPtr h1, h2;
10470 BLAST_HSPPtr PNTR hp1, PNTR hp2;
10471
10472 hp1 = (BLAST_HSPPtr PNTR) v1;
10473 hp2 = (BLAST_HSPPtr PNTR) v2;
10474 h1 = *hp1;
10475 h2 = *hp2;
10476
10477 if (SIGN(h1->query.frame) != SIGN(h2->query.frame))
10478 {
10479 if (h1->query.frame < h2->query.frame)
10480 return 1;
10481 else
10482 return -1;
10483 }
10484 if (h1->query.offset < h2->query.offset)
10485 return -1;
10486 if (h1->query.offset > h2->query.offset)
10487 return 1;
10488 /* Necessary in case both HSP's have the same query offset. */
10489 if (h1->subject.offset < h2->subject.offset)
10490 return -1;
10491 if (h1->subject.offset > h2->subject.offset)
10492 return 1;
10493
10494 return 0;
10495 }
10496
10497 static int LIBCALLBACK
rev_compare_hsps(VoidPtr v1,VoidPtr v2)10498 rev_compare_hsps(VoidPtr v1, VoidPtr v2)
10499
10500 {
10501 BLAST_HSPPtr h1, h2;
10502 BLAST_HSPPtr PNTR hp1, PNTR hp2;
10503
10504 hp1 = (BLAST_HSPPtr PNTR) v1;
10505 hp2 = (BLAST_HSPPtr PNTR) v2;
10506 h1 = *hp1;
10507 h2 = *hp2;
10508
10509 if (SIGN(h1->query.frame) != SIGN(h2->query.frame))
10510 {
10511 if (h1->query.frame > h2->query.frame)
10512 return 1;
10513 else
10514 return -1;
10515 }
10516
10517 if (h1->query.offset < h2->query.offset)
10518 return 1;
10519 if (h1->query.offset > h2->query.offset)
10520 return -1;
10521 if (h1->query.end < h2->query.end)
10522 return 1;
10523 if (h1->query.end > h2->query.end)
10524 return -1;
10525 if (h1->subject.offset < h2->subject.offset)
10526 return 1;
10527 if (h1->subject.offset > h2->subject.offset)
10528 return -1;
10529 if (h1->subject.end < h2->subject.end)
10530 return 1;
10531 if (h1->subject.end > h2->subject.end)
10532 return -1;
10533 return 0;
10534 }
10535
10536
10537 static int LIBCALLBACK
rev_compare_hsps_cfj(VoidPtr v1,VoidPtr v2)10538 rev_compare_hsps_cfj(VoidPtr v1, VoidPtr v2)
10539
10540 {
10541 BLAST_HSPPtr h1, h2;
10542 BLAST_HSPPtr PNTR hp1, PNTR hp2;
10543
10544 hp1 = (BLAST_HSPPtr PNTR) v1;
10545 hp2 = (BLAST_HSPPtr PNTR) v2;
10546 h1 = *hp1;
10547 h2 = *hp2;
10548
10549 if (SIGN(h1->query.frame) != SIGN(h2->query.frame))
10550 {
10551 if (h1->query.frame > h2->query.frame)
10552 return -1;
10553 else
10554 return 1;
10555 }
10556
10557 if (SIGN(h1->subject.frame) != SIGN(h2->subject.frame))
10558 {
10559 if (h1->subject.frame > h2->subject.frame)
10560 return 1;
10561 else
10562 return -1;
10563 }
10564
10565 if (h1->query.offset < h2->query.offset)
10566 return 1;
10567 if (h1->query.offset > h2->query.offset)
10568 return -1;
10569 if (h1->query.end < h2->query.end)
10570 return 1;
10571 if (h1->query.end > h2->query.end)
10572 return -1;
10573 if (h1->subject.offset < h2->subject.offset)
10574 return 1;
10575 if (h1->subject.offset > h2->subject.offset)
10576 return -1;
10577 if (h1->subject.end < h2->subject.end)
10578 return 1;
10579 if (h1->subject.end > h2->subject.end)
10580 return -1;
10581 return 0;
10582 }
10583
SumHSPEvalue(BlastSearchBlkPtr search,BLAST_HSPPtr head_hsp,BLAST_HSPPtr hsp,Nlm_FloatHi * xsum)10584 static FloatHi SumHSPEvalue(BlastSearchBlkPtr search, BLAST_HSPPtr head_hsp,
10585 BLAST_HSPPtr hsp, Nlm_FloatHi *xsum)
10586 {
10587 FloatHi gap_decay_rate, sum_evalue;
10588 Int4 gap_size, num, subject_length;
10589
10590 /* AM: The following are added for query multiplexing. */
10591 Int4 effective_length, length_adjustment;
10592 Uint4 qnum;
10593 Int8 dblen_eff;
10594
10595 if( search->mult_queries )
10596 {
10597 qnum = GetQueryNum( search->mult_queries,
10598 head_hsp->query.offset,
10599 head_hsp->query.end,
10600 head_hsp->query.frame );
10601
10602 effective_length = search->mult_queries->EffLengths[qnum];
10603 length_adjustment = search->mult_queries->Adjustments[qnum];
10604 dblen_eff = search->mult_queries->DbLenEff[qnum];
10605 }
10606 else
10607 {
10608 effective_length = search->context[search->first_context].query->effective_length;
10609 length_adjustment = search->length_adjustment;
10610 dblen_eff = search->dblen_eff;
10611 }
10612
10613 gap_size = search->pbp->gap_size;
10614 gap_decay_rate = search->pbp->gap_decay_rate;
10615 num = head_hsp->num + hsp->num;
10616 subject_length = MAX((search->subject->length - length_adjustment), 1);
10617
10618 if (search->prog_number == blast_type_tblastn ||
10619 search->prog_number == blast_type_blastx ||
10620 search->prog_number == blast_type_psitblastn) {
10621 subject_length /= 3;
10622 }
10623 subject_length = MAX(subject_length, 1);
10624
10625 *xsum = head_hsp->xsum + hsp->xsum;
10626
10627 sum_evalue =
10628 BlastUnevenGapSumE(LINK_HSP_OVERLAP + search->pbp->gap_size + 1,
10629 search->pbp->longest_intron + LINK_HSP_OVERLAP + 1,
10630 num, *xsum,
10631 effective_length,
10632 subject_length,
10633 dblen_eff,
10634 BlastGapDecayDivisor(gap_decay_rate, num));
10635
10636 return sum_evalue;
10637 }
10638
10639
10640 static int LIBCALLBACK
xsum_compare_hsps(VoidPtr v1,VoidPtr v2)10641 xsum_compare_hsps(VoidPtr v1, VoidPtr v2)
10642
10643 {
10644 BLAST_HSPPtr h1, h2;
10645 BLAST_HSPPtr PNTR hp1, PNTR hp2;
10646
10647 hp1 = (BLAST_HSPPtr PNTR) v1;
10648 hp2 = (BLAST_HSPPtr PNTR) v2;
10649 h1 = *hp1;
10650 h2 = *hp2;
10651
10652 if (h1 == NULL) {
10653 return (h2 == NULL) ? 0 : 1;
10654 } else if (h2 == NULL) {
10655 return -1;
10656 }
10657
10658 if (h1->xsum < h2->xsum)
10659 return 1;
10660 if (h1->xsum > h2->xsum)
10661 return -1;
10662
10663 return score_compare_hsps(&h1, &h2);
10664 }
10665
10666
10667 /** Merges HSPs from two linked HSP sets into an array of HSPs, sorted
10668 * in increasing order of contexts and increasing order of query
10669 * offsets.
10670 * @param hsp_set1 First linked set. [in]
10671 * @param hsp_set2 Second linked set. [in]
10672 * @param merged_size The total number of HSPs in two sets. [out]
10673 * @return The array of pointers to HSPs representing a merged set.
10674 */
10675 static BLAST_HSPPtr *
BLAST_HSPMergedLinkedSet(BLAST_HSPPtr hsp_set1,BLAST_HSPPtr hsp_set2,Int4 * merged_size)10676 BLAST_HSPMergedLinkedSet(BLAST_HSPPtr hsp_set1, BLAST_HSPPtr hsp_set2,
10677 Int4* merged_size)
10678 {
10679 Int4 index;
10680 Int4 length;
10681 BLAST_HSPPtr * merged_hsps;
10682
10683 /* Find the first link of the old HSP chain. */
10684 while (hsp_set1->prev)
10685 hsp_set1 = hsp_set1->prev;
10686 /* Find first and last link in the new HSP chain. */
10687 while (hsp_set2->prev)
10688 hsp_set2 = hsp_set2->prev;
10689
10690 *merged_size = length = hsp_set1->num + hsp_set2->num;
10691
10692 if( *merged_size == 0 ) return NULL;
10693
10694 merged_hsps = (BLAST_HSPPtr*) MemNew(length*sizeof(BLAST_HSPPtr));
10695
10696 index = 0;
10697 while (hsp_set1 || hsp_set2) {
10698 /* NB: HSP sets for which some HSPs have identical query
10699 offsets cannot possibly be admissible, so it doesn't matter
10700 how to deal with equal offsets. */
10701 if (!hsp_set2 || (hsp_set1 &&
10702 hsp_set1->query.offset < hsp_set2->query.offset)) {
10703 merged_hsps[index] = hsp_set1;
10704 hsp_set1 = hsp_set1->next;
10705 } else {
10706 merged_hsps[index] = hsp_set2;
10707 hsp_set2 = hsp_set2->next;
10708 }
10709 ++index;
10710 }
10711
10712 return merged_hsps;
10713 }
10714
10715
10716 /** Combines two linked sets of HSPs into a single set; the original
10717 * linked sets are consumed by this operation.
10718 *
10719 * @param hsp_set1 First set of HSPs [in]
10720 * @param hsp_set2 Second set of HSPs [in]
10721 * @param sum_score The sum score of the combined linked set
10722 * @param evalue The E-value of the combined linked set
10723 * @return Combined linked set.
10724 */
10725 static BLAST_HSPPtr
BLAST_HSPCombineLinkedSets(BLAST_HSPPtr hsp_set1,BLAST_HSPPtr hsp_set2,Nlm_FloatHi sum_score,Nlm_FloatHi evalue)10726 BLAST_HSPCombineLinkedSets(BLAST_HSPPtr hsp_set1, BLAST_HSPPtr hsp_set2,
10727 Nlm_FloatHi sum_score, Nlm_FloatHi evalue)
10728 {
10729 BLAST_HSPPtr* merged_hsps;
10730 BLAST_HSPPtr head_hsp;
10731 Int4 index, new_num;
10732
10733 if (!hsp_set2)
10734 return hsp_set1;
10735 else if (!hsp_set1)
10736 return hsp_set2;
10737
10738 merged_hsps = BLAST_HSPMergedLinkedSet(hsp_set1, hsp_set2, &new_num);
10739
10740 head_hsp = merged_hsps[0];
10741 head_hsp->start_of_chain = TRUE;
10742 head_hsp->prev = NULL;
10743 for (index = 0; index < new_num; ++index) {
10744 BLAST_HSPPtr link = merged_hsps[index];
10745 if (index < new_num - 1) {
10746 BLAST_HSPPtr next_link = merged_hsps[index+1];
10747 link->next = next_link;
10748 next_link->prev = link;
10749 } else {
10750 link->next = NULL;
10751 }
10752 link->xsum = sum_score;
10753 link->evalue = evalue;
10754 link->num = new_num;
10755 link->linked_set = TRUE;
10756 if (link != head_hsp)
10757 link->start_of_chain = FALSE;
10758 }
10759
10760 MemFree(merged_hsps);
10761 return head_hsp;
10762 }
10763
10764
10765 /** Given an array of HSPs (H), sorted in increasing order of query
10766 * offsets, fills an array of indices into array H such that for each
10767 * i, the index is the smallest HSP index, for which query ending
10768 * offset is >= than query ending offset of H[i]. This indexing is
10769 * performed before any of the HSPs in H are linked.
10770 *
10771 * @param hsp_array Array HSPs [in]
10772 * @param hspcnt Size of the hsp_array. [in]
10773 * @param qend_index_ptr Pointer to the new array of indices.
10774 */
10775 static Int2
BLAST_HSPArrayIndexQueryEnds(BLAST_HSPPtr * hsp_array,Int4 hspcnt,Int4 ** qend_index_ptr)10776 BLAST_HSPArrayIndexQueryEnds(BLAST_HSPPtr* hsp_array, Int4 hspcnt,
10777 Int4** qend_index_ptr)
10778 {
10779 Int4 index;
10780 Int4* qend_index_array = NULL;
10781 BLAST_HSPPtr link;
10782 Int4 current_end = 0;
10783 Int4 current_index = 0;
10784
10785 /* Allocate the array. */
10786 *qend_index_ptr = qend_index_array =
10787 (Int4*) Nlm_Calloc(hspcnt, sizeof(Int4));
10788 if (!qend_index_array)
10789 return -1;
10790
10791 current_end = hsp_array[0]->query.end;
10792
10793 for (index = 1; index < hspcnt; ++index) {
10794 link = hsp_array[index];
10795 if (link->context > hsp_array[current_index]->context ||
10796 link->query.end > current_end) {
10797 current_index = index;
10798 current_end = link->query.end;
10799 }
10800 qend_index_array[index] = current_index;
10801 }
10802 return 0;
10803 }
10804
10805
10806 /** Find an HSP on the same context as the one given, with closest
10807 * start offset that is greater than a specified value. The list of
10808 * HSPs to search must be sorted by query offset and in increasing
10809 * order of contexts.
10810 * @param hsp_array Array of pointers to HSPs [in]
10811 * @param size Number of elements in the array [in]
10812 * @param context Context of the target HSP [in]
10813 * @param offset The target offset to search for [in]
10814 * @return The index in the array of the HSP whose start/end offset
10815 * is closest to but >= the value 'offset'
10816 */
10817 static Int4
BLAST_HSPOffsetBinarySearch(BLAST_HSPPtr * hsp_array,Int4 size,Int4 context,Int4 offset)10818 BLAST_HSPOffsetBinarySearch(BLAST_HSPPtr* hsp_array, Int4 size,
10819 Int4 context, Int4 offset)
10820 {
10821 Int4 index, begin, end;
10822
10823 begin = 0;
10824 end = size;
10825 while (begin < end) {
10826 index = (begin + end) / 2;
10827
10828 if (hsp_array[index]->context < context)
10829 begin = index + 1;
10830 else if (hsp_array[index]->context > context)
10831 end = index;
10832 else {
10833 if (hsp_array[index]->query.offset >= offset)
10834 end = index;
10835 else
10836 begin = index + 1;
10837 }
10838 }
10839
10840 return end;
10841 }
10842
10843
10844 /** Find an HSP in an array sorted in increasing order of query
10845 * offsets and increasing order of contexts, with the smallest index
10846 * such that its query end is >= to a given offset.
10847 *
10848 * @param hsp_array Array of pointers to HSPs. [in]
10849 * @param size Number of elements in the array [in]
10850 * @param qend_index_array Array indexing query ends in the hsp_array [in]
10851 * @param context Context of the target HSP [in]
10852 * @param offset The target offset to search for [in]
10853 * @return The found index in the hsp_array.
10854 */
10855 static Int4
BLAST_HSPOffsetEndBinarySearch(BLAST_HSPPtr * hsp_array,Int4 size,Int4 * qend_index_array,Int4 context,Int4 offset)10856 BLAST_HSPOffsetEndBinarySearch(BLAST_HSPPtr* hsp_array, Int4 size,
10857 Int4* qend_index_array, Int4 context,
10858 Int4 offset)
10859 {
10860 Int4 begin, end;
10861
10862 begin = 0;
10863 end = size;
10864 while (begin < end) {
10865 Int4 right_index = (begin + end) / 2;
10866 Int4 left_index = qend_index_array[right_index];
10867
10868 if (hsp_array[right_index]->context < context)
10869 begin = right_index + 1;
10870 else if (hsp_array[right_index]->context > context)
10871 end = left_index;
10872 else {
10873 if (hsp_array[left_index]->query.end >= offset)
10874 end = left_index;
10875 else
10876 begin = right_index + 1;
10877 }
10878 }
10879
10880 return end;
10881 }
10882
10883
10884 /** Checks if new candidate HSP is admissible to be linked to a set of
10885 * HSPs on the left. The new HSP must start strictly before the parent
10886 * HSP in both query and subject, and its end must lie within an
10887 * interval from the parent HSP's start, determined by the allowed gap
10888 * and overlap sizes in query and subject. This function also
10889 * indicates whether parent is already too far to the right of the
10890 * candidate HSP, via a boolean pointer.
10891 *
10892 * @param hsp_set1 First linked set of HSPs. [in]
10893 * @param hsp_set2 Second linked set of HSPs. [in]
10894 * @param overlap_size Amount by which HSPs are allowed to overlap. [in]
10895 * @param gap_size Size of the maximum permitted gap in the
10896 * query. [in]
10897 * @param longest_intron Size of the maximum permitted gap in the
10898 * sujbect. [in]
10899 * @return Do the two sets satisfy the admissibility
10900 * criteria to form a combined set?
10901 */
10902 static Boolean
BLAST_HSPLinkedSetsAdmissible(BLAST_HSPPtr hsp_set1,BLAST_HSPPtr hsp_set2,Int4 overlap_size,Int4 gap_size,Int4 longest_intron)10903 BLAST_HSPLinkedSetsAdmissible(BLAST_HSPPtr hsp_set1,
10904 BLAST_HSPPtr hsp_set2,
10905 Int4 overlap_size, Int4 gap_size,
10906 Int4 longest_intron)
10907 {
10908 BLAST_HSPPtr* merged_hsps;
10909 Int4 combined_size = 0;
10910 Int4 index;
10911
10912 if (!hsp_set1 || !hsp_set2 )
10913 return FALSE;
10914
10915 /* The first input HSP must be the head of its set. */
10916 if (hsp_set1->prev)
10917 return FALSE;
10918
10919 /* The second input HSP may not be the head of its set. Hence
10920 follow the previous pointers to get to the head. */
10921 for ( ; hsp_set2->prev; hsp_set2 = hsp_set2->prev);
10922
10923 /* If left and right HSP are the same, return inadmissible
10924 status. */
10925 if (hsp_set1 == hsp_set2)
10926 return FALSE;
10927
10928 /* Check if these HSPs are for the same protein sequence (same
10929 context) */
10930 if (hsp_set1->context != hsp_set2->context)
10931 return FALSE;
10932
10933 /* Check if new HSP and hsp_set2 are on the same nucleotide
10934 sequence strand. (same sign of subject frame) */
10935 if (SIGN(hsp_set1->subject.frame) !=
10936 SIGN(hsp_set2->subject.frame))
10937 return FALSE;
10938
10939 /* Merge the two sets into an array with increasing order of query
10940 offsets. */
10941 merged_hsps =
10942 BLAST_HSPMergedLinkedSet(hsp_set1, hsp_set2, &combined_size);
10943
10944 for (index = 0; index < combined_size - 1; ++index) {
10945 BLAST_HSPPtr left_hsp = merged_hsps[index];
10946 BLAST_HSPPtr right_hsp = merged_hsps[index+1];
10947
10948
10949 /* If the new HSP is too far to the left from the right_hsp,
10950 indicate this by setting the boolean output value to
10951 TRUE. */
10952 if (left_hsp->query.end < right_hsp->query.offset - gap_size)
10953 break;
10954
10955 /* Check if the left HSP's query offset is to the right of the
10956 right HSP's offset, i.e. they came in wrong order. */
10957 if (left_hsp->query.offset >= right_hsp->query.offset)
10958 break;
10959
10960 /* Check the remaining condition for query offsets: left HSP
10961 cannot end further than the maximal allowed overlap from
10962 the right HSP's offset; and left HSP must end before the
10963 right HSP. */
10964 if (left_hsp->query.end > right_hsp->query.offset + overlap_size ||
10965 left_hsp->query.end >= right_hsp->query.end)
10966 break;
10967
10968 /* Check the subject offsets conditions. */
10969 if (left_hsp->subject.end >
10970 right_hsp->subject.offset + overlap_size ||
10971 left_hsp->subject.end <
10972 right_hsp->subject.offset - longest_intron ||
10973 left_hsp->subject.offset >= right_hsp->subject.offset ||
10974 left_hsp->subject.end >= right_hsp->subject.end)
10975 break;
10976 }
10977
10978 MemFree(merged_hsps);
10979
10980 if (index < combined_size - 1)
10981 return FALSE;
10982
10983 return TRUE;
10984 }
10985
10986
10987 /**
10988 * Swap the role of the query and subject within an HSP; used by
10989 * new_link_hsps to implement HSP linking for blastx using the code
10990 * for blastn.
10991 */
10992 static void
BLAST_HSPArraySwapSequences(BLAST_HSPPtr PNTR hsp_array,Int4 hspcnt)10993 BLAST_HSPArraySwapSequences(BLAST_HSPPtr PNTR hsp_array, Int4 hspcnt)
10994 {
10995 Int4 i;
10996
10997 for(i = 0; i < hspcnt; i++ ) {
10998 BLAST_Seg seg = hsp_array[i]->query;
10999 hsp_array[i]->query = hsp_array[i]->subject;
11000 hsp_array[i]->subject = seg;
11001 }
11002 }
11003
11004
11005 /**
11006 * Prepares an array of HSPs for linking within new_link_hsps.
11007 *
11008 * @param search Parameters for a blast search. [in]
11009 * @param kbp_array Array of Karlin-Altchul statitic parameters, on
11010 * block for each contex. [in]
11011 * @param hsp_array Array of HSPs. [in/out]
11012 * @pamam hspcnt Size of hsp_array. [in]
11013 */
11014 static void
new_link_hsps_setup(BlastSearchBlkPtr search,BLAST_KarlinBlkPtr * kbp_array,BLAST_HSPPtr PNTR hsp_array,Int4 hspcnt)11015 new_link_hsps_setup(BlastSearchBlkPtr search,
11016 BLAST_KarlinBlkPtr * kbp_array,
11017 BLAST_HSPPtr PNTR hsp_array, Int4 hspcnt)
11018 {
11019 Int4 index;
11020 BLAST_HSPPtr hsp;
11021 Nlm_FloatHi gap_decay_divisor =
11022 BlastGapDecayDivisor(search->pbp->gap_decay_rate, 1);
11023
11024 /* Find e-values for single HSPs */
11025 s_RoundDownOddScores(search->sbp, search->current_hitlist);
11026 BlastGetNonSumStatsEvalue(search);
11027
11028 for (index=0; index<hspcnt; index++) {
11029 hsp = hsp_array[index];
11030
11031 hsp->num = 1;
11032 hsp->linked_set = FALSE;
11033 hsp->start_of_chain = FALSE;
11034 hsp->next = NULL;
11035 hsp->prev = NULL;
11036
11037 hsp->ordering_method = 3;
11038 hsp->evalue /= gap_decay_divisor;
11039
11040 hsp->xsum = kbp_array[hsp->context]->Lambda * hsp->score -
11041 kbp_array[hsp->context]->logK;
11042 }
11043 }
11044
11045 /** Greedy algorithm to link HSPs with uneven gaps. Sorts HSPs by
11046 * score. Starting with the highest scoring HSP, finds an HSP that
11047 * produces the best sum e-value when added to the HSP set under
11048 * consideration. The neighboring HSPs in a set must have endpoints
11049 * within a window of each other on the protein axis, and within the
11050 * longest allowed intron length on the nucleotide axis. When no more
11051 * HSPs can be added to the highest scoring set, the next highest
11052 * scoring HSP is considered that is not yet part of any set.
11053 *
11054 * @param search Paramters for a blast search. [in]
11055 * @param hitlist A hitlist of HSPs to be linked. [in/out]
11056 * @param hsp_array An array of HSPs to be linked (redundantly contained
11057 * within hitlist.) [in/out]
11058 * @returns Status: 0 on success, -1 if bad input.
11059 */
11060 static Int2
new_link_hsps(BlastSearchBlkPtr search,BLAST_HitListPtr hitlist)11061 new_link_hsps(BlastSearchBlkPtr search, BLAST_HitListPtr hitlist)
11062 {
11063 BLAST_HSPPtr PNTR hsp_array; /* Original HSP array. */
11064 BLAST_HSPPtr PNTR score_hsp_array; /* an array of HSPs sorted by
11065 decreasing score */
11066 BLAST_HSPPtr PNTR offset_hsp_array; /* an array of HSPs sorted by
11067 increasing query offset */
11068 BLAST_HSPPtr head_hsp;
11069 BLAST_KarlinBlkPtr PNTR kbp_array;
11070
11071 Int4 hspcnt, index, index1;
11072 Int4 overlap_size; /* Maximal overlap size in query or
11073 subject */
11074 Int4 gap_size; /* Maximal gap size in query */
11075 Int4 longest_intron; /* Maximum gap size in subject */
11076
11077 Int4* qend_index_array = NULL;
11078
11079 /* Check input arguments. */
11080 if (!search || !hitlist)
11081 return -1;
11082
11083 hsp_array = hitlist->hsp_array;
11084
11085 if(search->pbp->gapped_calculation) {
11086 kbp_array = search->sbp->kbp_gap;
11087 } else {
11088 kbp_array = search->sbp->kbp;
11089 }
11090 hspcnt = hitlist->hspcnt;
11091
11092 /* Set up the HSP array to be an array of singleton sets, with
11093 * correct evalue, num and sumscore */
11094 new_link_hsps_setup(search, kbp_array, hsp_array, hspcnt);
11095
11096 /* If there is a single HSP, don't try to link, just use the
11097 * evalue set in new_link_hsps_setup */
11098 if(hitlist->hspcnt == 1)
11099 return 0;
11100
11101 overlap_size = LINK_HSP_OVERLAP;
11102 gap_size = search->pbp->gap_size;
11103 longest_intron = search->pbp->longest_intron;
11104
11105 if(search->prog_number == blast_type_blastx) {
11106 BLAST_HSPArraySwapSequences(hsp_array, hspcnt);
11107 }
11108
11109 /* Allocate, fill and sort the auxiliary arrays. */
11110 score_hsp_array = (BLAST_HSPPtr PNTR) Malloc(hspcnt*sizeof(BLAST_HSPPtr));
11111 MemCpy(score_hsp_array, hsp_array, hspcnt*sizeof(BLAST_HSPPtr));
11112 HeapSort(score_hsp_array, hspcnt, sizeof(BLAST_HSPPtr), xsum_compare_hsps);
11113
11114 offset_hsp_array = (BLAST_HSPPtr PNTR) Malloc(hspcnt*sizeof(BLAST_HSPPtr));
11115 MemCpy(offset_hsp_array, hsp_array, hspcnt*sizeof(BLAST_HSPPtr));
11116 HeapSort(offset_hsp_array, hspcnt, sizeof(BLAST_HSPPtr), fwd_compare_hsps);
11117
11118 BLAST_HSPArrayIndexQueryEnds(offset_hsp_array, hspcnt, &qend_index_array);
11119
11120 /* head_hsp is set to NULL whenever there is no current linked set
11121 that is being worked on. */
11122 head_hsp = NULL;
11123 for (index = 0; index < hspcnt && score_hsp_array[index]; ) {
11124 double best_evalue, best_sum_score = 0;
11125 BLAST_HSPPtr best_hsp = NULL;
11126 BLAST_HSPPtr tail_hsp = NULL;
11127 Int4 hsp_index_left, hsp_index_right;
11128 Int4 left_offset;
11129
11130 if (!head_hsp) {
11131 /* Find the highest scoring HSP that is not yet part of a
11132 linked set. An HSP is part of a linked set if and only
11133 if either prev or next pointer is not NULL. */
11134 while (index<hspcnt && score_hsp_array[index] &&
11135 (score_hsp_array[index]->next ||
11136 score_hsp_array[index]->prev))
11137 index++;
11138 if (index==hspcnt)
11139 break;
11140 head_hsp = score_hsp_array[index];
11141 }
11142 /* Find the last link in the current HSP set. */
11143 for (tail_hsp = head_hsp; tail_hsp->next; tail_hsp = tail_hsp->next);
11144
11145 best_evalue = head_hsp->evalue;
11146 best_sum_score = head_hsp->xsum;
11147 /* left_offset is the leftmost point where an HSP can end to be
11148 admissible for linking with head_hsp. */
11149 left_offset = head_hsp->query.offset - gap_size;
11150
11151 /* Find the smallest index in the offset array, for which an
11152 HSP can possibly be added to the set currently being
11153 explored. */
11154 hsp_index_left =
11155 BLAST_HSPOffsetEndBinarySearch(offset_hsp_array,
11156 hspcnt, qend_index_array,
11157 head_hsp->context, left_offset);
11158
11159 /* Find the largest index in the offset array, for which an HSP
11160 can be possibly added to the currently explored set. */
11161 hsp_index_right =
11162 BLAST_HSPOffsetBinarySearch(offset_hsp_array, hspcnt,
11163 tail_hsp->context,
11164 tail_hsp->query.end + gap_size);
11165
11166 for (index1 = hsp_index_left; index1 < hsp_index_right; ++index1) {
11167 BLAST_HSPPtr lhsp = offset_hsp_array[index1];
11168
11169 /* From each previously linked HSP set consider only one
11170 representative - the leftmost HSP whose query end is >=
11171 left_offset. */
11172 if (lhsp->prev && lhsp->prev->query.end >= left_offset)
11173 continue;
11174
11175 if (BLAST_HSPLinkedSetsAdmissible(head_hsp, lhsp,
11176 overlap_size, gap_size,
11177 longest_intron)) {
11178 double evalue, sum_score;
11179 /* Check if the e-value for the new combined HSP set is
11180 better than for the previously obtained set. */
11181 if ((evalue = SumHSPEvalue(search, head_hsp,
11182 lhsp, &sum_score)) <
11183 MIN(best_evalue, lhsp->evalue)) {
11184 best_hsp = lhsp;
11185 best_evalue = evalue;
11186 best_sum_score = sum_score;
11187 }
11188 }
11189 }
11190
11191 /* Link the new HSP to the set, if it qualified. */
11192 if (best_hsp) {
11193 head_hsp = BLAST_HSPCombineLinkedSets(head_hsp, best_hsp,
11194 best_sum_score, best_evalue);
11195 } else {
11196 head_hsp = NULL;
11197 ++index;
11198 }
11199 }
11200
11201 /* Free the auxiliary arrays. */
11202 MemFree(score_hsp_array);
11203 MemFree(offset_hsp_array);
11204 MemFree(qend_index_array);
11205
11206 if(search->prog_number == blast_type_blastx) {
11207 BLAST_HSPArraySwapSequences(hsp_array, hspcnt);
11208 }
11209
11210 /* Make sure that HSPs are sorted by individual score at exit. */
11211 HeapSort(hitlist->hsp_array, hitlist->hspcnt,
11212 sizeof(BLAST_HSPPtr), score_compare_hsps);
11213
11214 return 0;
11215 }
11216
11217
11218 /*
11219 This function orders and "links" the HSP's. It does this
11220 by first ordering them backwards (with "rev_compare_hsps") and
11221 then (as the function moves forwards through the list of HSP's)
11222 comparing them with the previous HSP's. They then end up
11223 in the "correct" order.
11224
11225 The HSP hp_start is used as a "hook" into the chain of HSP's.
11226 As HSP's are assigned to a set, they are removed from the linked
11227 list, and further consideration. hp_start always points to the first
11228 "real" HSP remaining.
11229
11230 Two attempts are made to order the HSP's:
11231 one has a maximum gap ("gap"), the other has no maximum.
11232
11233 This function works with the HSP's resulting from one query
11234 sequence and one subject sequence.
11235
11236 ******* Comments -cfj
11237 This function is, in the worst case, O(N^3) in the number of hsps, and often becomes
11238 the main bottleneck on large query strings.
11239 I've made a bunch of changes to try to speed up this function (by constant factors),
11240 while still producing the identical output.
11241
11242 ******* major changes -cfj
11243 - Use separate search (ie. separate frame loops) for each subject frame sign.
11244 - use lh_helper array to store most commonly used info. This helps by ordering
11245 accesses, and keeping them close.
11246 - For index=0, we can break out of loop when q_off gets large enough.
11247 - For index=1, we can break out of loop when remaining maxsum is small enough.
11248 - Keep track of which hsps are 'linked-to'. When we pull out the high scoring
11249 chain, if none of the removed hsp's are linked to by someone outside the change,
11250 then no other hsp's score will change, so we don't need to recompute scores, just
11251 find the new largest.
11252 - if max-path was unchanged, it is still the max, so don't recompute all scores
11253 - dynamically keep track of next hsp in chain with a larger score, this allows us to
11254 jump over low scores when walking down list
11255 - Whan computing the score for each hsp, if the linked_to (from last iter) has not changed,
11256 then this item is unchanged also.
11257 ********
11258 */
11259
11260 static BLAST_HSPPtr
link_hsps(BlastSearchBlkPtr search,BLAST_HitListPtr hitlist,BLAST_HSPPtr PNTR hsp_array)11261 link_hsps(BlastSearchBlkPtr search, BLAST_HitListPtr hitlist, BLAST_HSPPtr PNTR hsp_array)
11262 {
11263 BLAST_HSPPtr H, H2, best[2], first_hsp, last_hsp, hp_frame_start[3*2];
11264 BLAST_HSP hp_start;
11265 BLAST_KarlinBlkPtr PNTR kbp;
11266 BLAST_Score maxscore, cutoff[2];
11267 Boolean frame_change, linked_set, ignore_small_gaps;
11268 Nlm_FloatHi gap_decay_rate, gap_prob, prob[2];
11269 Int4 index, index1, ordering_method, num_links, frame_index, number_of_query_frames;
11270 Int4 hp_frame_number[3*2];
11271 Int4 start_range_size; /* Number of positions at which next HSP can
11272 start around the end point of the previous HSP. */
11273 Int4 subject_length, number_of_hsps, total_number_of_hsps;
11274 VoidPtr link;
11275 Int4 H2_index,H_index;
11276 Int4 i;
11277 Int4 max_q_diff=0;
11278 Int4 path_changed; /* will be set if an element is removed that may change an existing path */
11279 Int4 first_pass, use_current_max;
11280 LinkHelpStruct *lh_helper=0;
11281 Uint4 query_num=0; /* AM: to support query concatenation. */
11282 const Int4 overlap_size = LINK_HSP_OVERLAP; /* Maximal allowed overlap
11283 between to successive
11284 HSPs in a linked set. */
11285 Int4 trim_size = (overlap_size+1)/2; /* Distance by which HSPs are
11286 trimmed to remove the potential
11287 overlap. */
11288
11289 if (search == NULL || hitlist == NULL)
11290 return NULL;
11291
11292 /* enlarge helper array if necessary */
11293 if (hitlist->lh_helper_size <= (hitlist->hspcnt+5)){
11294 hitlist->lh_helper_size = MAX(1024,hitlist->hspcnt+5);
11295 MemFree(hitlist->lh_helper);
11296 hitlist->lh_helper = (LinkHelpStruct *) MemNew(sizeof(LinkHelpStruct)*hitlist->lh_helper_size);
11297 }
11298 lh_helper= hitlist->lh_helper;
11299
11300 if (search->pbp->gapped_calculation)
11301 {
11302 kbp = search->sbp->kbp_gap;
11303 }
11304 else
11305 {
11306 kbp = search->sbp->kbp;
11307 }
11308
11309 total_number_of_hsps = hitlist->hspcnt;
11310
11311 /* AM: Support for query concatenation */
11312 if( !search->mult_queries )
11313 subject_length = MAX((search->subject->length - search->length_adjustment), 1);
11314 else
11315 {
11316 query_num = GetQueryNum( search->mult_queries, hsp_array[0]->query.offset,
11317 hsp_array[0]->query.end, hsp_array[0]->query.frame );
11318 subject_length = MAX((search->subject->length
11319 - search->mult_queries->Adjustments[query_num]), 1);
11320 }
11321
11322 if (StringCmp(search->prog_name, "tblastn") == 0
11323 || StringCmp(search->prog_name, "tblastx") == 0
11324 ||StringCmp(search->prog_name, "psitblastn") == 0)
11325 {
11326 subject_length /= 3;
11327 }
11328 subject_length = MAX(subject_length, 1);
11329 number_of_hsps = total_number_of_hsps;
11330 start_range_size = search->pbp->gap_size + overlap_size + 1;
11331 gap_prob = search->pbp->gap_prob;
11332 gap_decay_rate = search->pbp->gap_decay_rate;
11333 /* Sort by (reverse) position. */
11334 HeapSort(hsp_array,total_number_of_hsps,sizeof(BLAST_HSPPtr), rev_compare_hsps_cfj);
11335
11336 cutoff[0] = search->pbp->cutoff_s_second;
11337 cutoff[1] = search->pbp->cutoff_big_gap;
11338 ignore_small_gaps = search->pbp->ignore_small_gaps;
11339
11340 if (StringICmp(search->prog_name, "blastn") == 0 || StringICmp(search->prog_name, "blastx") == 0 || StringICmp(search->prog_name, "tblastx") == 0)
11341 {
11342 number_of_query_frames = 2;
11343 }
11344 else
11345 {
11346 number_of_query_frames = 1;
11347 }
11348
11349 /* hook up the HSP's */
11350 hp_frame_start[0] = hsp_array[0];
11351 hp_frame_number[0] = hp_frame_number[1] = 0;
11352 hp_frame_number[2] = hp_frame_number[3] = 0;
11353 frame_change = FALSE;
11354
11355
11356 /* Put entries with different frame parity into separate 'query_frame's. -cfj */
11357 {
11358 Int4 cur_frame=0;
11359 for (index=0;index<number_of_hsps;index++)
11360 {
11361 H=hsp_array[index];
11362 hp_frame_number[cur_frame]++;
11363
11364 H->prev= index ? hsp_array[index-1] : NULL;
11365 H->next= index<(number_of_hsps-1) ? hsp_array[index+1] : NULL;
11366 if (H->prev != NULL &&
11367 ( (SIGN(H->query.frame) != SIGN(H->prev->query.frame))
11368 || (SIGN(H->subject.frame) != SIGN(H->prev->subject.frame))
11369
11370 ))
11371 { /* If frame switches, then start new list. */
11372 hp_frame_number[cur_frame]--;
11373 hp_frame_number[++cur_frame]++;
11374 hp_frame_start[cur_frame] = H;
11375 H->prev->next = NULL;
11376 H->prev = NULL;
11377 frame_change = TRUE;
11378 }
11379 }
11380 number_of_query_frames = cur_frame+1;
11381 }
11382
11383 /* max_q_diff is the maximum amount q.offset can differ from q.offset_trim */
11384 /* This is used to break out of H2 loop early */
11385 if (search->pbp->old_stats == FALSE)
11386 {
11387 for (index=0;index<number_of_hsps;index++)
11388 {
11389 H=hsp_array[index];
11390 H->query.offset_trim = H->query.offset +
11391 MIN(((H->query.length)/4), trim_size);
11392 H->query.end_trim = H->query.end -
11393 MIN(((H->query.length)/4), trim_size);
11394 H->subject.offset_trim = H->subject.offset +
11395 MIN(((H->subject.length)/4), trim_size);
11396 H->subject.end_trim = H->subject.end -
11397 MIN(((H->subject.length)/4), trim_size);
11398 }
11399 max_q_diff = trim_size;
11400 }
11401 else
11402 {
11403 for (index=0;index<number_of_hsps;index++)
11404 {
11405 H=hsp_array[index];
11406 H->query.offset_trim = H->query.offset + (H->query.length)/8;
11407 H->query.end_trim = H->query.end - (H->query.length)/8;
11408 H->subject.offset_trim = H->subject.offset + (H->subject.length)/8;
11409 H->subject.end_trim = H->subject.end - (H->subject.length)/8;
11410 max_q_diff=MAX(max_q_diff,(H->query.length/8));
11411 }
11412 }
11413
11414 for (frame_index=0; frame_index<number_of_query_frames; frame_index++)
11415 {
11416 MemFill(&hp_start, 0, sizeof(hp_start));
11417 hp_start.next = hp_frame_start[frame_index];
11418 hp_frame_start[frame_index]->prev = &hp_start;
11419 number_of_hsps = hp_frame_number[frame_index];
11420
11421 lh_helper[0].ptr = &hp_start;
11422 lh_helper[0].q_off_trim = 0;
11423 lh_helper[0].s_off_trim = 0;
11424 lh_helper[0].maxsum1 = -10000;
11425 lh_helper[0].next_larger = 0;
11426
11427 /* lh_helper[0] = empty = end marker that I added
11428 * lh_helper[1] = hsp_start = empty entry used in original code
11429 * lh_helper[2] = hsp_array->next = hsp_array[0]
11430 * lh_helper[i] = ... = hsp_array[i-2] (for i>=2)
11431 */
11432 first_pass=1; /* do full search */
11433 path_changed=1;
11434 for (H=hp_start.next; H!=NULL; H=H->next)
11435 H->hsp_link.changed=1;
11436
11437 while (number_of_hsps > 0)
11438 {
11439 Int4 last[3];
11440 Int4 max[3];
11441 last[0]=last[1]=last[2]=0;
11442 max[0]=max[1]=max[2]=-10000;
11443 /* Initialize the 'best' parameter */
11444 best[0] = best[1] = NULL;
11445
11446
11447 /* See if we can avoid recomputing all scores:
11448 * - Find the max paths (based on old scores).
11449 * - If no paths were changed by removal of nodes (ie research==0)
11450 * then these max paths are still the best.
11451 * - else if these max paths were unchanged, then they are still the best.
11452 */
11453 use_current_max=0;
11454 if (!first_pass){
11455 Int4 max0,max1;
11456 /* Find the current max sums */
11457 if(!ignore_small_gaps){
11458 max0 = -cutoff[0];
11459 max1 = -cutoff[1];
11460 for (H=hp_start.next; H!=NULL; H=H->next) {
11461 Int4 sum0=H->hsp_link.sum[0];
11462 Int4 sum1=H->hsp_link.sum[1];
11463 if(sum0>=max0)
11464 {
11465 max0=sum0;
11466 best[0]=H;
11467 }
11468 if(sum1>=max1)
11469 {
11470 max1=sum1;
11471 best[1]=H;
11472 }
11473 }
11474 }else{
11475 maxscore = -cutoff[1];
11476 for (H=hp_start.next; H!=NULL; H=H->next) {
11477 Int4 sum=H->hsp_link.sum[1];
11478 if(sum>=maxscore)
11479 {
11480 maxscore=sum;
11481 best[1]=H;
11482 }
11483 }
11484 }
11485 if(path_changed==0){
11486 /* No path was changed, use these max sums. */
11487 use_current_max=1;
11488 }
11489 else{
11490 /* If max path hasn't chaged, we can use it */
11491 /* Walk down best, give up if we find a removed item in path */
11492 use_current_max=1;
11493 if(!ignore_small_gaps){
11494 for (H=best[0]; H!=NULL; H=H->hsp_link.link[0])
11495 if (H->linked_to==-1000) {use_current_max=0; break;}
11496 }
11497 if(use_current_max)
11498 for (H=best[1]; H!=NULL; H=H->hsp_link.link[1])
11499 if (H->linked_to==-1000) {use_current_max=0; break;}
11500
11501 }
11502 }
11503
11504 /* reset helper_info */
11505 /* Inside this while loop, the linked list order never changes
11506 * So here we initialize an array of commonly used info,
11507 * and in this loop we access these arrays instead of the actual list
11508 */
11509 if(!use_current_max){
11510 for (H=&hp_start,H_index=1; H!=NULL; H=H->next,H_index++) {
11511 Int4 s_frame = H->subject.frame;
11512 Int4 s_off_t = H->subject.offset_trim;
11513 Int4 q_off_t = H->query.offset_trim;
11514 lh_helper[H_index].ptr = H;
11515 lh_helper[H_index].q_off_trim = q_off_t;
11516 lh_helper[H_index].s_off_trim = s_off_t;
11517 for(i=0;i<BLAST_NUMBER_OF_ORDERING_METHODS;i++)
11518 lh_helper[H_index].sum[i] = H->hsp_link.sum[i];
11519 /* lh_helper[H_index].s_frame = SIGN(s_frame);
11520 * lh_helper[H_index].prev_same = last[SIGN(s_frame)+1];
11521 * last[SIGN(s_frame)+1]=H_index;
11522 */
11523 max[SIGN(s_frame)+1]=MAX(max[SIGN(s_frame)+1],H->hsp_link.sum[1]);
11524 lh_helper[H_index].maxsum1 =max[SIGN(s_frame)+1];
11525
11526 /* set next_larger to link back to closest entry with a sum1 larger than this */
11527 {
11528 Int4 cur_sum=lh_helper[H_index].sum[1];
11529 Int4 prev = H_index-1;
11530 Int4 prev_sum = lh_helper[prev].sum[1];
11531 while((cur_sum>=prev_sum) && (prev>0)){
11532 prev=lh_helper[prev].next_larger;
11533 prev_sum = lh_helper[prev].sum[1];
11534 }
11535 lh_helper[H_index].next_larger = prev;
11536 }
11537 H->linked_to = 0;
11538 }
11539
11540 lh_helper[1].maxsum1 = -10000;
11541
11542
11543 /****** loop iter for index = 0 **************************/
11544 if(!ignore_small_gaps)
11545 {
11546 index=0;
11547 maxscore = -cutoff[index];
11548 H_index = 2;
11549 for (H=hp_start.next; H!=NULL; H=H->next,H_index++)
11550 {
11551 Int4 H_hsp_num=0;
11552 Int4 H_hsp_sum=0;
11553 Nlm_FloatHi H_hsp_xsum=0.0;
11554 VoidPtr H_hsp_link=NULL;
11555 if (H->score > cutoff[index]) {
11556 Int4 H_query_etrim = H->query.end_trim;
11557 Int4 H_sub_etrim = H->subject.end_trim;
11558 Int4 H_q_et_gap = H_query_etrim+start_range_size;
11559 Int4 H_s_et_gap = H_sub_etrim+start_range_size;
11560
11561 /* We only walk down hits with the same frame sign */
11562 /* for (H2=H->prev; H2!=NULL; H2=H2->prev,H2_index--) */
11563 for (H2_index=H_index-1; H2_index>1; H2_index=H2_index-1)
11564 {
11565 Int4 b1,b2,b4,b5;
11566 Int4 q_off_t,s_off_t,sum;
11567
11568 /* s_frame = lh_helper[H2_index].s_frame; */
11569 q_off_t = lh_helper[H2_index].q_off_trim;
11570 s_off_t = lh_helper[H2_index].s_off_trim;
11571
11572 /* combine tests to reduce mispredicts -cfj */
11573 b1 = q_off_t <= H_query_etrim;
11574 b2 = s_off_t <= H_sub_etrim;
11575 /* b3 = s_frame - H_sign_sframe; */
11576 sum = lh_helper[H2_index].sum[index];
11577
11578
11579 b4 = ( q_off_t > H_q_et_gap ) ;
11580 b5 = ( s_off_t > H_s_et_gap ) ;
11581
11582 /* list is sorted by q_off, so q_off should only increase.
11583 * q_off_t can only differ from q_off by max_q_diff
11584 * So once q_off_t is large enough (ie it exceeds limit
11585 * by max_q_diff), we can stop. -cfj
11586 */
11587 if(q_off_t > (H_q_et_gap+max_q_diff))
11588 break;
11589
11590 if (b1|b2|b5|b4) continue;
11591
11592
11593 if (sum>H_hsp_sum)
11594 {
11595 H2=lh_helper[H2_index].ptr;
11596 H_hsp_num=H2->hsp_link.num[index];
11597 H_hsp_sum=H2->hsp_link.sum[index];
11598 H_hsp_xsum=H2->hsp_link.xsum[index];
11599 H_hsp_link=H2;
11600
11601 }
11602 } /* end for H2... */
11603 }
11604 {
11605 BLAST_Score score=H->score;
11606 Nlm_FloatHi new_xsum =
11607 H_hsp_xsum +
11608 (score*(kbp[H->context]->Lambda)) - kbp[H->context]->logK;
11609 Int4 new_sum = H_hsp_sum + (score - cutoff[index]);
11610
11611 H->hsp_link.sum[index] = new_sum;
11612 H->hsp_link.num[index] = H_hsp_num+1;
11613 H->hsp_link.link[index] = H_hsp_link;
11614 lh_helper[H_index].sum[index] = new_sum;
11615 if (new_sum >= maxscore)
11616 {
11617 maxscore=new_sum;
11618 best[index]=H;
11619 }
11620 H->hsp_link.xsum[index] = new_xsum;
11621 if(H_hsp_link)
11622 ((BLAST_HSPPtr)H_hsp_link)->linked_to++;
11623 }
11624 } /* end for H=... */
11625 }
11626 /****** loop iter for index = 1 **************************/
11627 index=1;
11628 maxscore = -cutoff[index];
11629 H_index = 2;
11630 for (H=hp_start.next; H!=NULL; H=H->next,H_index++)
11631 {
11632 Int4 H_hsp_num=0;
11633 Int4 H_hsp_sum=0;
11634 Nlm_FloatHi H_hsp_xsum=0.0;
11635 VoidPtr H_hsp_link=NULL;
11636
11637 H->hsp_link.changed=1;
11638 H2 = H->hsp_link.link[index];
11639 if ( (!first_pass) &&
11640 ( (H2==0) || (H2->hsp_link.changed==0) )
11641 )
11642 {
11643 /* If The best choice last time has not been changed, then it is still the
11644 best choice, so no need to walk down list. */
11645 if(H2){
11646 H_hsp_num=H2->hsp_link.num[index];
11647 H_hsp_sum=H2->hsp_link.sum[index];
11648 H_hsp_xsum=H2->hsp_link.xsum[index];
11649 }
11650 H_hsp_link=H2;
11651 H->hsp_link.changed=0;
11652 } else
11653 if (H->score > cutoff[index]) {
11654 Int4 H_query_etrim = H->query.end_trim;
11655 Int4 H_sub_etrim = H->subject.end_trim;
11656
11657 /* Here we look at what was the best choice last time (if it's still around)
11658 * and set this to the initial choice. By setting the best score to
11659 * a (potentially) large value initially, we can reduce the number of
11660 * hsps checked. -cfj
11661 */
11662
11663 /* Currently we set the best score to a value just less than the real value. This
11664 * is not really necessary, but doing this ensures that in the case of a tie, we
11665 * make the same selection the original code did.
11666 */
11667
11668 #if 1
11669 if(!first_pass&&H2&&H2->linked_to>=0){
11670 if(1){
11671 /* We set this to less than the real value to keep the original ordering
11672 * in case of ties. */
11673 H_hsp_sum=H2->hsp_link.sum[index]-1;
11674 }else{
11675 H_hsp_num=H2->hsp_link.num[index];
11676 H_hsp_sum=H2->hsp_link.sum[index];
11677 H_hsp_xsum=H2->hsp_link.xsum[index];
11678 H_hsp_link=H2;
11679 }
11680 }
11681 #endif
11682
11683 /* We now only walk down hits with the same frame sign */
11684 /* for (H2=H->prev; H2!=NULL; H2=H2->prev,H2_index--) */
11685 for (H2_index=H_index-1; H2_index>1;)
11686 {
11687 Int4 b0,b1,b2;
11688 Int4 q_off_t,s_off_t,sum,next_larger;
11689 LinkHelpStruct * H2_helper=&lh_helper[H2_index];
11690 sum = H2_helper->sum[index];
11691 next_larger = H2_helper->next_larger;
11692
11693 s_off_t = H2_helper->s_off_trim;
11694 q_off_t = H2_helper->q_off_trim;
11695
11696 b0 = sum <= H_hsp_sum;
11697
11698 /* Compute the next H2_index */
11699 H2_index--;
11700 if(b0){ /* If this sum is too small to beat H_hsp_sum, advance to a larger sum */
11701 H2_index=next_larger;
11702 }
11703
11704 /* combine tests to reduce mispredicts -cfj */
11705 b1 = q_off_t <= H_query_etrim;
11706 b2 = s_off_t <= H_sub_etrim;
11707
11708 if(0) if(H2_helper->maxsum1<=H_hsp_sum)break;
11709
11710 if (!(b0|b1|b2) )
11711 {
11712 H2 = H2_helper->ptr;
11713
11714 H_hsp_num=H2->hsp_link.num[index];
11715 H_hsp_sum=H2->hsp_link.sum[index];
11716 H_hsp_xsum=H2->hsp_link.xsum[index];
11717 H_hsp_link=H2;
11718 }
11719
11720 } /* end for H2_index... */
11721 } /* end if(H->score>cuttof[]) */
11722 {
11723 BLAST_Score score=H->score;
11724 Nlm_FloatHi new_xsum =
11725 H_hsp_xsum +
11726 (score*(kbp[H->context]->Lambda)) - kbp[H->context]->logK;
11727 Int4 new_sum = H_hsp_sum + (score - cutoff[index]);
11728
11729 H->hsp_link.sum[index] = new_sum;
11730 H->hsp_link.num[index] = H_hsp_num+1;
11731 H->hsp_link.link[index] = H_hsp_link;
11732 lh_helper[H_index].sum[index] = new_sum;
11733 lh_helper[H_index].maxsum1 = MAX(lh_helper[H_index-1].maxsum1, new_sum);
11734 /* Update this entry's 'next_larger' field */
11735 {
11736 Int4 cur_sum=lh_helper[H_index].sum[1];
11737 Int4 prev = H_index-1;
11738 Int4 prev_sum = lh_helper[prev].sum[1];
11739 while((cur_sum>=prev_sum) && (prev>0)){
11740 prev=lh_helper[prev].next_larger;
11741 prev_sum = lh_helper[prev].sum[1];
11742 }
11743 lh_helper[H_index].next_larger = prev;
11744 }
11745
11746 if (new_sum >= maxscore)
11747 {
11748 maxscore=new_sum;
11749 best[index]=H;
11750 }
11751 H->hsp_link.xsum[index] = new_xsum;
11752 if(H_hsp_link)
11753 ((BLAST_HSPPtr)H_hsp_link)->linked_to++;
11754 }
11755 }
11756 path_changed=0;
11757 first_pass=0;
11758 }
11759
11760 if (search->pbp->old_stats == FALSE && search->pbp->use_large_gaps == FALSE)
11761 {
11762 if (!ignore_small_gaps)
11763 {
11764 /* Select the best ordering method.
11765 First we add back in the value cutoff[index] * the number
11766 of links, as this was subtracted out for purposes of the
11767 comparison above. */
11768 best[0]->hsp_link.sum[0] += (best[0]->hsp_link.num[0])*cutoff[0];
11769
11770 /* AM: Support for query concatenation. */
11771 if( best[0]->hsp_link.num[0] > 1 && gap_prob == 0 ) {
11772 prob[0] = INT4_MAX;
11773 } else {
11774 if( !search->mult_queries )
11775 prob[0] =
11776 BlastSmallGapSumE(start_range_size,
11777 best[0]->hsp_link.num[0],
11778 best[0]->hsp_link.xsum[0],
11779 search->context[search->first_context].
11780 query->effective_length,
11781 subject_length,
11782 search->dblen_eff,
11783 BlastGapDecayDivisor(gap_decay_rate,
11784 best[0]->
11785 hsp_link.num[0]));
11786 else
11787 prob[0] =
11788 BlastSmallGapSumE( start_range_size,
11789 best[0]->hsp_link.num[0],
11790 best[0]->hsp_link.xsum[0],
11791 search->mult_queries->
11792 EffLengths[query_num],
11793 subject_length,
11794 search->mult_queries->
11795 DbLenEff[query_num],
11796 BlastGapDecayDivisor(gap_decay_rate,
11797 best[0]->
11798 hsp_link.num[0]));
11799 if( best[0]->hsp_link.num[0] > 1 ) {
11800 prob[0] /= gap_prob;
11801 if( prob[0] > INT4_MAX ) prob[0] = INT4_MAX;
11802 }
11803 }
11804
11805 best[1]->hsp_link.sum[1] += (best[1]->hsp_link.num[1])*cutoff[1];
11806
11807 /* AM: Support for query concatenation. */
11808 if( 1 - gap_prob == 0.0 && best[1]->hsp_link.num[1] > 1 ) {
11809 prob[1] = INT4_MAX;
11810 } else{
11811 if( !search->mult_queries )
11812 prob[1] =
11813 BlastLargeGapSumE(best[1]->hsp_link.num[1],
11814 best[1]->hsp_link.xsum[1],
11815 search->context[search->first_context].
11816 query->effective_length,
11817 subject_length,
11818 search->dblen_eff,
11819 BlastGapDecayDivisor(gap_decay_rate,
11820 best[1]->
11821 hsp_link.num[1]));
11822 else
11823 prob[1] =
11824 BlastLargeGapSumE( best[1]->hsp_link.num[1],
11825 best[1]->hsp_link.xsum[1],
11826 search->mult_queries->
11827 EffLengths[query_num],
11828 subject_length,
11829 search->mult_queries->
11830 DbLenEff[query_num],
11831 BlastGapDecayDivisor(gap_decay_rate,
11832 best[1]->
11833 hsp_link.num[1]));
11834
11835 if( best[1]->hsp_link.num[1] > 1 ) {
11836 prob[1] /= 1 - gap_prob;
11837 if( prob[1] > INT4_MAX ) prob[1] = INT4_MAX;
11838 }
11839 }
11840 ordering_method = prob[0]<=prob[1] ? 0:1;
11841 }
11842 else
11843 {
11844 /* We only consider the case of big gaps. */
11845 best[1]->hsp_link.sum[1] += (best[1]->hsp_link.num[1])*cutoff[1];
11846 /* gap_prob=0 here as small gaps are NOT considered. */
11847
11848 /* AM: Support for query concatenation. */
11849 if( !search->mult_queries )
11850 prob[1] =
11851 BlastLargeGapSumE(best[1]->hsp_link.num[1],
11852 best[1]->hsp_link.xsum[1],
11853 search->context[search->first_context].
11854 query->effective_length,
11855 subject_length,
11856 search->dblen_eff,
11857 BlastGapDecayDivisor(gap_decay_rate,
11858 best[1]->
11859 hsp_link.num[1]));
11860 else
11861 prob[1] =
11862 BlastLargeGapSumE( best[1]->hsp_link.num[1],
11863 best[1]->hsp_link.xsum[1],
11864 search->mult_queries->EffLengths[query_num],
11865 subject_length,
11866 search->mult_queries->
11867 DbLenEff[query_num],
11868 BlastGapDecayDivisor(gap_decay_rate,
11869 best[1]->
11870 hsp_link.num[1]));
11871 ordering_method = 1;
11872 }
11873 }
11874 else
11875 {
11876 /* We only consider the case of big gaps. */
11877 best[1]->hsp_link.sum[1] += (best[1]->hsp_link.num[1])*cutoff[1];
11878 /* gap_prob=0 here as small gaps are NOT considered. */
11879
11880 /* AM: Support for query concatenation. */
11881 if( !search->mult_queries )
11882 prob[1] =
11883 BlastLargeGapSumE( best[1]->hsp_link.num[1],
11884 best[1]->hsp_link.xsum[1],
11885 search->context[search->first_context].
11886 query->effective_length,
11887 subject_length,
11888 search->dblen_eff,
11889 BlastGapDecayDivisor(gap_decay_rate,
11890 best[1]->
11891 hsp_link.num[1]));
11892 else
11893 prob[1] =
11894 BlastLargeGapSumE( best[1]->hsp_link.num[1],
11895 best[1]->hsp_link.xsum[1],
11896 search->mult_queries->EffLengths[query_num],
11897 subject_length,
11898 search->mult_queries->
11899 DbLenEff[query_num],
11900 BlastGapDecayDivisor(gap_decay_rate,
11901 best[1]->
11902 hsp_link.num[1]));
11903
11904 ordering_method = 1;
11905 }
11906
11907 best[ordering_method]->start_of_chain = TRUE;
11908 best[ordering_method]->evalue = prob[ordering_method];
11909
11910 /* remove the links that have been ordered already. */
11911 if (best[ordering_method]->hsp_link.link[ordering_method])
11912 {
11913 linked_set = TRUE;
11914 }
11915 else
11916 {
11917 linked_set = FALSE;
11918 }
11919 if (best[ordering_method]->linked_to>0) path_changed=1;
11920 for (H=best[ordering_method]; H!=NULL;
11921 H=H->hsp_link.link[ordering_method])
11922 {
11923 if (H->linked_to>1) path_changed=1;
11924 H->linked_to=-1000;
11925 H->hsp_link.changed=1;
11926 /* record whether this is part of a linked set. */
11927 H->linked_set = linked_set;
11928 if (ordering_method == 0)
11929 H->ordering_method = BLAST_SMALL_GAPS;
11930 else
11931 H->ordering_method = BLAST_LARGE_GAPS;
11932 H->evalue = prob[ordering_method];
11933 if (H->next)
11934 (H->next)->prev=H->prev;
11935 if (H->prev)
11936 (H->prev)->next=H->next;
11937 number_of_hsps--;
11938 }
11939
11940 } /* end while num_hsps... */
11941 } /* end for frame_index ... */
11942
11943
11944 HeapSort(hsp_array,total_number_of_hsps,sizeof(BLAST_HSPPtr), rev_compare_hsps);
11945 /* Sort by starting position. */
11946
11947 HeapSort(hsp_array, total_number_of_hsps,sizeof(BLAST_HSPPtr), fwd_compare_hsps);
11948
11949 for (index=0, last_hsp=NULL;index<total_number_of_hsps; index++)
11950 {
11951 H = hsp_array[index];
11952 H->prev = NULL;
11953 H->next = NULL;
11954 }
11955
11956 /* hook up the HSP's. */
11957 first_hsp = NULL;
11958 for (index=0, last_hsp=NULL;index<total_number_of_hsps; index++)
11959 {
11960 H = hsp_array[index];
11961
11962 /* If this is not a single piece or the start of a chain, then Skip it. */
11963 if (H->linked_set == TRUE && H->start_of_chain == FALSE)
11964 continue;
11965
11966 /* If the HSP has no "link" connect the "next", otherwise follow the "link"
11967 chain down, connecting them with "next" and "prev". */
11968 if (last_hsp == NULL)
11969 first_hsp = H;
11970 H->prev = last_hsp;
11971 ordering_method = H->ordering_method;
11972 if (H->hsp_link.link[ordering_method] == NULL)
11973 {
11974 /* Grab the next HSP that is not part of a chain or the start of a chain */
11975 /* The "next" pointers are not hooked up yet in HSP's further down array. */
11976 index1=index;
11977 H2 = index1<(total_number_of_hsps-1) ? hsp_array[index1+1] : NULL;
11978 while (H2 && H2->linked_set == TRUE &&
11979 H2->start_of_chain == FALSE)
11980 {
11981 index1++;
11982 H2 = index1<(total_number_of_hsps-1) ? hsp_array[index1+1] : NULL;
11983 }
11984 H->next= H2;
11985 }
11986 else
11987 {
11988 /* The first one has the number of links correct. */
11989 num_links = H->hsp_link.num[ordering_method];
11990 link = H->hsp_link.link[ordering_method];
11991 while (link)
11992 {
11993 H->num = num_links;
11994 H->xsum = H->hsp_link.xsum[ordering_method];
11995 H->next = (BLAST_HSPPtr) link;
11996 H->prev = last_hsp;
11997 last_hsp = H;
11998 H = H->next;
11999 if (H != NULL)
12000 link = H->hsp_link.link[ordering_method];
12001 else
12002 break;
12003 }
12004 /* Set these for last link in chain. */
12005 H->num = num_links;
12006 H->xsum = H->hsp_link.xsum[ordering_method];
12007 /* Grab the next HSP that is not part of a chain or the start of a chain */
12008 index1=index;
12009 H2 = index1<(total_number_of_hsps-1) ? hsp_array[index1+1] : NULL;
12010 while (H2 && H2->linked_set == TRUE &&
12011 H2->start_of_chain == FALSE)
12012 {
12013 index1++;
12014 H2 = index1<(total_number_of_hsps-1) ? hsp_array[index1+1] : NULL;
12015 }
12016 H->next= H2;
12017 H->prev = last_hsp;
12018 }
12019 last_hsp = H;
12020 }
12021
12022 return first_hsp;
12023 }
12024
12025
12026 /*
12027 Checks Hitlist's for an HSP (or set of HSP's) with the
12028 minimum e-value. Discards those that do not meet the
12029 standard.
12030 */
12031
12032 Int2 LIBCALL
BlastReapHitlistByEvalue(BlastSearchBlkPtr search)12033 BlastReapHitlistByEvalue (BlastSearchBlkPtr search)
12034
12035 {
12036 BLAST_HitListPtr hitlist;
12037 BLAST_HSPPtr hsp;
12038 BLAST_HSPPtr PNTR hsp_array;
12039 Boolean hsp_deleted=FALSE;
12040 Int4 hsp_cnt=0;
12041 Int4 index;
12042 Nlm_FloatHi cutoff;
12043
12044 if (search == NULL)
12045 return 1;
12046
12047 cutoff = search->pbp->cutoff_e;
12048
12049 /* AM: Support for query concatenation. */
12050 if( !search->mult_queries
12051 || search->prog_number != blast_type_tblastn
12052 || !search->mult_queries->use_mq )
12053 hitlist = search->current_hitlist;
12054 else
12055 hitlist = search->mult_queries->HitListArray[
12056 search->mult_queries->current_query];
12057
12058 if (hitlist)
12059 {
12060 hitlist->hspcnt_max = hitlist->hspcnt;
12061 hsp_array = hitlist->hsp_array;
12062 for (index=0; index<hitlist->hspcnt; index++)
12063 {
12064 hsp = hsp_array[index];
12065 if (hsp->evalue > cutoff)
12066 {
12067 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
12068 hsp_deleted = TRUE;
12069 }
12070 else
12071 {
12072 /*hsp->pvalue = BlastKarlinEtoP(hsp->evalue);*/
12073 hsp_cnt++;
12074 }
12075 }
12076 if (hsp_deleted == TRUE)
12077 {
12078 HspArrayPurge(hitlist->hsp_array, hitlist->hspcnt, FALSE);
12079 }
12080
12081 hitlist->hspcnt = hsp_cnt;
12082 hitlist->hspcnt_max = hitlist->hspcnt;
12083 if (hitlist->hspcnt == 0)
12084 {
12085 BlastHitListPurge(hitlist);
12086 }
12087 else
12088 {
12089 NlmMutexLockEx(&search->thr_info->callback_mutex);
12090 search->thr_info->number_of_pos_hits++;
12091 NlmMutexUnlock(search->thr_info->callback_mutex);
12092 search->number_of_seqs_better_E++;
12093 }
12094 }
12095
12096 /* AM: Support for query concatenation. */
12097 if( !search->mult_queries
12098 || search->prog_number != blast_type_tblastn
12099 || !search->mult_queries->use_mq )
12100 search->current_hitlist = hitlist;
12101 else
12102 search->mult_queries->HitListArray[
12103 search->mult_queries->current_query] = hitlist;
12104
12105 return 0;
12106 }
12107
12108 /*
12109 Checks Hitlist's for an HSP (or set of HSP's) with the
12110 minimum e-value. Discards those that do not meet the
12111 standard.
12112 */
12113
12114 Int2 LIBCALL
BlastGetNonSumStatsEvalue(BlastSearchBlkPtr search)12115 BlastGetNonSumStatsEvalue (BlastSearchBlkPtr search)
12116 {
12117 BLAST_HitListPtr hitlist;
12118 BLAST_HSPPtr hsp;
12119 BLAST_HSPPtr PNTR hsp_array;
12120 BLAST_KarlinBlkPtr PNTR kbp;
12121 Int4 hsp_cnt;
12122 Int4 index;
12123 /* AM: Added to support query concatencation. */
12124 Int4 query_num;
12125
12126 if (search == NULL)
12127 return 1;
12128
12129 if (search->pbp->gapped_calculation)
12130 {
12131 kbp = search->sbp->kbp_gap;
12132 }
12133 else
12134 {
12135 kbp = search->sbp->kbp;
12136 }
12137
12138 hitlist = search->current_hitlist;
12139 if (hitlist)
12140 {
12141 hsp_cnt = hitlist->hspcnt;
12142 hsp_array = hitlist->hsp_array;
12143 for (index=0; index<hsp_cnt; index++)
12144 {
12145 hsp = hsp_array[index];
12146 if (!search->pbp->mb_params)
12147 {
12148 /* AM: changed to support query concatenation. */
12149 if( !search->mult_queries )
12150 hsp->evalue = BlastKarlinStoE_simple(hsp->score,
12151 kbp[hsp->context],
12152 search->searchsp_eff);
12153 else
12154 {
12155 query_num = GetQueryNum( search->mult_queries,
12156 hsp->query.offset,
12157 hsp->query.end,
12158 hsp->query.frame );
12159 hsp->evalue = BlastKarlinStoE_simple( hsp->score,
12160 kbp[hsp->context],
12161 search->mult_queries->SearchSpEff[query_num] );
12162 }
12163 }
12164 else {
12165 FloatHi searchsp_eff;
12166 hsp->context = BinarySearchInt4(hsp->query.offset,
12167 search->query_context_offsets, (Int4) (search->last_context+1));
12168 if (kbp[hsp->context]) {
12169 searchsp_eff = (FloatHi) search->dblen_eff *
12170 (FloatHi) search->context[hsp->context].query->effective_length;
12171 hsp->evalue = BlastKarlinStoE_simple(hsp->score,
12172 kbp[hsp->context], searchsp_eff);
12173 }
12174 }
12175 }
12176 }
12177 return 0;
12178 }
12179
12180 Int2 LIBCALL
BlastTimeFillStructure(BlastTimeKeeperPtr btkp)12181 BlastTimeFillStructure(BlastTimeKeeperPtr btkp)
12182
12183 {
12184 CPUTimePtr pTime;
12185
12186 if (btkp == NULL)
12187 return 1;
12188
12189 pTime = CPUTimeMeasure();
12190 if (pTime == NULL)
12191 return 1;
12192
12193 btkp->user = (Nlm_FloatLo) CPUTimeGetUser(pTime);
12194 btkp->system = (Nlm_FloatLo) CPUTimeGetSys(pTime);
12195 btkp->total = btkp->user + btkp->system;
12196
12197 CPUTimeFree(pTime);
12198
12199 return 0;
12200 }
12201
12202 /*
12203 starts the awake thread using static variables in this file.
12204 */
12205
12206 void
BlastStartAwakeThread(BlastThrInfoPtr thr_info)12207 BlastStartAwakeThread(BlastThrInfoPtr thr_info)
12208 {
12209 VoidPtr status=NULL;
12210
12211 if (!thr_info->tick_callback)
12212 return;
12213 /* If awake_thr is running from the last search, then wait for the join. */
12214 /* This pointer is NULL on the first search ever. */
12215 if (thr_info->awake_thr) {
12216 NlmThreadJoin(thr_info->awake_thr, &status);
12217 thr_info->awake_thr = NULL;
12218 }
12219
12220 if (NlmThreadsAvailable()) {
12221 thr_info->awake = TRUE;
12222 /* last tick is used by 'star_proc' */
12223 thr_info->awake_thr =
12224 NlmThreadCreate(star_proc, thr_info);
12225 }
12226
12227 return;
12228 }
12229
12230 /* Change the awake flag. This thread will die in one second. */
12231 void
BlastStopAwakeThread(BlastThrInfoPtr thr_info)12232 BlastStopAwakeThread(BlastThrInfoPtr thr_info)
12233 {
12234 thr_info->awake = FALSE;
12235
12236 }
12237