1 static char const rcsid[] = "$Id: blastpgp.c,v 6.140 2008/03/31 13:35:18 madden Exp $";
2
3 /* $Id: blastpgp.c,v 6.140 2008/03/31 13:35:18 madden Exp $ */
4 /**************************************************************************
5 * *
6 * COPYRIGHT NOTICE *
7 * *
8 * This software/database is categorized as "United States Government *
9 * Work" under the terms of the United States Copyright Act. It was *
10 * produced as part of the author's official duties as a Government *
11 * employee and thus can not be copyrighted. This software/database is *
12 * freely available to the public for use without a copyright notice. *
13 * Restrictions can not be placed on its present or future use. *
14 * *
15 * Although all reasonable efforts have been taken to ensure the accuracy *
16 * and reliability of the software and data, the National Library of *
17 * Medicine (NLM) and the U.S. Government do not and can not warrant the *
18 * performance or results that may be obtained by using this software, *
19 * data, or derivative works thereof. The NLM and the U.S. Government *
20 * disclaim any and all warranties, expressed or implied, as to the *
21 * performance, merchantability or fitness for any particular purpose or *
22 * use. *
23 * *
24 * In any work or product derived from this material, proper attribution *
25 * of the author(s) as the source of the software or data would be *
26 * appreciated. *
27 * *
28 **************************************************************************
29 * $Revision: 6.140 $
30 * $Log: blastpgp.c,v $
31 * Revision 6.140 2008/03/31 13:35:18 madden
32 * Change semantics of -c option, so that a new method for effective observations is used always and a new entropy-based method for column-specific PSI-BLAST pseudocounts is used by default. If default is used (-c 0), then all constants are defined in posit.c; if only the new method of effective observations is used, then the value of -c should be set by the user at approximately 30. (Changes
33 * submitted by Alejandro Schaffer).
34 *
35 * Revision 6.139 2008/01/02 20:16:11 madden
36 * XML output respects -v and -b option, JIRA SB-30
37 *
38 * Revision 6.138 2008/01/02 14:02:06 madden
39 * Make composition-based score adjustments the default for blastp and tblastn
40 *
41 * Revision 6.137 2007/03/14 17:55:27 madden
42 * - #include string.h to get a prototype for strcasecmp In
43 * - In tick_callback, suppress unused parameter warnings in tick_callback
44 * by casting to void.
45 * - In PGPFormatMainOutput, initialize local variables pruneSeed and
46 * prune to NULL.
47 * - In PGPReadBlastOptions, print a warning if a PSI-BLAST output
48 * checkpoint filename is specified, but does not have a file
49 * extension appropriate for its type.
50 * [from Mike Gertz]
51 *
52 * Revision 6.136 2006/07/17 17:32:54 coulouri
53 * from mike gertz: fix a memory leak in PGPFormatFooter
54 *
55 * Revision 6.135 2006/05/03 14:40:49 madden
56 * Changed usage of -t flag to also include specification of whether
57 * unified p-values should be used to calculate the significance of
58 * alignments when composition-based statistics is used. (from Mike Gertz)
59 *
60 * Revision 6.134 2006/01/24 18:33:51 papadopo
61 * from Mike Gertz: Use enumerated values, rather than #define'd constants, to specify the composition adjustment method
62 *
63 * Revision 6.133 2005/08/31 20:34:02 coulouri
64 * In PGPReadBlastOptions:
65 * - set the value of options->kappa_expect_value.
66 * - changed the wording of the warnings that
67 * composition-based statistics mode > 1 are experimental.
68 * - set mode for composition-based statistics to 1 if restarting
69 * from a checkpoint.
70 * In Main:
71 * - Remove outdated code for setting options->hitlist_size and
72 * options->expect_value
73 * - Set the value of kappa_expect_value for composition-based
74 * statistics, modes > 1.
75 * - Correctly set the evalue for preliminary alignments when more
76 * than one query is in the file.
77 *
78 * Revision 6.132 2005/07/28 16:16:46 coulouri
79 * remove dead code
80 *
81 * Revision 6.131 2005/06/30 15:10:03 coulouri
82 * remove -g option, use enum for argument handling
83 *
84 * Revision 6.130 2005/05/18 17:35:49 papadopo
85 * add warnings if new composition-based statistics options are selected
86 *
87 * Revision 6.129 2005/05/17 17:51:20 papadopo
88 * make the -t argument a string and handle values of 'T' or 'F' manually (required for backward compatibility)
89 *
90 * Revision 6.128 2005/05/16 17:41:00 papadopo
91 * From Alejandro Schaffer: Added support for compositional adjustment
92 * of matrices via enhanced -t flag, which now is integer to allow 4 options.
93 *
94 * Revision 6.127 2005/04/04 14:57:47 papadopo
95 * remove requirement for a fasta format query file if restarting from scoremat
96 *
97 * Revision 6.126 2004/10/12 15:14:39 papadopo
98 * add gap open and extend penalties to [BC]posComputation calls
99 *
100 * Revision 6.125 2004/07/19 17:16:19 papadopo
101 * add capability to perform input and output of residue frequencies in scoremat form
102 *
103 * Revision 6.124 2004/06/30 12:33:30 madden
104 * Add include for blfmtutl.h
105 *
106 * Revision 6.123 2004/06/25 20:58:49 dondosha
107 * Ungapped option not supported for multi-iterational search, but OK otherwise
108 *
109 * Revision 6.122 2004/06/24 21:48:16 dondosha
110 * Made ungapped search option deprecated
111 *
112 * Revision 6.121 2004/06/24 19:02:23 dondosha
113 * Exit with error status if PGPReadBlastOptions returns NULL
114 *
115 * Revision 6.120 2004/04/29 19:56:00 dondosha
116 * Mask filtered locations in query sequence lines in XML output
117 *
118 * Revision 6.119 2004/03/31 17:59:32 dondosha
119 * Fix for XML output for multiple queries: append full XML outputs in one file
120 *
121 * Revision 6.118 2003/09/26 16:01:34 madden
122 * Change threshold to 0.002
123 *
124 * Revision 6.117 2003/05/30 17:31:09 coulouri
125 * add rcsid
126 *
127 * Revision 6.116 2003/05/13 16:02:42 coulouri
128 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
129 *
130 * Revision 6.115 2003/03/20 14:47:16 madden
131 * StringSave on asn1_mode
132 *
133 * Revision 6.114 2003/03/20 13:44:23 madden
134 * Fix -m 10/11 output to make them SeqAnnots
135 *
136 * Revision 6.113 2002/12/31 22:47:16 boemker
137 * Added support for printing output as ASN (text, with -m 10, or binary, with
138 * -m 11).
139 *
140 * Revision 6.112 2002/11/12 16:04:24 dondosha
141 * Do not print the no hits found message for tabular output
142 *
143 * Revision 6.111 2002/09/23 21:00:00 madden
144 * Fix to correctly output SeqAlign for every query and iteration
145 *
146 * Revision 6.110 2002/09/19 13:25:00 camacho
147 * Fix incorrect index into myargs array
148 *
149 * Revision 6.109 2002/09/18 20:34:30 camacho
150 * Restored -P option
151 *
152 * Revision 6.108 2002/08/09 19:41:25 camacho
153 * 1) Added blast version number to command-line options
154 * 2) Added explanations for some default parameters
155 *
156 * Revision 6.107 2002/06/19 22:50:17 dondosha
157 * Added all queries information for tabular output with multiple queries
158 *
159 * Revision 6.106 2002/04/29 19:55:25 madden
160 * Use ARG_FLOAT for db length
161 *
162 * Revision 6.105 2002/03/13 22:48:06 dondosha
163 * Avoid freeing masking locations after each iteration with XML output
164 *
165 * Revision 6.104 2001/10/12 14:55:41 dondosha
166 * Changed description of the -t option
167 *
168 * Revision 6.103 2001/08/29 19:06:37 madden
169 * added variable posComputationcalled in Main, added parameter posComputationCalled to PGPrintPosMatrix
170 *
171 * Revision 6.102 2001/08/28 17:34:35 madden
172 * Add -m 9 as tabular output with comments
173 *
174 * Revision 6.101 2001/07/30 16:27:42 dondosha
175 * Do not call PGPOutTextMessages with XML output option
176 *
177 * Revision 6.100 2001/07/25 19:40:15 dondosha
178 * Multiply hitlist_size by 2 when going to next query if when tweak_parameters set
179 *
180 * Revision 6.99 2001/07/24 18:16:55 madden
181 * Set error_return when freeing
182 *
183 * Revision 6.98 2001/07/09 19:37:31 kans
184 * return 0 instead of NULL to fix Mac compiler error
185 *
186 * Revision 6.97 2001/07/03 20:50:33 madden
187 * Commented out call to PrintTabularOutputHeader
188 *
189 * Revision 6.96 2001/06/22 13:48:39 dondosha
190 * Declare variable vnp
191 *
192 * Revision 6.95 2001/06/21 21:56:37 dondosha
193 * Fixed memory leak: destroy all error returns; uncommented ObjMgrFreeCache
194 *
195 * Revision 6.94 2001/06/15 21:19:28 dondosha
196 * Added tabular output option -m 8
197 *
198 * Revision 6.93 2001/06/11 20:31:19 shavirin
199 * Added possibility to make Batch searches in iteractive mode.
200 *
201 * Revision 6.92 2001/04/13 20:46:01 madden
202 * Changed default e-value threshold for inclusion in multipass model from 0.002 to 0.005, Changed default constant in pseudocounts for multipass version from 7 to 9
203 *
204 * Revision 6.91 2001/04/10 19:20:52 madden
205 * Unsuppress some options suppressed for the release
206 *
207 * Revision 6.90 2001/03/30 22:02:04 madden
208 * Suppress options not yet ready for prime-time
209 *
210 * Revision 6.89 2000/11/30 17:10:13 shavirin
211 * Initialized to NULL xml pointer.
212 *
213 * Revision 6.88 2000/11/28 20:48:49 shavirin
214 * Adopted for usage with many-iterations XML.
215 *
216 * Revision 6.87 2000/11/22 21:57:08 shavirin
217 * Added passing of iteration number into XML output.
218 *
219 * Revision 6.86 2000/11/22 21:41:52 shavirin
220 * Removed problem with mutiple iteration XML output.
221 *
222 * Revision 6.85 2000/11/22 17:34:32 madden
223 * Change NULL to 0 for an intvalue
224 *
225 * Revision 6.84 2000/11/22 17:28:31 madden
226 * Enable decline-to-align
227 *
228 * Revision 6.83 2000/10/27 21:26:50 shavirin
229 * Produce valid XML output if no hits found.
230 *
231 * Revision 6.82 2000/10/27 19:14:41 madden
232 * Change description of -b option
233 *
234 * Revision 6.81 2000/10/24 13:51:33 shavirin
235 * Removed unused parameter from the function BXMLPrintOutput().
236 *
237 * Revision 6.80 2000/10/23 19:58:22 dondosha
238 * Open and close AsnIo outside of call(s) to BXMLPrintOutput
239 *
240 * Revision 6.79 2000/10/17 19:39:20 shavirin
241 * Fixed compilation problems on Mac.
242 *
243 * Revision 6.78 2000/10/13 20:58:01 madden
244 * Add YES_TO_DECLINE_TO_ALIGN define and disable same
245 *
246 * Revision 6.77 2000/09/28 15:48:30 dondosha
247 * Open <PRE> block in PGPFormatFooter
248 *
249 * Revision 6.76 2000/09/27 19:31:44 dondosha
250 * Use original square substitution matrix to pass to txalign on all iterations
251 *
252 * Revision 6.75 2000/09/21 17:54:46 dondosha
253 * Do not pass matrix to txalign in case of query-anchored formatting
254 *
255 * Revision 6.74 2000/09/13 18:34:35 dondosha
256 * Create BLAST_Matrix from ScoreBlk before converting it to txalign-style matrix
257 *
258 * Revision 6.73 2000/09/12 21:51:38 dondosha
259 * Pass the correct scoring matrix to ShowTextAlignFromAnnot
260 *
261 * Revision 6.72 2000/08/28 21:55:01 shavirin
262 * Added option (m = 7) to print XML output.
263 *
264 * Revision 6.71 2000/08/22 20:18:21 shavirin
265 * Added support for HTML output if decline-to-align parameter is set.
266 *
267 * Revision 6.70 2000/08/18 21:31:12 madden
268 * no longer need to raise E-value threshold for composition-based statistics
269 *
270 * Revision 6.69 2000/08/08 21:47:55 shavirin
271 * Added parameter decline_align and possibility to print then
272 * discontinuous alignment.
273 *
274 * Revision 6.68 2000/07/26 19:04:14 shavirin
275 * Set overriding default threshold only if input value != 0.
276 *
277 * Revision 6.67 2000/07/25 18:14:06 shavirin
278 * WARNING: This is no-turning-back changed related to S&W Blast from
279 * Alejandro Schaffer
280 *
281 * Revision 6.66 2000/07/21 20:46:40 madden
282 * Threshold set to default if arg is zero
283 *
284 * Revision 6.65 2000/06/27 15:25:19 madden
285 * Changed master-slave to query-anchored
286 *
287 * Revision 6.64 2000/03/29 17:32:55 madden
288 * Commented out yet another call to ObjMgrFreeCache
289 *
290 * Revision 6.63 2000/03/27 21:32:00 shavirin
291 * Use function ShowTextAlignFromAnnot2 instead of ShowTextAlignFromAnnot3
292 *
293 * Revision 6.62 2000/03/23 15:30:22 shavirin
294 * Removed call to ObjMgrFreeCache()
295 *
296 * Revision 6.61 2000/03/07 21:59:07 shavirin
297 * Now will use PSSM Matrix to show positives in PSI Blast
298 *
299 * Revision 6.60 2000/03/02 21:06:09 shavirin
300 * Added -U option, that allows to consider low characters in FASTA files
301 * as filtered regions (for blastn, blastp and tblastn).
302 *
303 * Revision 6.58 2000/01/07 22:01:04 shavirin
304 * Fixed problem with printing alignment.
305 *
306 * Revision 6.57 1999/12/21 21:34:05 shavirin
307 * Fixed some memory leaks.
308 *
309 * Revision 6.56 1999/11/08 19:12:41 shavirin
310 * Fixed minor SGI warning.
311 *
312 * Revision 6.55 1999/10/21 20:27:52 shavirin
313 * Fixed bug resulted in failue to print out seqannot. (-O)
314 *
315 * Revision 6.53 1999/10/14 15:52:51 shavirin
316 * Added possibility to make search by list of gis. Fixed some bugs.
317 *
318 * Revision 6.52 1999/10/05 17:41:08 shavirin
319 * Corrected in accordance to blast.c changes.
320 *
321 * Revision 6.51 1999/09/24 16:06:15 shavirin
322 * Matrix is set to NULL if no matrix calculation produced.
323 *
324 * Revision 6.50 1999/09/22 17:53:25 shavirin
325 * Now functions will collect messages in ValNodePtr before printing out.
326 *
327 * Revision 6.49 1999/09/21 16:01:46 shavirin
328 * Rearanged all file. Main function was disintegrated into few small
329 * functions.
330 *
331 * Revision 6.48 1999/08/27 18:17:42 shavirin
332 * Added new parameter to command line - decline_align.
333 *
334 * Revision 6.47 1999/08/26 14:58:06 madden
335 * Use float for db length
336 *
337 * Revision 6.46 1999/08/04 13:26:49 madden
338 * Added -B option
339 *
340 * Revision 6.45 1999/03/31 16:58:04 madden
341 * Removed static FindProt and FindNuc
342 *
343 * Revision 6.44 1999/03/24 18:16:33 madden
344 * zero-out groupOrder and featureOrder
345 *
346 * Revision 6.43 1999/03/21 19:43:23 madden
347 * Added -Q option to store ASCII version of last position-specific matrix in a file
348 *
349 * Revision 6.42 1999/03/04 14:17:20 egorov
350 * Added new parameter to BlastMaskTheResidues() function for correct masking
351 * when query is seqloc. The paramter is not used in this file and is 0.
352 *
353 * Revision 6.41 1999/02/18 21:13:11 madden
354 * Check for non-pattern search before call to BlastPruneSapStructDestruct
355 *
356 * Revision 6.40 1999/02/10 21:12:27 madden
357 * Added HTML and GI list option, fixed filtering
358 *
359 * Revision 6.39 1999/01/22 17:24:51 madden
360 * added line breaks for alignment views
361 *
362 * Revision 6.38 1998/12/29 20:03:14 kans
363 * calls UseLocalAsnloadDataAndErrMsg at startup
364 *
365 * Revision 6.37 1998/12/17 21:54:39 madden
366 * Added call to BlastPruneHitsFromSeqAlign for other than first round
367 *
368 * Revision 6.36 1998/12/16 14:13:36 madden
369 * Fix for more than one pattern in query
370 *
371 * Revision 6.35 1998/11/19 14:04:34 madden
372 * Changed message level to SEV_WARNING
373 *
374 * Revision 6.34 1998/11/16 16:29:41 madden
375 * Added ErrSetMessageLevel(SEV_INFO) and fixed call to PrintKAParametersExtra
376 *
377 * Revision 6.33 1998/09/28 12:33:02 madden
378 * Used BlastErrorPrint
379 *
380 * Revision 6.31 1998/09/17 19:54:31 madden
381 * Removed fillCandLambda
382 *
383 * Revision 6.29 1998/09/10 22:38:24 madden
384 * Moved convertSeqAlignListToValNodeList and convertValNodeListToSeqAlignList
385 *
386 * Revision 6.28 1998/09/09 21:20:02 madden
387 * AS fixed warnings, removed stderr fprintf, added call to PrintKAParametersExtra
388 *
389 * Revision 6.27 1998/09/09 16:10:49 madden
390 * PHI-BLAST changes
391 *
392 * Revision 6.26 1998/07/17 15:41:36 madden
393 * Added effective search space flag
394 *
395 * Revision 6.24 1998/06/14 19:44:46 madden
396 * Checkpointing fix
397 *
398 * Revision 6.23 1998/06/12 21:32:27 madden
399 * Removed extra FnPtr cast
400 *
401 * Revision 6.22 1998/06/10 13:33:39 madden
402 * change -h from 0.01 to 0.001
403 *
404 * Revision 6.21 1998/06/05 21:48:43 madden
405 * Added -K and -L options
406 *
407 * Revision 6.20 1998/05/18 18:01:05 madden
408 * Changed args to allow filter options to be changed
409 *
410 * Revision 6.19 1998/05/01 18:31:03 egorov
411 * Add new parametes to BLASTOptionSetGapParam()
412 *
413 * Revision 6.18 1998/04/30 14:32:33 madden
414 * init_buff_ex arg changed to 90 for reference
415 *
416 * Revision 6.17 1998/04/29 14:29:31 madden
417 * Made reference line longer
418 *
419 * Revision 6.16 1998/04/01 22:49:13 madden
420 * Print No hits found message
421 *
422 * Revision 6.15 1998/02/25 20:50:50 madden
423 * Added arg for db length
424 *
425 * Revision 6.14 1998/02/24 22:48:36 madden
426 * Removed options for culling
427 *
428 * Revision 6.13 1998/01/02 14:33:37 madden
429 * Added comma
430 *
431 * Revision 6.12 1997/12/31 17:48:56 madden
432 * Added wordsize option
433 *
434 * Revision 6.11 1997/12/23 21:09:44 madden
435 * Added -K and -L for range-dependent blast
436 *
437 * Revision 6.10 1997/12/23 20:57:44 madden
438 * Changes for checkpointing
439 *
440 * Revision 6.9 1997/11/19 14:26:46 madden
441 * Removed extra break statement
442 *
443 * Revision 6.8 1997/11/18 22:24:24 madden
444 * Added call to BLASTOptionSetGapParams
445 *
446 * Revision 6.7 1997/11/08 22:04:43 madden
447 * Called BlastOtherReturnsPrepare earlier to before posMatrix is deleted
448 *
449 * Revision 6.6 1997/10/27 22:26:49 madden
450 * Added call to ObjMgrFreeCache(0)
451 *
452 * Revision 6.5 1997/10/23 20:26:15 madden
453 * Use of init_buff_ex rather than init_buff
454 *
455 * Revision 6.4 1997/10/22 21:56:49 madden
456 * Changed default values
457 *
458 * Revision 6.3 1997/10/07 21:33:34 madden
459 * Added BLUNT option
460 *
461 * Revision 6.2 1997/09/18 22:25:02 madden
462 * b and v options now work
463 *
464 * Revision 6.1 1997/09/16 16:40:50 madden
465 * Dbinfo printing changed for multiple db searches
466 *
467 * Revision 6.0 1997/08/25 18:19:19 madden
468 * Revision changed to 6.0
469 *
470 * Revision 1.24 1997/08/24 19:38:23 madden
471 * Used function BlastOtherReturnsPrepare
472 *
473 * Revision 1.23 1997/08/14 21:48:57 madden
474 * Added descriptions and alignments options
475 *
476 * Revision 1.22 1997/07/29 19:33:05 madden
477 * Added TXALIGN_SHOW_QS flag
478 *
479 * Revision 1.21 1997/07/28 18:36:45 madden
480 * Replaced printf with ErrPostEx and fprintf
481 *
482 * Revision 1.20 1997/07/28 16:59:21 madden
483 * Added include for simutil.h
484 *
485 * Revision 1.19 1997/07/28 16:50:51 madden
486 * Changed call to ShowTextAlignFromAnnot
487 *
488 * Revision 1.18 1997/07/22 19:18:40 madden
489 * printing version, etc.
490 *
491 * Revision 1.17 1997/06/25 14:06:21 madden
492 * minor changes to check convergence
493 *
494 * Revision 1.16 1997/06/23 20:51:29 madden
495 * Added matrix option
496 *
497 * Revision 1.15 1997/06/20 19:30:00 madden
498 * Added align_type info, support for SeqAligns
499 *
500 * Revision 1.14 1997/05/23 20:54:48 madden
501 * Added -Z option for final gapped alignment
502 *
503 * Revision 1.13 1997/05/07 15:06:01 madden
504 * replacement of search by compactSearch
505 *
506 * Revision 1.12 1997/04/21 14:09:27 madden
507 * Added four new master-slave alignment types.
508 *
509 * Revision 1.11 1997/04/11 19:03:47 madden
510 * Changes to ignore query ID and show master-slave alignments.
511 *
512 * Revision 1.10 1997/04/10 19:28:28 madden
513 * COMMAND_LINE replaced by ALL_ROUNDS.
514 *
515 * Revision 1.9 1997/04/10 13:27:32 madden
516 * Added COMMAND_LINE define, option for multiple alignments.
517 *
518 * Revision 1.8 1997/04/07 21:44:50 madden
519 * Removed unused variable stats.
520 *
521 * Revision 1.7 1997/04/04 21:13:33 madden
522 * Used function BioseqBlastEngineCore, remove PrivateScoreFunc.
523 *
524 * Revision 1.6 1997/04/04 16:38:04 madden
525 * Changed filtering option, ObjMgrHold.
526 *
527 * Revision 1.5 1997/03/21 20:30:02 madden
528 * Expect value arg made a float.
529 *
530 * Revision 1.4 1997/03/13 21:18:51 madden
531 * Changed default extension value to 1 from 2.
532 *
533 * Revision 1.3 1997/02/19 21:43:04 madden
534 * Extensive changes for psi-blast. blastp runs now occur multiple times.
535 *
536 * Revision 1.2 1997/02/12 15:16:58 madden
537 * Changed from blast2 to new formatting.
538 *
539 * Revision 1.1 1997/01/16 21:35:42 madden
540 * Initial revision
541 *
542 * Revision 1.1 1997/01/16 21:34:23 madden
543 * Initial revision
544 *
545 */
546 #include <string.h>
547
548 #include <ncbi.h>
549 #include <objseq.h>
550 #include <objsset.h>
551 #include <sequtil.h>
552 #include <seqport.h>
553 #include <tofasta.h>
554 #include <blast.h>
555 #include <blastpri.h>
556 #include <txalign.h>
557 #include <simutil.h>
558 #include <gapxdrop.h>
559 #include <posit.h>
560 #include <seed.h>
561 #include <sqnutils.h>
562 #include <xmlblast.h>
563 #include <ddvcreate.h>
564 #include <blfmtutl.h>
565 #include <objscoremat.h>
566 #include <algo/blast/composition_adjustment/composition_constants.h>
567
568 /* Used by the callback function. */
569 FILE *global_fp=NULL;
570 /*
571 Callback to print out ticks, in UNIX only due to file systems
572 portability issues.
573 */
574
575 static int LIBCALLBACK
tick_callback(Int4 sequence_number,Int4 number_of_positive_hits)576 tick_callback(Int4 sequence_number, Int4 number_of_positive_hits)
577 {
578 (void) sequence_number;
579 (void) number_of_positive_hits;
580 #ifdef OS_UNIX
581
582 fprintf(global_fp, "%s", ".");
583 fflush(global_fp);
584 #endif
585 return 0;
586 }
587
588 #define EVALUE_EXPAND 10
589
590 #define YES_TO_DECLINE_TO_ALIGN
591
592 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
593
594 typedef enum {
595 ARG_DATABASE = 0,
596 ARG_QUERY,
597 ARG_WINDOW_SIZE,
598 ARG_THRESHOLD,
599 ARG_EVALUE,
600 ARG_FORMAT,
601 ARG_OUT,
602 ARG_XDROP_UNGAPPED,
603 ARG_MULTIPLEHITS,
604 ARG_FILTER,
605 ARG_GAPOPEN,
606 ARG_GAPEXT,
607 ARG_XDROP_GAPPED,
608 ARG_GAP_TRIGGER,
609 ARG_REQUIRED_START,
610 ARG_REQUIRED_END,
611 ARG_THREADS,
612 ARG_SHOW_GIS,
613 ARG_EVALUE_INCLUSION_THRESHOLD,
614 ARG_PSEUDOCOUNT_CONSTANT,
615 ARG_MAX_PASSES,
616 ARG_BELIEVEQUERY,
617 ARG_XDROP_FINAL,
618 ARG_SEQALIGN_FILE,
619 ARG_MATRIX,
620 ARG_DESCRIPTIONS,
621 ARG_ALIGNMENTS,
622 ARG_CHECKPOINT_OUTPUT,
623 ARG_CHECKPOINT_INPUT,
624 ARG_WORDSIZE,
625 ARG_DBSIZE,
626 ARG_BEST_HITS,
627 ARG_SMITHWATERMAN,
628 ARG_SEARCHSP,
629 ARG_PHI_PROGRAM,
630 ARG_PHI_HITFILE,
631 ARG_HTML,
632 ARG_MATRIX_OUT,
633 ARG_ALIGNMENT_IN,
634 ARG_GILIST,
635 ARG_LCASE,
636 ARG_COMP_BASED_STATS,
637 ARG_SCOREMAT_INPUT,
638 ARG_SCOREMAT_OUTPUT,
639 ARG_COST_DECLINE_ALIGN
640 } BlastpgpArguments;
641
642 static Args myargs[] = {
643 { "Database", /* ARG_DATABASE */
644 "nr", NULL, NULL, FALSE, 'd', ARG_STRING, 0.0, 0, NULL},
645 { "Query File (not needed if restarting from scoremat)", /* ARG_QUERY */
646 "stdin", NULL, NULL, TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
647 { "Multiple Hits window size", /* ARG_WINDOW_SIZE */
648 "40", NULL, NULL, FALSE, 'A', ARG_INT, 0.0, 0, NULL},
649 { "Threshold for extending hits", /* ARG_THRESHOLD */
650 "11", NULL, NULL, FALSE, 'f', ARG_INT, 0.0, 0, NULL},
651 { "Expectation value (E)", /* ARG_EVALUE */
652 "10.0", NULL, NULL, FALSE, 'e', ARG_FLOAT, 0.0, 0, NULL},
653 { "alignment view options:\n0 = pairwise,\n1 = query-anchored showing identities,\n2 = query-anchored no identities,\n3 = flat query-anchored, show identities,\n4 = flat query-anchored, no identities,\n5 = query-anchored no identities and blunt ends,\n6 = flat query-anchored, no identities and blunt ends,\n7 = XML Blast output,\n8 = Tabular output, \n9 = Tabular output with comments\n10 = ASN, text\n11 = ASN, binary", /* ARG_FORMAT */
654 "0", NULL, NULL, FALSE, 'm', ARG_INT, 0.0, 0, NULL},
655 { "Output File for Alignment", /* ARG_OUT */
656 "stdout", NULL, NULL, TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
657 { "Dropoff (X) for blast extensions in bits (default if zero)", /* ARG_XDROP_UNGAPPED */
658 "7.0", NULL, NULL, FALSE, 'y', ARG_FLOAT, 0.0, 0, NULL},
659 { "0 for multiple hit, 1 for single hit", /* ARG_MULTIPLEHITS */
660 "0", NULL, NULL, FALSE, 'P', ARG_INT, 0.0, 0, NULL},
661 { "Filter query sequence with SEG", /* ARG_FILTER */
662 "F", NULL, NULL, FALSE, 'F', ARG_STRING, 0.0, 0, NULL},
663 { "Cost to open a gap", /* ARG_GAPOPEN */
664 "11", NULL, NULL, FALSE, 'G', ARG_INT, 0.0, 0, NULL},
665 { "Cost to extend a gap", /* ARG_GAPEXT */
666 "1", NULL, NULL, FALSE, 'E', ARG_INT, 0.0, 0, NULL},
667 { "X dropoff value for gapped alignment (in bits)", /* ARG_XDROP_GAPPED */
668 "15", NULL, NULL, FALSE, 'X', ARG_INT, 0.0, 0, NULL},
669 { "Number of bits to trigger gapping", /* ARG_GAP_TRIGGER */
670 "22.0", NULL, NULL, FALSE, 'N', ARG_FLOAT, 0.0, 0, NULL},
671 { "Start of required region in query", /* ARG_REQUIRED_START */
672 "1", NULL, NULL, FALSE, 'S', ARG_INT, 0.0, 0, NULL},
673 { "End of required region in query (-1 indicates end of query)", /* ARG_REQUIRED_END */
674 "-1", NULL, NULL, FALSE, 'H', ARG_INT, 0.0, 0, NULL},
675 { "Number of processors to use", /* ARG_THREADS */
676 "1", NULL, NULL, FALSE, 'a', ARG_INT, 0.0, 0, NULL},
677 { "Show GI's in deflines", /* ARG_SHOW_GIS */
678 "F", NULL, NULL, FALSE, 'I', ARG_BOOLEAN, 0.0, 0, NULL},
679 { "e-value threshold for inclusion in multipass model", /* ARG_EVALUE_INCLUSION_THRESHOLD */
680 "0.002", NULL, NULL, FALSE, 'h', ARG_FLOAT, 0.0, 0, NULL},
681 { "Constant in pseudocounts for multipass version; 0 uses entropy method; otherwise a value near 30 is recommended", /* ARG_PSEUDOCOUNT_CONSTANT */
682 "0", NULL, NULL, FALSE, 'c', ARG_INT, 0.0, 0, NULL},
683 { "Maximum number of passes to use in multipass version", /* ARG_MAX_PASSES */
684 "1", NULL, NULL, FALSE, 'j', ARG_INT, 0.0, 0, NULL},
685 { "Believe the query defline", /* ARG_BELIEVEQUERY */
686 "F", NULL, NULL, FALSE, 'J', ARG_BOOLEAN, 0.0, 0, NULL},
687 { "X dropoff value for final gapped alignment (in bits)", /* ARG_XDROP_FINAL */
688 "25", NULL, NULL, FALSE, 'Z', ARG_INT, 0.0, 0, NULL},
689 { "SeqAlign file ('Believe the query defline' must be TRUE)", /* ARG_SEQALIGN_FILE */
690 NULL, NULL, NULL, TRUE, 'O', ARG_FILE_OUT, 0.0, 0, NULL},
691 { "Matrix", /* ARG_MATRIX */
692 "BLOSUM62", NULL, NULL, FALSE, 'M', ARG_STRING, 0.0, 0, NULL},
693 { "Number of database sequences to show one-line descriptions for (V)", /* ARG_DESCRIPTIONS */
694 "500", NULL, NULL, FALSE, 'v', ARG_INT, 0.0, 0, NULL},
695 { "Number of database sequence to show alignments for (B)", /* ARG_ALIGNMENTS */
696 "250", NULL, NULL, FALSE, 'b', ARG_INT, 0.0, 0, NULL},
697 { "Output File for PSI-BLAST Checkpointing", /* ARG_CHECKPOINT_OUTPUT */
698 NULL, NULL, NULL, TRUE, 'C', ARG_FILE_OUT, 0.0, 0, NULL},
699 { "Input File for PSI-BLAST Restart", /* ARG_CHECKPOINT_INPUT */
700 NULL, NULL, NULL, TRUE, 'R', ARG_FILE_IN, 0.0, 0, NULL},
701 { "Word size", /* ARG_WORDSIZE */
702 "3", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL},
703 { "Effective length of the database (use zero for the real size)", /* ARG_DBSIZE */
704 "0", NULL, NULL, FALSE, 'z', ARG_FLOAT, 0.0, 0, NULL},
705 { "Number of best hits from a region to keep", /* ARG_BEST_HITS */
706 "0", NULL, NULL, FALSE, 'K', ARG_INT, 0.0, 0, NULL},
707 { "Compute locally optimal Smith-Waterman alignments", /* ARG_SMITHWATERMAN */
708 "F", NULL, NULL, FALSE, 's', ARG_BOOLEAN, 0.0, 0, NULL},
709 { "Effective length of the search space (use zero for the real size)", /* ARG_SEARCHSP */
710 "0", NULL, NULL, FALSE, 'Y', ARG_FLOAT, 0.0, 0, NULL},
711 { "program option for PHI-BLAST", /* ARG_PHI_PROGRAM */
712 "blastpgp", NULL, NULL, FALSE, 'p', ARG_STRING, 0.0, 0, NULL},
713 { "Hit File for PHI-BLAST", /* ARG_PHI_HITFILE */
714 "hit_file", NULL, NULL, FALSE, 'k', ARG_FILE_IN, 0.0, 0, NULL},
715 { "Produce HTML output", /* ARG_HTML */
716 "F", NULL, NULL, FALSE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
717 { "Output File for PSI-BLAST Matrix in ASCII", /* ARG_MATRIX_OUT */
718 NULL, NULL, NULL, TRUE, 'Q', ARG_FILE_OUT, 0.0, 0, NULL},
719 { "Input Alignment File for PSI-BLAST Restart", /* ARG_ALIGNMENT_IN */
720 NULL, NULL, NULL, TRUE, 'B', ARG_FILE_IN, 0.0, 0, NULL},
721 { "Restrict search of database to list of GI's", /* ARG_GILIST */
722 NULL, NULL, NULL, TRUE, 'l', ARG_STRING, 0.0, 0, NULL},
723 {"Use lower case filtering of FASTA sequence", /* ARG_LCASE */
724 "F", NULL,NULL,TRUE,'U',ARG_BOOLEAN, 0.0,0,NULL},
725 { "Use composition based score adjustment\n" /* ARG_COMP_BASED_STATS */
726 "As first character:\n"
727 "0 or F or f: no composition-based statistics\n"
728 "2 or T or t: Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties in round 1\n"
729 "1: Composition-based statistics as in NAR 29:2994--3005, 2001\n"
730 "3: Composition-based score adjustment as in Bioinformatics 21:902-911, "
731 "2005, unconditionally in round 1\n"
732 "As second character, if first character is equivalent to 1, 2, or 3:\n"
733 "U or u: unified p-value combining alignment p-value and "
734 "compositional p-value in round 1 only\n",
735 "2", NULL, NULL, FALSE, 't', ARG_STRING, 0.0, 0, NULL},
736 { "ASN.1 Scoremat input of checkpoint data:\n"
737 "0: no scoremat input\n"
738 "1: Restart is from ASCII scoremat checkpoint file,\n"
739 "2: Restart is from binary scoremat checkpoint file", /* ARG_SCOREMAT_INPUT */
740 "0", NULL, NULL, TRUE, 'q', ARG_INT, 0.0, 0, NULL},
741 { "ASN.1 Scoremat output of checkpoint data:\n"
742 "0: no scoremat output\n"
743 "1: Output is ASCII scoremat checkpoint file (requires -J),\n"
744 "2: Output is binary scoremat checkpoint file (requires -J)", /* ARG_SCOREMAT_OUTPUT */
745 "0", NULL, NULL, TRUE, 'u', ARG_INT, 0.0, 0, NULL},
746 #ifdef YES_TO_DECLINE_TO_ALIGN
747 { "Cost to decline alignment (disabled when 0)", /* ARG_COST_DECLINE_ALIGN */
748 "0", NULL, NULL, FALSE, 'L', ARG_INT, 0.0, 0, NULL},
749 #endif
750 };
751
752 typedef struct _pgp_blast_options {
753 BLAST_OptionsBlkPtr options;
754 CharPtr blast_database;
755 BioseqPtr query_bsp, fake_bsp;
756 Int4 number_of_descriptions, number_of_alignments;
757 FILE *infp, *outfp;
758 AsnIoPtr aip_out;
759 Boolean html;
760 Boolean believe_query;
761 Uint4 align_options, print_options;
762 /* PHI-PSI Blast variables */
763 Uint1 featureOrder[FEATDEF_ANY];
764 Uint1 groupOrder[FEATDEF_ANY];
765 Int4 program_flag;
766 CharPtr patfile;
767 FILE *patfp;
768 seedSearchItems *seedSearch;
769 Boolean is_xml_output;
770 Boolean is_asn1_output;
771 char* asn1_mode; /* "w" or "wb" */
772 } PGPBlastOptions, PNTR PGPBlastOptionsPtr;
773
PGPGetPrintOptions(Boolean gapped,Uint4Ptr align_options_out,Uint4Ptr print_options_out)774 void PGPGetPrintOptions(Boolean gapped, Uint4Ptr align_options_out,
775 Uint4Ptr print_options_out)
776 {
777 Uint4 print_options, align_options;
778
779 print_options = 0;
780 if (gapped == FALSE)
781 print_options += TXALIGN_SHOW_NO_OF_SEGS;
782
783 align_options = 0;
784 align_options += TXALIGN_COMPRESS;
785 align_options += TXALIGN_END_NUM;
786
787 if (myargs[ARG_SHOW_GIS].intvalue) {
788 align_options += TXALIGN_SHOW_GI;
789 print_options += TXALIGN_SHOW_GI;
790 }
791
792 if (myargs[ARG_HTML].intvalue) {
793 align_options += TXALIGN_HTML;
794 print_options += TXALIGN_HTML;
795 }
796
797 if (myargs[ARG_FORMAT].intvalue != 0) {
798 align_options += TXALIGN_MASTER;
799 if (myargs[ARG_FORMAT].intvalue == 1 || myargs[ARG_FORMAT].intvalue == 3)
800 align_options += TXALIGN_MISMATCH;
801 if (myargs[ARG_FORMAT].intvalue == 3 || myargs[ARG_FORMAT].intvalue == 4 || myargs[ARG_FORMAT].intvalue == 6)
802 align_options += TXALIGN_FLAT_INS;
803 if (myargs[ARG_FORMAT].intvalue == 5 || myargs[ARG_FORMAT].intvalue == 6)
804 align_options += TXALIGN_BLUNT_END;
805 } else {
806 align_options += TXALIGN_MATRIX_VAL;
807 align_options += TXALIGN_SHOW_QS;
808 }
809
810 *align_options_out = align_options;
811 *print_options_out = print_options;
812
813 return;
814 }
PGPFreeBlastOptions(PGPBlastOptionsPtr bop)815 void PGPFreeBlastOptions(PGPBlastOptionsPtr bop)
816 {
817
818 bop->options = BLASTOptionDelete(bop->options);
819
820 FileClose(bop->infp);
821 FileClose(bop->outfp);
822 bop->aip_out = AsnIoClose(bop->aip_out);
823
824 MemFree(bop->blast_database);
825 MemFree(bop->patfile);
826 MemFree(bop->seedSearch);
827
828 MemFree(bop);
829
830 return;
831 }
832
PGPReadBlastOptions(void)833 PGPBlastOptionsPtr PGPReadBlastOptions(void)
834 {
835 PGPBlastOptionsPtr bop;
836 BLAST_OptionsBlkPtr options;
837 SeqEntryPtr sep;
838 Boolean is_dna;
839 Int4 align_view;
840
841 align_view = myargs[ARG_FORMAT].intvalue;
842
843 bop = MemNew(sizeof(PGPBlastOptions));
844
845 bop->blast_database = StringSave(myargs[ARG_DATABASE].strvalue);
846
847 if (align_view != 7 && align_view != 10 && align_view != 11 && myargs[ARG_OUT].strvalue != NULL) {
848 if ((bop->outfp = FileOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) {
849 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open output "
850 "file %s\n", myargs[ARG_OUT].strvalue);
851 return NULL;
852 }
853 }
854
855 bop->number_of_descriptions = myargs[ARG_DESCRIPTIONS].intvalue;
856 bop->number_of_alignments = myargs[ARG_ALIGNMENTS].intvalue;
857
858 if (myargs[ARG_BELIEVEQUERY].intvalue != 0)
859 bop->believe_query = TRUE;
860
861 if (myargs[ARG_SCOREMAT_INPUT].intvalue != NO_SCOREMAT_IO &&
862 myargs[ARG_SCOREMAT_INPUT].intvalue != ASCII_SCOREMAT &&
863 myargs[ARG_SCOREMAT_INPUT].intvalue != BINARY_SCOREMAT) {
864 ErrPostEx(SEV_FATAL, 1, 0,"Invalid choice for scoremat input\n");
865 return NULL;
866 }
867 if (myargs[ARG_SCOREMAT_OUTPUT].intvalue != NO_SCOREMAT_IO &&
868 myargs[ARG_SCOREMAT_OUTPUT].intvalue != ASCII_SCOREMAT &&
869 myargs[ARG_SCOREMAT_OUTPUT].intvalue != BINARY_SCOREMAT) {
870 ErrPostEx(SEV_FATAL, 1, 0,"Invalid choice for scoremat output\n");
871 return NULL;
872 }
873 if (myargs[ARG_CHECKPOINT_OUTPUT].strvalue != NULL) {
874 const char * description = NULL;
875 const char * recommended_extension = NULL;
876 const char * actual_extension = NULL;
877
878 switch (myargs[ARG_SCOREMAT_OUTPUT].intvalue) {
879 case NO_SCOREMAT_IO:
880 description = "standard";
881 recommended_extension = ".chk";
882 break;
883 case ASCII_SCOREMAT:
884 description = "ascii ASN.1";
885 recommended_extension = ".asnt";
886 break;
887 case BINARY_SCOREMAT:
888 description = "binary ASN.1";
889 recommended_extension = ".asn";
890 break;
891 }
892 actual_extension =
893 strrchr(myargs[ARG_CHECKPOINT_OUTPUT].strvalue, '.');
894 if (NULL == actual_extension) { /* No extension */
895 actual_extension = "";
896 }
897 if (0 != strcasecmp(recommended_extension, actual_extension)) {
898 ErrPostEx(SEV_WARNING, 1, 0, "a %s checkpoint file "
899 "should have extension \"%s\".\n", description,
900 recommended_extension);
901 }
902 }
903 if (myargs[ARG_SCOREMAT_OUTPUT].intvalue != NO_SCOREMAT_IO) {
904 if (bop->believe_query == FALSE) {
905 ErrPostEx(SEV_FATAL, 1, 0,
906 "-J option must be TRUE for scoremat output");
907 return NULL;
908 }
909 }
910
911 if (myargs[ARG_SEQALIGN_FILE].strvalue != NULL) {
912
913 if (bop->believe_query == FALSE) {
914 ErrPostEx(SEV_FATAL, 1, 0,
915 "-J option must be TRUE for ASN.1 output");
916 return NULL;
917 }
918
919 if ((bop->aip_out = AsnIoOpen (myargs[ARG_SEQALIGN_FILE].strvalue,"w")) == NULL) {
920 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open output "
921 "file %s\n", myargs[ARG_SEQALIGN_FILE].strvalue);
922 return NULL;
923 }
924 }
925 else if (align_view == 10 || align_view == 11)
926 {
927 const char* mode = (align_view == 10) ? "w" : "wb";
928 if (bop->believe_query == FALSE) {
929 ErrPostEx(SEV_FATAL, 1, 0,
930 "-J option must be TRUE for ASN.1 output");
931 return NULL;
932 }
933 if ((bop->aip_out = AsnIoOpen(myargs[ARG_OUT].strvalue, (char*) mode)) == NULL) {
934 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open output file %s\n", myargs[ARG_OUT].strvalue);
935 return NULL;
936 }
937 bop->is_asn1_output = TRUE;
938 if(align_view == 10)
939 bop->asn1_mode = StringSave("w");
940 else
941 bop->asn1_mode = StringSave("wb");
942 }
943
944 options = BLASTOptionNew("blastp", TRUE);
945 bop->options = options;
946
947 /* read the query sequence */
948
949 if (myargs[ARG_SCOREMAT_INPUT].intvalue == NO_SCOREMAT_IO) {
950 if ((bop->infp = FileOpen(myargs[ARG_QUERY].strvalue, "r")) == NULL) {
951 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open input file %s\n",
952 myargs[ARG_QUERY].strvalue);
953 return NULL;
954 }
955 if(myargs[ARG_LCASE].intvalue) {
956 if((sep = FastaToSeqEntryForDb (bop->infp, FALSE, NULL, bop->believe_query, NULL, NULL, &options->query_lcase_mask)) == NULL) {
957 ErrPostEx(SEV_FATAL, 1, 0, "Unable to read input FASTA file\n");
958 return NULL;
959 }
960 } else {
961 if((sep = FastaToSeqEntryEx(bop->infp, FALSE, NULL, bop->believe_query)) == NULL) {
962 ErrPostEx(SEV_FATAL, 1, 0, "Unable to read input FASTA file\n");
963 return NULL;
964 }
965 }
966
967 SeqEntryExplore(sep, &bop->query_bsp, FindProt);
968 if (bop->query_bsp == NULL) {
969 ErrPostEx(SEV_FATAL, 1, 0, "Unable to obtain bioseq\n");
970 return NULL;
971 }
972 }
973 else { /* recover the query sequence from scoremat */
974 AsnIoPtr scorematfile;
975 PssmWithParametersPtr scoremat = NULL;
976
977 if (myargs[ARG_CHECKPOINT_INPUT].strvalue == NULL) {
978 ErrPostEx(SEV_FATAL, 1, 0, "No restart file specified\n");
979 return NULL;
980 }
981
982 if (myargs[ARG_SCOREMAT_INPUT].intvalue == ASCII_SCOREMAT)
983 scorematfile = AsnIoOpen(myargs[ARG_CHECKPOINT_INPUT].strvalue, "r");
984 else /* binary scoremat */
985 scorematfile = AsnIoOpen(myargs[ARG_CHECKPOINT_INPUT].strvalue, "rb");
986
987 if (scorematfile == NULL) {
988 ErrPostEx(SEV_FATAL, 1, 0, "Unable to open scoremat file\n");
989 return NULL;
990 }
991
992 scoremat = PssmWithParametersAsnRead(scorematfile, NULL);
993 AsnIoClose(scorematfile);
994 if (scoremat == NULL) {
995 ErrPostEx(SEV_FATAL, 1, 0, "Could not read scoremat "
996 "for query sequence\n");
997 return NULL;
998 }
999
1000 if (scoremat->pssm == NULL ||
1001 scoremat->pssm->query == NULL ||
1002 scoremat->pssm->query->data.ptrvalue == NULL) {
1003 ErrPostEx(SEV_FATAL, 1, 0, "Cannot read query sequence "
1004 "from scoremat\n");
1005 return NULL;
1006 }
1007
1008 /* use the bioseq from 'scoremat'; remove from that
1009 structure so the bioseq is not freed along with the
1010 rest of the scoremat. Note that only the first
1011 bioseq in a bioseq-set is used */
1012
1013 bop->query_bsp = (BioseqPtr)(scoremat->pssm->query->data.ptrvalue);
1014 scoremat->pssm->query = ValNodeFree(scoremat->pssm->query);
1015 PssmWithParametersFree(scoremat);
1016 }
1017
1018 /* without a bioseq we cannot continue */
1019 if (bop->query_bsp == NULL) {
1020 ErrPostEx(SEV_FATAL, 1, 0, "Unable to obtain bioseq\n");
1021 return NULL;
1022 }
1023
1024 /* Set default gap params for matrix. */
1025 BLASTOptionSetGapParams(options, myargs[ARG_MATRIX].strvalue, 0, 0);
1026
1027 if(myargs[ARG_FORMAT].intvalue == 7) {
1028 bop->is_xml_output = TRUE;
1029 } else if (myargs[ARG_FORMAT].intvalue != 8 && myargs[ARG_FORMAT].intvalue != 9) {
1030 PGPGetPrintOptions(options->gapped_calculation, &bop->align_options,
1031 &bop->print_options);
1032 }
1033
1034 /* decrement by one to agree with program values. */
1035 options->required_start = myargs[ARG_REQUIRED_START].intvalue - 1;
1036 options->required_end = myargs[ARG_REQUIRED_END].intvalue;
1037 if (options->required_end != -1) {
1038 options->required_end--;
1039 }
1040
1041 options->window_size = myargs[ARG_WINDOW_SIZE].intvalue;
1042
1043 if(myargs[ARG_THRESHOLD].intvalue)
1044 options->threshold_second = (Int4) myargs[ARG_THRESHOLD].intvalue;
1045
1046 options->dropoff_2nd_pass = myargs[ARG_XDROP_UNGAPPED].floatvalue;
1047 options->expect_value = (Nlm_FloatHi) myargs[ARG_EVALUE].floatvalue;
1048 options->kappa_expect_value = options->expect_value;
1049 options->hitlist_size = MAX(bop->number_of_descriptions,
1050 bop->number_of_alignments);
1051
1052 if (myargs[ARG_MULTIPLEHITS].intvalue == 1) {
1053 options->two_pass_method = FALSE;
1054 options->multiple_hits_only = FALSE;
1055 } else {
1056 /* all other inputs, including the default 0 use 2-hit method */
1057 options->two_pass_method = FALSE;
1058 options->multiple_hits_only = TRUE;
1059 }
1060 options->gap_open = myargs[ARG_GAPOPEN].intvalue;
1061 options->gap_extend = myargs[ARG_GAPEXT].intvalue;
1062
1063 #ifdef YES_TO_DECLINE_TO_ALIGN
1064 if(myargs[ARG_COST_DECLINE_ALIGN].intvalue != 0) {
1065 options->discontinuous = TRUE;
1066 options->decline_align = myargs[ARG_COST_DECLINE_ALIGN].intvalue;
1067 } else {
1068 options->discontinuous = FALSE;
1069 options->decline_align = INT2_MAX;
1070 }
1071 #endif
1072
1073 options->gap_x_dropoff = myargs[ARG_XDROP_GAPPED].intvalue;
1074 options->gap_x_dropoff_final = myargs[ARG_XDROP_FINAL].intvalue;
1075 options->gap_trigger = myargs[ARG_GAP_TRIGGER].floatvalue;
1076
1077 if (StringICmp(myargs[ARG_FILTER].strvalue, "T") == 0) {
1078 options->filter_string = StringSave("S");
1079 } else {
1080 options->filter_string = StringSave(myargs[9].strvalue);
1081 }
1082
1083 options->number_of_cpus = (Int2) myargs[ARG_THREADS].intvalue;
1084
1085
1086 options->isPatternSearch = FALSE;
1087
1088 if (0 != (StringCmp("blastpgp",myargs[ARG_PHI_PROGRAM].strvalue))) {
1089 options->isPatternSearch = TRUE;
1090 options->discontinuous = FALSE;
1091 options->decline_align = INT2_MAX;
1092 bop->program_flag = convertProgramToFlag(myargs[ARG_PHI_PROGRAM].strvalue,
1093 &is_dna);
1094 }
1095
1096 if (options->isPatternSearch) {
1097 bop->patfile = StringSave(myargs[ARG_PHI_HITFILE].strvalue);
1098 if ((bop->patfp = FileOpen(bop->patfile, "r")) == NULL) {
1099 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open pattern "
1100 "file %s\n", bop->patfile);
1101 return NULL;
1102 }
1103
1104 bop->seedSearch = (seedSearchItems *)
1105 ckalloc(sizeof(seedSearchItems));
1106 }
1107
1108 if(options->isPatternSearch)
1109 fillCandLambda(bop->seedSearch, myargs[ARG_MATRIX].strvalue, options);
1110
1111 options->ethresh = (Nlm_FloatHi) myargs[ARG_EVALUE_INCLUSION_THRESHOLD].floatvalue;
1112 options->pseudoCountConst = myargs[ARG_PSEUDOCOUNT_CONSTANT].intvalue;
1113 options->maxNumPasses = myargs[ARG_MAX_PASSES].intvalue;
1114 /*zero out e-value threshold if it will not be used*/
1115 if (options->maxNumPasses == 1)
1116 options->ethresh = 0.0;
1117 if (myargs[ARG_WORDSIZE].intvalue)
1118 options->wordsize = myargs[ARG_WORDSIZE].intvalue;
1119 if (myargs[ARG_DBSIZE].floatvalue)
1120 options->db_length = (Int8) myargs[ARG_DBSIZE].floatvalue;
1121
1122 options->hsp_range_max = myargs[ARG_BEST_HITS].intvalue;
1123 if (options->hsp_range_max != 0)
1124 options->perform_culling = TRUE;
1125
1126 options->block_width = 20; /* Default value - previously '-L' parameter */
1127
1128 if (myargs[ARG_SEARCHSP].floatvalue)
1129 options->searchsp_eff = (Nlm_FloatHi) myargs[ARG_SEARCHSP].floatvalue;
1130
1131 switch (myargs[ARG_COMP_BASED_STATS].strvalue[0]) {
1132 case 'F':
1133 case 'f':
1134 case '0':
1135 options->tweak_parameters = eNoCompositionBasedStats;
1136 break;
1137 case 'T':
1138 case 't':
1139 case '2':
1140 options->tweak_parameters = eCompositionMatrixAdjust;
1141 break;
1142 case '1':
1143 options->tweak_parameters = eCompositionBasedStats;
1144 break;
1145 case '3':
1146 ErrPostEx(SEV_WARNING, 1, 0, "the -t 3 argument "
1147 "is currently experimental\n");
1148 options->tweak_parameters = eCompoForceFullMatrixAdjust;
1149 break;
1150 default:
1151 ErrPostEx(SEV_FATAL, 1, 0, "invalid argument for composition-"
1152 "based statistics; see -t options\n");
1153 break;
1154 }
1155 options->unified_p = 0;
1156 if (options->tweak_parameters > eNoCompositionBasedStats) {
1157 switch (myargs[ARG_COMP_BASED_STATS].strvalue[1]) {
1158 case 'U':
1159 case 'u':
1160 options->unified_p = 1;
1161 ErrPostEx(SEV_WARNING, 1, 0, "unified p-values "
1162 "are currently experimental\n");
1163 break;
1164 case '\0':
1165 break;
1166 default:
1167 ErrPostEx(SEV_WARNING, 1, 0, "unrecognized second character"
1168 "in value of -t, ignoring it\n");
1169 break;
1170 }
1171 }
1172 if ((options->tweak_parameters > 1) &&
1173 ((NULL != myargs[ARG_CHECKPOINT_INPUT].strvalue) ||
1174 (NULL != myargs[ARG_ALIGNMENT_IN].strvalue))) {
1175 ErrPostEx(SEV_WARNING, 1, 0,
1176 "-t larger than 1 not supported when restarting "
1177 "from a checkpoint; setting -t to 1\n");
1178 options->tweak_parameters = 1;
1179 }
1180 options->smith_waterman = (Boolean) myargs[ARG_SMITHWATERMAN].intvalue;
1181
1182 /* Seting list of gis to restrict search */
1183 if (myargs[ARG_GILIST].strvalue) {
1184 options->gifile = StringSave(myargs[ARG_GILIST].strvalue);
1185 }
1186
1187 options = BLASTOptionValidate(options, "blastp");
1188
1189 if (options == NULL)
1190 return NULL;
1191
1192 if(bop->believe_query) {
1193 bop->fake_bsp = bop->query_bsp;
1194 } else {
1195 bop->fake_bsp = BlastMakeFakeBioseq(bop->query_bsp, NULL);
1196
1197 BLASTUpdateSeqIdInSeqInt(bop->options->query_lcase_mask,
1198 bop->fake_bsp->id);
1199 }
1200
1201 return bop;
1202 }
1203
PGPReadNextQuerySequence(PGPBlastOptionsPtr bop)1204 Boolean PGPReadNextQuerySequence(PGPBlastOptionsPtr bop)
1205 {
1206 SeqEntryPtr sep;
1207
1208 /* Cleaning up stuff from previous query run */
1209
1210 bop->options->query_lcase_mask = SeqLocSetFree(bop->options->query_lcase_mask);
1211
1212 if (bop->believe_query == TRUE) { /* Not corect - to be fixed */
1213 BioseqFree(bop->query_bsp);
1214 } else {
1215 BioseqFree(bop->fake_bsp);
1216 }
1217
1218 /* scoremats contain only one sequence */
1219 if(myargs[ARG_CHECKPOINT_INPUT].intvalue != NO_SCOREMAT_IO) {
1220 return FALSE;
1221 }
1222
1223 if(myargs[ARG_LCASE].intvalue) {
1224 if((sep = FastaToSeqEntryForDb (bop->infp, FALSE, NULL, bop->believe_query, NULL, NULL, &bop->options->query_lcase_mask)) == NULL) {
1225 return FALSE;
1226 }
1227 } else {
1228 if((sep = FastaToSeqEntryEx(bop->infp, FALSE, NULL,
1229 bop->believe_query)) == NULL) {
1230 return FALSE;
1231 }
1232 }
1233
1234 bop->query_bsp = NULL;
1235 SeqEntryExplore(sep, &bop->query_bsp, FindProt);
1236
1237 if (bop->query_bsp == NULL) {
1238 ErrPostEx(SEV_FATAL, 1, 0, "Unable to obtain bioseq\n");
1239 return 0;
1240 }
1241
1242 if(bop->believe_query) {
1243 bop->fake_bsp = bop->query_bsp;
1244 } else {
1245 bop->fake_bsp = BlastMakeFakeBioseq(bop->query_bsp, NULL);
1246
1247 BLASTUpdateSeqIdInSeqInt(bop->options->query_lcase_mask,
1248 bop->fake_bsp->id);
1249 }
1250
1251 return TRUE;
1252 }
1253
PGPFormatHeader(PGPBlastOptionsPtr bop)1254 Boolean PGPFormatHeader(PGPBlastOptionsPtr bop)
1255 {
1256 Boolean html = myargs[ARG_HTML].intvalue;
1257
1258 if (bop->outfp == NULL)
1259 return FALSE;
1260
1261 if (html)
1262 fprintf(bop->outfp, "<PRE>\n");
1263
1264 init_buff_ex(90);
1265 BlastPrintVersionInfo("blastp", html, bop->outfp);
1266 fprintf(bop->outfp, "\n");
1267 BlastPrintReference(html, 90, bop->outfp);
1268 fprintf(bop->outfp, "\n");
1269 if (bop->options->tweak_parameters > 1) {
1270 CAdjustmentPrintReference(html, 90, bop->outfp);
1271 fprintf(bop->outfp, "\n");
1272 }
1273 if ((1== bop->options->tweak_parameters) ||
1274 ((1 < bop->options->tweak_parameters) && (bop->options->maxNumPasses > 1))) {
1275 CBStatisticsPrintReference(html, 90, (1 == bop->options->tweak_parameters),
1276 (bop->options->maxNumPasses > 1), bop->outfp);
1277 fprintf(bop->outfp, "\n");
1278 }
1279 AcknowledgeBlastQuery(bop->query_bsp, 70,
1280 bop->outfp, bop->believe_query, html);
1281 PrintDbInformation(bop->blast_database, TRUE, 70, bop->outfp, html);
1282 free_buff();
1283
1284 return TRUE;
1285 }
PGPFormatFooter(PGPBlastOptionsPtr bop,BlastSearchBlkPtr search)1286 Boolean PGPFormatFooter(PGPBlastOptionsPtr bop, BlastSearchBlkPtr search)
1287 {
1288 ValNodePtr vnp;
1289 BLAST_KarlinBlkPtr ka_params=NULL, ka_params_gap=NULL;
1290 TxDfDbInfoPtr dbinfo=NULL, dbinfo_head;
1291 CharPtr params_buffer=NULL;
1292 ValNodePtr other_returns;
1293 Boolean html = myargs[ARG_HTML].intvalue;
1294
1295 if (bop->outfp == NULL)
1296 return FALSE;
1297
1298 if (html)
1299 fprintf(bop->outfp, "<PRE>\n");
1300
1301 other_returns = BlastOtherReturnsPrepare(search);
1302 for (vnp=other_returns; vnp; vnp = vnp->next) {
1303 switch (vnp->choice) {
1304 case TXDBINFO:
1305 dbinfo = vnp->data.ptrvalue;
1306 break;
1307 case TXKABLK_NOGAP:
1308 ka_params = vnp->data.ptrvalue;
1309 break;
1310 case TXKABLK_GAP:
1311 ka_params_gap = vnp->data.ptrvalue;
1312 break;
1313 case TXPARAMETERS:
1314 params_buffer = vnp->data.ptrvalue;
1315 break;
1316 default:
1317 break;
1318 }
1319 }
1320 init_buff_ex(85);
1321 dbinfo_head = dbinfo;
1322 while (dbinfo) {
1323 PrintDbReport(dbinfo, 70, bop->outfp);
1324 dbinfo = dbinfo->next;
1325 }
1326 if (ka_params && !bop->options->isPatternSearch) {
1327 PrintKAParameters(ka_params->Lambda, ka_params->K,
1328 ka_params->H, 70, bop->outfp, FALSE);
1329 }
1330 if (ka_params_gap) {
1331 if (bop->options->isPatternSearch)
1332 PrintKAParametersExtra(ka_params_gap->Lambda,
1333 ka_params_gap->K, ka_params_gap->H,
1334 bop->seedSearch->paramC, 70,
1335 bop->outfp, FALSE);
1336 else
1337 PrintKAParameters(ka_params_gap->Lambda, ka_params_gap->K,
1338 ka_params_gap->H, 70, bop->outfp,
1339 FALSE);
1340 }
1341 PrintTildeSepLines(params_buffer, 70, bop->outfp);
1342
1343 free_buff();
1344 BlastOtherReturnsFree(other_returns);
1345
1346 return TRUE;
1347 }
1348
PGPrintPosMatrix(CharPtr filename,posSearchItems * posSearch,compactSearchItems * compactSearch,Boolean posComputationCalled)1349 Boolean PGPrintPosMatrix(CharPtr filename, posSearchItems *posSearch,
1350 compactSearchItems *compactSearch,
1351 Boolean posComputationCalled)
1352 {
1353 FILE *fp;
1354
1355 if ((fp = FileOpen(filename, "w")) == NULL) {
1356 ErrPostEx(SEV_FATAL, 1, 0, "Unable to open matrix output file %s\n",
1357 filename);
1358 return FALSE;
1359 }
1360
1361 /* a diagnostic, partly an option with -Q. */
1362 outputPosMatrix(posSearch, compactSearch, fp, posComputationCalled);
1363 FileClose(fp);
1364
1365 return TRUE;
1366 }
1367
PGPSeedSearch(PGPBlastOptionsPtr bop,BlastSearchBlkPtr search,posSearchItems * posSearch,SeqLocPtr PNTR seqloc_duplicate,SeqAlignPtr PNTR PNTR lastSeqAligns,Int4Ptr numLastSeqAligns)1368 SeqAlignPtr PGPSeedSearch(PGPBlastOptionsPtr bop, BlastSearchBlkPtr search,
1369 posSearchItems *posSearch,
1370 SeqLocPtr PNTR seqloc_duplicate,
1371 SeqAlignPtr PNTR PNTR lastSeqAligns,
1372 Int4Ptr numLastSeqAligns)
1373 {
1374 Uint1Ptr query = NULL; /*query sequence read in*/
1375 Uint1Ptr unfilter_query = NULL; /*needed if seg will filter query*/
1376 SeqLocPtr seg_slp; /*pointer to structure for seg filtering*/
1377 Int4 i, queryLength; /*length of query sequence*/
1378 SeqAlignPtr head;
1379 SeqAnnotPtr annot;
1380 SeqFeatPtr sfp;
1381 SeqLocPtr seqloc, next;
1382 ValNodePtr seedReturn; /*return value from seedEngineCore, which
1383 is a list of lists of SeqAligns, one
1384 list of SeqAligns per pattern occurrence*/
1385
1386 ValNodePtr vnp, info_vnp; /* Output text messages from seedEngineCore() */
1387
1388 query = BlastGetSequenceFromBioseq(bop->fake_bsp, &queryLength);
1389 seg_slp = BlastBioseqFilter(bop->fake_bsp,
1390 bop->options->filter_string);
1391 if (seg_slp) {
1392 unfilter_query = MemNew((queryLength + 1) * sizeof(Uint1));
1393 for (i = 0; i < queryLength; i++)
1394 unfilter_query[i] = query[i];
1395 BlastMaskTheResidues(query,queryLength,21,seg_slp,FALSE, 0);
1396 }
1397
1398 search->gap_align = GapAlignBlkNew(1,1);
1399 search->gap_align->gap_open = bop->options->gap_open;
1400 search->gap_align->gap_extend = bop->options->gap_extend;
1401
1402
1403 search->gap_align->decline_align = INT2_MAX;
1404
1405 #ifdef YES_TO_DECLINE_TO_ALIGN
1406 if(myargs[ARG_COST_DECLINE_ALIGN].intvalue != 0) {
1407 search->gap_align->decline_align = myargs[ARG_COST_DECLINE_ALIGN].intvalue;
1408 } else {
1409 search->gap_align->decline_align = INT2_MAX;
1410 }
1411 #endif
1412
1413 /* search->gap_align->decline_align = (-(BLAST_SCORE_MIN)); */
1414 /* search->gap_align->decline_align = myargs[ARG_LCASE].intvalue; */
1415
1416 search->gap_align->x_parameter = bop->options->gap_x_dropoff
1417 *NCBIMATH_LN2/bop->seedSearch->paramLambda;
1418 search->gap_align->matrix = search->sbp->matrix;
1419 initProbs(bop->seedSearch);
1420 init_order(search->gap_align->matrix, bop->program_flag,
1421 FALSE, bop->seedSearch);
1422
1423 for(i = 0; i < queryLength; i++)
1424 query[i] = bop->seedSearch->order[query[i]];
1425
1426 if (unfilter_query) {
1427 for(i = 0; i < queryLength; i++)
1428 unfilter_query[i] = bop->seedSearch->order[unfilter_query[i]];
1429 }
1430
1431 seqloc = NULL;
1432 seedReturn = seedEngineCore(search, bop->options, query,
1433 unfilter_query,
1434 bop->blast_database, bop->patfile,
1435 bop->program_flag,
1436 bop->patfp, FALSE, FALSE,
1437 bop->seedSearch, bop->options->ethresh,
1438 myargs[ARG_SEARCHSP].floatvalue, posSearch,
1439 &seqloc, TRUE, &info_vnp);
1440 if (!bop->is_xml_output)
1441 PGPOutTextMessages(info_vnp, bop->outfp);
1442 ValNodeFreeData(info_vnp);
1443
1444 if (search->error_return) {
1445 BlastErrorPrint(search->error_return);
1446 for (vnp = search->error_return; vnp; vnp = vnp->next) {
1447 BlastDestroyErrorMessage((BlastErrorMsgPtr)vnp->data.ptrvalue);
1448 }
1449 search->error_return = ValNodeFree(search->error_return);
1450 }
1451 *seqloc_duplicate = seqloc;
1452 head = convertValNodeListToSeqAlignList(seedReturn, lastSeqAligns,
1453 numLastSeqAligns);
1454
1455 bop->featureOrder[FEATDEF_REGION] = 1;
1456 bop->groupOrder[FEATDEF_REGION] = 1;
1457 annot = bop->fake_bsp->annot = SeqAnnotNew();
1458 bop->fake_bsp->annot->type = 1; /* ftable. */
1459 while (seqloc) {
1460 next = seqloc->next;
1461 sfp = SeqFeatNew();
1462 sfp->location = seqloc;
1463 sfp->data.choice = SEQFEAT_REGION;
1464 sfp->data.value.ptrvalue = StringSave("pattern");
1465 annot->data = sfp;
1466 if (next) {
1467 annot->next = SeqAnnotNew();
1468 annot = annot->next;
1469 annot->type = 1;
1470 }
1471 seqloc = next;
1472 }
1473
1474 if (query != NULL)
1475 MemFree(query);
1476 if (unfilter_query != NULL)
1477 unfilter_query = MemFree(unfilter_query);
1478
1479 return head;
1480 }
1481
PGPFormatMainOutput(SeqAlignPtr head,PGPBlastOptionsPtr bop,BlastSearchBlkPtr search,Int4 thisPassNum,SeqAlignPtr PNTR lastSeqAligns,Int4 numLastSeqAligns,SeqLocPtr seed_seqloc,Int2Ptr posRepeatSequences)1482 void PGPFormatMainOutput(SeqAlignPtr head, PGPBlastOptionsPtr bop,
1483 BlastSearchBlkPtr search, Int4 thisPassNum,
1484 SeqAlignPtr PNTR lastSeqAligns,
1485 Int4 numLastSeqAligns, SeqLocPtr seed_seqloc,
1486 Int2Ptr posRepeatSequences)
1487 {
1488 SeqAnnotPtr seqannot;
1489 ValNodePtr pruneSeed = NULL, seedReturn;
1490 BlastPruneSapStructPtr prune = NULL;
1491 BLAST_MatrixPtr matrix;
1492 Int4Ptr PNTR txmatrix;
1493 BioseqPtr query_bsp;
1494
1495 if(head == NULL && myargs[ARG_FORMAT].intvalue < 7) {
1496 fprintf(bop->outfp, "\n\n ***** No hits found ******\n\n");
1497 return;
1498 }
1499
1500 if (myargs[ARG_FORMAT].intvalue == 8 || myargs[ARG_FORMAT].intvalue == 9) {
1501 query_bsp = (bop->believe_query) ? bop->query_bsp : bop->fake_bsp;
1502 if (myargs[ARG_FORMAT].intvalue == 9)
1503 PrintTabularOutputHeader(bop->blast_database, query_bsp, NULL,
1504 "blastp", thisPassNum, bop->believe_query,
1505 bop->outfp);
1506 BlastPrintTabulatedResults(head, query_bsp, NULL,
1507 bop->number_of_alignments,
1508 "blastp", FALSE,
1509 bop->believe_query, 0, 0, bop->outfp, FALSE);
1510 return;
1511 }
1512
1513 seqannot = SeqAnnotNew();
1514 seqannot->type = 2;
1515 AddAlignInfoToSeqAnnot(seqannot, 2);
1516 seqannot->data = head;
1517
1518 if (search->pbp->maxNumPasses != 1 && myargs[ARG_FORMAT].intvalue < 7) {
1519 fprintf(bop->outfp, "\nResults from round %d\n",
1520 thisPassNum);
1521 }
1522
1523 ObjMgrSetHold();
1524 init_buff_ex(85);
1525
1526 /* ------- Printing deflines for the BLAST output ------- */
1527
1528 if (thisPassNum == 1) {
1529 search->positionBased = FALSE;
1530 if (!bop->options->isPatternSearch) {
1531 prune = BlastPruneHitsFromSeqAlign(head,
1532 bop->number_of_descriptions, NULL);
1533 PrintDefLinesFromSeqAlign(prune->sap, 80, bop->outfp,
1534 bop->print_options, FIRST_PASS, NULL);
1535 } else {
1536 seedReturn = convertSeqAlignListToValNodeList(head,lastSeqAligns,
1537 numLastSeqAligns);
1538
1539 pruneSeed = SeedPruneHitsFromSeedReturn(seedReturn,
1540 bop->number_of_descriptions);
1541 PrintDefLinesExtra(pruneSeed, 80, bop->outfp, bop->print_options,
1542 FIRST_PASS, NULL, seed_seqloc);
1543 }
1544 } else {
1545 prune = BlastPruneHitsFromSeqAlign(head,
1546 bop->number_of_descriptions, NULL);
1547 if (ALL_ROUNDS) {
1548 PrintDefLinesFromSeqAlign(prune->sap, 80, bop->outfp,
1549 bop->print_options,
1550 NOT_FIRST_PASS_REPEATS,
1551 posRepeatSequences);
1552 PrintDefLinesFromSeqAlign(prune->sap, 80, bop->outfp,
1553 bop->print_options, NOT_FIRST_PASS_NEW,
1554 posRepeatSequences);
1555 } else
1556 PrintDefLinesFromSeqAlign(prune->sap, 80, bop->outfp,
1557 bop->print_options, FIRST_PASS, NULL);
1558 } /*thisPassNum == 1 */
1559
1560 /* ------- --------------------------------------- ------- */
1561
1562 if (ALL_ROUNDS && search->posConverged && myargs[ARG_FORMAT].intvalue < 7) {
1563 fprintf(bop->outfp, "\nCONVERGED!\n");
1564 }
1565
1566 free_buff();
1567
1568 matrix = NULL;
1569 txmatrix = NULL;
1570 if (search->sbp->matrix) {
1571 matrix = BLAST_MatrixFill(search->sbp, FALSE);
1572 txmatrix = BlastMatrixToTxMatrix(matrix);
1573 }
1574
1575 if (!(bop->options->isPatternSearch)) {
1576 prune = BlastPruneHitsFromSeqAlign(head, bop->number_of_alignments,
1577 prune);
1578 seqannot->data = prune->sap;
1579
1580 if(bop->options->discontinuous) {
1581 if(!DDV_DisplayBlastPairList(prune->sap, search->mask,
1582 bop->outfp, FALSE,
1583 bop->align_options,
1584 bop->align_options & TXALIGN_HTML ? 6 : 1)) {
1585 fprintf(stdout,
1586 "\n\n!!!\n "
1587 " -------- Failure to print alignment... --------"
1588 "\n!!!\n\n");
1589 fflush(stdout);
1590 }
1591 } else { /* Old type formating */
1592 if (myargs[ARG_FORMAT].intvalue != 0) {
1593 ShowTextAlignFromAnnot2(seqannot, 60, bop->outfp,
1594 bop->featureOrder, bop->groupOrder,
1595 bop->align_options, NULL,
1596 search->mask, NULL,
1597 NULL, NULL);
1598 } else {
1599 ShowTextAlignFromAnnot2(seqannot, 60, bop->outfp,
1600 bop->featureOrder, bop->groupOrder,
1601 bop->align_options, txmatrix,
1602 search->mask, FormatScoreFunc,
1603 NULL, NULL);
1604 }
1605 }
1606
1607 /* seqannot->data = head; */
1608
1609 } else {
1610 if (bop->number_of_alignments < bop->number_of_descriptions) {
1611 pruneSeed = SeedPruneHitsFromSeedReturn(pruneSeed,
1612 bop->number_of_alignments);
1613 }
1614
1615
1616 if (myargs[ARG_FORMAT].intvalue != 0) {
1617 ShowTextAlignFromAnnotExtra(bop->query_bsp, pruneSeed,
1618 seed_seqloc, 60, bop->outfp,
1619 bop->featureOrder, bop->groupOrder,
1620 bop->align_options, NULL,
1621 search->mask, NULL);
1622 } else {
1623 ShowTextAlignFromAnnotExtra(bop->query_bsp, pruneSeed,
1624 seed_seqloc, 60, bop->outfp,
1625 bop->featureOrder, bop->groupOrder,
1626 bop->align_options, txmatrix,
1627 search->mask, FormatScoreFunc);
1628 }
1629 }
1630
1631 if (!(bop->options->isPatternSearch)) {
1632 prune = BlastPruneSapStructDestruct(prune);
1633 }
1634
1635 search->positionBased = TRUE;
1636 ObjMgrClearHold();
1637 ObjMgrFreeCache(0);
1638
1639 matrix = BLAST_MatrixDestruct(matrix);
1640 if (txmatrix)
1641 txmatrix = TxMatrixDestruct(txmatrix);
1642
1643 seqannot->data = NULL;
1644 seqannot = SeqAnnotFree(seqannot);
1645
1646 return;
1647 }
1648
PGPSeqAlignOut(PGPBlastOptionsPtr bop,SeqAlignPtr head)1649 void PGPSeqAlignOut(PGPBlastOptionsPtr bop, SeqAlignPtr head)
1650 {
1651 SeqAnnotPtr seqannot;
1652
1653 if (!bop->aip_out || !head)
1654 return;
1655
1656 seqannot = SeqAnnotNew();
1657 seqannot->type = 2;
1658 AddAlignInfoToSeqAnnot(seqannot, 2);
1659 seqannot->data = head;
1660 SeqAnnotAsnWrite((SeqAnnotPtr) seqannot, bop->aip_out, NULL);
1661 AsnIoReset(bop->aip_out);
1662 /*
1663 bop->aip_out = AsnIoClose(bop->aip_out);
1664 */
1665 seqannot->data = NULL;
1666 seqannot = SeqAnnotFree(seqannot);
1667 }
1668
Main(void)1669 Int2 Main (void)
1670
1671 {
1672 PGPBlastOptionsPtr bop;
1673 BlastSearchBlkPtr search;
1674 SeqAlignPtr head = NULL;
1675 SeqLocPtr seed_seqloc = NULL;
1676 Char buf[256] = { '\0' };
1677
1678 /* used for psi-blast */
1679 Int4 thisPassNum;
1680 posSearchItems *posSearch;
1681 compactSearchItems *compactSearch;
1682 Boolean recoverCheckpoint = FALSE;
1683 Boolean alreadyRecovered = FALSE;
1684 Boolean freqCheckpoint = FALSE;
1685 Boolean alignCheckpoint = FALSE;
1686 Boolean posComputationCalled = FALSE;
1687 Boolean checkReturn = FALSE;
1688 Boolean next_query = FALSE;
1689 Boolean tabular_output;
1690 AsnIoPtr aip = NULL;
1691
1692 SeqAlignPtr PNTR lastSeqAligns = NULL;
1693 /*keeps track of the last SeqAlign in
1694 each list of seedReturn so that the
1695 2-level list can be converted to a 1-level
1696 list and then back to 2-level*/
1697 Int4 numLastSeqAligns = 0;
1698
1699 PSIXmlPtr psixp = NULL;
1700 ValNodePtr vnp;
1701
1702 /* ----- End of definitions ----- */
1703
1704 StringCpy(buf, "blastpgp ");
1705 StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf)-1);
1706 if (! GetArgs (buf, NUMARG, myargs))
1707 return (1);
1708 ErrSetMessageLevel(SEV_WARNING);
1709
1710 UseLocalAsnloadDataAndErrMsg ();
1711
1712 if (! SeqEntryLoad())
1713 return 1;
1714
1715 if ((bop = PGPReadBlastOptions()) == NULL)
1716 return 1;
1717
1718 if(bop->is_xml_output) {
1719 if((aip = AsnIoOpen(myargs[ARG_OUT].strvalue, "wx")) == NULL)
1720 return 1;
1721 }
1722
1723 /* Here we will start main do/while loop over many query entries in
1724 the input file. If there is an option for checkpoint recover or
1725 Output File for PSI-BLAST Checkpointing all but first entries in
1726 the input file will be discarded */
1727
1728 do {
1729 search = BLASTSetUpSearchWithReadDb(bop->fake_bsp, "blastp",
1730 bop->query_bsp->length,
1731 bop->blast_database,
1732 bop->options, NULL);
1733
1734 if (search == NULL)
1735 return 1;
1736
1737
1738 if(bop->is_xml_output) {
1739 psixp = PSIXmlInit(aip, "blastp", bop->blast_database,
1740 bop->options,
1741 bop->fake_bsp, 0);
1742 }
1743
1744 /*AAS*/
1745 if ((NULL != myargs[ARG_CHECKPOINT_INPUT].strvalue) || (NULL != myargs[ARG_ALIGNMENT_IN].strvalue)) {
1746 recoverCheckpoint = TRUE;
1747 if (NULL != myargs[ARG_CHECKPOINT_INPUT].strvalue) {
1748 freqCheckpoint = TRUE;
1749 alignCheckpoint = FALSE;
1750 } else {
1751 freqCheckpoint = FALSE;
1752 alignCheckpoint = TRUE;
1753 }
1754 }
1755
1756 if (recoverCheckpoint)
1757 search->positionBased = TRUE;
1758 else
1759 search->positionBased = FALSE;
1760
1761 global_fp = bop->outfp;
1762 tabular_output = (myargs[ARG_FORMAT].intvalue == 8 || myargs[ARG_FORMAT].intvalue == 9);
1763
1764 if(!bop->is_xml_output && !tabular_output) {
1765 PGPFormatHeader(bop);
1766 }
1767
1768 posSearch = NULL;
1769 thisPassNum = 0;
1770 compactSearch = NULL;
1771 search->posConverged = FALSE;
1772 global_fp = bop->outfp;
1773 search->error_return = NULL;
1774 /*AAS*/
1775 if (recoverCheckpoint) {
1776 posSearch = (posSearchItems *) MemNew(1 * sizeof(posSearchItems));
1777 compactSearch = compactSearchNew(compactSearch);
1778 copySearchItems(compactSearch, search, bop->options->matrix);
1779 posInitializeInformation(posSearch,search);
1780 /*AAS*/
1781 if (freqCheckpoint) {
1782 checkReturn = posReadCheckpoint(posSearch, compactSearch, myargs[ARG_CHECKPOINT_INPUT].strvalue, myargs[ARG_SCOREMAT_INPUT].intvalue, &(search->error_return));
1783 search->sbp->posMatrix = posSearch->posMatrix;
1784 if (NULL == search->sbp->posFreqs)
1785 search->sbp->posFreqs = allocatePosFreqs(compactSearch->qlength, compactSearch->alphabetSize);
1786 copyPosFreqs(posSearch->posFreqs,search->sbp->posFreqs, compactSearch->qlength, compactSearch->alphabetSize);
1787 } else {
1788 search->sbp->posMatrix = BposComputation(posSearch, search, compactSearch, myargs[ARG_ALIGNMENT_IN].strvalue, myargs[ARG_CHECKPOINT_OUTPUT].strvalue, myargs[ARG_SCOREMAT_OUTPUT].intvalue, bop->query_bsp, myargs[ARG_GAPOPEN].intvalue, myargs[ARG_GAPEXT].intvalue, &(search->error_return));
1789 posComputationCalled = TRUE;
1790 if (NULL == search->sbp->posMatrix)
1791 checkReturn = FALSE;
1792 else
1793 checkReturn = TRUE;
1794 }
1795
1796
1797 if (search->error_return) {
1798 BlastErrorPrint(search->error_return);
1799 for (vnp = search->error_return; vnp; vnp = vnp->next) {
1800 BlastDestroyErrorMessage((BlastErrorMsgPtr)vnp->data.ptrvalue);
1801 }
1802 search->error_return = ValNodeFree(search->error_return);
1803 }
1804 if (!checkReturn) {
1805 ErrPostEx(SEV_FATAL, 1, 0, "blast: Error recovering from checkpoint");
1806 return 1;
1807 }
1808
1809 /* Print out Pos matrix if necessary */
1810 if (myargs[ARG_MATRIX_OUT].strvalue != NULL)
1811 PGPrintPosMatrix(myargs[ARG_MATRIX_OUT].strvalue, posSearch, compactSearch, posComputationCalled);
1812 }
1813
1814 do { /*AAS*/
1815 thisPassNum++;
1816 if (thisPassNum > 1)
1817 bop->options->isPatternSearch = FALSE;
1818
1819 if(head != NULL)
1820 head = SeqAlignSetFree(head);
1821
1822 #ifdef OS_UNIX
1823 if(!bop->is_xml_output && !tabular_output && !bop->is_asn1_output) {
1824 search->thr_info->tick_callback = tick_callback;
1825 fprintf(global_fp, "%s", "Searching");
1826 fflush(global_fp);
1827 }
1828 #endif
1829 if (1 == thisPassNum && (!recoverCheckpoint)) {
1830
1831 posSearch = (posSearchItems *)
1832 MemNew(1 * sizeof(posSearchItems));
1833 }
1834
1835 /* ----- Here is real BLAST search done ------- */
1836
1837 if (bop->options->isPatternSearch &&
1838 (1 == thisPassNum && (!recoverCheckpoint))) {
1839 head = PGPSeedSearch(bop, search, posSearch,
1840 &seed_seqloc,
1841 &lastSeqAligns, &numLastSeqAligns);
1842 } else {
1843 if ((bop->options->tweak_parameters > 1) &&
1844 (1 == thisPassNum && (!recoverCheckpoint))) {
1845 /* round 1 and not recovering from checkpoint;
1846 * relax the cutoff evalue so that we don't loose
1847 * too many hits for which compositional adjustment
1848 * improves the evalue. */
1849 bop->options->expect_value =
1850 bop->options->kappa_expect_value * EVALUE_EXPAND;
1851 search->pbp->cutoff_e = bop->options->expect_value;
1852 } else {
1853 /* For pass > 1, set all expect_values equal */
1854 search->pbp->cutoff_e =
1855 bop->options->expect_value =
1856 bop->options->kappa_expect_value;
1857 }
1858 if ((1 == thisPassNum) && (!recoverCheckpoint))
1859 head = BioseqBlastEngineCore(search, bop->options, NULL);
1860 else
1861 head = BioseqBlastEngineCore(search, bop->options,
1862 search->sbp->posMatrix);
1863 }
1864 /* ---------------------------------------------- */
1865
1866 if (recoverCheckpoint) {
1867 compactSearchDestruct(compactSearch);
1868 recoverCheckpoint = FALSE;
1869 alreadyRecovered = TRUE;
1870 }
1871
1872 if (thisPassNum == 1) {
1873 compactSearch = compactSearchNew(compactSearch);
1874 } else {
1875 MemFree(compactSearch->standardProb);
1876 }
1877
1878 copySearchItems(compactSearch, search, bop->options->matrix);
1879
1880
1881 /* The next two calls (after the "if") are diagnostics
1882 for Stephen. Don't perform this if only one pass will
1883 be made (i.e., standard BLAST) */
1884
1885 if (ALL_ROUNDS && 1 != search->pbp->maxNumPasses) {
1886 if ((1 == thisPassNum) && (!alreadyRecovered))
1887 posInitializeInformation(posSearch, search);
1888 posPrintInformation(posSearch, search, thisPassNum);
1889 }
1890
1891 #ifdef OS_UNIX
1892 if(!bop->is_xml_output && !tabular_output && !bop->is_asn1_output) {
1893 fprintf(global_fp, "%s", "done\n\n");
1894 }
1895 #endif
1896
1897 /* AAS */
1898 if (thisPassNum == 1) {
1899 ReadDBBioseqFetchEnable ("blastpgp", bop->blast_database,
1900 FALSE, TRUE);
1901 } else {
1902
1903 /* Have things converged? */
1904 if (ALL_ROUNDS && search->pbp->maxNumPasses != 1) {
1905 posConvergenceTest(posSearch, search, head, thisPassNum);
1906 }
1907 }
1908
1909 /*AAS*/
1910 search->positionBased = TRUE;
1911
1912 /* Here is all BLAST formating of the main output done */
1913
1914 if(bop->is_xml_output) {
1915 ValNodePtr other_returns;
1916 IterationPtr iterp;
1917 ValNodePtr vnp = search->mask;
1918 /* Avoid linking masking locations into other_returns,
1919 lest they will be freed */
1920 search->mask = NULL;
1921 other_returns = BlastOtherReturnsPrepare(search);
1922 search->mask = vnp;
1923
1924 if (head == NULL) {
1925 iterp = BXMLBuildOneIteration(head, other_returns,
1926 bop->options->is_ooframe,
1927 !bop->options->gapped_calculation, thisPassNum,
1928 "No hits found", search->mask);
1929 } else {
1930 BlastPruneSapStructPtr prune =
1931 BlastPruneHitsFromSeqAlign(head, bop->number_of_alignments, NULL);
1932 iterp = BXMLBuildOneIteration(prune->sap, other_returns,
1933 bop->options->is_ooframe,
1934 !bop->options->gapped_calculation, thisPassNum,
1935 (search->posConverged ? "CONVERGED" : NULL),
1936 search->mask);
1937 prune = BlastPruneSapStructDestruct(prune);
1938 }
1939
1940 IterationAsnWrite(iterp, psixp->aip, psixp->atp);
1941 AsnIoFlush(psixp->aip);
1942
1943 IterationFree(iterp);
1944 BlastOtherReturnsFree(other_returns);
1945 } else if(!bop->is_asn1_output){
1946 PGPFormatMainOutput(head, bop, search, thisPassNum,
1947 lastSeqAligns, numLastSeqAligns,
1948 seed_seqloc, posSearch->posRepeatSequences);
1949 }
1950
1951 if (alreadyRecovered) {
1952 posCheckpointFreeMemory(posSearch, compactSearch->qlength);
1953 alreadyRecovered = FALSE;
1954 }
1955
1956 if (ALL_ROUNDS && thisPassNum > 1) {
1957 posCleanup(posSearch, compactSearch);
1958 }
1959
1960 if (!search->posConverged && (search->pbp->maxNumPasses == 0 ||
1961 (thisPassNum < search->pbp->maxNumPasses))) {
1962 if (ALL_ROUNDS) {
1963 search->sbp->posMatrix =
1964 CposComputation(posSearch, search, compactSearch,
1965 head, myargs[ARG_CHECKPOINT_OUTPUT].strvalue,
1966 (bop->options->isPatternSearch &&
1967 (1== thisPassNum)),
1968 myargs[ARG_SCOREMAT_OUTPUT].intvalue,
1969 bop->query_bsp,
1970 myargs[ARG_GAPOPEN].intvalue,
1971 myargs[ARG_GAPEXT].intvalue,
1972 &(search->error_return), 1.0); /*AAS*/
1973 posComputationCalled = TRUE;
1974 if (search->error_return) {
1975 BlastErrorPrint(search->error_return);
1976 for (vnp = search->error_return; vnp; vnp = vnp->next) {
1977 BlastDestroyErrorMessage((BlastErrorMsgPtr)vnp->data.ptrvalue);
1978 }
1979 search->error_return = ValNodeFree(search->error_return);
1980 }
1981 } else {
1982 search->sbp->posMatrix =
1983 WposComputation(compactSearch, head, search->sbp->posFreqs);
1984 posComputationCalled = TRUE;
1985 }
1986 #if 0
1987 /* DEBUG Printing of the matrix */
1988 {{
1989 Int4 i, j;
1990 FILE *fd;
1991
1992 fd = FileOpen("omatrix.out", "w");
1993 for(i = 0; i < bop->query_bsp->length; i++) {
1994 for(j = 0; j < 26; j++) {
1995 fprintf(fd, "%d ", search->sbp->posMatrix[i][j]);
1996 }
1997 fprintf(fd, "\n");
1998 }
1999 FileClose(fd);
2000 }}
2001 #endif
2002 } else {
2003 search->sbp->posMatrix = NULL;
2004 }
2005
2006 if (ALL_ROUNDS && thisPassNum > 1) {
2007 MemFree(posSearch->posRepeatSequences);
2008 }
2009
2010 if (!search->posConverged && (0 == search->pbp->maxNumPasses ||
2011 thisPassNum < search->pbp->maxNumPasses)) {
2012
2013 /* Print out pos matrix if necessary */
2014 if (ALL_ROUNDS && (myargs[ARG_MATRIX_OUT].strvalue != NULL))
2015 PGPrintPosMatrix(myargs[ARG_MATRIX_OUT].strvalue, posSearch,
2016 compactSearch, posComputationCalled);
2017 }
2018
2019 if ((bop->is_asn1_output || bop->aip_out != NULL) && head != NULL)
2020 PGPSeqAlignOut(bop, head);
2021
2022 } while (( 0 == search->pbp->maxNumPasses || thisPassNum < (search->pbp->maxNumPasses)) && (! (search->posConverged)));
2023
2024 head = SeqAlignSetFree(head);
2025
2026 /* Here we will print out footer of BLAST output */
2027
2028 /*need to temporarily adjust cutoff_e for printing*/
2029 if(!bop->is_xml_output && !tabular_output) {
2030 PGPFormatFooter(bop, search);
2031 }
2032
2033 /* PGPOneQueryCleanup */
2034
2035 if (bop->options->isPatternSearch) {
2036 bop->seedSearch = MemFree(bop->seedSearch);
2037 FileClose(bop->patfp);
2038 }
2039
2040 if (ALL_ROUNDS) {
2041 posFreeInformation(posSearch);
2042 MemFree(posSearch);
2043 }
2044 compactSearchDestruct(compactSearch);
2045 search = BlastSearchBlkDestruct(search);
2046
2047 /* If these options are set we are not going to proceed with another
2048 queryes in the input file if any */
2049
2050 if(recoverCheckpoint || myargs[ARG_MATRIX_OUT].strvalue != NULL)
2051 break;
2052
2053 next_query = FALSE;
2054 next_query = PGPReadNextQuerySequence(bop);
2055
2056 if (psixp) {
2057 PSIXmlReset(psixp);
2058 }
2059 } while (next_query); /* End of main do {} while (); loop */
2060
2061 ReadDBBioseqFetchDisable();
2062
2063
2064 if(psixp != NULL) {
2065 PSIXmlClose(psixp);
2066 }
2067
2068 PGPFreeBlastOptions(bop);
2069
2070 return 0;
2071 }
2072
2073
2074 /* Nothing below this line is executable code */
2075
2076 #ifdef PRINT_ONLY_ALIGNMENT
2077 {{
2078 AsnIoPtr aip;
2079
2080 if (seqannot)
2081 seqannot = SeqAnnotFree(seqannot);
2082
2083 seqannot = SeqAnnotNew();
2084 seqannot->type = 2;
2085 AddAlignInfoToSeqAnnot(seqannot, 2);
2086 seqannot->data = head;
2087 aip = AsnIoOpen("stdout", "w");
2088 SeqAnnotAsnWrite(seqannot, aip, NULL);
2089 AsnIoReset(aip);
2090 AsnIoClose(aip);
2091
2092 seqannot->data = NULL;
2093 seqannot = SeqAnnotFree(seqannot);
2094
2095 return 0;
2096 }}
2097 #endif
2098