1 static char const rcsid[] = "$Id: fastacmd.c,v 6.36 2005/06/22 13:55:22 coulouri Exp $";
2
3 /* $Id: fastacmd.c,v 6.36 2005/06/22 13:55:22 coulouri Exp $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: $RCSfile: fastacmd.c,v $
29 *
30 * Author: Sergei Shavirin
31 *
32 * Initial Version Creation Date: 05/20/1997
33 *
34 * $Revision: 6.36 $
35 *
36 * File Description:
37 * FASTA retrievel system using ISAM indexes
38 *
39 * $Log: fastacmd.c,v $
40 * Revision 6.36 2005/06/22 13:55:22 coulouri
41 * add support for dumping accessions
42 *
43 * Revision 6.35 2005/05/05 15:54:06 dondosha
44 * Enhanced comment for the -s option and fixed typo in -i option description
45 *
46 * Revision 6.34 2004/12/04 03:39:48 camacho
47 * Set range of data on -D option
48 *
49 * Revision 6.33 2004/12/03 04:58:06 camacho
50 * Fix name conflict in enumeration for fastacmd dump types
51 *
52 * Revision 6.32 2004/12/02 20:37:40 camacho
53 * + fastacmd feature to dump list of gis
54 *
55 * Revision 6.31 2004/06/30 19:52:00 camacho
56 * Added #include <blfmtutl.h>
57 *
58 * Revision 6.30 2004/05/13 20:54:45 coulouri
59 * spell 'loci' correctly
60 *
61 * Revision 6.29 2003/05/30 17:31:09 coulouri
62 * add rcsid
63 *
64 * Revision 6.28 2003/04/15 19:09:41 camacho
65 * Added option to retrieve sequences by PIG
66 *
67 * Revision 6.27 2002/11/21 21:35:54 camacho
68 * Make sure the proper exit code is returned
69 *
70 * Revision 6.26 2002/08/12 12:40:55 camacho
71 * Fix for unresolved symbol in Win32 build
72 *
73 * Revision 6.25 2002/08/09 19:41:25 camacho
74 * 1) Added blast version number to command-line options
75 * 2) Added explanations for some default parameters
76 *
77 * Revision 6.24 2002/07/30 21:02:17 camacho
78 * Added explanation for -T option
79 *
80 * Revision 6.23 2002/07/30 15:30:47 camacho
81 * 1. Added explanation for -L option
82 * 2. Moved function to parse SeqLocs to readdb.c
83 *
84 * Revision 6.22 2002/07/18 22:17:49 madden
85 * Revert last change
86 *
87 * Revision 6.21 2002/07/18 18:49:43 madden
88 * Set SeqLoc to NULL always
89 *
90 * Revision 6.20 2002/07/14 21:02:08 camacho
91 * Added extra features to fastacmd
92 *
93 * Revision 6.19 2002/05/02 21:59:31 camacho
94 * Clarified database parameter default
95 *
96 * Revision 6.18 2001/12/18 13:01:52 camacho
97 * Added new flag -D to dump blast database in FASTA format
98 *
99 * Revision 6.17 2001/12/10 19:17:32 camacho
100 * Added option to allow fastacmd to use Ctrl-As as defline separators.
101 *
102 * Revision 6.16 2001/10/19 19:46:26 camacho
103 * Added new feature to dump FASTA files from blast databases, added support for the new database format
104 *
105 * Revision 6.15 2000/10/16 20:47:35 madden
106 * Add -o option to write output to file
107 *
108 * Revision 6.14 2000/06/28 16:56:52 madden
109 * Call Fastacmd_Search_ex, Boolean for target gi only
110 *
111 * Revision 6.13 2000/03/08 15:26:03 madden
112 * Add return statement to Main fct, purify nit
113 *
114 * Revision 6.12 2000/01/12 21:05:00 egorov
115 * Use Fastacmd API
116 *
117 * Revision 6.11 1999/12/21 21:26:22 egorov
118 * Use new parameter of readdb_gi2seq function
119 *
120 * Revision 6.10 1999/12/17 20:48:54 egorov
121 * Fix 'gcc -Wall' warnings and remove old stuff.
122 *
123 * Revision 6.9 1999/09/28 19:02:47 egorov
124 * In the new version of the 'fastacmd' there is not need to
125 * specify database name if search a GI.
126 *
127 * Revision 6.8 1999/02/23 17:17:32 madden
128 * Remove unused static _accession functions
129 *
130 * Revision 6.7 1998/02/11 18:06:43 madden
131 * Fix for reading IDs in from a file
132 *
133 * Revision 6.6 1998/02/06 18:26:35 madden
134 * Removed stripping of white spaces
135 *
136 * Revision 6.5 1998/01/29 19:47:02 madden
137 * Changed second call from BioseqRawToFasta to BioseqRawToFastaExtra
138 *
139 * Revision 6.4 1998/01/27 20:27:03 madden
140 * Added option to specify sequence line length
141 *
142 * Revision 6.3 1998/01/23 21:56:16 madden
143 * Error messages sent to stderr
144 *
145 * Revision 6.2 1998/01/16 22:04:20 madden
146 * Call to readdb_new_ex, fixed FUM
147 *
148 * Revision 6.1 1997/11/07 16:19:15 shavirin
149 * Added possibility to retrieve redundant accessions
150 *
151 * Revision 6.0 1997/08/25 18:19:56 madden
152 * Revision changed to 6.0
153 *
154 * Revision 5.2 1997/05/20 21:00:45 shavirin
155 * Remove spurious error message in accession/locus retrieval
156 *
157 * Revision 5.1 1997/05/20 15:47:30 shavirin
158 * Initial revision
159 *
160 *
161 * ==========================================================================
162 */
163
164 #include <ncbi.h>
165 #include <objseq.h>
166 #include <objsset.h>
167 #include <sequtil.h>
168 #include <seqport.h>
169 #include <tofasta.h>
170 #include <readdb.h>
171 #include <blast.h>
172 #include <blfmtutl.h>
173
174 static Args myargs [] = {
175 { "Database", /* 0 */
176 "nr", NULL, NULL, TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
177 { "Type of file\n" /* 1 */
178 " G - guess mode (look for protein, then nucleotide)\n"
179 " T - protein \n"
180 " F - nucleotide",
181 "G", NULL,NULL,TRUE,'p',ARG_STRING,0.0,0,NULL},
182 { "Comma-delimited search string(s).\n"
183 " GIs, accessions, loci, or fullSeq-id strings may be used,\n"
184 " e.g. 555, AC147927, 'gnl|dbname|tag'", /* 2 */
185 NULL, NULL, NULL, TRUE, 's', ARG_STRING, 0.0, 0, NULL},
186 { "Input file with GIs/accessions/loci for batch\n"
187 " retrieval", /* 3 */
188 NULL, NULL, NULL, TRUE, 'i', ARG_STRING, 0.0, 0, NULL},
189 { "Retrieve duplicate accessions", /* 4 */
190 "F", NULL, NULL, TRUE, 'a', ARG_BOOLEAN, 0.0, 0, NULL},
191 { "Line length for sequence", /* 5 */
192 "80", NULL, NULL, TRUE, 'l', ARG_INT, 0.0, 0, NULL},
193 { "Definition line should contain target gi only", /* 6 */
194 "F", NULL, NULL, TRUE, 't', ARG_BOOLEAN, 0.0, 0, NULL},
195 { "Output file", /* 7 */
196 "stdout", NULL, NULL, TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
197 { "Use Ctrl-A's as non-redundant defline separator", /* 8 */
198 "F", NULL, NULL, TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
199 { "Dump the entire database as (default is not to dump anything):\n"
200 " 1 FASTA\n"
201 " 2 Gi list\n"
202 " 3 Accession.version list\n", /* 9 */
203 "0", "0", "3", TRUE, 'D', ARG_INT, 0.0, 0, NULL},
204 { "Range of sequence to extract (Format: start,stop)\n"
205 " 0 in 'start' refers to the beginning of the sequence\n"
206 " 0 in 'stop' refers to the end of the sequence",
207 "0,0", NULL, NULL, TRUE, 'L', ARG_STRING, 0.0, 0, NULL}, /* 10 */
208 { "Strand on subsequence (nucleotide only): 1 is top, 2 is bottom",
209 "1", NULL, NULL, FALSE, 'S', ARG_INT, 0.0, 0, NULL}, /* 11 */
210 { "Print taxonomic information for requested sequence(s)",
211 "F", NULL, NULL, FALSE, 'T', ARG_BOOLEAN, 0.0, 0, NULL}, /* 12 */
212 { "Print database information only (overrides all other options)",
213 "F", NULL, NULL, FALSE, 'I', ARG_BOOLEAN, 0.0, 0, NULL}, /* 13 */
214 { "Retrieve sequences with this PIG",
215 NULL, NULL, NULL, TRUE, 'P', ARG_INT, 0.0, 0, NULL}, /* 14 */
216 };
217
Main(void)218 Int2 Main (void)
219 {
220 CharPtr database, searchstr, batchfile;
221 Int4 linelen, pig;
222 Boolean dupl, target, use_ctrlAs, taxonomy_info, dbinfo_only;
223 EBlastDbDumpType dump_db = eNoDump;
224 Uint1 is_prot;
225 FILE *outfp = NULL;
226 CharPtr seqlocstr;
227 Uint1 strand;
228 Char buf[256] = { '\0' };
229 Int2 rv;
230
231 StringCpy(buf, "fastacmd ");
232 StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf));
233 if (! GetArgs (buf, DIM(myargs), myargs)) {
234 return (1);
235 }
236
237 if( !ErrSetLogfile ("stderr", ELOG_APPEND) ) {
238 exit(1);
239 }
240
241 database = myargs[0].strvalue;
242 if (!StringICmp(myargs[1].strvalue, "T"))
243 is_prot = READDB_DB_IS_PROT;
244 else if (!StringICmp(myargs[1].strvalue, "F"))
245 is_prot = READDB_DB_IS_NUC;
246 else
247 is_prot = READDB_DB_UNKNOWN;
248
249 searchstr = myargs[2].strvalue;
250 batchfile = myargs[3].strvalue;
251 dupl = myargs[4].intvalue;
252 linelen = myargs[5].intvalue;
253 target = myargs[6].intvalue;
254 use_ctrlAs = myargs[8].intvalue;
255 dump_db = myargs[9].intvalue;
256 seqlocstr = myargs[10].strvalue;
257 strand = myargs[11].intvalue;
258 taxonomy_info = myargs[12].intvalue;
259 dbinfo_only = myargs[13].intvalue;
260 pig = myargs[14].intvalue == 0 ? PIG_NONE : myargs[14].intvalue;
261
262 if ((outfp = FileOpen(myargs[7].strvalue, "w")) == NULL) {
263 ErrPostEx(SEV_ERROR, 0, 0,"Could not open %s", myargs[7].strvalue);
264 return 1;
265 }
266
267 rv = Fastacmd_Search_ex (searchstr, database, is_prot, batchfile, dupl,
268 linelen, outfp, target, use_ctrlAs, dump_db, seqlocstr, strand,
269 taxonomy_info, dbinfo_only, pig);
270
271 FileClose(outfp);
272
273 return rv;
274 }
275