1 static char const rcsid[] = "$Id: fastacmd.c,v 6.36 2005/06/22 13:55:22 coulouri Exp $";
2 
3 /* $Id: fastacmd.c,v 6.36 2005/06/22 13:55:22 coulouri Exp $
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name:  $RCSfile: fastacmd.c,v $
29 *
30 * Author:  Sergei Shavirin
31 *
32 * Initial Version Creation Date: 05/20/1997
33 *
34 * $Revision: 6.36 $
35 *
36 * File Description:
37 *        FASTA retrievel system using ISAM indexes
38 *
39 * $Log: fastacmd.c,v $
40 * Revision 6.36  2005/06/22 13:55:22  coulouri
41 * add support for dumping accessions
42 *
43 * Revision 6.35  2005/05/05 15:54:06  dondosha
44 * Enhanced comment for the -s option and fixed typo in -i option description
45 *
46 * Revision 6.34  2004/12/04 03:39:48  camacho
47 * Set range of data on -D option
48 *
49 * Revision 6.33  2004/12/03 04:58:06  camacho
50 * Fix name conflict in enumeration for fastacmd dump types
51 *
52 * Revision 6.32  2004/12/02 20:37:40  camacho
53 * + fastacmd feature to dump list of gis
54 *
55 * Revision 6.31  2004/06/30 19:52:00  camacho
56 * Added #include <blfmtutl.h>
57 *
58 * Revision 6.30  2004/05/13 20:54:45  coulouri
59 * spell 'loci' correctly
60 *
61 * Revision 6.29  2003/05/30 17:31:09  coulouri
62 * add rcsid
63 *
64 * Revision 6.28  2003/04/15 19:09:41  camacho
65 * Added option to retrieve sequences by PIG
66 *
67 * Revision 6.27  2002/11/21 21:35:54  camacho
68 * Make sure the proper exit code is returned
69 *
70 * Revision 6.26  2002/08/12 12:40:55  camacho
71 * Fix for unresolved symbol in Win32 build
72 *
73 * Revision 6.25  2002/08/09 19:41:25  camacho
74 * 1) Added blast version number to command-line options
75 * 2) Added explanations for some default parameters
76 *
77 * Revision 6.24  2002/07/30 21:02:17  camacho
78 * Added explanation for -T option
79 *
80 * Revision 6.23  2002/07/30 15:30:47  camacho
81 * 1. Added explanation for -L option
82 * 2. Moved function to parse SeqLocs to readdb.c
83 *
84 * Revision 6.22  2002/07/18 22:17:49  madden
85 * Revert last change
86 *
87 * Revision 6.21  2002/07/18 18:49:43  madden
88 * Set SeqLoc to NULL always
89 *
90 * Revision 6.20  2002/07/14 21:02:08  camacho
91 * Added extra features to fastacmd
92 *
93 * Revision 6.19  2002/05/02 21:59:31  camacho
94 * Clarified database parameter default
95 *
96 * Revision 6.18  2001/12/18 13:01:52  camacho
97 * Added new flag -D to dump blast database in FASTA format
98 *
99 * Revision 6.17  2001/12/10 19:17:32  camacho
100 * Added option to allow fastacmd to use Ctrl-As as defline separators.
101 *
102 * Revision 6.16  2001/10/19 19:46:26  camacho
103 * Added new feature to dump FASTA files from blast databases, added support for the new database format
104 *
105 * Revision 6.15  2000/10/16 20:47:35  madden
106 * Add -o option to write output to file
107 *
108 * Revision 6.14  2000/06/28 16:56:52  madden
109 * Call Fastacmd_Search_ex, Boolean for target gi only
110 *
111 * Revision 6.13  2000/03/08 15:26:03  madden
112 * Add return statement to Main fct, purify nit
113 *
114 * Revision 6.12  2000/01/12 21:05:00  egorov
115 * Use Fastacmd API
116 *
117 * Revision 6.11  1999/12/21 21:26:22  egorov
118 * Use new parameter of readdb_gi2seq function
119 *
120 * Revision 6.10  1999/12/17 20:48:54  egorov
121 * Fix 'gcc -Wall' warnings and remove old stuff.
122 *
123 * Revision 6.9  1999/09/28 19:02:47  egorov
124 * In the new version of the 'fastacmd' there is not need to
125 * specify database name if search a GI.
126 *
127 * Revision 6.8  1999/02/23 17:17:32  madden
128 * Remove unused static _accession functions
129 *
130 * Revision 6.7  1998/02/11 18:06:43  madden
131 * Fix for reading IDs in from a file
132 *
133 * Revision 6.6  1998/02/06 18:26:35  madden
134 * Removed stripping of white spaces
135 *
136 * Revision 6.5  1998/01/29 19:47:02  madden
137 * Changed second call from BioseqRawToFasta to BioseqRawToFastaExtra
138 *
139 * Revision 6.4  1998/01/27 20:27:03  madden
140 * Added option to specify sequence line length
141 *
142 * Revision 6.3  1998/01/23 21:56:16  madden
143 * Error messages sent to stderr
144 *
145 * Revision 6.2  1998/01/16 22:04:20  madden
146 * Call to readdb_new_ex, fixed FUM
147 *
148 * Revision 6.1  1997/11/07 16:19:15  shavirin
149 * Added possibility to retrieve redundant accessions
150 *
151 * Revision 6.0  1997/08/25 18:19:56  madden
152 * Revision changed to 6.0
153 *
154 * Revision 5.2  1997/05/20 21:00:45  shavirin
155 * Remove spurious error message in accession/locus retrieval
156 *
157 * Revision 5.1  1997/05/20 15:47:30  shavirin
158 * Initial revision
159 *
160 *
161 * ==========================================================================
162 */
163 
164 #include <ncbi.h>
165 #include <objseq.h>
166 #include <objsset.h>
167 #include <sequtil.h>
168 #include <seqport.h>
169 #include <tofasta.h>
170 #include <readdb.h>
171 #include <blast.h>
172 #include <blfmtutl.h>
173 
174 static Args myargs [] = {
175     { "Database",                                               /* 0 */
176       "nr", NULL, NULL, TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
177     { "Type of file\n"                                          /* 1 */
178       "         G - guess mode (look for protein, then nucleotide)\n"
179       "         T - protein   \n"
180       "         F - nucleotide",
181       "G", NULL,NULL,TRUE,'p',ARG_STRING,0.0,0,NULL},
182     { "Comma-delimited search string(s).\n"
183       "      GIs, accessions, loci, or fullSeq-id strings may be used,\n"
184       "      e.g. 555, AC147927, 'gnl|dbname|tag'",             /* 2 */
185       NULL, NULL, NULL, TRUE, 's', ARG_STRING, 0.0, 0, NULL},
186     { "Input file with GIs/accessions/loci for batch\n"
187       "      retrieval",                                        /* 3 */
188       NULL, NULL, NULL, TRUE, 'i', ARG_STRING, 0.0, 0, NULL},
189     { "Retrieve duplicate accessions",                          /* 4 */
190       "F", NULL, NULL, TRUE, 'a', ARG_BOOLEAN, 0.0, 0, NULL},
191     { "Line length for sequence",                               /* 5 */
192       "80", NULL, NULL, TRUE, 'l', ARG_INT, 0.0, 0, NULL},
193     { "Definition line should contain target gi only",          /* 6 */
194       "F", NULL, NULL, TRUE, 't', ARG_BOOLEAN, 0.0, 0, NULL},
195     { "Output file",                                            /* 7 */
196       "stdout", NULL, NULL, TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
197     { "Use Ctrl-A's as non-redundant defline separator",        /* 8 */
198       "F", NULL, NULL, TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
199     { "Dump the entire database as (default is not to dump anything):\n"
200       "      1 FASTA\n"
201       "      2 Gi list\n"
202       "      3 Accession.version list\n",                       /* 9 */
203       "0", "0", "3", TRUE, 'D', ARG_INT, 0.0, 0, NULL},
204     { "Range of sequence to extract (Format: start,stop)\n"
205       "      0 in 'start' refers to the beginning of the sequence\n"
206       "      0 in 'stop' refers to the end of the sequence",
207       "0,0", NULL, NULL, TRUE, 'L', ARG_STRING, 0.0, 0, NULL},  /* 10 */
208     { "Strand on subsequence (nucleotide only): 1 is top, 2 is bottom",
209       "1", NULL, NULL, FALSE, 'S', ARG_INT, 0.0, 0, NULL},      /* 11 */
210     { "Print taxonomic information for requested sequence(s)",
211       "F", NULL, NULL, FALSE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},  /* 12 */
212     { "Print database information only (overrides all other options)",
213       "F", NULL, NULL, FALSE, 'I', ARG_BOOLEAN, 0.0, 0, NULL},  /* 13 */
214     { "Retrieve sequences with this PIG",
215       NULL, NULL, NULL, TRUE, 'P', ARG_INT, 0.0, 0, NULL},     /* 14 */
216 };
217 
Main(void)218 Int2 Main (void)
219 {
220     CharPtr	database, searchstr, batchfile;
221     Int4	linelen, pig;
222     Boolean	dupl, target, use_ctrlAs, taxonomy_info, dbinfo_only;
223     EBlastDbDumpType dump_db = eNoDump;
224     Uint1 is_prot;
225     FILE *outfp = NULL;
226     CharPtr seqlocstr;
227     Uint1 strand;
228     Char buf[256] = { '\0' };
229     Int2 rv;
230 
231     StringCpy(buf, "fastacmd ");
232     StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf));
233     if (! GetArgs (buf, DIM(myargs), myargs)) {
234 	     return (1);
235     }
236 
237     if( !ErrSetLogfile ("stderr", ELOG_APPEND) ) {
238 	     exit(1);
239     }
240 
241     database = myargs[0].strvalue;
242     if (!StringICmp(myargs[1].strvalue, "T"))
243         is_prot = READDB_DB_IS_PROT;
244     else if (!StringICmp(myargs[1].strvalue, "F"))
245         is_prot = READDB_DB_IS_NUC;
246     else
247         is_prot = READDB_DB_UNKNOWN;
248 
249     searchstr     = myargs[2].strvalue;
250     batchfile     = myargs[3].strvalue;
251     dupl          = myargs[4].intvalue;
252     linelen       = myargs[5].intvalue;
253     target        = myargs[6].intvalue;
254     use_ctrlAs    = myargs[8].intvalue;
255     dump_db       = myargs[9].intvalue;
256     seqlocstr     = myargs[10].strvalue;
257     strand        = myargs[11].intvalue;
258     taxonomy_info = myargs[12].intvalue;
259     dbinfo_only   = myargs[13].intvalue;
260     pig           = myargs[14].intvalue == 0 ? PIG_NONE : myargs[14].intvalue;
261 
262     if ((outfp = FileOpen(myargs[7].strvalue, "w")) == NULL) {
263         ErrPostEx(SEV_ERROR, 0, 0,"Could not open %s", myargs[7].strvalue);
264         return 1;
265     }
266 
267     rv = Fastacmd_Search_ex (searchstr, database, is_prot, batchfile, dupl,
268             linelen, outfp, target, use_ctrlAs, dump_db, seqlocstr, strand,
269             taxonomy_info, dbinfo_only, pig);
270 
271     FileClose(outfp);
272 
273     return rv;
274 }
275