1 static char const rcsid[] = "$Id: formatdb.c,v 6.104 2007/08/21 20:07:49 kans Exp $";
2 
3 /*****************************************************************************
4 
5 
6                           PUBLIC DOMAIN NOTICE
7               National Center for Biotechnology Information
8 
9     This software/database is a "United States Government Work" under the
10     terms of the United States Copyright Act.  It was written as part of
11     the author's official duties as a United States Government employee
12     and thus cannot be copyrighted.  This software/database is freely
13     available to the public for use. The National Library of Medicine and
14     the U.S. Government have not placed any restriction on its use or
15     reproduction.
16 
17     Although all reasonable efforts have been taken to ensure the accuracy
18     and reliability of the software and data, the NLM and the U.S.
19     Government do not and cannot warrant the performance or results that
20     may be obtained by using this software or data. The NLM and the U.S.
21     Government disclaim all warranties, express or implied, including
22     warranties of performance, merchantability or fitness for any
23     particular purpose.
24 
25     Please cite the author in any work or product based on this material.
26 
27    ***************************************************************************
28 
29    File Name:  formatdb.c
30 
31    Author:  Sergei B. Shavirin
32 
33    Version Creation Date: 10/01/96
34 
35    $Revision: 6.104 $
36 
37    File Description:  formats FASTA databases for use by BLAST
38 
39    $Log: formatdb.c,v $
40    Revision 6.104  2007/08/21 20:07:49  kans
41    made static functions static, added cast of EFDBCleanOpt to fix CodeWarrior complaints
42 
43    Revision 6.103  2007/04/13 13:21:11  madden
44    Add call to ErrSetLogLevel
45 
46    Revision 6.102  2006/09/25 19:56:05  camacho
47    Added s_SetDBListNameMultiVolDereference to fix bug when creating alias files over est
48 
49    Revision 6.101  2006/06/19 17:20:14  coulouri
50    Extend 1GB default volume size to all platforms and impose a hard limit of 4G. rt#15171398
51 
52    Revision 6.100  2006/05/04 20:07:27  camacho
53    Report fatal error in case of failure to add sequence to BLAST database because
54    of zero-length sequence and clean up the datababase that was being created.
55 
56    Revision 6.99  2006/03/14 14:36:46  camacho
57    Logging changes
58 
59    Revision 6.98  2006/03/08 19:06:12  camacho
60    Moved definition for maximum number of volumes to readdb.[ch], fixes rt ticket 15147600
61 
62    Revision 6.97  2005/09/30 14:54:32  camacho
63    Enable recognition of the formatdb configuration file to allow users to set the
64    membership and link bits in the ASN.1 deflines.
65 
66    Revision 6.96  2005/07/28 14:52:22  coulouri
67    remove dead code
68 
69    Revision 6.95  2005/06/08 19:25:53  camacho
70    New feature to allow formatdb to add taxonomy ids to BLAST databases
71    generated from FASTA input
72    BugzID: 6
73 
74    Revision 6.94  2004/08/25 14:47:21  camacho
75    Refactorings to allow formatdb process multiple deflines
76 
77    Revision 6.93  2004/06/30 19:52:00  camacho
78    Added #include <blfmtutl.h>
79 
80    Revision 6.92  2004/01/29 14:56:44  camacho
81    Removed -A option, FORMATDB_VER_TEXT no longer supported
82 
83    Revision 6.91  2003/10/01 18:59:56  camacho
84    Fix to creation of custom databases using a gi list and alias files when the
85    source database spans multiple volumes.
86 
87    Revision 6.90  2003/09/12 20:18:55  camacho
88    This change enables the generation of alias files for multiple ASN.1 inputs.
89 
90    Revision 6.89  2003/05/30 17:31:09  coulouri
91    add rcsid
92 
93    Revision 6.88  2003/05/13 16:02:42  coulouri
94    make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
95 
96    Revision 6.87  2003/05/08 16:02:17  camacho
97    Use conditional compilation for reading .formatdbrc
98 
99    Revision 6.86  2003/04/03 19:10:59  camacho
100    Fixed typo
101 
102    Revision 6.85  2003/03/26 18:51:45  camacho
103 
104    1. Minor bug fixes.
105    2. Added eFDBCleanOpt parameter to FDBOptionsNew.
106 
107    Revision 6.84  2003/03/20 14:03:02  camacho
108    Allow users to set the membership and link bits
109 
110    Revision 6.83  2003/03/11 18:27:45  madden
111    Cast value to Int8 to prevent rollover
112 
113    Revision 6.82  2003/02/25 15:53:40  beloslyu
114    final coma is not allowed after the last element of enum (at least AIX compiler barks)
115 
116    Revision 6.81  2003/01/27 20:17:06  camacho
117    Bug fix in alias file creation
118 
119    Revision 6.80  2003/01/23 22:01:29  camacho
120    Minor change
121 
122    Revision 6.79  2003/01/22 19:44:51  camacho
123 
124    1. Added check for number of volumes greater than 100.
125    2. Fixed bug when creating alias files if multiple FASTA inputs are used.
126    3. Implemented correct creation of alias files when creating custom
127       blast databases with a gi list.
128 
129    Revision 6.78  2003/01/16 19:46:07  kans
130    changed NULL to 0 to fix Mac compiler error
131 
132    Revision 6.77  2003/01/07 17:20:26  camacho
133    Added error message when gi file is not found
134 
135    Revision 6.76  2002/12/18 15:16:17  camacho
136    Minor fixes of command-line arguments
137 
138    Revision 6.75  2002/12/16 20:22:19  camacho
139    Better error handling when creating alias files
140 
141    Revision 6.74  2002/12/13 13:44:50  camacho
142    Use FDBOptionsNew to create options structure
143 
144    Revision 6.73  2002/12/02 22:43:42  camacho
145    Added warning message when no sequences are found when creating alias files
146 
147    Revision 6.72  2002/11/06 21:27:46  ucko
148    Make 4294967295 explicitly unsigned to avoid warnings.
149 
150    Revision 6.71  2002/09/26 02:14:43  camacho
151    Allow limiting the number of sequences per volume
152 
153    Revision 6.70  2002/09/25 20:14:20  camacho
154    Fix for multivolume databases with non-parseable seqids
155 
156    Revision 6.69  2002/08/09 19:41:25  camacho
157    1) Added blast version number to command-line options
158    2) Added explanations for some default parameters
159 
160    Revision 6.68  2002/07/24 20:52:55  coulouri
161    Change database volume size parameter to megabases
162 
163    Revision 6.67  2002/04/19 13:10:43  madden
164    Make new database format the default
165 
166    Revision 6.66  2002/02/15 22:01:19  beloslyu
167    fix from HP
168 
169    Revision 6.65  2001/11/06 15:24:20  dondosha
170    Roll back previous change - it was not needed
171 
172    Revision 6.64  2001/11/05 22:14:49  dondosha
173    Allow stdin as input
174 
175    Revision 6.63  2001/11/02 19:27:45  camacho
176    Fixed problem that would corrupt the BlastDefLine structures for the new database format
177 
178    Revision 6.62  2001/07/12 19:35:51  madden
179    Set alias_file_name
180 
181    Revision 6.61  2001/07/06 19:58:57  madden
182    Add NLM_GENERATED_CODE_PROTO and include for fdlobj.h, remove unused variables
183 
184    Revision 6.60  2001/06/14 14:18:48  madden
185    Replace FD_MakeAliasFile with FD_CreateAliasFile
186 
187    Revision 6.59  2001/06/07 20:52:07  shavirin
188    Fixed problem with truncation of definition lines when TEXT mode is
189    chosen for dump.
190 
191    Revision 6.58  2001/06/07 13:13:02  madden
192    Set limits for -v arg, set bases_in_volume to UINT4_MAX if not set
193 
194    Revision 6.57  2001/05/17 20:21:46  dondosha
195    Do not add .00 extension when only one volume created
196 
197    Revision 6.56  2001/05/11 20:02:35  madden
198    Add oidlist and gifile to FD_CreateAliasFileEx
199 
200    Revision 6.55  2001/05/11 18:19:02  madden
201    Add option to output binary gifile
202 
203    Revision 6.54  2001/05/10 17:28:33  madden
204    Make gifile optional
205 
206    Revision 6.53  2001/05/10 17:21:11  madden
207    Add options to produce alias file from a gifile
208 
209    Revision 6.52  2001/05/08 21:56:43  shavirin
210    Added possibility to generate tax_id for every definition in Blast FASTA
211    definition set in ASN.1 structured definition lines.
212 
213    Revision 6.51  2001/05/02 16:22:04  dondosha
214    Add NSEQ and LENGTH to alias files in case of multiple inputs to formatdb
215 
216    Revision 6.50  2001/04/11 21:00:53  dondosha
217    Made functions FD_CreateAliasFile(Ex) public
218 
219    Revision 6.49  2001/04/11 20:45:35  dondosha
220    Moved appending of .00 for the first volume to FormatDBInit function
221 
222    Revision 6.48  2001/04/11 20:14:41  dondosha
223    Processing of volumes moved to lower level
224 
225    Revision 6.47  2001/03/27 21:11:40  dondosha
226    Allow multiple input files for formatting
227 
228    Revision 6.46  2001/02/01 22:25:50  shavirin
229    Added (uncommented) option to create ASN.1 structured deflines in
230    BLAST databases.
231 
232    Revision 6.45  2001/01/25 21:08:09  madden
233    Fix an ABR
234 
235    Revision 6.44  2000/12/12 23:12:09  shavirin
236    Fixed some FMR error.
237 
238    Revision 6.43  2000/12/08 22:34:35  shavirin
239    Added possibility to create Taxonomy lookup database.
240 
241    Revision 6.42  2000/11/22 20:53:19  shavirin
242    Added possibility to use Taxonomy client/server for creation of ASN.1
243    structured deflines with taxonomy ID (Using define TAX_CS_LOOKUP).
244 
245    Revision 6.41  2000/11/13 21:37:23  madden
246    Use ErrPostEx
247 
248    Revision 6.40  2000/11/03 18:13:22  madden
249    Print list of bad characters in FASTA input
250 
251    Revision 6.39  2000/09/29 16:40:16  shavirin
252    Fixed problem with multivolume database creation.
253 
254    Revision 6.38  2000/09/18 20:46:36  kans
255    added back #include <sqnutils.h>, needed for UseLocalAsnloadDataAndErrMsg
256 
257    Revision 6.37  2000/09/12 15:38:35  shavirin
258    Error message level set to SEV_WARNING
259 
260    Revision 6.36  2000/07/18 19:32:28  shavirin
261    Added new option -V to enable check for non-unique string ids in the
262    FASTA database. Default is FALSE.
263 
264    Revision 6.35  2000/02/17 17:20:59  sicotte
265    Change Calling convention for FastaToSeqEntryForDb
266 
267    Revision 6.34  2000/02/04 21:52:58  madden
268    Use FastaToSeqEntryForDb
269 
270    Revision 6.33  1999/12/21 18:31:38  madden
271    Fixed bug with writing alias file.
272 
273    Revision 6.32  1999/12/17 20:48:54  egorov
274    Fix 'gcc -Wall' warnings and remove old stuff.
275 
276    Revision 6.31  1999/12/16 15:53:23  egorov
277    Typo fixed
278 
279    Revision 6.30  1999/09/10 16:30:35  shavirin
280    Fixed problems with formating proteins by formatdb
281 
282    Revision 6.29  1999/09/09 18:25:51  shavirin
283    Changed way to parse ASN.1. Added possibility to parse
284    delta sequences.
285 
286    Revision 6.28  1999/08/25 20:20:27  shavirin
287    Added -s option to create sparse indexes.
288 
289    Revision 6.27  1999/08/18 15:00:11  shavirin
290    If title missing from args *.pal file will have basename as title.
291 
292    Revision 6.26  1999/08/03 16:38:56  shavirin
293    Added function FD_CreateAliasFile() for multivolume formating.
294 
295    Revision 6.24  1999/07/23 18:59:01  shavirin
296    Added support for creation of multivolume databases.
297 
298    Revision 6.23  1999/05/13 19:34:19  shavirin
299    More changes towards dump from ID.
300 
301    Revision 6.21  1999/05/12 15:46:52  shavirin
302    Changed parameter in function FDBAddSequence().
303 
304    Revision 6.20  1999/04/26 21:06:19  shavirin
305    Fixed minor bug.
306 
307    Revision 6.19  1999/04/26 19:37:45  shavirin
308    Dumping info set to FALSE.
309 
310    Revision 6.18  1999/04/26 14:53:16  shavirin
311    Fixed memory leaks in FDBAddSequence() function.
312 
313    Revision 6.17  1999/04/21 21:44:34  shavirin
314    Many functions were moved to "readdb.c" file.
315 
316    Revision 6.16  1999/03/21 19:16:59  madden
317    Fix problem on round numbers
318 
319    Revision 6.15  1999/03/05 21:34:48  madden
320    Changes for accession.version
321 
322    Revision 6.14  1999/02/04 18:01:48  madden
323    Add -n option for basename
324 
325    Revision 6.13  1998/11/16 18:34:42  madden
326    Add return-value checks
327 
328    Revision 6.12  1998/07/13 15:32:17  egorov
329    make error message more understandable
330 
331    Revision 6.10  1998/06/19 21:05:46  egorov
332    Fix MemFree() bug
333 
334    Revision 6.9  1998/05/05 13:57:37  madden
335    Print version number to log file
336 
337    Revision 6.8  1998/04/20 19:14:05  egorov
338    Fix just one, but huge MLK
339 
340    Revision 6.7  1998/02/23 16:49:14  egorov
341    Changes to make the tofasta.c independent on readdb.h
342 
343    Revision 6.6  1998/02/18 15:29:31  madden
344    Added const to prototype for FormatdbCreateStringIndex
345 
346    Revision 6.5  1998/02/11 18:05:32  madden
347    Changed program to take ASN.1 as input
348 
349    Revision 6.3  1997/12/08 21:55:00  madden
350    Parse naked (no bars) as IDs
351 
352    Revision 6.2  1997/11/06 18:11:17  madden
353    Added indices for naked gnl|PID and backbone entries
354 
355    Revision 6.1  1997/10/30 18:15:08  madden
356    Changes to SeqIdE2Index to allow lookups by accession strings
357 
358    Revision 6.0  1997/08/25 18:20:04  madden
359    Revision changed to 6.0
360 
361    Revision 1.20  1997/07/28 18:36:55  madden
362    Replaced printf with ErrPostEx and fprintf
363 
364    Revision 1.19  1997/07/28 14:35:37  vakatov
365    Added LIBCALLBACK to the ID_Compare() proto
366 
367    Revision 1.18  1997/06/10 18:44:11  shavirin
368    Fixed return value from UpdateLookupInfo()
369 
370    Revision 1.17  1997/05/19 21:16:30  shavirin
371    Changed content of string index file due to E2Iindex API logic
372 
373    Revision 1.16  1997/05/12 19:57:38  shavirin
374    Added additional dump of Accessions/Locuses into string indexes
375 
376    Revision 1.15  1997/05/07 21:08:15  madden
377    flipped parse argument default
378 
379    Revision 1.14  1997/05/05 17:01:42  shavirin
380    Added ability to format "non-parced" seqid-deflines
381    Removed not-used d if#defs  with FASTA_ASN
382 
383  * Revision 1.13  1997/05/01  17:31:32  shavirin
384  * Added dumping of 2 more files: String ISAM SeqId index
385  *
386  * Revision 1.12  1997/02/25  22:20:39  shavirin
387  * Changes in accordance to ISAM API changes
388  *
389  * Revision 1.11  1997/02/24  21:22:57  shavirin
390  * Added dump of numeric ISAM information.
391  *
392  * Revision 1.10  1996/12/20  00:31:19  madden
393  * Protected ambiguity data against big/little endian changes.
394  *
395  * Revision 1.9  1996/12/19  16:30:36  madden
396  * Changes to eliminate ".nac" file for nucl.
397  *
398  * Revision 1.8  1996/11/27  16:40:19  madden
399  * Save build date, Make "o" argument FALSE by default.
400  *
401  * Revision 1.7  1996/11/26  20:08:08  madden
402  * BioseqRawConvert(bsp, Seq_code_ncbistdaa); only called for protein alphabets.
403  *
404  * Revision 1.6  1996/11/26  19:52:10  madden
405  * Removed FORMATDB_VER and added readdb.h (which contains same);
406  * Changed phd or nhd to phr or nhr
407  *
408  * Revision 1.5  1996/11/18  20:53:58  shavirin
409  * Forced output protein code to Seq_code_ncbistdaa.
410  *
411  * Revision 1.4  1996/11/06  23:15:34  shavirin
412  * Removed bug with reallocation of index tables
413  *
414 
415 *****************************************************************************/
416 #define NLM_GENERATED_CODE_PROTO
417 #include <ncbi.h>
418 #include <tofasta.h>
419 #include <sequtil.h>
420 #include <readdb.h>
421 #include <sqnutils.h>
422 #include <taxblast.h>
423 #include <blastdef.h>
424 #include <mblast.h>
425 #include <fdlobj.h>
426 #include <blfmtutl.h>
427 
428 /* program's arguments */
429 
430 #define NUMARG (sizeof(dump_args)/sizeof(dump_args[0]))
431 
432 Args dump_args[] = {
433     { "Title for database file",
434       NULL, NULL, NULL, TRUE, 't', ARG_STRING, 0.0, 0, NULL},
435     {"Input file(s) for formatting",
436      NULL, NULL,NULL,TRUE,'i',ARG_FILE_IN, 0.0,0,NULL},
437     {"Logfile name:",
438      "formatdb.log", NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL},
439     {"Type of file\n"
440      "         T - protein   \n"
441      "         F - nucleotide",
442      "T", NULL,NULL,TRUE,'p',ARG_BOOLEAN,0.0,0,NULL},
443     {"Parse options\n"
444      "         T - True: Parse SeqId and create indexes.\n"
445      "         F - False: Do not parse SeqId. Do not create indexes.\n",
446      "F", NULL,NULL,TRUE,'o',ARG_BOOLEAN,0.0,0,NULL},
447     {"Input file is database in ASN.1 format (otherwise FASTA is expected)\n"
448      "         T - True, \n"
449      "         F - False.\n",
450      "F", NULL,NULL,TRUE,'a',ARG_BOOLEAN,0.0,0,NULL},
451     {"ASN.1 database in binary mode\n"
452      "         T - binary, \n"
453      "         F - text mode.\n",
454      "F", NULL,NULL,TRUE,'b',ARG_BOOLEAN,0.0,0,NULL},
455     {"Input is a Seq-entry",
456      "F", NULL ,NULL ,TRUE,'e',ARG_BOOLEAN,0.0,0,NULL},
457     { "Base name for BLAST files",
458       NULL, NULL, NULL, TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
459     { "Database volume size in millions of letters",
460       "4000", NULL, NULL, TRUE, 'v', ARG_INT, 0.0, 0, NULL},
461     { "Create indexes limited only to accessions - sparse",
462       "F", NULL, NULL, TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL},
463     { "Verbose: check for non-unique string ids in the database",
464       "F", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL},
465     { "Create an alias file with this name\n"
466       "        use the gifile arg (below) if set to calculate db size\n"
467       "        use the BLAST db specified with -i (above)",
468       NULL, NULL, NULL, TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL},
469     {"Gifile (file containing list of gi's)",
470      NULL, NULL,NULL,TRUE,'F',ARG_FILE_IN, 0.0,0,NULL},
471     {"Binary Gifile produced from the Gifile specified above",
472      NULL, NULL,NULL,TRUE,'B',ARG_FILE_OUT, 0.0,0,NULL},
473     {"Taxid file to set the taxonomy ids in ASN.1 deflines",
474      NULL, NULL,NULL,TRUE,'T',ARG_FILE_IN, 0.0,0,NULL},
475 #if 0
476      /* disabled for this release of the NCBI C toolkit */
477     {"Clean up options for new blast database generation\n"
478      "         0 - Never: Do not clean up any 'basename.*' blast db files.\n"
479      "         1 - Always: Remove all 'basename.*' blast db files.\n"
480      "         2 - Prompt: If any 'basename.*' blast db files are found,\n"
481      "                     prompt user.\n",
482      "0", "0","2",TRUE,'c',ARG_INT,0.0,0,NULL},
483 #endif
484 };
485 
486 enum {
487     title_arg,
488     input_arg,
489     logfile_arg,
490     is_prot_arg,
491     parse_arg,
492     asn_arg,
493     asnbin_arg,
494     seqentry_arg,
495     basename_arg,
496     dbsize_arg,
497     sparse_arg,
498     nonunique_arg,
499     alias_fn_arg,
500     gifile_arg,
501     bin_gifile_arg,
502     seqid_taxid_file_arg,
503     cleanup_arg
504 };
505 
506 /* Fasta file delimiters */
507 #define DELIM " "
508 
FDBCheckFastaInputs(CharPtr fasta_files,Int4 is_prot,Int8 bases_per_vol,Int4Ptr num_inputs)509 static Boolean FDBCheckFastaInputs(CharPtr fasta_files, Int4 is_prot, Int8
510         bases_per_vol, Int4Ptr num_inputs)
511 {
512     Int8 predicted_dblength = 0;
513     Char *next_file;
514 
515     next_file = StringTokMT(fasta_files, DELIM, &fasta_files);
516     predicted_dblength = FileLength(next_file);
517     *num_inputs = 1;
518 
519     while ((next_file = StringTokMT(fasta_files, DELIM, &fasta_files))) {
520         predicted_dblength += FileLength(next_file);
521         (*num_inputs)++;
522     }
523 
524     if (bases_per_vol == 0)
525         return TRUE;
526 
527     if (!is_prot)
528         predicted_dblength /= READDB_COMPRESSION_RATIO;
529 
530     if ((predicted_dblength/bases_per_vol) > (kFDBMaxNumVolumes - 10)) {
531         ErrPostEx(SEV_ERROR, 0, 0, "Using %s bases per volume will exceed "
532                 "the maximum number\nof volumes formatdb can create.\n"
533                 "Please increase this value or do not set it at all.\n",
534                 Nlm_Int8tostr(bases_per_vol, 0));
535         return FALSE;
536     }
537 
538     return TRUE;
539 }
540 
SeqEntryGetLength(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)541 static void SeqEntryGetLength(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
542 {
543     Int8* length = (Int8*) data;
544 
545     if (IS_Bioseq(sep)) {
546         BioseqPtr bsp = (BioseqPtr) sep->data.ptrvalue;
547         *length += (Int8)bsp->length;
548         return;
549     } else {
550         BioseqSetPtr bssp = (BioseqSetPtr) sep->data.ptrvalue;
551         SeqEntryGetLength(bssp->seq_set, data, index, indent);
552     }
553     return;
554 }
555 
556 /** This function ensures that the path listed in the alias file's DBLIST
557  * contains any (relative) paths specified by the user and also do any
558  * necessary dereferences of the alias file contents so that they refer to the
559  * database's underlying name
560  */
561 static void
s_SetDBListNameMultiVolDereference(const ReadDBFILE * rdfp,CharPtr basename_requested_by_user,Int4 rdfp_ctr,Char base_fn[])562 s_SetDBListNameMultiVolDereference(const ReadDBFILE* rdfp,
563                                    CharPtr basename_requested_by_user,
564                                    Int4 rdfp_ctr,
565                                    Char base_fn[])
566 {
567     char* basename_of_volume = FileNameFind(rdfp->filename);
568     char* basename_predicted = NULL;
569 
570     /* Calculate the 'predicted' name of the database to go in the DBLIST of
571      * the alias file */
572     sprintf(base_fn, "%s.%02d", basename_requested_by_user, rdfp_ctr);
573     basename_predicted = FileNameFind(base_fn); /* basename thereof */
574 
575     /* If the basename of the volume is not the same as the 'predicted'
576      * basename, save the path prefix of the database name provided by the
577      * user and append the volume's basename */
578     if (StringCmp(basename_predicted, basename_of_volume)) {
579         char* dirname = FilePathFind(basename_requested_by_user);
580         if (StringLen(dirname)) {
581             sprintf(base_fn, "%s%c%s", dirname, DIRDELIMCHR,
582                     basename_of_volume);
583         } else {
584             StringCpy(base_fn, basename_of_volume);
585         }
586         MemFree(dirname);
587     }
588 }
589 
Main(void)590 Int2 Main(void)
591 {
592     SeqEntryPtr sep = NULL;
593     FormatDBPtr    fdbp = NULL;
594     FDB_optionsPtr options = NULL;
595     BioseqPtr bsp = NULL;
596     BlastDefLinePtr bdp = NULL;
597     Int2 id_ctr=1;
598     Int4 sequence_count=0;
599     Int4 input_ctr = 0, num_inputfiles = 0;
600     Int8 total_length, *lengths = NULL;
601     CharPtr error_msg=NULL;
602     FILE *fd = NULL;
603     CharPtr next_db = NULL, file_inputs = NULL, orig_ptr = NULL, tmp = NULL;
604     Boolean multiple_inputs = FALSE;
605     Char buf[256] = { '\0' };
606     Int4Ptr last_oid = NULL;
607     CharPtr *inputs = NULL;
608     Char tmpbuf1[PATH_MAX], tmpbuf2[PATH_MAX];
609     FDBTaxidDeflineTable* taxid_tbl = NULL;
610     CharPtr seqid_taxid_file = NULL;
611 
612     /* get arguments */
613     StringCpy(buf, "formatdb ");
614     StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf));
615     if (!GetArgs(buf, NUMARG, dump_args))
616         return 1;
617 
618     if (!SeqEntryLoad())
619         return 1;
620 
621     if (!ErrSetLog(dump_args[logfile_arg].strvalue))
622         ErrShow();
623     else
624         ErrSetOpts(ERR_CONTINUE, ERR_LOG_ON);
625     UseLocalAsnloadDataAndErrMsg();
626     ErrSetMessageLevel(SEV_WARNING);
627     ErrSetLogLevel(SEV_WARNING);
628 
629     /* Ensure that volume size is within acceptable limits */
630     if (dump_args[dbsize_arg].intvalue > 16000) {
631         ErrPostEx(SEV_FATAL, 1, 0, "Volume size may not exceed 16 gigabases.\n");
632         return 1;
633     }
634 
635     /* Parse input string for multiple inputs */
636     file_inputs = StringSave(dump_args[input_arg].strvalue);
637     tmp = StringTokMT(file_inputs, DELIM, &next_db);
638     if (next_db) {
639        if (!dump_args[basename_arg].strvalue) {
640           ErrPostEx(SEV_FATAL, 1, 0, "Database base name must be provided "
641                   "with multiple input files\n");
642           return 1;
643        }
644        multiple_inputs = TRUE;
645        /* When formatdb takes multiple inputs and writes one blast database,
646         * it also writes alias files for each of the inputs passed in. Make
647         * sure that none of these inputs has the same name as the basename for
648         * the new database */
649        do {
650            if (!StringCmp(dump_args[basename_arg].strvalue, tmp)) {
651                ErrPostEx(SEV_FATAL, 1, 0, "Database base name cannot have the "
652                        "same name as one of the input files\n");
653                return 1;
654            }
655        } while ((tmp = StringTokMT(next_db, DELIM, &next_db)));
656     }
657     MemFree(file_inputs);
658 
659     options = FDBOptionsNew(dump_args[input_arg].strvalue,
660                             dump_args[is_prot_arg].intvalue,
661                             dump_args[title_arg].strvalue,
662                             dump_args[asn_arg].intvalue,
663                             dump_args[asnbin_arg].intvalue,
664                             dump_args[seqentry_arg].intvalue,
665                             dump_args[sparse_arg].intvalue,
666                             dump_args[nonunique_arg].intvalue,
667                             dump_args[parse_arg].intvalue,
668                             dump_args[basename_arg].strvalue,
669                             dump_args[alias_fn_arg].strvalue,
670                             ((Int8)dump_args[dbsize_arg].intvalue)*1000000, 0,
671                             FORMATDB_VER, FALSE, (EFDBCleanOpt) 0);
672     if (options == NULL)
673         return 1;
674 
675     options->gi_file = StringSave(dump_args[gifile_arg].strvalue);
676     options->gi_file_bin = StringSave(dump_args[bin_gifile_arg].strvalue);
677     orig_ptr = options->db_file;
678     options->db_file = StringTokMT(options->db_file, DELIM, &next_db);
679 
680     if (options->gi_file && dump_args[alias_fn_arg].strvalue == NULL &&
681             options->gi_file_bin == NULL) {
682         ErrPostEx(SEV_FATAL, 1,0,"The -F option must be used with either "
683                 "the -L or -B option\n");
684         FDBOptionsFree(options);
685         return 1;
686     } else if (dump_args[alias_fn_arg].strvalue && options->gi_file_bin) {
687         ErrPostEx(SEV_FATAL, 1, 0, "The -L and -B options may not be "
688                 "specified together");
689         FDBOptionsFree(options);
690         return 1;
691     } else if (options->gi_file_bin && options->gi_file == NULL) {
692         ErrPostEx(SEV_FATAL, 1, 0, "The -B option may not be specified "
693                 "without the -F option");
694         FDBOptionsFree(options);
695         return 1;
696     } else if (dump_args[alias_fn_arg].strvalue && options->gi_file == NULL) {
697         ErrPostEx(SEV_FATAL, 1, 0, "The -L option must be specified "
698                 "with the -F option\n");
699         FDBOptionsFree(options);
700         return 1;
701     }
702 
703 
704     /*** Write alias file using a gilist ***/
705     if (options->alias_file_name && options->gi_file) {
706         Int8 nletters=0, nletters_tot=0;
707         Int4 nseqs=0, nseqs_tot=0;
708         CharPtr gifile;
709         Int4 i, gi_list_total, ordinal_id, rdfp_ctr = 0;
710         BlastDoubleInt4Ptr gi_list;
711         ReadDBFILEPtr rdfp, rdfp_tmp;
712         Char alias_fn[PATH_MAX], base_fn[PATH_MAX];
713 
714         rdfp = rdfp_tmp = readdb_new(options->db_file, options->is_protein);
715         if (rdfp == NULL) {
716             ErrPostEx(SEV_FATAL, 1, 0, "Unable to open BLAST db %s\n",
717                     options->db_file);
718             FDBOptionsFree(options);
719             return 1;
720         }
721         if ((gifile = FindBlastDBFile(options->gi_file)) == NULL) {
722             ErrPostEx(SEV_FATAL, 1, 0, "Unable to find %s\n", options->gi_file);
723             rdfp = readdb_destruct(rdfp);
724             FDBOptionsFree(options);
725             return 1;
726         }
727         gi_list = GetGisFromFile(gifile, &gi_list_total);
728         gifile = MemFree(gifile);
729 
730         /* Iterate through the rdfp's (there might be many) */
731         for (; rdfp; rdfp = rdfp->next, rdfp_ctr++) {
732 
733             /* Isolate the current rdfp so that we restrict the search for gis
734              * to this volume only! */
735             ReadDBFILEPtr next = rdfp->next;
736             rdfp->next = NULL;
737 
738             nseqs = nletters = 0;
739             for (i = 0; i < gi_list_total; i++) {
740                 ordinal_id = readdb_gi2seq(rdfp, gi_list[i].gi, NULL);
741                 if (ordinal_id >= 0) {
742                     nseqs++;
743                     nletters += readdb_get_sequence_length(rdfp, ordinal_id);
744                 }
745             }
746             rdfp->next = next;  /* restore the next rdfp */
747 
748             sprintf(alias_fn, "%s.%02d", options->alias_file_name, rdfp_ctr);
749             /* For the base name (DBLIST field in alias file) append the
750              * volume number if there are multiple volumes in this database */
751             if (rdfp->next || rdfp_ctr != 0) {
752                 s_SetDBListNameMultiVolDereference(rdfp, options->base_name,
753                                                    rdfp_ctr, base_fn);
754             } else {
755                 StringCpy(base_fn, options->base_name);
756             }
757             FD_CreateAliasFileEx(options->db_title, alias_fn, 0,
758                     options->is_protein, base_fn, 0, 0,
759                     nletters, nseqs, NULL, options->gi_file);
760             ErrLogPrintf("Created %s alias file with %ld sequences, %s %s\n",
761                     alias_fn, nseqs, Nlm_Int8tostr(nletters,0),
762                     options->is_protein ? "residues" : "bases");
763             nseqs_tot += nseqs; nletters_tot += nletters;
764 
765         }
766         /* Sanity check: Don't write 'ghost' alias files */
767         if (nletters_tot == 0 || nseqs_tot == 0) {
768             ErrPostEx(SEV_FATAL, 1, 0, "No gis from %s were found in the %s "
769                 "database", options->gi_file, options->db_file);
770             for (i = 0; i <= rdfp_ctr; i++) {
771                 sprintf(alias_fn,"%s.%02d.%cal", options->alias_file_name, i,
772                         options->is_protein ? 'p' : 'n');
773                 FileRemove(alias_fn);
774             }
775             FDBOptionsFree(options);
776             gi_list = MemFree(gi_list);
777             rdfp_tmp = readdb_destruct(rdfp_tmp);
778             return 1;
779         }
780         /* Adjust alias files if necessary */
781         if (rdfp_ctr == 1) { /* single volume database */
782             sprintf(tmpbuf1, "%s.%cal", alias_fn,
783                     options->is_protein ? 'p' : 'n');
784             sprintf(tmpbuf2, "%s.%cal", options->alias_file_name,
785                     options->is_protein ? 'p' : 'n');
786             FileRename(tmpbuf1, tmpbuf2);
787             ErrLogPrintf("SUCCESS: Renamed %s to %s\n", tmpbuf1, tmpbuf2);
788         } else { /* multi-volume database */
789             Char *p = FD_ConstructMultivolumeDBList(options->alias_file_name,
790                     rdfp_ctr);
791             /* Create wrapper alias file
792              * Note that the total number of sequences and letters is not needed
793              * because these will be calculated by readdb when reading the alias
794              * files.
795              */
796             FD_CreateAliasFileEx(options->db_title, options->alias_file_name,
797                     0, options->is_protein, p, 0, 0, 0, 0, NULL, NULL);
798             ErrLogPrintf("SUCCESS: Created wrapper alias file %s for %s\n",
799                     options->alias_file_name, p);
800             p = MemFree(p);
801         }
802 
803         gi_list = MemFree(gi_list);
804         FDBOptionsFree(options);
805         rdfp_tmp = readdb_destruct(rdfp_tmp);
806         return 0;
807     } else if (options->gi_file_bin && options->gi_file) {
808         /*** Convert text gi list to binary format ***/
809         Int4 ngis;
810         ngis = readdb_MakeGiFileBinary(options->gi_file, options->gi_file_bin);
811         ErrLogPrintf("SUCCESS: Converted %ld gi(s) to binary format on %s\n",
812                 ngis, options->gi_file_bin);
813         FDBOptionsFree(options);
814         return 0;
815     }
816 
817 #ifdef TAX_CS_LOOKUP
818     if(dump_args[12].intvalue && options->parse_mode) {
819         /* These functions will create taxonomy lookup database */
820         options->tax_lookup = RDTaxLookupInit();
821         options->tax_callback = FDBTaxCallback;
822     }
823 #endif
824 
825     /*** Make sure that the inputs will not create too many volumes ***/
826     if (!FDBCheckFastaInputs(dump_args[input_arg].strvalue,
827                              options->is_protein,
828                              options->bases_in_volume, &num_inputfiles))
829         return 1;
830 
831     /* Allocate last_oid to keep track of the last ordinal ids that each
832      * of the input files had */
833     if (multiple_inputs) {
834         ASSERT(num_inputfiles > 0);
835         last_oid = (Int4Ptr) MemNew(num_inputfiles*sizeof(Int4));
836         inputs = (CharPtr *) MemNew(num_inputfiles*sizeof(CharPtr));
837         lengths = (Int8Ptr) MemNew(num_inputfiles*sizeof(Int8));
838         if (!last_oid || !inputs || !lengths) {
839             ErrPostEx(SEV_ERROR, 0, 0, "Out of memory");
840             FDBOptionsFree(options);
841             return 1;
842         }
843     }
844 
845     /* Initialize formatdb structure */
846     if ((fdbp = FormatDBInit(options)) == NULL)
847         return 2;
848 
849     /* Allow users to set their own membership and link bits using a
850      * .formatdbrc file. Useful for formatting purposes */
851     if (options->version >= FORMATDB_VER) {
852         options->linkbit_listp = FDBLoadLinksTable();
853         options->memb_tblp = FDBLoadMembershipsTable();
854     }
855 
856     /* Process the optional seqid/taxid pair input file */
857     seqid_taxid_file = dump_args[seqid_taxid_file_arg].strvalue;
858     taxid_tbl = FDBTaxidDeflineTableNew(seqid_taxid_file);
859     if ( !taxid_tbl && !StringHasNoText(seqid_taxid_file) ) {
860         ErrPostEx(SEV_ERROR, 0, 0, "Failed to read taxonomy data from %s",
861                   seqid_taxid_file);
862         FDBOptionsFree(options);
863         return 1;
864     }
865 
866     /* Loop on input files */
867     while (options->db_file) {
868        total_length = 0;
869        /* Input database file maybe either in ASN.1 or in FASTA format */
870        if (!options->isASN) {
871           /* FASTA format of input database */
872 
873           if((fd = FileOpen(options->db_file, "r")) == NULL) {
874              ErrPostEx(SEV_ERROR, 0, 0, "Could not open %s\n", options->db_file);
875              return 3;
876           }
877 
878           /* Get sequences */
879           while ((sep = FastaToSeqEntryForDb(fd,
880                                              (Boolean)!options->is_protein,
881                                              &error_msg, options->parse_mode, options->base_name, &id_ctr,NULL)) != NULL) {
882 
883              if(!IS_Bioseq(sep)) { /* Not Bioseq - failure */
884                 ErrLogPrintf("Error in readind Bioseq Formating failed.\n");
885                 return 4;
886              }
887 
888              SeqEntrySetScope(sep);
889              bsp = (BioseqPtr) sep->data.ptrvalue;
890 
891              total_length += bsp->length;
892              sequence_count++;
893 
894              if (error_msg) {
895                  Char buffer[42];
896                  SeqIdWrite(bsp->id, buffer, PRINTID_FASTA_LONG, 41);
897                  ErrPostEx(SEV_WARNING, 0, 0, "Sequence number %ld (%s), %s\n",
898                            sequence_count, buffer, error_msg);
899                  error_msg = MemFree(error_msg);
900              }
901 
902              bdp = FDBGetDefAsnFromBioseq(bsp, taxid_tbl);
903              if ( FDBAddBioseq(fdbp, bsp, bdp) ) {
904                  options->clean_opt = eCleanAlways;
905                  FDBCleanUp(options);
906                  ErrPostEx(SEV_FATAL, 1, 0,
907                    "Fatal error when adding sequence to BLAST database.");
908                  return 1;
909              }
910              bdp = BlastDefLineSetFree(bdp);
911              SeqEntryFree(sep);
912           }
913 
914           FileClose(fd);
915 
916           /* Writing multi-volume pointer file */
917       FD_MakeAliasFile(options);
918 
919        } else {
920           /* ASN.1 format of input database */
921           AsnTypePtr atp, atp2;
922           AsnModulePtr amp;
923 
924           if (! SeqEntryLoad())
925              ErrShow();
926 
927           /* get pointer to all loaded ASN.1 modules */
928           amp = AsnAllModPtr();
929 
930           if (amp == NULL) {
931              ErrLogPrintf("Could not load ASN.1 modules.\n");
932              return 5;
933           }
934 
935           /* get the initial type pointers */
936 
937           atp = AsnFind("Bioseq-set");
938           if (atp == NULL) {
939              ErrLogPrintf("Could not get type pointer for Bioseq-set.\n");
940              return 6;
941           }
942 
943           atp2 = AsnFind("Bioseq-set.seq-set.E");
944           if (atp2 == NULL) {
945              ErrLogPrintf("Could not get type pointer for Bioseq-set.seq-set.E\n");
946              return 7;
947           }
948 
949           if ((fdbp->aip = AsnIoOpen (options->db_file,
950                                       options->asnbin ? "rb":"r")) == NULL) {
951              ErrLogPrintf("Cannot open input database file. Formating failed...\n");
952              return 8;
953           }
954 
955           if (options->is_seqentry) {
956              Int8 len = 0;
957              /* Seq entry */
958              sep = SeqEntryAsnRead(fdbp->aip, NULL);
959              FDBAddSeqEntry(fdbp, sep);
960              SeqEntryExplore(sep, (Pointer)&len, SeqEntryGetLength);
961              SeqEntryFree(sep);
962              sequence_count++;
963              total_length += len;
964           } else {
965              /* Bioseq-set */
966 
967              while ((atp = AsnReadId(fdbp->aip, amp, atp)) != NULL) {
968                 if (atp == atp2) {   /* top level Seq-entry */
969                    Int8 len = 0;
970                    sep = SeqEntryAsnRead(fdbp->aip, atp);
971 
972                    FDBAddSeqEntry(fdbp, sep);
973                    SeqEntryExplore(sep, (Pointer)&len, SeqEntryGetLength);
974                    SeqEntryFree(sep);
975                    sequence_count++;
976                    total_length += len;
977                 } else {
978                    AsnReadVal(fdbp->aip, atp, NULL);
979                 }
980              }
981           } /* end "if Bioseq or Bioseq-set */
982 
983 
984        } /* end "if FASTA or ASN.1" */
985 
986        if (multiple_inputs) {
987            /* record the ordinal ids, input file names and input file lengths
988             * in the following arrays. This will be used later to
989             * create the multiple alias files in case that the multiple input
990             * results in a multi-volume database */
991            inputs[input_ctr] = options->db_file;
992            lengths[input_ctr] = total_length;
993            last_oid[input_ctr++] = sequence_count;
994        }
995        options->db_file = StringTokMT(next_db, DELIM, &next_db);
996 
997     } /* Loop on input files */
998 
999     /* Dump indexes, deallocate structure, arrays, etc. */
1000 
1001     if(FormatDBClose(fdbp))
1002         return 9;
1003     options->db_file = orig_ptr;
1004 
1005     /* If multiple inputs were given, create an alias file for each of the
1006      * fasta file inputs */
1007     if (multiple_inputs) {
1008         ReadDBFILEPtr rdfp, rdfp_tmp;
1009         Boolean span_multiple_rdfp = FALSE, first_time = TRUE;
1010         Int4 start_oid = 1, stop_oid, vol_ctr = 0;
1011         Char basename[PATH_MAX]; /* save the name of individual alias files */
1012         Char dblist[PATH_MAX]; /* save the name(s) of all alias files when an
1013                                   input spans multiple volumes */
1014 
1015         ASSERT(input_ctr == num_inputfiles);
1016         rdfp = rdfp_tmp = readdb_new(dump_args[basename_arg].strvalue,
1017                                      options->is_protein);
1018         if (rdfp == NULL) {
1019             ErrPostEx(SEV_FATAL, 1, 0,
1020                     "Cannot create alias files for multiple inputs");
1021             FDBOptionsFree(options);
1022             return 1;
1023         }
1024 
1025         /* For each rdfp write the corresponding alias file(s) */
1026         ErrLogPrintf("\nCreating alias files for multiple FASTA inputs...\n");
1027         input_ctr = 0;
1028         MemSet(dblist, 0, sizeof(dblist));
1029         MemSet(basename, 0, sizeof(basename));
1030 
1031         for (; input_ctr < num_inputfiles; input_ctr++) {
1032             while (rdfp) {
1033 
1034                 sprintf(basename, "%s.%02d", inputs[input_ctr], vol_ctr);
1035 
1036                 if (first_time || span_multiple_rdfp) {
1037                     start_oid = 1;
1038                     first_time = FALSE;
1039                 } else
1040                     start_oid = last_oid[input_ctr-1] - rdfp->start+1;
1041 
1042                 if (last_oid[input_ctr] > (rdfp->stop+1)) {
1043                     stop_oid = rdfp->stop+1;
1044                     span_multiple_rdfp = TRUE;
1045                     StringCat(dblist, basename); StringCat(dblist, " ");
1046                 } else if (span_multiple_rdfp) {
1047                     StringCat(dblist, basename); StringCat(dblist, " ");
1048                     stop_oid = last_oid[input_ctr] - rdfp->start;
1049                 } else {
1050                     stop_oid = last_oid[input_ctr] - rdfp->start;
1051                 }
1052 
1053                 ErrLogPrintf("Input %s (up to %ld) alias file %s (%ld-%ld)\n",
1054                         inputs[input_ctr], last_oid[input_ctr], basename,
1055                         start_oid, stop_oid);
1056                 FD_CreateAliasFileEx(NULL, basename, 0,
1057                             options->is_protein, rdfp->filename,
1058                             start_oid, stop_oid, lengths[input_ctr],
1059                             0, NULL, NULL);
1060 
1061                 if (last_oid[input_ctr] > rdfp->stop) {
1062                     rdfp = rdfp->next;
1063                     vol_ctr++;
1064                 } else
1065                     break;
1066             }
1067 
1068             if (span_multiple_rdfp) {
1069                 /* Create wrapper alias file for the corresponding volumes */
1070                 FD_CreateAliasFileEx(NULL, inputs[input_ctr], 0,
1071                         options->is_protein, dblist, 0, 0, lengths[input_ctr],
1072                         0, NULL, NULL);
1073                 span_multiple_rdfp = FALSE;
1074                 ErrLogPrintf("Created wrapper alias file %s for %s\n",
1075                         inputs[input_ctr], dblist);
1076             } else {
1077                 /* Rename the alias file just created */
1078                 sprintf(tmpbuf1, "%s.%cal", basename,
1079                         options->is_protein ? 'p' : 'n');
1080                 sprintf(tmpbuf2, "%s.%cal", inputs[input_ctr],
1081                         options->is_protein ? 'p' : 'n');
1082                 FileRename(tmpbuf1, tmpbuf2);
1083                 ErrLogPrintf("Renamed %s to %s\n", tmpbuf1, tmpbuf2);
1084             }
1085             MemSet(dblist, 0, sizeof(dblist));
1086         }
1087         rdfp_tmp = readdb_destruct(rdfp_tmp);
1088         MemFree(last_oid);
1089         MemFree(inputs);
1090         MemFree(lengths);
1091     }
1092 
1093 #ifdef TAX_CS_LOOKUP
1094     if(dump_args[12].intvalue && options->parse_mode) {
1095         RDTaxLookupClose(options->tax_lookup);
1096     }
1097 #endif
1098 
1099     taxid_tbl = FDBTaxidDeflineTableFree(taxid_tbl);
1100 
1101     if (options->version >= FORMATDB_VER) {
1102         options->linkbit_listp = FDBDestroyLinksTable(options->linkbit_listp);
1103         options->memb_tblp = FDBDestroyMembershipsTable(options->memb_tblp);
1104     }
1105 
1106     ErrLogPrintf("SUCCESS: formatted database %s\n",
1107             options->alias_file_name ?
1108             options->alias_file_name : options->base_name);
1109 
1110     options = FDBOptionsFree(options);
1111 
1112     return 0;
1113 
1114 } /* main()*/
1115 
1116 
1117 
1118