1 static char const rcsid[] = "$Id: formatdb.c,v 6.104 2007/08/21 20:07:49 kans Exp $";
2
3 /*****************************************************************************
4
5
6 PUBLIC DOMAIN NOTICE
7 National Center for Biotechnology Information
8
9 This software/database is a "United States Government Work" under the
10 terms of the United States Copyright Act. It was written as part of
11 the author's official duties as a United States Government employee
12 and thus cannot be copyrighted. This software/database is freely
13 available to the public for use. The National Library of Medicine and
14 the U.S. Government have not placed any restriction on its use or
15 reproduction.
16
17 Although all reasonable efforts have been taken to ensure the accuracy
18 and reliability of the software and data, the NLM and the U.S.
19 Government do not and cannot warrant the performance or results that
20 may be obtained by using this software or data. The NLM and the U.S.
21 Government disclaim all warranties, express or implied, including
22 warranties of performance, merchantability or fitness for any
23 particular purpose.
24
25 Please cite the author in any work or product based on this material.
26
27 ***************************************************************************
28
29 File Name: formatdb.c
30
31 Author: Sergei B. Shavirin
32
33 Version Creation Date: 10/01/96
34
35 $Revision: 6.104 $
36
37 File Description: formats FASTA databases for use by BLAST
38
39 $Log: formatdb.c,v $
40 Revision 6.104 2007/08/21 20:07:49 kans
41 made static functions static, added cast of EFDBCleanOpt to fix CodeWarrior complaints
42
43 Revision 6.103 2007/04/13 13:21:11 madden
44 Add call to ErrSetLogLevel
45
46 Revision 6.102 2006/09/25 19:56:05 camacho
47 Added s_SetDBListNameMultiVolDereference to fix bug when creating alias files over est
48
49 Revision 6.101 2006/06/19 17:20:14 coulouri
50 Extend 1GB default volume size to all platforms and impose a hard limit of 4G. rt#15171398
51
52 Revision 6.100 2006/05/04 20:07:27 camacho
53 Report fatal error in case of failure to add sequence to BLAST database because
54 of zero-length sequence and clean up the datababase that was being created.
55
56 Revision 6.99 2006/03/14 14:36:46 camacho
57 Logging changes
58
59 Revision 6.98 2006/03/08 19:06:12 camacho
60 Moved definition for maximum number of volumes to readdb.[ch], fixes rt ticket 15147600
61
62 Revision 6.97 2005/09/30 14:54:32 camacho
63 Enable recognition of the formatdb configuration file to allow users to set the
64 membership and link bits in the ASN.1 deflines.
65
66 Revision 6.96 2005/07/28 14:52:22 coulouri
67 remove dead code
68
69 Revision 6.95 2005/06/08 19:25:53 camacho
70 New feature to allow formatdb to add taxonomy ids to BLAST databases
71 generated from FASTA input
72 BugzID: 6
73
74 Revision 6.94 2004/08/25 14:47:21 camacho
75 Refactorings to allow formatdb process multiple deflines
76
77 Revision 6.93 2004/06/30 19:52:00 camacho
78 Added #include <blfmtutl.h>
79
80 Revision 6.92 2004/01/29 14:56:44 camacho
81 Removed -A option, FORMATDB_VER_TEXT no longer supported
82
83 Revision 6.91 2003/10/01 18:59:56 camacho
84 Fix to creation of custom databases using a gi list and alias files when the
85 source database spans multiple volumes.
86
87 Revision 6.90 2003/09/12 20:18:55 camacho
88 This change enables the generation of alias files for multiple ASN.1 inputs.
89
90 Revision 6.89 2003/05/30 17:31:09 coulouri
91 add rcsid
92
93 Revision 6.88 2003/05/13 16:02:42 coulouri
94 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
95
96 Revision 6.87 2003/05/08 16:02:17 camacho
97 Use conditional compilation for reading .formatdbrc
98
99 Revision 6.86 2003/04/03 19:10:59 camacho
100 Fixed typo
101
102 Revision 6.85 2003/03/26 18:51:45 camacho
103
104 1. Minor bug fixes.
105 2. Added eFDBCleanOpt parameter to FDBOptionsNew.
106
107 Revision 6.84 2003/03/20 14:03:02 camacho
108 Allow users to set the membership and link bits
109
110 Revision 6.83 2003/03/11 18:27:45 madden
111 Cast value to Int8 to prevent rollover
112
113 Revision 6.82 2003/02/25 15:53:40 beloslyu
114 final coma is not allowed after the last element of enum (at least AIX compiler barks)
115
116 Revision 6.81 2003/01/27 20:17:06 camacho
117 Bug fix in alias file creation
118
119 Revision 6.80 2003/01/23 22:01:29 camacho
120 Minor change
121
122 Revision 6.79 2003/01/22 19:44:51 camacho
123
124 1. Added check for number of volumes greater than 100.
125 2. Fixed bug when creating alias files if multiple FASTA inputs are used.
126 3. Implemented correct creation of alias files when creating custom
127 blast databases with a gi list.
128
129 Revision 6.78 2003/01/16 19:46:07 kans
130 changed NULL to 0 to fix Mac compiler error
131
132 Revision 6.77 2003/01/07 17:20:26 camacho
133 Added error message when gi file is not found
134
135 Revision 6.76 2002/12/18 15:16:17 camacho
136 Minor fixes of command-line arguments
137
138 Revision 6.75 2002/12/16 20:22:19 camacho
139 Better error handling when creating alias files
140
141 Revision 6.74 2002/12/13 13:44:50 camacho
142 Use FDBOptionsNew to create options structure
143
144 Revision 6.73 2002/12/02 22:43:42 camacho
145 Added warning message when no sequences are found when creating alias files
146
147 Revision 6.72 2002/11/06 21:27:46 ucko
148 Make 4294967295 explicitly unsigned to avoid warnings.
149
150 Revision 6.71 2002/09/26 02:14:43 camacho
151 Allow limiting the number of sequences per volume
152
153 Revision 6.70 2002/09/25 20:14:20 camacho
154 Fix for multivolume databases with non-parseable seqids
155
156 Revision 6.69 2002/08/09 19:41:25 camacho
157 1) Added blast version number to command-line options
158 2) Added explanations for some default parameters
159
160 Revision 6.68 2002/07/24 20:52:55 coulouri
161 Change database volume size parameter to megabases
162
163 Revision 6.67 2002/04/19 13:10:43 madden
164 Make new database format the default
165
166 Revision 6.66 2002/02/15 22:01:19 beloslyu
167 fix from HP
168
169 Revision 6.65 2001/11/06 15:24:20 dondosha
170 Roll back previous change - it was not needed
171
172 Revision 6.64 2001/11/05 22:14:49 dondosha
173 Allow stdin as input
174
175 Revision 6.63 2001/11/02 19:27:45 camacho
176 Fixed problem that would corrupt the BlastDefLine structures for the new database format
177
178 Revision 6.62 2001/07/12 19:35:51 madden
179 Set alias_file_name
180
181 Revision 6.61 2001/07/06 19:58:57 madden
182 Add NLM_GENERATED_CODE_PROTO and include for fdlobj.h, remove unused variables
183
184 Revision 6.60 2001/06/14 14:18:48 madden
185 Replace FD_MakeAliasFile with FD_CreateAliasFile
186
187 Revision 6.59 2001/06/07 20:52:07 shavirin
188 Fixed problem with truncation of definition lines when TEXT mode is
189 chosen for dump.
190
191 Revision 6.58 2001/06/07 13:13:02 madden
192 Set limits for -v arg, set bases_in_volume to UINT4_MAX if not set
193
194 Revision 6.57 2001/05/17 20:21:46 dondosha
195 Do not add .00 extension when only one volume created
196
197 Revision 6.56 2001/05/11 20:02:35 madden
198 Add oidlist and gifile to FD_CreateAliasFileEx
199
200 Revision 6.55 2001/05/11 18:19:02 madden
201 Add option to output binary gifile
202
203 Revision 6.54 2001/05/10 17:28:33 madden
204 Make gifile optional
205
206 Revision 6.53 2001/05/10 17:21:11 madden
207 Add options to produce alias file from a gifile
208
209 Revision 6.52 2001/05/08 21:56:43 shavirin
210 Added possibility to generate tax_id for every definition in Blast FASTA
211 definition set in ASN.1 structured definition lines.
212
213 Revision 6.51 2001/05/02 16:22:04 dondosha
214 Add NSEQ and LENGTH to alias files in case of multiple inputs to formatdb
215
216 Revision 6.50 2001/04/11 21:00:53 dondosha
217 Made functions FD_CreateAliasFile(Ex) public
218
219 Revision 6.49 2001/04/11 20:45:35 dondosha
220 Moved appending of .00 for the first volume to FormatDBInit function
221
222 Revision 6.48 2001/04/11 20:14:41 dondosha
223 Processing of volumes moved to lower level
224
225 Revision 6.47 2001/03/27 21:11:40 dondosha
226 Allow multiple input files for formatting
227
228 Revision 6.46 2001/02/01 22:25:50 shavirin
229 Added (uncommented) option to create ASN.1 structured deflines in
230 BLAST databases.
231
232 Revision 6.45 2001/01/25 21:08:09 madden
233 Fix an ABR
234
235 Revision 6.44 2000/12/12 23:12:09 shavirin
236 Fixed some FMR error.
237
238 Revision 6.43 2000/12/08 22:34:35 shavirin
239 Added possibility to create Taxonomy lookup database.
240
241 Revision 6.42 2000/11/22 20:53:19 shavirin
242 Added possibility to use Taxonomy client/server for creation of ASN.1
243 structured deflines with taxonomy ID (Using define TAX_CS_LOOKUP).
244
245 Revision 6.41 2000/11/13 21:37:23 madden
246 Use ErrPostEx
247
248 Revision 6.40 2000/11/03 18:13:22 madden
249 Print list of bad characters in FASTA input
250
251 Revision 6.39 2000/09/29 16:40:16 shavirin
252 Fixed problem with multivolume database creation.
253
254 Revision 6.38 2000/09/18 20:46:36 kans
255 added back #include <sqnutils.h>, needed for UseLocalAsnloadDataAndErrMsg
256
257 Revision 6.37 2000/09/12 15:38:35 shavirin
258 Error message level set to SEV_WARNING
259
260 Revision 6.36 2000/07/18 19:32:28 shavirin
261 Added new option -V to enable check for non-unique string ids in the
262 FASTA database. Default is FALSE.
263
264 Revision 6.35 2000/02/17 17:20:59 sicotte
265 Change Calling convention for FastaToSeqEntryForDb
266
267 Revision 6.34 2000/02/04 21:52:58 madden
268 Use FastaToSeqEntryForDb
269
270 Revision 6.33 1999/12/21 18:31:38 madden
271 Fixed bug with writing alias file.
272
273 Revision 6.32 1999/12/17 20:48:54 egorov
274 Fix 'gcc -Wall' warnings and remove old stuff.
275
276 Revision 6.31 1999/12/16 15:53:23 egorov
277 Typo fixed
278
279 Revision 6.30 1999/09/10 16:30:35 shavirin
280 Fixed problems with formating proteins by formatdb
281
282 Revision 6.29 1999/09/09 18:25:51 shavirin
283 Changed way to parse ASN.1. Added possibility to parse
284 delta sequences.
285
286 Revision 6.28 1999/08/25 20:20:27 shavirin
287 Added -s option to create sparse indexes.
288
289 Revision 6.27 1999/08/18 15:00:11 shavirin
290 If title missing from args *.pal file will have basename as title.
291
292 Revision 6.26 1999/08/03 16:38:56 shavirin
293 Added function FD_CreateAliasFile() for multivolume formating.
294
295 Revision 6.24 1999/07/23 18:59:01 shavirin
296 Added support for creation of multivolume databases.
297
298 Revision 6.23 1999/05/13 19:34:19 shavirin
299 More changes towards dump from ID.
300
301 Revision 6.21 1999/05/12 15:46:52 shavirin
302 Changed parameter in function FDBAddSequence().
303
304 Revision 6.20 1999/04/26 21:06:19 shavirin
305 Fixed minor bug.
306
307 Revision 6.19 1999/04/26 19:37:45 shavirin
308 Dumping info set to FALSE.
309
310 Revision 6.18 1999/04/26 14:53:16 shavirin
311 Fixed memory leaks in FDBAddSequence() function.
312
313 Revision 6.17 1999/04/21 21:44:34 shavirin
314 Many functions were moved to "readdb.c" file.
315
316 Revision 6.16 1999/03/21 19:16:59 madden
317 Fix problem on round numbers
318
319 Revision 6.15 1999/03/05 21:34:48 madden
320 Changes for accession.version
321
322 Revision 6.14 1999/02/04 18:01:48 madden
323 Add -n option for basename
324
325 Revision 6.13 1998/11/16 18:34:42 madden
326 Add return-value checks
327
328 Revision 6.12 1998/07/13 15:32:17 egorov
329 make error message more understandable
330
331 Revision 6.10 1998/06/19 21:05:46 egorov
332 Fix MemFree() bug
333
334 Revision 6.9 1998/05/05 13:57:37 madden
335 Print version number to log file
336
337 Revision 6.8 1998/04/20 19:14:05 egorov
338 Fix just one, but huge MLK
339
340 Revision 6.7 1998/02/23 16:49:14 egorov
341 Changes to make the tofasta.c independent on readdb.h
342
343 Revision 6.6 1998/02/18 15:29:31 madden
344 Added const to prototype for FormatdbCreateStringIndex
345
346 Revision 6.5 1998/02/11 18:05:32 madden
347 Changed program to take ASN.1 as input
348
349 Revision 6.3 1997/12/08 21:55:00 madden
350 Parse naked (no bars) as IDs
351
352 Revision 6.2 1997/11/06 18:11:17 madden
353 Added indices for naked gnl|PID and backbone entries
354
355 Revision 6.1 1997/10/30 18:15:08 madden
356 Changes to SeqIdE2Index to allow lookups by accession strings
357
358 Revision 6.0 1997/08/25 18:20:04 madden
359 Revision changed to 6.0
360
361 Revision 1.20 1997/07/28 18:36:55 madden
362 Replaced printf with ErrPostEx and fprintf
363
364 Revision 1.19 1997/07/28 14:35:37 vakatov
365 Added LIBCALLBACK to the ID_Compare() proto
366
367 Revision 1.18 1997/06/10 18:44:11 shavirin
368 Fixed return value from UpdateLookupInfo()
369
370 Revision 1.17 1997/05/19 21:16:30 shavirin
371 Changed content of string index file due to E2Iindex API logic
372
373 Revision 1.16 1997/05/12 19:57:38 shavirin
374 Added additional dump of Accessions/Locuses into string indexes
375
376 Revision 1.15 1997/05/07 21:08:15 madden
377 flipped parse argument default
378
379 Revision 1.14 1997/05/05 17:01:42 shavirin
380 Added ability to format "non-parced" seqid-deflines
381 Removed not-used d if#defs with FASTA_ASN
382
383 * Revision 1.13 1997/05/01 17:31:32 shavirin
384 * Added dumping of 2 more files: String ISAM SeqId index
385 *
386 * Revision 1.12 1997/02/25 22:20:39 shavirin
387 * Changes in accordance to ISAM API changes
388 *
389 * Revision 1.11 1997/02/24 21:22:57 shavirin
390 * Added dump of numeric ISAM information.
391 *
392 * Revision 1.10 1996/12/20 00:31:19 madden
393 * Protected ambiguity data against big/little endian changes.
394 *
395 * Revision 1.9 1996/12/19 16:30:36 madden
396 * Changes to eliminate ".nac" file for nucl.
397 *
398 * Revision 1.8 1996/11/27 16:40:19 madden
399 * Save build date, Make "o" argument FALSE by default.
400 *
401 * Revision 1.7 1996/11/26 20:08:08 madden
402 * BioseqRawConvert(bsp, Seq_code_ncbistdaa); only called for protein alphabets.
403 *
404 * Revision 1.6 1996/11/26 19:52:10 madden
405 * Removed FORMATDB_VER and added readdb.h (which contains same);
406 * Changed phd or nhd to phr or nhr
407 *
408 * Revision 1.5 1996/11/18 20:53:58 shavirin
409 * Forced output protein code to Seq_code_ncbistdaa.
410 *
411 * Revision 1.4 1996/11/06 23:15:34 shavirin
412 * Removed bug with reallocation of index tables
413 *
414
415 *****************************************************************************/
416 #define NLM_GENERATED_CODE_PROTO
417 #include <ncbi.h>
418 #include <tofasta.h>
419 #include <sequtil.h>
420 #include <readdb.h>
421 #include <sqnutils.h>
422 #include <taxblast.h>
423 #include <blastdef.h>
424 #include <mblast.h>
425 #include <fdlobj.h>
426 #include <blfmtutl.h>
427
428 /* program's arguments */
429
430 #define NUMARG (sizeof(dump_args)/sizeof(dump_args[0]))
431
432 Args dump_args[] = {
433 { "Title for database file",
434 NULL, NULL, NULL, TRUE, 't', ARG_STRING, 0.0, 0, NULL},
435 {"Input file(s) for formatting",
436 NULL, NULL,NULL,TRUE,'i',ARG_FILE_IN, 0.0,0,NULL},
437 {"Logfile name:",
438 "formatdb.log", NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL},
439 {"Type of file\n"
440 " T - protein \n"
441 " F - nucleotide",
442 "T", NULL,NULL,TRUE,'p',ARG_BOOLEAN,0.0,0,NULL},
443 {"Parse options\n"
444 " T - True: Parse SeqId and create indexes.\n"
445 " F - False: Do not parse SeqId. Do not create indexes.\n",
446 "F", NULL,NULL,TRUE,'o',ARG_BOOLEAN,0.0,0,NULL},
447 {"Input file is database in ASN.1 format (otherwise FASTA is expected)\n"
448 " T - True, \n"
449 " F - False.\n",
450 "F", NULL,NULL,TRUE,'a',ARG_BOOLEAN,0.0,0,NULL},
451 {"ASN.1 database in binary mode\n"
452 " T - binary, \n"
453 " F - text mode.\n",
454 "F", NULL,NULL,TRUE,'b',ARG_BOOLEAN,0.0,0,NULL},
455 {"Input is a Seq-entry",
456 "F", NULL ,NULL ,TRUE,'e',ARG_BOOLEAN,0.0,0,NULL},
457 { "Base name for BLAST files",
458 NULL, NULL, NULL, TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
459 { "Database volume size in millions of letters",
460 "4000", NULL, NULL, TRUE, 'v', ARG_INT, 0.0, 0, NULL},
461 { "Create indexes limited only to accessions - sparse",
462 "F", NULL, NULL, TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL},
463 { "Verbose: check for non-unique string ids in the database",
464 "F", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL},
465 { "Create an alias file with this name\n"
466 " use the gifile arg (below) if set to calculate db size\n"
467 " use the BLAST db specified with -i (above)",
468 NULL, NULL, NULL, TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL},
469 {"Gifile (file containing list of gi's)",
470 NULL, NULL,NULL,TRUE,'F',ARG_FILE_IN, 0.0,0,NULL},
471 {"Binary Gifile produced from the Gifile specified above",
472 NULL, NULL,NULL,TRUE,'B',ARG_FILE_OUT, 0.0,0,NULL},
473 {"Taxid file to set the taxonomy ids in ASN.1 deflines",
474 NULL, NULL,NULL,TRUE,'T',ARG_FILE_IN, 0.0,0,NULL},
475 #if 0
476 /* disabled for this release of the NCBI C toolkit */
477 {"Clean up options for new blast database generation\n"
478 " 0 - Never: Do not clean up any 'basename.*' blast db files.\n"
479 " 1 - Always: Remove all 'basename.*' blast db files.\n"
480 " 2 - Prompt: If any 'basename.*' blast db files are found,\n"
481 " prompt user.\n",
482 "0", "0","2",TRUE,'c',ARG_INT,0.0,0,NULL},
483 #endif
484 };
485
486 enum {
487 title_arg,
488 input_arg,
489 logfile_arg,
490 is_prot_arg,
491 parse_arg,
492 asn_arg,
493 asnbin_arg,
494 seqentry_arg,
495 basename_arg,
496 dbsize_arg,
497 sparse_arg,
498 nonunique_arg,
499 alias_fn_arg,
500 gifile_arg,
501 bin_gifile_arg,
502 seqid_taxid_file_arg,
503 cleanup_arg
504 };
505
506 /* Fasta file delimiters */
507 #define DELIM " "
508
FDBCheckFastaInputs(CharPtr fasta_files,Int4 is_prot,Int8 bases_per_vol,Int4Ptr num_inputs)509 static Boolean FDBCheckFastaInputs(CharPtr fasta_files, Int4 is_prot, Int8
510 bases_per_vol, Int4Ptr num_inputs)
511 {
512 Int8 predicted_dblength = 0;
513 Char *next_file;
514
515 next_file = StringTokMT(fasta_files, DELIM, &fasta_files);
516 predicted_dblength = FileLength(next_file);
517 *num_inputs = 1;
518
519 while ((next_file = StringTokMT(fasta_files, DELIM, &fasta_files))) {
520 predicted_dblength += FileLength(next_file);
521 (*num_inputs)++;
522 }
523
524 if (bases_per_vol == 0)
525 return TRUE;
526
527 if (!is_prot)
528 predicted_dblength /= READDB_COMPRESSION_RATIO;
529
530 if ((predicted_dblength/bases_per_vol) > (kFDBMaxNumVolumes - 10)) {
531 ErrPostEx(SEV_ERROR, 0, 0, "Using %s bases per volume will exceed "
532 "the maximum number\nof volumes formatdb can create.\n"
533 "Please increase this value or do not set it at all.\n",
534 Nlm_Int8tostr(bases_per_vol, 0));
535 return FALSE;
536 }
537
538 return TRUE;
539 }
540
SeqEntryGetLength(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)541 static void SeqEntryGetLength(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
542 {
543 Int8* length = (Int8*) data;
544
545 if (IS_Bioseq(sep)) {
546 BioseqPtr bsp = (BioseqPtr) sep->data.ptrvalue;
547 *length += (Int8)bsp->length;
548 return;
549 } else {
550 BioseqSetPtr bssp = (BioseqSetPtr) sep->data.ptrvalue;
551 SeqEntryGetLength(bssp->seq_set, data, index, indent);
552 }
553 return;
554 }
555
556 /** This function ensures that the path listed in the alias file's DBLIST
557 * contains any (relative) paths specified by the user and also do any
558 * necessary dereferences of the alias file contents so that they refer to the
559 * database's underlying name
560 */
561 static void
s_SetDBListNameMultiVolDereference(const ReadDBFILE * rdfp,CharPtr basename_requested_by_user,Int4 rdfp_ctr,Char base_fn[])562 s_SetDBListNameMultiVolDereference(const ReadDBFILE* rdfp,
563 CharPtr basename_requested_by_user,
564 Int4 rdfp_ctr,
565 Char base_fn[])
566 {
567 char* basename_of_volume = FileNameFind(rdfp->filename);
568 char* basename_predicted = NULL;
569
570 /* Calculate the 'predicted' name of the database to go in the DBLIST of
571 * the alias file */
572 sprintf(base_fn, "%s.%02d", basename_requested_by_user, rdfp_ctr);
573 basename_predicted = FileNameFind(base_fn); /* basename thereof */
574
575 /* If the basename of the volume is not the same as the 'predicted'
576 * basename, save the path prefix of the database name provided by the
577 * user and append the volume's basename */
578 if (StringCmp(basename_predicted, basename_of_volume)) {
579 char* dirname = FilePathFind(basename_requested_by_user);
580 if (StringLen(dirname)) {
581 sprintf(base_fn, "%s%c%s", dirname, DIRDELIMCHR,
582 basename_of_volume);
583 } else {
584 StringCpy(base_fn, basename_of_volume);
585 }
586 MemFree(dirname);
587 }
588 }
589
Main(void)590 Int2 Main(void)
591 {
592 SeqEntryPtr sep = NULL;
593 FormatDBPtr fdbp = NULL;
594 FDB_optionsPtr options = NULL;
595 BioseqPtr bsp = NULL;
596 BlastDefLinePtr bdp = NULL;
597 Int2 id_ctr=1;
598 Int4 sequence_count=0;
599 Int4 input_ctr = 0, num_inputfiles = 0;
600 Int8 total_length, *lengths = NULL;
601 CharPtr error_msg=NULL;
602 FILE *fd = NULL;
603 CharPtr next_db = NULL, file_inputs = NULL, orig_ptr = NULL, tmp = NULL;
604 Boolean multiple_inputs = FALSE;
605 Char buf[256] = { '\0' };
606 Int4Ptr last_oid = NULL;
607 CharPtr *inputs = NULL;
608 Char tmpbuf1[PATH_MAX], tmpbuf2[PATH_MAX];
609 FDBTaxidDeflineTable* taxid_tbl = NULL;
610 CharPtr seqid_taxid_file = NULL;
611
612 /* get arguments */
613 StringCpy(buf, "formatdb ");
614 StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf));
615 if (!GetArgs(buf, NUMARG, dump_args))
616 return 1;
617
618 if (!SeqEntryLoad())
619 return 1;
620
621 if (!ErrSetLog(dump_args[logfile_arg].strvalue))
622 ErrShow();
623 else
624 ErrSetOpts(ERR_CONTINUE, ERR_LOG_ON);
625 UseLocalAsnloadDataAndErrMsg();
626 ErrSetMessageLevel(SEV_WARNING);
627 ErrSetLogLevel(SEV_WARNING);
628
629 /* Ensure that volume size is within acceptable limits */
630 if (dump_args[dbsize_arg].intvalue > 16000) {
631 ErrPostEx(SEV_FATAL, 1, 0, "Volume size may not exceed 16 gigabases.\n");
632 return 1;
633 }
634
635 /* Parse input string for multiple inputs */
636 file_inputs = StringSave(dump_args[input_arg].strvalue);
637 tmp = StringTokMT(file_inputs, DELIM, &next_db);
638 if (next_db) {
639 if (!dump_args[basename_arg].strvalue) {
640 ErrPostEx(SEV_FATAL, 1, 0, "Database base name must be provided "
641 "with multiple input files\n");
642 return 1;
643 }
644 multiple_inputs = TRUE;
645 /* When formatdb takes multiple inputs and writes one blast database,
646 * it also writes alias files for each of the inputs passed in. Make
647 * sure that none of these inputs has the same name as the basename for
648 * the new database */
649 do {
650 if (!StringCmp(dump_args[basename_arg].strvalue, tmp)) {
651 ErrPostEx(SEV_FATAL, 1, 0, "Database base name cannot have the "
652 "same name as one of the input files\n");
653 return 1;
654 }
655 } while ((tmp = StringTokMT(next_db, DELIM, &next_db)));
656 }
657 MemFree(file_inputs);
658
659 options = FDBOptionsNew(dump_args[input_arg].strvalue,
660 dump_args[is_prot_arg].intvalue,
661 dump_args[title_arg].strvalue,
662 dump_args[asn_arg].intvalue,
663 dump_args[asnbin_arg].intvalue,
664 dump_args[seqentry_arg].intvalue,
665 dump_args[sparse_arg].intvalue,
666 dump_args[nonunique_arg].intvalue,
667 dump_args[parse_arg].intvalue,
668 dump_args[basename_arg].strvalue,
669 dump_args[alias_fn_arg].strvalue,
670 ((Int8)dump_args[dbsize_arg].intvalue)*1000000, 0,
671 FORMATDB_VER, FALSE, (EFDBCleanOpt) 0);
672 if (options == NULL)
673 return 1;
674
675 options->gi_file = StringSave(dump_args[gifile_arg].strvalue);
676 options->gi_file_bin = StringSave(dump_args[bin_gifile_arg].strvalue);
677 orig_ptr = options->db_file;
678 options->db_file = StringTokMT(options->db_file, DELIM, &next_db);
679
680 if (options->gi_file && dump_args[alias_fn_arg].strvalue == NULL &&
681 options->gi_file_bin == NULL) {
682 ErrPostEx(SEV_FATAL, 1,0,"The -F option must be used with either "
683 "the -L or -B option\n");
684 FDBOptionsFree(options);
685 return 1;
686 } else if (dump_args[alias_fn_arg].strvalue && options->gi_file_bin) {
687 ErrPostEx(SEV_FATAL, 1, 0, "The -L and -B options may not be "
688 "specified together");
689 FDBOptionsFree(options);
690 return 1;
691 } else if (options->gi_file_bin && options->gi_file == NULL) {
692 ErrPostEx(SEV_FATAL, 1, 0, "The -B option may not be specified "
693 "without the -F option");
694 FDBOptionsFree(options);
695 return 1;
696 } else if (dump_args[alias_fn_arg].strvalue && options->gi_file == NULL) {
697 ErrPostEx(SEV_FATAL, 1, 0, "The -L option must be specified "
698 "with the -F option\n");
699 FDBOptionsFree(options);
700 return 1;
701 }
702
703
704 /*** Write alias file using a gilist ***/
705 if (options->alias_file_name && options->gi_file) {
706 Int8 nletters=0, nletters_tot=0;
707 Int4 nseqs=0, nseqs_tot=0;
708 CharPtr gifile;
709 Int4 i, gi_list_total, ordinal_id, rdfp_ctr = 0;
710 BlastDoubleInt4Ptr gi_list;
711 ReadDBFILEPtr rdfp, rdfp_tmp;
712 Char alias_fn[PATH_MAX], base_fn[PATH_MAX];
713
714 rdfp = rdfp_tmp = readdb_new(options->db_file, options->is_protein);
715 if (rdfp == NULL) {
716 ErrPostEx(SEV_FATAL, 1, 0, "Unable to open BLAST db %s\n",
717 options->db_file);
718 FDBOptionsFree(options);
719 return 1;
720 }
721 if ((gifile = FindBlastDBFile(options->gi_file)) == NULL) {
722 ErrPostEx(SEV_FATAL, 1, 0, "Unable to find %s\n", options->gi_file);
723 rdfp = readdb_destruct(rdfp);
724 FDBOptionsFree(options);
725 return 1;
726 }
727 gi_list = GetGisFromFile(gifile, &gi_list_total);
728 gifile = MemFree(gifile);
729
730 /* Iterate through the rdfp's (there might be many) */
731 for (; rdfp; rdfp = rdfp->next, rdfp_ctr++) {
732
733 /* Isolate the current rdfp so that we restrict the search for gis
734 * to this volume only! */
735 ReadDBFILEPtr next = rdfp->next;
736 rdfp->next = NULL;
737
738 nseqs = nletters = 0;
739 for (i = 0; i < gi_list_total; i++) {
740 ordinal_id = readdb_gi2seq(rdfp, gi_list[i].gi, NULL);
741 if (ordinal_id >= 0) {
742 nseqs++;
743 nletters += readdb_get_sequence_length(rdfp, ordinal_id);
744 }
745 }
746 rdfp->next = next; /* restore the next rdfp */
747
748 sprintf(alias_fn, "%s.%02d", options->alias_file_name, rdfp_ctr);
749 /* For the base name (DBLIST field in alias file) append the
750 * volume number if there are multiple volumes in this database */
751 if (rdfp->next || rdfp_ctr != 0) {
752 s_SetDBListNameMultiVolDereference(rdfp, options->base_name,
753 rdfp_ctr, base_fn);
754 } else {
755 StringCpy(base_fn, options->base_name);
756 }
757 FD_CreateAliasFileEx(options->db_title, alias_fn, 0,
758 options->is_protein, base_fn, 0, 0,
759 nletters, nseqs, NULL, options->gi_file);
760 ErrLogPrintf("Created %s alias file with %ld sequences, %s %s\n",
761 alias_fn, nseqs, Nlm_Int8tostr(nletters,0),
762 options->is_protein ? "residues" : "bases");
763 nseqs_tot += nseqs; nletters_tot += nletters;
764
765 }
766 /* Sanity check: Don't write 'ghost' alias files */
767 if (nletters_tot == 0 || nseqs_tot == 0) {
768 ErrPostEx(SEV_FATAL, 1, 0, "No gis from %s were found in the %s "
769 "database", options->gi_file, options->db_file);
770 for (i = 0; i <= rdfp_ctr; i++) {
771 sprintf(alias_fn,"%s.%02d.%cal", options->alias_file_name, i,
772 options->is_protein ? 'p' : 'n');
773 FileRemove(alias_fn);
774 }
775 FDBOptionsFree(options);
776 gi_list = MemFree(gi_list);
777 rdfp_tmp = readdb_destruct(rdfp_tmp);
778 return 1;
779 }
780 /* Adjust alias files if necessary */
781 if (rdfp_ctr == 1) { /* single volume database */
782 sprintf(tmpbuf1, "%s.%cal", alias_fn,
783 options->is_protein ? 'p' : 'n');
784 sprintf(tmpbuf2, "%s.%cal", options->alias_file_name,
785 options->is_protein ? 'p' : 'n');
786 FileRename(tmpbuf1, tmpbuf2);
787 ErrLogPrintf("SUCCESS: Renamed %s to %s\n", tmpbuf1, tmpbuf2);
788 } else { /* multi-volume database */
789 Char *p = FD_ConstructMultivolumeDBList(options->alias_file_name,
790 rdfp_ctr);
791 /* Create wrapper alias file
792 * Note that the total number of sequences and letters is not needed
793 * because these will be calculated by readdb when reading the alias
794 * files.
795 */
796 FD_CreateAliasFileEx(options->db_title, options->alias_file_name,
797 0, options->is_protein, p, 0, 0, 0, 0, NULL, NULL);
798 ErrLogPrintf("SUCCESS: Created wrapper alias file %s for %s\n",
799 options->alias_file_name, p);
800 p = MemFree(p);
801 }
802
803 gi_list = MemFree(gi_list);
804 FDBOptionsFree(options);
805 rdfp_tmp = readdb_destruct(rdfp_tmp);
806 return 0;
807 } else if (options->gi_file_bin && options->gi_file) {
808 /*** Convert text gi list to binary format ***/
809 Int4 ngis;
810 ngis = readdb_MakeGiFileBinary(options->gi_file, options->gi_file_bin);
811 ErrLogPrintf("SUCCESS: Converted %ld gi(s) to binary format on %s\n",
812 ngis, options->gi_file_bin);
813 FDBOptionsFree(options);
814 return 0;
815 }
816
817 #ifdef TAX_CS_LOOKUP
818 if(dump_args[12].intvalue && options->parse_mode) {
819 /* These functions will create taxonomy lookup database */
820 options->tax_lookup = RDTaxLookupInit();
821 options->tax_callback = FDBTaxCallback;
822 }
823 #endif
824
825 /*** Make sure that the inputs will not create too many volumes ***/
826 if (!FDBCheckFastaInputs(dump_args[input_arg].strvalue,
827 options->is_protein,
828 options->bases_in_volume, &num_inputfiles))
829 return 1;
830
831 /* Allocate last_oid to keep track of the last ordinal ids that each
832 * of the input files had */
833 if (multiple_inputs) {
834 ASSERT(num_inputfiles > 0);
835 last_oid = (Int4Ptr) MemNew(num_inputfiles*sizeof(Int4));
836 inputs = (CharPtr *) MemNew(num_inputfiles*sizeof(CharPtr));
837 lengths = (Int8Ptr) MemNew(num_inputfiles*sizeof(Int8));
838 if (!last_oid || !inputs || !lengths) {
839 ErrPostEx(SEV_ERROR, 0, 0, "Out of memory");
840 FDBOptionsFree(options);
841 return 1;
842 }
843 }
844
845 /* Initialize formatdb structure */
846 if ((fdbp = FormatDBInit(options)) == NULL)
847 return 2;
848
849 /* Allow users to set their own membership and link bits using a
850 * .formatdbrc file. Useful for formatting purposes */
851 if (options->version >= FORMATDB_VER) {
852 options->linkbit_listp = FDBLoadLinksTable();
853 options->memb_tblp = FDBLoadMembershipsTable();
854 }
855
856 /* Process the optional seqid/taxid pair input file */
857 seqid_taxid_file = dump_args[seqid_taxid_file_arg].strvalue;
858 taxid_tbl = FDBTaxidDeflineTableNew(seqid_taxid_file);
859 if ( !taxid_tbl && !StringHasNoText(seqid_taxid_file) ) {
860 ErrPostEx(SEV_ERROR, 0, 0, "Failed to read taxonomy data from %s",
861 seqid_taxid_file);
862 FDBOptionsFree(options);
863 return 1;
864 }
865
866 /* Loop on input files */
867 while (options->db_file) {
868 total_length = 0;
869 /* Input database file maybe either in ASN.1 or in FASTA format */
870 if (!options->isASN) {
871 /* FASTA format of input database */
872
873 if((fd = FileOpen(options->db_file, "r")) == NULL) {
874 ErrPostEx(SEV_ERROR, 0, 0, "Could not open %s\n", options->db_file);
875 return 3;
876 }
877
878 /* Get sequences */
879 while ((sep = FastaToSeqEntryForDb(fd,
880 (Boolean)!options->is_protein,
881 &error_msg, options->parse_mode, options->base_name, &id_ctr,NULL)) != NULL) {
882
883 if(!IS_Bioseq(sep)) { /* Not Bioseq - failure */
884 ErrLogPrintf("Error in readind Bioseq Formating failed.\n");
885 return 4;
886 }
887
888 SeqEntrySetScope(sep);
889 bsp = (BioseqPtr) sep->data.ptrvalue;
890
891 total_length += bsp->length;
892 sequence_count++;
893
894 if (error_msg) {
895 Char buffer[42];
896 SeqIdWrite(bsp->id, buffer, PRINTID_FASTA_LONG, 41);
897 ErrPostEx(SEV_WARNING, 0, 0, "Sequence number %ld (%s), %s\n",
898 sequence_count, buffer, error_msg);
899 error_msg = MemFree(error_msg);
900 }
901
902 bdp = FDBGetDefAsnFromBioseq(bsp, taxid_tbl);
903 if ( FDBAddBioseq(fdbp, bsp, bdp) ) {
904 options->clean_opt = eCleanAlways;
905 FDBCleanUp(options);
906 ErrPostEx(SEV_FATAL, 1, 0,
907 "Fatal error when adding sequence to BLAST database.");
908 return 1;
909 }
910 bdp = BlastDefLineSetFree(bdp);
911 SeqEntryFree(sep);
912 }
913
914 FileClose(fd);
915
916 /* Writing multi-volume pointer file */
917 FD_MakeAliasFile(options);
918
919 } else {
920 /* ASN.1 format of input database */
921 AsnTypePtr atp, atp2;
922 AsnModulePtr amp;
923
924 if (! SeqEntryLoad())
925 ErrShow();
926
927 /* get pointer to all loaded ASN.1 modules */
928 amp = AsnAllModPtr();
929
930 if (amp == NULL) {
931 ErrLogPrintf("Could not load ASN.1 modules.\n");
932 return 5;
933 }
934
935 /* get the initial type pointers */
936
937 atp = AsnFind("Bioseq-set");
938 if (atp == NULL) {
939 ErrLogPrintf("Could not get type pointer for Bioseq-set.\n");
940 return 6;
941 }
942
943 atp2 = AsnFind("Bioseq-set.seq-set.E");
944 if (atp2 == NULL) {
945 ErrLogPrintf("Could not get type pointer for Bioseq-set.seq-set.E\n");
946 return 7;
947 }
948
949 if ((fdbp->aip = AsnIoOpen (options->db_file,
950 options->asnbin ? "rb":"r")) == NULL) {
951 ErrLogPrintf("Cannot open input database file. Formating failed...\n");
952 return 8;
953 }
954
955 if (options->is_seqentry) {
956 Int8 len = 0;
957 /* Seq entry */
958 sep = SeqEntryAsnRead(fdbp->aip, NULL);
959 FDBAddSeqEntry(fdbp, sep);
960 SeqEntryExplore(sep, (Pointer)&len, SeqEntryGetLength);
961 SeqEntryFree(sep);
962 sequence_count++;
963 total_length += len;
964 } else {
965 /* Bioseq-set */
966
967 while ((atp = AsnReadId(fdbp->aip, amp, atp)) != NULL) {
968 if (atp == atp2) { /* top level Seq-entry */
969 Int8 len = 0;
970 sep = SeqEntryAsnRead(fdbp->aip, atp);
971
972 FDBAddSeqEntry(fdbp, sep);
973 SeqEntryExplore(sep, (Pointer)&len, SeqEntryGetLength);
974 SeqEntryFree(sep);
975 sequence_count++;
976 total_length += len;
977 } else {
978 AsnReadVal(fdbp->aip, atp, NULL);
979 }
980 }
981 } /* end "if Bioseq or Bioseq-set */
982
983
984 } /* end "if FASTA or ASN.1" */
985
986 if (multiple_inputs) {
987 /* record the ordinal ids, input file names and input file lengths
988 * in the following arrays. This will be used later to
989 * create the multiple alias files in case that the multiple input
990 * results in a multi-volume database */
991 inputs[input_ctr] = options->db_file;
992 lengths[input_ctr] = total_length;
993 last_oid[input_ctr++] = sequence_count;
994 }
995 options->db_file = StringTokMT(next_db, DELIM, &next_db);
996
997 } /* Loop on input files */
998
999 /* Dump indexes, deallocate structure, arrays, etc. */
1000
1001 if(FormatDBClose(fdbp))
1002 return 9;
1003 options->db_file = orig_ptr;
1004
1005 /* If multiple inputs were given, create an alias file for each of the
1006 * fasta file inputs */
1007 if (multiple_inputs) {
1008 ReadDBFILEPtr rdfp, rdfp_tmp;
1009 Boolean span_multiple_rdfp = FALSE, first_time = TRUE;
1010 Int4 start_oid = 1, stop_oid, vol_ctr = 0;
1011 Char basename[PATH_MAX]; /* save the name of individual alias files */
1012 Char dblist[PATH_MAX]; /* save the name(s) of all alias files when an
1013 input spans multiple volumes */
1014
1015 ASSERT(input_ctr == num_inputfiles);
1016 rdfp = rdfp_tmp = readdb_new(dump_args[basename_arg].strvalue,
1017 options->is_protein);
1018 if (rdfp == NULL) {
1019 ErrPostEx(SEV_FATAL, 1, 0,
1020 "Cannot create alias files for multiple inputs");
1021 FDBOptionsFree(options);
1022 return 1;
1023 }
1024
1025 /* For each rdfp write the corresponding alias file(s) */
1026 ErrLogPrintf("\nCreating alias files for multiple FASTA inputs...\n");
1027 input_ctr = 0;
1028 MemSet(dblist, 0, sizeof(dblist));
1029 MemSet(basename, 0, sizeof(basename));
1030
1031 for (; input_ctr < num_inputfiles; input_ctr++) {
1032 while (rdfp) {
1033
1034 sprintf(basename, "%s.%02d", inputs[input_ctr], vol_ctr);
1035
1036 if (first_time || span_multiple_rdfp) {
1037 start_oid = 1;
1038 first_time = FALSE;
1039 } else
1040 start_oid = last_oid[input_ctr-1] - rdfp->start+1;
1041
1042 if (last_oid[input_ctr] > (rdfp->stop+1)) {
1043 stop_oid = rdfp->stop+1;
1044 span_multiple_rdfp = TRUE;
1045 StringCat(dblist, basename); StringCat(dblist, " ");
1046 } else if (span_multiple_rdfp) {
1047 StringCat(dblist, basename); StringCat(dblist, " ");
1048 stop_oid = last_oid[input_ctr] - rdfp->start;
1049 } else {
1050 stop_oid = last_oid[input_ctr] - rdfp->start;
1051 }
1052
1053 ErrLogPrintf("Input %s (up to %ld) alias file %s (%ld-%ld)\n",
1054 inputs[input_ctr], last_oid[input_ctr], basename,
1055 start_oid, stop_oid);
1056 FD_CreateAliasFileEx(NULL, basename, 0,
1057 options->is_protein, rdfp->filename,
1058 start_oid, stop_oid, lengths[input_ctr],
1059 0, NULL, NULL);
1060
1061 if (last_oid[input_ctr] > rdfp->stop) {
1062 rdfp = rdfp->next;
1063 vol_ctr++;
1064 } else
1065 break;
1066 }
1067
1068 if (span_multiple_rdfp) {
1069 /* Create wrapper alias file for the corresponding volumes */
1070 FD_CreateAliasFileEx(NULL, inputs[input_ctr], 0,
1071 options->is_protein, dblist, 0, 0, lengths[input_ctr],
1072 0, NULL, NULL);
1073 span_multiple_rdfp = FALSE;
1074 ErrLogPrintf("Created wrapper alias file %s for %s\n",
1075 inputs[input_ctr], dblist);
1076 } else {
1077 /* Rename the alias file just created */
1078 sprintf(tmpbuf1, "%s.%cal", basename,
1079 options->is_protein ? 'p' : 'n');
1080 sprintf(tmpbuf2, "%s.%cal", inputs[input_ctr],
1081 options->is_protein ? 'p' : 'n');
1082 FileRename(tmpbuf1, tmpbuf2);
1083 ErrLogPrintf("Renamed %s to %s\n", tmpbuf1, tmpbuf2);
1084 }
1085 MemSet(dblist, 0, sizeof(dblist));
1086 }
1087 rdfp_tmp = readdb_destruct(rdfp_tmp);
1088 MemFree(last_oid);
1089 MemFree(inputs);
1090 MemFree(lengths);
1091 }
1092
1093 #ifdef TAX_CS_LOOKUP
1094 if(dump_args[12].intvalue && options->parse_mode) {
1095 RDTaxLookupClose(options->tax_lookup);
1096 }
1097 #endif
1098
1099 taxid_tbl = FDBTaxidDeflineTableFree(taxid_tbl);
1100
1101 if (options->version >= FORMATDB_VER) {
1102 options->linkbit_listp = FDBDestroyLinksTable(options->linkbit_listp);
1103 options->memb_tblp = FDBDestroyMembershipsTable(options->memb_tblp);
1104 }
1105
1106 ErrLogPrintf("SUCCESS: formatted database %s\n",
1107 options->alias_file_name ?
1108 options->alias_file_name : options->base_name);
1109
1110 options = FDBOptionsFree(options);
1111
1112 return 0;
1113
1114 } /* main()*/
1115
1116
1117
1118