1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 */
26 
27 /*****************************************************************************
28 
29 File name: readdb.h
30 
31 Author: Tom Madden
32 
33 Contents: defines and prototypes used by readdb.c and formatdb.c.
34 
35 ******************************************************************************/
36 
37 /*
38 * File Name: readdb.h
39 *
40 * Author: Tom Madden
41 *
42 * Version Creation Date:   3/21/95
43 *
44 * $Revision: 6.180 $
45 *
46 * File Description:
47 *       Functions to rapidly read databases from files produced by formatdb.
48 *
49 * Modifications:
50 * --------------------------------------------------------------------------
51 * Date     Name        Description of modification
52 * -------  ----------  -----------------------------------------------------
53 *
54 * ==========================================================================
55 *
56 *
57 * RCS Modification History:
58 * $Log: readdb.h,v $
59 * Revision 6.180  2009/01/15 15:20:35  madden
60 * Add prototype for readdb_check_oid
61 *
62 * Revision 6.179  2007/09/27 17:20:54  madden
63 * Add readdb_get_full_filename
64 *
65 * Revision 6.178  2007/08/21 20:06:07  kans
66 * added prototype for FDBCleanUp
67 *
68 * Revision 6.177  2007/05/08 13:09:39  madden
69 * Add ability to read STATS_NSEQ and STATS_TOTLEN from alias file with funciton readdb_get_stats_numbers
70 *
71 * Revision 6.176  2006/08/07 15:03:57  camacho
72 * +is_REFSEQ_GENOMIC
73 *
74 * Revision 6.175  2006/07/03 18:27:22  coulouri
75 * correct volume size defaults for protein databases
76 *
77 * Revision 6.174  2006/06/27 15:34:18  coulouri
78 * Correct comment and default volume size
79 *
80 * Revision 6.173  2006/06/19 18:37:08  coulouri
81 * improve default handling for non-formatdb clients
82 *
83 * Revision 6.172  2006/06/19 17:20:14  coulouri
84 * Extend 1GB default volume size to all platforms and impose a hard limit of 4G. rt#15171398
85 *
86 * Revision 6.171  2006/05/10 22:00:28  kans
87 * new function prototypes were erroneously inside ifdef HAVE_MADVISE block, moved outside
88 *
89 * Revision 6.170  2006/05/10 20:48:57  camacho
90 * From Ilya Dondoshansky: 1. Several FDB functions made public - needed for incremental dump efficiency; 2. mol field added to SI_Record and DI_Record
91 *
92 * Revision 6.169  2006/04/24 15:50:19  camacho
93 * + is_REFSEQ_RNA
94 *
95 * Revision 6.168  2006/03/16 14:14:24  camacho
96 * Fix parsing of locations for fastacmd command line argument (rt # 15151399)
97 *
98 * Revision 6.167  2006/03/09 21:56:02  camacho
99 * Refactored sequence hash function
100 *
101 * Revision 6.166  2006/03/08 19:06:14  camacho
102 * Added definition for maximum number of volumes, fixes rt ticket 15147600
103 *
104 * Revision 6.165  2006/02/15 21:07:29  camacho
105 * Add validation to fastacmd to reject mixed protein/nucleotide databases
106 *
107 * Revision 6.164  2005/10/04 20:40:50  madden
108 * Make PrintDbInformationBasicEx public
109 *
110 * Revision 6.163  2005/10/04 15:44:54  madden
111 * Workaround to time-out problem of PrintDbInformationWithRID
112 *
113 * Revision 6.162  2005/07/28 14:57:10  coulouri
114 * remove dead code
115 *
116 * Revision 6.161  2005/07/27 21:30:02  camacho
117 * 1) Replaces is_REFSEQ_* functions by a single function (is_REFSEQ), to be
118 * used by genmask and ID1 group's BLAST database dumper.
119 * 2) Removed out-of-date is_WGS* functions.
120 *
121 * Revision 6.160  2005/07/27 17:48:57  coulouri
122 * remove hardcoded paths
123 *
124 * Revision 6.159  2005/06/22 13:55:22  coulouri
125 * add support for dumping accessions
126 *
127 * Revision 6.158  2005/06/08 19:25:36  camacho
128 * New feature to allow formatdb to add taxonomy ids to BLAST databases
129 * generated from FASTA input
130 * BugzID: 6
131 *
132 * Revision 6.157  2005/02/22 14:15:48  camacho
133 * Pass bioseq data type by reference to FDBAddBioseq
134 *
135 * Revision 6.156  2004/12/04 03:41:09  camacho
136 * Add extra enum for fastacmd -D option for error checking
137 *
138 * Revision 6.155  2004/12/03 04:57:57  camacho
139 * Fix name conflict in enumeration for fastacmd dump types
140 *
141 * Revision 6.154  2004/12/02 20:37:31  camacho
142 * + fastacmd feature to dump list of gis
143 *
144 * Revision 6.153  2004/09/27 16:29:34  madden
145 * Make title on SI_Record dynamically allocated
146 *
147 * Revision 6.152  2004/08/25 14:45:23  camacho
148 * Refactorings to allow formatdb process multiple deflines
149 *
150 * Revision 6.151  2004/07/14 18:35:12  camacho
151 * Added comments for readdb_get_header_ex
152 *
153 * Revision 6.150  2004/07/09 17:09:12  camacho
154 * Updated documentation for last_oid_assigned
155 *
156 * Revision 6.149  2004/07/08 19:49:03  camacho
157 * Contributions from ID1 Group:
158 * 1) SI_Record structure.
159 * 2) Refactoring of FDBAddSequence2 to allow addition of non-redundant sequences
160 * when creating BLAST databases.
161 *
162 * Revision 6.148  2004/06/29 20:59:23  camacho
163 * Added last_oid_assigned to ReadDBSharedInfo structure
164 *
165 * Revision 6.147  2004/04/16 18:14:50  camacho
166 * Made division field in DI_Record larger
167 *
168 * Revision 6.146  2004/02/24 14:06:01  camacho
169 * Added support for approximate sequence length calculation for nucleotide
170 * sequences.
171 *
172 * Revision 6.145  2004/02/04 15:35:05  camacho
173 * Rollback to fix problems in release 2.2.7
174 *
175 * Revision 6.143  2003/07/08 18:42:40  camacho
176 * Elaborated fastacmd return values
177 *
178 * Revision 6.142  2003/06/13 19:56:48  dondosha
179 * Removed unneeded argument in FastaToBlastDB
180 *
181 * Revision 6.141  2003/04/25 18:55:27  camacho
182 * 1. Added readdb_merge_gifiles to deal with Microbial blast database issues.
183 * 2. Minor fixes to Int4List functions.
184 *
185 * Revision 6.140  2003/04/22 21:30:14  camacho
186 * Added Int4 list utilities
187 *
188 * Revision 6.139  2003/04/22 19:04:57  camacho
189 * Moved GiList structure to generic list of 4-byte integers
190 *
191 * Revision 6.138  2003/04/16 15:39:37  coulouri
192 * fix compiler warning
193 *
194 * Revision 6.137  2003/04/15 19:09:13  camacho
195 * Completed implementation of PIG interface
196 *
197 * Revision 6.136  2003/04/10 15:11:37  camacho
198 * Include PIG interface in __cplusplus
199 *
200 * Revision 6.135  2003/04/09 21:46:00  camacho
201 * Added basic PIG interface
202 *
203 * Revision 6.134  2003/04/08 19:45:35  camacho
204 * Defined invalid PIG
205 *
206 * Revision 6.133  2003/04/08 15:37:15  camacho
207 * Extended FDBAddSequence2 to take pig
208 *
209 * Revision 6.132  2003/04/01 21:51:36  camacho
210 * Made fastacmd functions & structure non-static
211 *
212 * Revision 6.131  2003/03/27 22:26:04  camacho
213 * Add error messages and non-zero return value on error for fastacmd
214 *
215 * Revision 6.130  2003/03/26 19:11:22  camacho
216 * Minor change to previous commit
217 *
218 * Revision 6.129  2003/03/26 18:50:07  camacho
219 * Added eFDBCleanOpt to formatdb API
220 *
221 * Revision 6.128  2003/01/30 21:57:28  camacho
222 * Added more detailed comment to readdb_new_ex2
223 *
224 * Revision 6.127  2003/01/22 19:41:21  camacho
225 * Added function to build multi-volume db list for creating alias files
226 *
227 * Revision 6.126  2002/12/20 14:37:34  coulouri
228 * Fix prototype for RDBTaxInfoInit()
229 *
230 * Revision 6.125  2002/12/17 20:33:25  camacho
231 * Removed unnecessary function attribute
232 *
233 * Revision 6.124  2002/12/16 20:22:48  camacho
234 * Removed unused options in formatdb options structure
235 *
236 * Revision 6.123  2002/12/16 05:01:55  camacho
237 * Fixes to previous commit
238 *
239 * Revision 6.122  2002/12/13 13:43:25  camacho
240 * Changes to set links and membership bits in formatdb API
241 *
242 * Revision 6.121  2002/11/25 17:23:28  camacho
243 * 1) Changed file access to blast taxonomy databases: only 2 files are loaded
244 *    for an entire chain of rdfp's.
245 * 2) Fixed memory leak in FindBlastDBFile.
246 * 3) Protect NlmOpenMFILE against NULL argument.
247 *
248 * Revision 6.120  2002/10/25 16:49:45  camacho
249 * Added Michael Kimelman's FDBAddSequence2
250 *
251 * Revision 6.119  2002/10/03 14:13:44  camacho
252 * Added support for gilist field in alias file in multivolume databases
253 *
254 * Revision 6.118  2002/09/26 02:14:42  camacho
255 * Allow limiting the number of sequences per volume
256 *
257 * Revision 6.117  2002/09/25 20:14:20  camacho
258 * Fix for multivolume databases with non-parseable seqids
259 *
260 * Revision 6.116  2002/07/30 15:28:50  camacho
261 * Added fastacmd function to parse SeqLocs
262 *
263 * Revision 6.115  2002/07/29 15:45:19  camacho
264 * Made readdb_get_taxnames a LIBCALL function
265 *
266 * Revision 6.114  2002/07/24 19:31:48  raytseli
267 * much simpler and more efficient approach to using madvise()
268 * .
269 *
270 * Revision 6.113  2002/07/22 13:06:42  raytseli
271 * explicitly allow setting of the advice type for madvise()
272 * .
273 *
274 * Revision 6.112  2002/07/18 17:39:54  raytseli
275 * changed ifdef OS_UNIX_SUN to ifdef OS_UNIX_SOL for madvise()
276 * .
277 *
278 * Revision 6.111  2002/07/18 15:54:26  raytseli
279 * added function to explicitly set madvise() block size, and madvise() sync mode.
280 *
281 * Revision 6.110  2002/07/18 15:01:54  raytseli
282 * correct problem with pointer format "%p" ErrPostEx() handling on linux.
283 * Add extern func to allow explicit madvise() functionality activation.
284 *
285 * Revision 6.109  2002/07/17 17:15:06  raytseli
286 * only allow madvise()-related stuff on SUN or Linux.
287 * .
288 *
289 * Revision 6.108  2002/07/17 16:54:54  raytseli
290 * additional #ifdefs to allow compilation.
291 *
292 * Revision 6.106  2002/07/17 14:36:54  raytseli
293 * incorporated madvise into readdb
294 * .
295 *
296 * Revision 6.105  2002/07/14 21:02:08  camacho
297 * Added extra features to fastacmd
298 *
299 * Revision 6.104  2002/07/09 16:41:52  camacho
300 * Made taxonomy databases multi-thread safe
301 *
302 * Revision 6.103  2002/06/26 00:45:37  camacho
303 *
304 * Added readdb_get_totals_ex2 to allow recalculation of database length as
305 * well as total number of sequences after the virtual oidlist has been
306 * created.
307 *
308 * Revision 6.102  2002/06/04 21:45:39  dondosha
309 * Corrected the readdb_get_sequence_number function in case of multiple-volume databases
310 *
311 * Revision 6.101  2002/06/04 20:22:56  camacho
312 * Fixed taxonomy databases to work w/o mmap
313 *
314 * Revision 6.100  2002/05/15 20:23:47  camacho
315 * Added wgs_{mouse,anthrax} criteria functions
316 *
317 * Revision 6.99  2002/05/02 21:52:06  camacho
318 * Support for genmask's new month/subset mask combinations
319 *
320 * Revision 6.98  2002/04/18 19:35:07  camacho
321 * 1. Added fdfilter/genmask callbacks for wgs subsets
322 * 2. Modified fdfilter/genmask refseq_protein callback function
323 * 3. Fixed problem in readdb_read_alias_file to read multiple oidlists
324 *
325 * Revision 6.97  2002/03/08 16:58:50  camacho
326 * Added accessions to dump info files *.[pn]di
327 *
328 * Revision 6.96  2002/01/25 17:06:57  camacho
329 * Added new criteria to create new refseq databases
330 *
331 * Revision 6.95  2002/01/24 18:47:48  camacho
332 * Moved RDBTaxNamesFree from readdb.[ch] to txalign.[ch]
333 *
334 * Revision 6.94  2002/01/11 19:22:26  camacho
335 * 1. Added preferred_gi field to ReadDBFILE structure.
336 * 2. Modified FDReadDeflineAsn to return the preferred gi as the
337 *    first element of the list of BlastDefLine structures (if set).
338 *
339 * Revision 6.93  2001/12/18 13:01:51  camacho
340 * Added new flag -D to dump blast database in FASTA format
341 *
342 * Revision 6.92  2001/12/10 19:17:13  camacho
343 * Added option to allow fastacmd to use Ctrl-As as defline separators.
344 *
345 * Revision 6.91  2001/11/09 19:05:35  dondosha
346 * ReadDBFreeSharedInfo and ReadDBOpenMHdrAndSeqFiles made static in readdb.c
347 *
348 * Revision 6.90  2001/11/02 18:30:12  dondosha
349 * Added prototypes for readdb_get_sequence_number, PrintDbInformationWithRID
350 *
351 * Revision 6.89  2001/10/19 13:40:31  camacho
352 * Updated the DI_Record structure and moved some function prototypes to allow their use by fdfilter
353 *
354 * Revision 6.88  2001/10/01 18:43:37  camacho
355 * Added BlastDBToFasta function
356 * Added readdb_get_header_ex function
357 *
358 * Revision 6.87  2001/10/01 18:37:32  camacho
359 * readdb.h
360 *
361 * Revision 6.86  2001/07/12 19:27:45  madden
362 * Add alias_file_name to Options
363 *
364 * Revision 6.85  2001/06/21 18:27:28  shavirin
365 * Moved into files txalign.[c,h] functions returning taxonomy names
366 * from Bioseq created from Blast database.
367 *
368 * Revision 6.84  2001/06/14 16:22:46  madden
369 * Add prototype for FD_MakeAliasFile
370 *
371 * Revision 6.83  2001/05/23 21:17:24  shavirin
372 * Added definitions for bits related to sequence-to-database affiliation.
373 *
374 * Revision 6.82  2001/05/11 19:59:41  madden
375 * Add gi_file_bin to FDOptions, oidlist and gifile to FD_CreateAliasFileEx
376 *
377 * Revision 6.81  2001/05/10 17:19:53  madden
378 * Add number_seqs arg to FD_CreateAliasFileEx
379 *
380 * Revision 6.80  2001/05/08 21:58:28  shavirin
381 * Added possibility to generate tax_id for every definition in Blast FASTA
382 * definition set in ASN.1 structured definition lines.
383 *
384 * Revision 6.79  2001/05/02 16:22:05  dondosha
385 * Add NSEQ and LENGTH to alias files in case of multiple inputs to formatdb
386 *
387 * Revision 6.78  2001/04/11 21:00:53  dondosha
388 * Made functions FD_CreateAliasFile(Ex) public
389 *
390 * Revision 6.77  2001/04/11 20:14:06  dondosha
391 * Added volume information to FDB_options structure
392 *
393 * Revision 6.76  2001/03/29 20:15:59  madden
394 * Removed unneeded #define
395 *
396 * Revision 6.75  2001/03/23 17:23:54  madden
397 * Move FDGetDeflineAsnFromBioseq to txalign.[ch]
398 *
399 * Revision 6.74  2001/02/05 18:52:01  shavirin
400 * Blast database size was changed from Uint4 to Uint8 - this corrected
401 * invalidly printed database size for large databases.
402 *
403 * Revision 6.73  2000/12/12 23:14:42  shavirin
404 * Added functions to initialize taxonomy names database and search functions
405 * to get all taxonomy names given tax_id using this database.
406 *
407 * Revision 6.72  2000/12/08 22:25:01  shavirin
408 * Added code for creation Taxonomy lookup database using formatdb API.
409 *
410 * Revision 6.71  2000/11/28 18:20:40  madden
411 * Comments from Sergei on FDB_options
412 *
413 * Revision 6.70  2000/11/24 15:41:58  shavirin
414 * Added parameter tax_id into function FDBAddBioseq().
415 *
416 * Revision 6.69  2000/11/22 19:52:44  shavirin
417 * Added definition of the new function FDGetDeflineAsnFromBioseq()
418 *
419 * Revision 6.68  2000/10/26 18:30:50  dondosha
420 * Added gifile member to ReadDBFILE structure
421 *
422 * Revision 6.67  2000/10/13 17:31:52  shavirin
423 * Adjusted calls to readdb_get_header for ASN.1 structured deflines.
424 *
425 * Revision 6.66  2000/09/29 16:38:30  shavirin
426 * Added new function FDB_FreeCLOptions(FDB_optionsPtr options).
427 *
428 * Revision 6.65  2000/09/16 15:20:17  shavirin
429 * Added AsnIoPtr structure for ASN.1 structured deflines.
430 *
431 * Revision 6.64  2000/09/07 20:49:58  shavirin
432 * Added parameters to support ASN.1 defline dump for blast db. FORMATDB_VER 3->4
433 * Added parameter FORMATDB_VER_TEXT for backward compatibility.
434 *
435 * Revision 6.63  2000/07/18 19:29:29  shavirin
436 * Added new parameter test_non_unique to suppress check for non-unique
437 * strings ids in the database - default - TRUE.
438 *
439 * Revision 6.62  2000/07/07 21:20:08  vakatov
440 * Get all "#include" out of the 'extern "C" { }' scope!
441 *
442 * Revision 6.61  2000/06/28 16:55:50  madden
443 * Add function Fastacmd_Search_ex, gi_target to ReadDBFILEPtr
444 *
445 * Revision 6.60  2000/06/19 20:06:43  madden
446 * Add ready Boolean to readdb_get_sequence_ex, for nucl. sequence the data is then in blastna format with sentinel bytes
447 *
448 * Revision 6.59  2000/05/22 18:46:23  dondosha
449 * Merged all Boolean members in ReadDBFILE structure into a single Int4
450 *
451 * Revision 6.58  2000/05/09 15:54:20  shavirin
452 * Added function ReadDBBioseqSetDbGeneticCode().
453 *
454 * Revision 6.57  2000/05/03 16:18:34  dondosha
455 * Added prototype for FastaToBlastDB
456 *
457 * Revision 6.56  2000/03/13 18:36:38  madden
458 * Added insert_ctrlA Boolean to readdb_get_bioseq_ex
459 *
460 * Revision 6.55  2000/03/10 18:52:11  madden
461 * Add prototype for readdb_get_filebits
462 *
463 * Revision 6.54  2000/02/09 19:35:52  madden
464 * Added readdb_MakeGiFileBinary
465 *
466 * Revision 6.53  2000/01/12 21:03:52  egorov
467 * 1. Introduce Fastacmd API function - Fastacmd_Search
468 * 2. Rearrange order of functions to have Fastacmd, ID1, and CommonIndex stuff separate.
469 *
470 * Revision 6.52  2000/01/07 16:00:25  madden
471 * Alias db length is Int8 instead of Uint4
472 *
473 * Revision 6.51  2000/01/03 15:46:16  lewisg
474 * add prototype for readdb_get_num_entries_total_real
475 *
476 * Revision 6.50  1999/12/31 14:23:21  egorov
477 * Add support for using mixture of real and maks database with gi-list files:
478 * 1. Change logic of creating rdfp list.
479 * 2. BlastGetDbChunk gets real databases first, then masks.
480 * 3. Propoper calculation of database sizes using alias files.
481 * 4. Change to CommonIndex to support using of mask databases.
482 * 5. Use correct gis in formated output (BlastGetAllowedGis()).
483 * 6. Other small changes
484 *
485 * Revision 6.49  1999/12/22 20:34:34  dondosha
486 * Add full_filename and shared_info to ReadDBFile structure, plus prototypes of related routines
487 *
488 * Revision 6.48  1999/12/21 20:00:27  egorov
489 * Add new parameter into readdb_gi2seq()
490 *
491 * Revision 6.47  1999/12/17 21:33:01  egorov
492 * Add support for the 'month' subset.
493 *
494 * Revision 6.46  1999/12/15 17:34:32  egorov
495 * 1. Introduce MASK_WORD_SIZE constant variable.
496 * 2. Introduce DI_Record structure for fileld of DI index file.
497 * 3. Introduce UpdateIndexStruct which is used in callback for UpdateCommonIndexFile.
498 * 4. Add new field to ReadDbFile structure - aliasfilename, which used
499 *    while deciding which gi to use.
500 *
501 * Revision 6.45  1999/11/26 22:06:59  madden
502 * Added READDB_UNPACK_BASE_N macro
503 *
504 * Revision 6.44  1999/11/23 22:02:27  madden
505 * Added readdb_get_totals_ex that may use alias file values
506 *
507 * Revision 6.43  1999/11/23 21:51:24  madden
508 * Changes for freeing OIDlist
509 *
510 * Revision 6.42  1999/11/12 14:16:14  madden
511 * Allow other initialization states in readdb_new_ex2
512 *
513 * Revision 6.41  1999/09/24 18:59:16  egorov
514 * Add functions prototypes
515 *
516 * Revision 6.40  1999/09/23 15:02:53  egorov
517 * Use more descriptive name
518 *
519 * Revision 6.39  1999/09/22 21:50:57  egorov
520 * Add mask DB stuff
521 *
522 * Revision 6.38  1999/09/13 16:18:40  shavirin
523 * Added function readdb_get_bioseq_ex, which has possibility
524 * to bypass ObjMgr registration.
525 *
526 * Revision 6.37  1999/09/10 16:30:18  shavirin
527 * Fixed problems with formating proteins by formatdb
528 *
529 * Revision 6.36  1999/09/09 18:25:05  shavirin
530 * Added functions to parse ASN.1 with formatdb
531 *
532 * Revision 6.35  1999/08/25 20:17:39  shavirin
533 * Added option to create and retrieve from sparse indexes.
534 *
535 * Revision 6.34  1999/08/02 13:33:58  shavirin
536 * Rolled back last changes.
537 *
538 * Revision 6.32  1999/05/27 15:51:29  shavirin
539 * Added function readdb_get_defline ()
540 *
541 * Revision 6.31  1999/05/18 20:35:31  madden
542 * Changes to read an alias file for multiple db searches and ordinal ID lists
543 *
544 * Revision 6.30  1999/05/13 19:31:14  shavirin
545 * More changes toward dump from ID.
546 *
547 * Revision 6.29  1999/05/12 15:48:03  shavirin
548 * Changed parameter in function FDBAddSequence().
549 *
550 * Revision 6.28  1999/05/06 15:25:27  egorov
551 * Remove static function declaration
552 *
553 * Revision 6.27  1999/04/26 14:36:29  shavirin
554 * Added ability to dump statistics.
555 *
556 * Revision 6.26  1999/04/21 22:55:39  kans
557 * was not checked in
558 *
559 * Revision 6.25  1999/02/22 21:48:03  egorov
560 * Optimize GIs2OIDs not reinitializing ISAM indicies for non-exclisive databases, but use already initialized rdfp's field for that.
561 *
562 * Revision 6.24  1999/02/05 13:47:05  madden
563 * Add basename for formatdb
564 *
565 * Revision 6.23  1998/12/14 21:49:23  egorov
566 * new max gi number memeber in CommonIndexHead structure and therefore no need for COMMON_INDEX_TABLE_SIZE
567 *
568 * Revision 6.22  1998/12/14 16:05:36  egorov
569 * *** empty log message ***
570 *
571 * Revision 6.21  1998/09/14 15:11:19  egorov
572 * Add support for Int8 length databases; remove unused variables
573 *
574 * Revision 6.20  1998/08/27 15:02:37  madden
575 * Added LIBCALL for readdb_get_sequence_ex
576 *
577 * Revision 6.19  1998/08/24 14:59:57  madden
578 * readdb_get_sequence_ex function
579 *
580 * Revision 6.18  1998/08/11 17:49:48  madden
581 * is_na becomes is_aa
582 *
583 * Revision 6.17  1998/07/01 14:03:07  egorov
584 * Fix bug with a thread freeing CommonIndex: add new flag to rdfp
585 *
586 * Revision 6.16  1998/06/26 16:51:15  egorov
587 * Fix CommonIndex bugs
588 *
589 * Revision 6.15  1998/06/24 21:03:40  egorov
590 * Remove memory leaks
591 *
592 * Revision 6.12  1998/05/22 20:19:54  madden
593 * Changes to fix multi-db search bug
594 *
595 * Revision 6.11  1998/02/26 22:34:24  madden
596 * Changes for 16 bit windows
597 *
598 * Revision 6.10  1998/02/11 17:49:38  madden
599 * Added structures and prototypes for formatdb to take ASN.1 as input
600 *
601 * Revision 6.9  1998/01/16 22:03:00  madden
602 * Added init_indices Boolean
603 *
604 * Revision 6.8  1997/11/26 22:48:38  madden
605 * Added readdb_parse_db_names for multiple db searches
606 *
607 * Revision 6.7  1997/11/07 16:16:36  shavirin
608 * Added definition of new function readdb_acc2fastaEx()
609 *
610 * Revision 6.6  1997/10/24 19:08:16  madden
611 * Added ReadDBGetDb and ReadDBGetDbId
612 *
613 * Revision 6.5  1997/09/24 22:37:06  madden
614 * Added readdb_destruct_element
615 *
616 * Revision 6.4  1997/09/16 16:31:40  madden
617 * More changes for multiple db runs
618 *
619 * Revision 6.3  1997/09/12 19:55:38  madden
620 * Added readdb_compare
621 *
622 * Revision 6.2  1997/09/11 18:49:40  madden
623 * Changes to enable searches against multiple databases.
624 *
625 * Revision 6.1  1997/08/27 14:46:59  madden
626 * Changes to enable multiple DB searches
627 *
628 * Revision 6.0  1997/08/25 18:53:59  madden
629 * Revision changed to 6.0
630 *
631 * Revision 1.26  1997/05/12 21:34:05  madden
632 * readdb_new allows indeterminate database type
633 *
634 * Revision 1.25  1997/05/12 21:11:42  shavirin
635 * Added definition for function readdb_acc2fasta()
636 *
637 * Revision 1.23  1997/05/07 21:04:02  madden
638 * Added prototype for SeqId2OrdinalId and changed FORMATDB_VER 2->3
639 *
640 * Revision 1.22  1997/05/01 17:26:58  shavirin
641 * Added definition for the function readdb_seqid2fasta()
642 *
643  * Revision 1.21  1997/02/25  22:16:32  shavirin
644  * Changes in accordance to ISAM API changes
645  *
646  * Revision 1.20  1997/02/25  16:28:38  shavirin
647  * Added new entries in ReadDBFILEPtr structure to do search by gi
648  * number.
649  *
650  * Revision 1.19  1996/12/19  16:29:56  madden
651  * Changes to eliminate ".nac" file for nucl.
652  *
653  * Revision 1.18  1996/12/17  21:34:46  madden
654  * Changes to allow deflines for inidividual entries to be retrieved.
655  *
656  * Revision 1.17  1996/12/11  18:42:36  madden
657  * Added prototypes for BioseqFetch functions.
658  *
659  * Revision 1.16  1996/11/27  16:39:11  madden
660  * Added functions to return filename and date. FORMATDB_VER 1->2
661  *
662  * Revision 1.15  1996/11/26  19:54:27  madden
663  * Added check for database in standard places.
664  *
665  * Revision 1.14  1996/11/22  19:05:48  madden
666  * removed ifdef for OLD_BIT_ORDER.
667  *
668  * Revision 1.13  1996/11/08  21:45:03  madden
669  * Removed function readdb_get_partial_unpacked_sequence.
670  *
671  * Revision 1.12  1996/11/07  22:33:00  madden
672  * Added prototype for readdb_ambchar_present.
673  *
674  * Revision 1.11  1996/11/04  18:50:20  shavirin
675  * Added definitions for ambiguity information pointers
676  *
677  * Revision 1.10  1996/10/31  16:29:55  shavirin
678  * Changed definitions due to reverce of residues in BLAST database
679  * for nucleotide sequences from (4321) to (1234)
680  * New dumper now required to create BLAST databases.
681  *
682  * Revision 1.9  1996/09/27  19:12:17  madden
683  * Added function readdb_get_bioseq to obtain a BioseqPtr from the BLAST databases.
684  *
685  * Revision 1.8  1996/09/26  15:09:21  madden
686  * Corrected misplaced comment.
687  *
688  * Revision 1.7  1996/09/23  14:37:35  madden
689  * Replaced CharPtr (for sequence) with Uint1Ptr.
690  *
691  * Revision 1.6  1996/09/20  21:59:16  madden
692  * *** empty log message ***
693  *
694  * Revision 1.5  1996/09/13  20:01:52  madden
695  * defined READDB_COMPRESSION_RATIO
696  *
697  * Revision 1.4  1996/09/13  18:55:04  madden
698  * Added function readdb_get_partial_unpacked_sequence.
699  *
700  * Revision 1.3  1996/08/29  20:42:01  madden
701  * memory mapping moved to the corelib (in ncbimem.[ch]).
702  *
703  * Revision 1.2  1996/08/07  18:32:05  madden
704  * Moved define of MMAP_AVAIL from readdb.h to readdb.c
705  *
706  * Revision 1.1  1996/08/05  19:48:21  madden
707  * Initial revision
708  *
709  * Revision 1.12  1996/08/02  14:20:06  madden
710  * Added readdb_attach function.
711  *
712  * Revision 1.11  1996/07/31  13:09:17  madden
713  * Changes for partial copy of ReadDB structure.
714  *
715  * Revision 1.10  1996/07/25  20:45:20  madden
716  * Change to arguments of readdb_get_sequence.
717  *
718  * Revision 1.9  1996/07/25  12:56:15  madden
719  * readdb_get_sequence changed to allow for systems w/o mmap.
720  *
721  * Revision 1.8  1996/06/20  17:00:11  madden
722  * Added "__cplusplus" define.
723  *
724  * Revision 1.7  1996/06/20  16:16:36  madden
725  * Replaced int's with Int4's.
726  *
727  * Revision 1.6  1996/05/16  19:50:15  madden
728  * Added documentation block.
729  *
730  * Revision 1.5  1996/04/22  21:42:07  madden
731  * New prototype for readdb_get_sequence
732  *
733  * Revision 1.4  1996/04/11  14:30:06  madden
734  * Memory-mapping added.
735  *
736  * Revision 1.3  1996/03/29  21:28:30  madden
737  * Added function readdb_get_sequence_length.
738  *
739  * Revision 1.2  1996/03/28  20:42:36  madden
740  * Added functions readdb_get_title, readdb_is_prot and
741  * readdb_get_formatdb_version.
742  *
743  * Revision 1.1  1996/03/26  19:38:08  madden
744  * Initial revision
745  *
746  *
747 */
748 
749 #ifndef _READDB_
750 #define _READDB_
751 
752 
753 /****************************************************************************/
754 /* INCLUDES */
755 /****************************************************************************/
756 
757 #include <ncbi.h>
758 #include <objloc.h>
759 #include <sequtil.h>
760 #include <ncbisam.h>
761 #include <tofasta.h>
762 #include <txalign.h>
763 
764 /* This define should be added here to pacify NT build */
765 #ifndef NLM_GENERATED_CODE_PROTO
766 #define NLM_GENERATED_CODE_PROTO
767 #endif
768 
769 
770 #include <fdlobj.h>
771 
772 #ifdef __cplusplus
773 extern "C" {
774 #endif
775 
776 /****************************************************************************/
777 /* Structure of index file header - old version */
778 /****************************************************************************/
779 
780 /*
781    4 bytes  4 bytes      4 bytes    title_len bytes      n bytes
782   <version><is_protein?><title_len><the_database_title><date_stamp>
783 
784 (title_len+n)%8 bytes
785   <ex_bytes><num_of_seqs><total_len><max_seq_len>
786 
787   num_of_seqs*4bytes   num_of_seqs*4bytes     num_of_seqs*4bytes
788 <defline_offset_table><sequence_offset_table><ambig_offset_table>
789 */
790 
791 /****************************************************************************/
792 /* DEFINES */
793 /****************************************************************************/
794 
795 /* Defines used to retrieve a base out of a packed byte. */
796 /* x should be unsigned (Uint1) to avoid sign extension problems. */
797 
798 #define READDB_UNPACK_BASE_1(x) ((x)>>6)
799 #define READDB_UNPACK_BASE_2(x) (((x)>>4) & 0x03)
800 #define READDB_UNPACK_BASE_3(x) (((x)>>2) & 0x03)
801 #define READDB_UNPACK_BASE_4(x) ((x) & 0x03)
802 #define READDB_UNPACK_BASE_N(x, N) (((x)>>(2*(N))) & 0x03)
803 
804 /* Compress 4 bytes to one. */
805 #define READDB_COMPRESSION_RATIO 4
806 
807 /* Character used to separate deflines from different entries that all
808 belong to the same sequence. */
809 #define READDB_DEF_SEPARATOR '\001'
810 
811 /* Choices for whether it's a protein db or not. */
812 #define READDB_DB_IS_NUC 0
813 #define READDB_DB_IS_PROT 1
814 #define READDB_DB_UNKNOWN 2
815 
816 #define READDB_CONTENTS_ALLOCATED  0x00000001
817 #define READDB_IS_PROT             0x00000002
818 #define READDB_HANDLE_COMMON_INDEX 0x00000004
819 #define READDB_NOT_FIRST_TIME      0x00000008
820 #define READDB_NO_SEQ_FILE         0x00000010
821 #define READDB_KEEP_HDR_AND_SEQ    0x00000020
822 
823 /*** Choices for how much to initialize on startup in readdb_new_internal. ***/
824 
825 /* attempt to memory map all files. */
826 #define READDB_NEW_DO_ALL               ((Uint1) (1<<0))
827 /* Only open the nin or pin files for a database report. */
828 #define READDB_NEW_DO_REPORT            ((Uint1) (1<<1))
829 /* Only open the nin (or pin) and nsq (or psq) files for a search. */
830 #define READDB_NEW_DO_SEARCH            ((Uint1) (1<<2))
831 /* Open only index (nin or pin) files for memory mapping */
832 #define READDB_NEW_INDEX                ((Uint1) (1<<3))
833 /* Same as above and memory map blast taxonomy db files */
834 #define READDB_NEW_DO_TAXDB             ((Uint1) (1<<4))
835 
836 /* The following variables are shared by formatdb and readdb. */
837 /* version of formatdb.
838 
839    Explanations: last text version of defline used for blast database
840    was 3 - all subsequent versions use ASN.1 for defline storage.
841    For backward compatibility if database version is 3 new program
842    will handle it OK. If database version > 3 - exact match of version
843    is needed to proceed.
844 
845 */
846 
847 #define FORMATDB_VER_TEXT 3
848 #define FORMATDB_VER      4
849 
850 /* 'Magic' number at the beginning of a binary gi list that indicates it is binary. */
851 #define READDB_MAGIC_NUMBER UINT4_MAX
852 
853 /* Maximum volume size, in bytes */
854 #define SEQFILE_SIZE_MAX 4000000000UL
855 
856 /* Default volume size; 4*10^9 bases, or 1*10^9 residues */
857 #define SEQFILE_SIZE_DFL 4000000000UL
858 
859 /****************************************************************************/
860 /* TYPEDEFS */
861 /****************************************************************************/
862 
863 typedef struct nlm_mfile {
864 	Nlm_MemMapPtr mem_mapp;	/* structure containing mem-map info,
865 				produced by Nlm_MemMapInit. */
866 	FILE PNTR fp;		/* FILE pointer. */
867 	Uint1Ptr  mmp_begin,	/* beginning of mmap'ed are. */
868 		  mmp,		/* present position of mmap'ed pointer. */
869 		  mmp_end;	/* end of mmap'ed area. */
870 	Int4	  file_size;	/* size of file that is mmap'ed. */
871 	Boolean   mfile_true;	/* If TRUE then mmap succeeded. */
872 	Boolean   contents_allocated; /* If TRUE, the contents have been allocated
873 					and are not merely a copy. */
874 	Uint1Ptr mmp_madvise_end; /* madvise() file offset */
875 } NlmMFILE, PNTR NlmMFILEPtr;
876 
877 /*
878 Open the file and initialze the memory mapping.
879 */
880 NlmMFILEPtr LIBCALL NlmOpenMFILE PROTO((CharPtr name));
881 
882 /*
883 Undo the memory mapping.
884 */
885 NlmMFILEPtr LIBCALL NlmCloseMFILE PROTO((NlmMFILEPtr mfp));
886 
887 /*
888 Read "nitems" of size "size" from a memory mapped file into "buffer"
889 usig the memory-mapped file given by "mfp".
890 */
891 Int4 LIBCALL NlmReadMFILE PROTO((Uint1Ptr buffer, size_t size, Int4 nitems, NlmMFILEPtr mfp));
892 
893 /*
894 	"fseek" to a point in the memory mapped file.
895 */
896 Int4 LIBCALL NlmSeekInMFILE PROTO((NlmMFILEPtr mfp, long offset, Int4 ptrname));
897 
898 /*
899         What is the offset (in bytes) to the beginning of the file.
900         Analog to ftell.
901 */
902 Int4 LIBCALL NlmTellMFILE PROTO((NlmMFILEPtr mfp));
903 
904 /* Generic 4-byte integer list */
905 typedef struct _gilist {
906     Int4    count, allocated;
907     Int4Ptr i;
908 } Int4List, *Int4ListPtr;
909 
910 /* Creates a new list of 4-byte integers */
911 Int4ListPtr LIBCALL
912 Int4ListNew PROTO((void));
913 
914 /* Creates a new list of 4-byte integers of size s */
915 Int4ListPtr LIBCALL
916 Int4ListNewEx PROTO((Int4 s));
917 
918 /* Deallocates the list of 4-byte integers */
919 Int4ListPtr LIBCALL
920 Int4ListFree PROTO((Int4ListPtr lp));
921 
922 /* Reads a list of newline separated 4-byte integers.
923  * Caller is responsible for deallocating the return value */
924 Int4ListPtr LIBCALL
925 Int4ListReadFromFile PROTO((CharPtr filename));
926 
927 /* Appends i to the end of the list, reallocating memory if necessary. Returns
928  * FALSE if it cannot allocate more memory */
929 Boolean LIBCALL
930 Int4ListAdd PROTO((Int4ListPtr lp, Int4 i));
931 
932 /* Returns the concatenation of list1 and list2, freeing both parameters. It
933  * returns NULL if both lists are empty and if it cannot allocate more memory */
934 Int4ListPtr LIBCALL
935 Int4ListConcat PROTO((Int4ListPtr *list1, Int4ListPtr *list2));
936 
937 /* Attempts to reallocate new_size elements to the list. Returns NULL on
938  * incorrect arguments or if it cannot allocate more memory */
939 Int4ListPtr LIBCALL
940 Int4ListResize PROTO((Int4ListPtr listp, Int4 new_size));
941 
942 /* Performs a binary search for key on lp.
943    Returns the index into lp->i where key is located or -1 if key is not found
944  */
945 Int4 LIBCALL
946 Int4ListBSearch PROTO((Int4ListPtr lp, Int4 key));
947 
948 /* Ascendingly sorts the list and removes repeated entries */
949 Int4ListPtr LIBCALL
950 Int4ListMakeUnique PROTO((Int4ListPtr list));
951 
952 /* Returns the ascending sorted intersection of list1 and list2, freeing the
953  * both parameters */
954 Int4ListPtr LIBCALL
955 Int4ListIntersect PROTO((Int4ListPtr *list1, Int4ListPtr *list2));
956 
957 
958 /*
959 	Common index structures
960  */
961 
962 #define COMMONINDEX_FN	"comindex.mm"
963 #define DB_CONFIG_FN	"dblist.txt"
964 
965 typedef struct  CommonIndex{
966     Int4        dbmask; 	/* mask to define which db contains the GI */
967     Int4        oftenOID;       /* ordinal ID for the GI in most often DB */
968 } CommonIndex, *CommonIndexPtr;
969 
970 typedef	struct	CommonIndexResult {
971     Int4	gi;	/* GI */
972     Int4	oid;	/* OID */
973     Int2	dbid;	/* database ID */
974     struct CommonIndexResult *next;	/* make a list */
975 } CommonIndexResult, *CommonIndexResultPtr;
976 
977 /* Data bases */
978 
979 typedef struct	DataBaseID {
980     CharPtr	name;	/* database name like gss, nr, etc */
981     Char	id;	/* integer ID, value from 0, to 32, used for bitmasks */
982     Boolean	isprot;	/* says TRUE if database contains proteins, FALSE otherwise */
983 } DataBaseID, *DataBaseIDPtr;
984 
985 typedef struct	CommonIndexHead {
986     CommonIndexPtr	ci;
987     Nlm_MemMapPtr	memmap;
988     Int2		num_of_DBs;
989     DataBaseIDPtr	dbids;
990     Int4		maxgi; /* maximum GI number permitted */
991 } CommonIndexHead, *CommonIndexHeadPtr;
992 
993 typedef	struct	OIDList {
994     CharPtr	filename;	/* name of the file containing OID list */
995     Uint4Ptr	list;		/* array of OID's */
996     Uint4Ptr	memory;		/* memory to keep the OID's (element list).
997 				if this is NULL, then list is memory mapped. */
998     Int4	total;		/* number of elements in the array */
999     NlmMFILEPtr mfp;		/* Used for memory-mapped file. */
1000 } OIDList, *OIDListPtr;
1001 
1002 OIDListPtr OIDListFree (OIDListPtr oidlist);
1003 
1004 typedef struct read_db_shared_info {
1005    Int2 nthreads;
1006    NlmMFILEPtr headerfp, sequencefp;
1007 
1008    /* This is the ordinal id of the last chunk assigned to a thread when
1009     * iterating over a database via the BlastSeqSrc interface with multiple
1010     * threads. It should not be used in other contexts. It is analogous to
1011     * the db_chunk_last field of the BlastThrInfo structure.
1012     * Please note that in case of a linked list of ReadDBFILE structures, only
1013     * the first shared_info->last_oid_assigned field is significant when
1014     * performing an iteration with multiple threads.
1015     */
1016    Uint4 last_oid_assigned;
1017 } ReadDBSharedInfo, *ReadDBSharedInfoPtr;
1018 
1019 /* ---------------------------------------------------------------------*/
1020 /* -- Here is set of definitions used with taxonomy info database ----- */
1021 /* ---------------------------------------------------------------------*/
1022 
1023 /* The following #define allows for the creation of taxonomy databases along
1024  * with the blast databases. Please note that the code to create the blast
1025  * databases is NOT thread-safe! */
1026 /*#define FDB_TAXONOMYDB*/
1027 
1028 typedef	struct _RDBTaxId {
1029     Uint4 taxid;
1030     Uint4 offset;
1031 } RDBTaxId, PNTR RDBTaxIdPtr;
1032 
1033 typedef	struct _RDBTaxInfo {
1034     Int4        all_taxid_count; /* Total number of taxids in the database */
1035     Int4        reserved[4];     /* reserved */
1036     NlmMFILEPtr taxfp;           /* Memory mapped index file */
1037     RDBTaxIdPtr taxdata;         /* Index tax_id/file offset */
1038     Boolean     taxdata_alloc;   /* true if taxdata was allocated */
1039     NlmMFILEPtr name_fd;         /* Pointer to the file with taxonomy names */
1040     Boolean     taxinfo_alloc;   /* Flag to determine structure ptr ownership */
1041 } RDBTaxInfo, *RDBTaxInfoPtr;
1042 
1043 typedef	struct _RDBTaxLookup {
1044     Int4 all_taxid_count; /* Total number of taxids in the database */
1045     Int4 taxids_in_db;
1046     RDBTaxNamesPtr *tax_array; /* This array's index correspond to tax_id and
1047                                   value of the cell corresponds to tax names
1048                                   if  any */
1049     VoidPtr tax_data;      /* This data may be set and used by the callback */
1050 } RDBTaxLookup, *RDBTaxLookupPtr;
1051 
1052 typedef Boolean (*TaxCallbackFunc) (RDBTaxLookupPtr tax_lookup, Int4 tax_id);
1053 
1054 
1055 /*
1056  * sequence info record (SI_Record):
1057  * > contains information about given gi
1058  * > most of it will be dumped to *[np]di files
1059  * > form a linked list for identical gis
1060  * > used for transferring data into AddSequence interface
1061  *
1062  * Contribution from Michael Kimelman/Olga Cherenkov from
1063  * NCBI's ID1 group.
1064  */
1065 
1066 typedef struct si_record {
1067     struct  si_record PNTR next;
1068     Int4    gi;
1069     char    seqid[256]; /* seqid in FASTA format */
1070     char*   title;      /* defline */
1071     Int4    taxid;
1072     Int4    owner;
1073     char    div[4];
1074     Int4    ent;  /* entity (sat_key) */
1075     Uint1   mol;  /* Molecule type, as in Seq-inst::mol */
1076 } SI_Record, PNTR SI_RecordPtr;
1077 
1078 /** Allocates a single node in the SI_Record linked list structure */
1079 SI_Record* SI_RecordNew(void);
1080 /** Deallocates the linked list of SI_Record structures in srp
1081  * @return NULL
1082  */
1083 SI_Record* SI_RecordFree(SI_Record* srp);
1084 
1085 /*    ----
1086       Here are functions for run-time blast in relation to the
1087       Taxonomy blast database
1088       ----  */
1089 
1090 #define TAXDB_ON_FTP "ftp://ftp.ncbi.nih.gov/blast/db/taxdb.tar.gz"
1091 #define BLAST_TAXDB_FILENAME "taxdb"
1092 
1093 /* Initialize taxonomy lookup database. returns NULL if failure or
1094    this database do not exists */
1095 RDBTaxInfoPtr  RDBTaxInfoInit(void);
1096 
1097 /* Free memory, unmap files etc. related to the taxonomy database */
1098 void RDBTaxInfoClose(RDBTaxInfoPtr tip);
1099 
1100 /* Main function to get taxonomy names for given tax_id from
1101    blast taxonomy database. Returns NULL if tax_id is not in the database */
1102 RDBTaxNamesPtr RDBGetTaxNames(RDBTaxInfoPtr tip, Int4 tax_id);
1103 
1104 #define TAX_DB_MAGIC_NUMBER 0x8739
1105 
1106 typedef struct read_db_file {
1107 	struct read_db_file PNTR next;
1108         Int4 parameters; /* All boolean parameters */
1109    /* Bits: 0 - contents allocated
1110             1 - is protein
1111 	    2 - handle common index
1112 	    3 - not first time
1113 	    4 - do not open sequence files
1114 	    5 - do not close header and sequence files in readdb_get_link
1115    */
1116    /* 0: Are contents of this struct allocated, or not?  Does NOT include
1117       the actual structure and buffer, below. */
1118    /* 1: If TRUE, sequence is protein, otherwise dna. */
1119    /* 2: TRUE only for the initial thread;  needed for proper freeing of the CommonIndex */
1120    /* 3: For recursive calls to readdb_new_ex2. */
1121 	CharPtr filename;	/* name of the input (w/o extensions). */
1122 	CharPtr aliasfilename;	/* name of the alias of input */
1123 /* The files pointers for "file" (above), the index file, the file
1124 containing the headers, and the sequence file. */
1125         NlmMFILEPtr indexfp, headerfp, sequencefp;
1126 	Int4	header_index_offset;	/* offset to beginning of header index in indexfp. */
1127 	CharPtr title,	/* Database Title. */
1128 		date;	/* Date and time database was prepared. */
1129 	Int4 num_seqs, /* Number of sequences in the database. */
1130 	      formatdb_ver;	/* Version of formatdb used. */
1131 	BlastDefLinePtr blast_deflinep;  /* when not NULL, points to the first defline of the seq*/
1132 	Int4 	start,	/* 1st ordinal id in this file. */
1133 		stop;	/* last ordinal id in this file. */
1134 	Int8 totlen;	/* Total length of database. */
1135 	Int8 totlen_stats; /* Total length of database used for expect value and search space. */
1136 	Uint4 maxlen;	/* Length of longest sequence in database. */
1137 	Int8 aliaslen;	/* Length of the database as read from alias file */
1138 	Uint4 aliasnseq;/* Number of seqs of the database as read from alias file */
1139 	Uint4 nseq_stats; /* Number of seqs to be used for search space and expect value. */
1140 /* The "index" arrays specify the offsets (in files) of the header and
1141 sequence information. */
1142 	Uint4Ptr header_index,	sequence_index, ambchar_index;
1143 	Uint4Ptr header_index_start,	sequence_index_start, ambchar_index_start;
1144 /* Buffer and allocated amount of this buffer.  These should always be
1145 NULL (i.e., NOT USED) if mem-mapping is used; only used to store sequence
1146 if there is no mem-mapping or it failed. */
1147 
1148     ISAMObjectPtr nisam_opt;  /* Object for numeric search */
1149     ISAMObjectPtr sisam_opt;  /* Object for string search */
1150     ISAMObjectPtr isam_pig;   /* Object for PIG search */
1151     RDBTaxInfoPtr taxinfo;    /* This object if not NULL - pointer to
1152                                      the taxonomy names database */
1153 	Uint1Ptr buffer;
1154 	Int4 allocated_length;
1155 	CommonIndexHeadPtr  cih;       /* head of the common index */
1156 	Int2	            filebit;   /* bit corresponding to the DB file */
1157 	Int2		    aliasfilebit;/* bit corresponding to the DB alias file */
1158 	OIDListPtr	    oidlist;   /* structure containing a list of ordinal ID's. */
1159     Int4            membership_bit; /* membership bit read from .[pn]al file for structured asn deflines */
1160 	Int4		    sparse_idx;/* Sparse indexes indicator */
1161         Char                full_filename[PATH_MAX]; /* Full path for the file */
1162         ReadDBSharedInfoPtr shared_info;
1163 	Int4 	            gi_target; /* only this gi should be retrieved */
1164                                        /* if non-zero. */
1165     CharPtr             gifile;   /* Path to a file with the gi list, should
1166 									 always be NULL after readdb_new* calls */
1167     Int4ListPtr     gilist; 	  /* storage for the above file in memory */
1168     Int4		    preferred_gi; /* this gi should be listed first */
1169                                   /* in the bioseq if non-zero */
1170 	Int4    last_preloaded; /* starting ordinal id of the last preloaded file block */
1171 } ReadDBFILE, PNTR ReadDBFILEPtr;
1172 
1173 /* Function prototypes */
1174 Int4    GI2OID(CommonIndexHeadPtr cih, Int4 gi, Int4 dbmask, Int4 alias_dbmask,
1175 	Int2Ptr dbid, Int2Ptr alias_dbid, ReadDBFILEPtr rdfp);
1176 Int2	DBShift(Int2 num_of_DBs, DataBaseIDPtr dbids, CharPtr dbname, Boolean is_prot);
1177 CharPtr	DBName(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift);
1178 Boolean	DBisProt(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift);
1179 CommonIndexResultPtr	GIs2OIDs(CommonIndexHeadPtr cih,
1180 			Int4Ptr gis, Int4 number_of_gis, Int4 dbshift, ReadDBFILEPtr rdfp);
1181 Int2	SeniorBit(Int4	bitmask);
1182 CommonIndexHeadPtr	CommonIndexInit(CharPtr indexfilename);
1183 void	CommonIndexDestruct(CommonIndexHeadPtr cihp);
1184 Int2	bit_engine_firstbit (Int4 word);
1185 Int2Ptr	bit_engine_arr(Int4 word);
1186 Int2	bit_engine_numofbits(Int4 word);
1187 Int2	ParseDBConfigFile(DataBaseIDPtr *dbidsp, CharPtr path);
1188 CharPtr	FindBlastDBFile (CharPtr filename);
1189 CharPtr	FindDBbyGI(CommonIndexHeadPtr cih, Int4 gi, Uint1 *is_prot);
1190 RDBTaxNamesPtr LIBCALL readdb_get_taxnames PROTO((
1191             ReadDBFILEPtr rdfp, Int4 tax_id));
1192 
1193 /* mmap's */
1194 
1195 NLM_EXTERN Nlm_MemMapPtr EA_MemMapInit(const Nlm_Char PNTR name, Boolean readonly);
1196 
1197 /****************************************************************************/
1198 /* FINCTION DEFINITIONS */
1199 /****************************************************************************/
1200 /* Deallocate the memory mapping of header and sequence files */
1201 ReadDBFILEPtr ReadDBCloseMHdrAndSeqFiles PROTO((ReadDBFILEPtr rdfp));
1202 
1203 /*
1204 Intitialize the readdb structure using the database "filename".
1205 If no database is used, set filename to NULL.
1206 */
1207 ReadDBFILEPtr LIBCALL readdb_new PROTO((CharPtr filename, Uint1 is_prot));
1208 
1209 /*
1210 	init_indices should be TRUE if entire database is to be searched, otherwise
1211 	it can be FALSE.
1212 */
1213 ReadDBFILEPtr LIBCALL readdb_new_ex PROTO((CharPtr filename, Uint1 is_prot, Boolean init_indices));
1214 
1215 /*
1216  * Initializes the blast database specified in the argument list.
1217  * filename: blast database to initialize
1218  * is_prot: is this database protein ?
1219  * init_state: bitwise-OR of the READDB_NEW_* values (selectively mmap certain
1220  *             files)
1221  * oidlist: Path to the ordinal id list to use (this is mmap'd)
1222  * gilist: Path to the gi list to use (this is not resolved until the search
1223  *         is conducted (see BlastProcessGiLists)
1224 */
1225 ReadDBFILEPtr LIBCALL readdb_new_ex2 PROTO((CharPtr filename, Uint1 is_prot,
1226             Uint1 init_state, CharPtr oidlist, CharPtr gilist));
1227 
1228 
1229 /*
1230 Deallocate the ReadDBFILEPtr.
1231 */
1232 ReadDBFILEPtr LIBCALL readdb_destruct PROTO((ReadDBFILEPtr readdb));
1233 
1234 ReadDBFILEPtr LIBCALL readdb_destruct_element PROTO((ReadDBFILEPtr rdfp));
1235 
1236 
1237 /*
1238         Attach to an already open ReadDBFILEPtr.  Duplicate the
1239         indexfp, sequencefp, and headerfp structures as the pointers
1240         there (i.e., mmp) will need to be manipulated.  Do not
1241         change the FILE PNTR fp.
1242 */
1243 ReadDBFILEPtr LIBCALL readdb_attach PROTO((ReadDBFILEPtr rdfp));
1244 
1245 /*
1246         Checks whether a ReadDBFILEPtr is the original, or just attaced.
1247         It does this by checking the rdfp->contents_allocated flag.
1248 */
1249 Boolean LIBCALL readdb_copy PROTO((ReadDBFILEPtr rdfp));
1250 
1251 /*
1252 	Checks two ReadDBFILEPtr to see if they refer to the same
1253 	database.
1254 */
1255 Boolean LIBCALL readdb_compare PROTO((ReadDBFILEPtr rdfp1, ReadDBFILEPtr rdfp2));
1256 
1257 
1258 /*
1259         Get total length and number of sequences in multiple databases.
1260 */
1261 
1262 Boolean LIBCALL readdb_get_totals PROTO((ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num));
1263 
1264 /*
1265         Get total length and number of sequences in multiple databases.
1266         if 'use_alias' is TRUE, values from the alias file will be used
1267         if non-zero.
1268 */
1269 
1270 Boolean LIBCALL
1271 readdb_get_totals_ex PROTO((ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num, Boolean use_alias));
1272 
1273 /* retrieves the total number of sequences and database length in the
1274  * rdfp_list. use_alias and use_virtual_oidlist are mutually exclusive
1275  * options (both of them cannot be true at the same time). If
1276  * use_virtual_oidlist is TRUE, this function assumes that this rdfp_list has
1277  * been processed by BlastProcessGiLists */
1278 Boolean LIBCALL
1279 readdb_get_totals_ex2 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr dblen,
1280         Int4Ptr nseq, Boolean use_alias, Boolean use_virtual_oidlist));
1281 
1282 /* Enumerated type to determine if the database length (number of
1283  * bases/residues) should be approximated or calculated exactly by
1284  * readdb_get_totals_ex3 */
1285 typedef enum {
1286     eExact,
1287     eApproximate
1288 } EAccountingMode;
1289 
1290 /* This function is identical to readdb_get_totals_ex2 but it uses its last
1291  * argument to determine if in the case of nucleotide databases the exact
1292  * database length is required. If eExact is used, the exact database size is
1293  * calculated, if eApproximate is used, an approximation is returned. This is
1294  * done to avoid having to touch every last byte of each sequence to determine
1295  * the exact length of the database when it is restricted by a virtual oidlist.
1296  * The EAccountingMode argument is irrelevant for protein databases, where this
1297  * function always return the exact database length. Same assumption about
1298  * BlastProcessGiLists as in readdb_get_totals_ex2 applies.
1299  */
1300 Boolean LIBCALL
1301 readdb_get_totals_ex3 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr dblen,
1302         Int4Ptr nseq, Boolean use_alias, Boolean use_virtual_oidlist,
1303         EAccountingMode acc_mode));
1304 
1305 /*
1306         Gets the number to be used for statistical purposes.  Should be set in
1307         alias file as STATS_NSEQ and STATS_TOTLEN.
1308 */
1309 Boolean LIBCALL
1310 readdb_get_stats_numbers(ReadDBFILEPtr rdfp_list, Int4* num_seqs_stats, Int8* tot_len_stats);
1311 
1312 /*
1313 Get the sequence with sequence_number and put it in buffer.  No memory
1314 is allocated for this if memory-mapped files are used, otherwise it is.
1315 Return the length of the sequence.
1316 */
1317 Int4 LIBCALL readdb_get_sequence PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer));
1318 
1319 /*
1320 	Gets the sequence number "sequence_number".  The sequence returned includes
1321 	all ambiguity information.  THis funciton should only be used for nucleic
1322 	acid sequences, for proteins use readdb_get_sequence.
1323 
1324 	buffer contains the sequence and is reallocated if *buffer_length is not long enough.
1325 
1326 	The length of the sequence requested is the return value.
1327 	protein sequences are always returned as Seq_code_ncbistdaa,
1328 	nucleotide sequences as Seq_code_ncbi4na.
1329 */
1330 Int4 LIBCALL readdb_get_sequence_ex PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer, Int4 *buffer_length, Boolean ready));
1331 
1332 /* Gets sequence number by gi number. Returnes -1 if gi not found or
1333    other negative value if NISAM library faults. Non-negative value
1334    means success. Use numeric ISAM indexes.
1335 */
1336 Int4 LIBCALL readdb_gi2seq(ReadDBFILEPtr rdfp, Int4 gi, Int4Ptr start);
1337 
1338 /* Gets sequence number by SeqId number. Returnes -1 if gi not found or
1339    other negative value if SISAM library faults. Non-negative value
1340    means success. Use string ISAM indexes.
1341 */
1342 Int4 LIBCALL readdb_seqid2fasta(ReadDBFILEPtr rdfp, SeqIdPtr sip);
1343 
1344 /* Gets sequence number by Accession/Locus string. Returnes -1
1345    if accession not found or
1346    other negative value if SISAM library faults. Non-negative value
1347    means success. Use string ISAM indexes.
1348 */
1349 Int4 LIBCALL readdb_acc2fasta(ReadDBFILEPtr rdfp, CharPtr string);
1350 
1351 /* Gets array of sequence numbers by Accession/Locus string. Returnes -1
1352    if accession not found or
1353    other negative value if SISAM library faults. Non-negative value
1354    means success. Use string ISAM indexes.
1355 */
1356 Int4 LIBCALL readdb_acc2fastaEx(ReadDBFILEPtr rdfp, CharPtr string,
1357                                 Int4Ptr PNTR ids, Int4Ptr count);
1358 
1359 /*
1360 Gets a BioseqPtr containing the sequence in sequence_number.
1361 */
1362 BioseqPtr LIBCALL readdb_get_bioseq PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number));
1363 BioseqPtr LIBCALL readdb_get_bioseq_ex PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Boolean use_objmgr, Boolean insert_ctrlA));
1364 
1365 /*
1366    Gets the exact sequence length for protein sequences, but for nucleotide
1367    sequences it gets the length of the sequence +/- at most 3 bases (last byte
1368    is not examined, therefore the return value is an approximation).
1369  */
1370 Int4 LIBCALL readdb_get_sequence_length_approx PROTO((ReadDBFILEPtr rdfp,
1371                                                        Int4 sequence_number));
1372 
1373 /*
1374 Get the length of the sequence.
1375 */
1376 Int4 LIBCALL readdb_get_sequence_length PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number));
1377 
1378 /*
1379 Get the ID and definition for the sequence with sequence_number.
1380 It is the caller's RESPONSIBILITY to DEALLOCATE "id" and "description".
1381 */
1382 Boolean LIBCALL readdb_get_descriptor PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, SeqIdPtr PNTR id, CharPtr PNTR description));
1383 Boolean
1384 readdb_get_defline (ReadDBFILEPtr rdfp, Int4 sequence_number, CharPtr PNTR description);
1385 
1386 /*
1387 Get the ID's and headers for a sequence.
1388 */
1389 Boolean LIBCALL
1390 readdb_get_header PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr header_index , SeqIdPtr PNTR id, CharPtr PNTR description));
1391 
1392 /*
1393 Get the ID's, headers, taxid, memberships, and links for a sequence.
1394 Returns FALSE if the sequence_number is not applicable in the context of the
1395 database in rdfp (i.e.: masked databases), otherwise it will return TRUE until
1396 there are sequences associated with this sequence_number (then it returns
1397 FALSE).
1398 */
1399 Boolean LIBCALL
1400 readdb_get_header_ex PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number,
1401                            Uint4Ptr header_index, SeqIdPtr PNTR id,
1402                            CharPtr PNTR description, Int4 PNTR taxid,
1403                            ValNodePtr PNTR memberships, ValNodePtr PNTR links));
1404 
1405 /*
1406  Get the Int4Ptr to ambiguity buffer
1407 */
1408 Boolean  LIBCALL readdb_get_ambchar PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr PNTR ambchar_return));
1409 
1410 /*
1411 	Check whether ambiguity characters are present in the sequence.
1412 */
1413 Boolean LIBCALL readdb_ambchar_present PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number));
1414 
1415 /*
1416 Get the total length (in bp or residues) of the database.
1417 */
1418 Int8 LIBCALL readdb_get_dblen PROTO((ReadDBFILEPtr rdfp));
1419 
1420 /*
1421 Get the number of entries in the database.
1422 */
1423 Int4 LIBCALL readdb_get_num_entries PROTO((ReadDBFILEPtr rdfp));
1424 
1425 /*
1426 Get the total number of entries in all the files.
1427 */
1428 Int4 LIBCALL readdb_get_num_entries_total PROTO((ReadDBFILEPtr rdfp));
1429 
1430 /*
1431 Obtains the total number of real database sequences from all the ReadDBFILE structures.
1432 */
1433 
1434 Int4 LIBCALL readdb_get_num_entries_total_real PROTO((ReadDBFILEPtr rdfp));
1435 
1436 /* Check whether an OID is actually in the database according to the mask file. */
1437 Boolean readdb_check_oid(ReadDBFILEPtr rdfp_head, Int4 oid);
1438 
1439 /*
1440 Get the length of the longest sequence in the database.
1441 */
1442 Int4 LIBCALL readdb_get_maxlen PROTO((ReadDBFILEPtr rdfp));
1443 
1444 /*
1445 Get the title (i.e., name) of the database.
1446 NOTE: the CharPtr returned is not owned by the caller!
1447 */
1448 CharPtr LIBCALL readdb_get_title PROTO((ReadDBFILEPtr rdfp));
1449 
1450 /*
1451 Get the name of the file used for formatting.
1452 NOTE: the CharPtr returned is not owned by the caller!
1453 */
1454 CharPtr LIBCALL readdb_get_filename PROTO((ReadDBFILEPtr rdfp));
1455 
1456 /* For use by the seq-src to get the alias file name with full path. */
1457 CharPtr LIBCALL readdb_get_full_filename PROTO((ReadDBFILEPtr rdfp));
1458 
1459 /*
1460 Get the date the database was formatted.
1461 NOTE: the CharPtr returned is not owned by the caller!
1462 */
1463 CharPtr LIBCALL readdb_get_date PROTO((ReadDBFILEPtr rdfp));
1464 
1465 /*
1466 Is this a protein database?
1467 */
1468 Boolean LIBCALL readdb_is_prot PROTO((ReadDBFILEPtr rdfp));
1469 
1470 /*
1471         Parses the databases names (if more than one) from
1472         'filenames' into buffer.  buffer should already be
1473         long enough and allocated.  The funciton should be
1474         repeatedly called until TRUE is returned.
1475 */
1476 Boolean LIBCALL readdb_parse_db_names PROTO((CharPtr PNTR filenames, CharPtr buffer));
1477 
1478 /*
1479 Get the version of formatdb used on this database.
1480 */
1481 Int4 LIBCALL readdb_get_formatdb_version PROTO((ReadDBFILEPtr rdfp));
1482 
1483 /*
1484 	returns the 'filebits' associated with a certain ordinal number.
1485 	This is done by going to the rdfp for that ordinal id and
1486 	gathering the filebits.
1487 */
1488 Boolean LIBCALL readdb_get_filebits PROTO((ReadDBFILEPtr rdfp, Int4 ordinal_id, Uint2Ptr filebit, Uint2Ptr aliasfilebit));
1489 
1490 /* Possible return values for readdb_validate */
1491 
1492 #define READDB_VALID 0
1493 #define READDB_INVALID_NULL_ARG -1
1494 #define READDB_INVALID_MIXED_DBS -2
1495 
1496 /* Validate the linked list of rdfp structures passed as an argument to this
1497  * function.
1498  * Return
1499  * list or if the argument is NULL, otherwise returns TRUE
1500  */
1501 Int4 LIBCALL readdb_validate PROTO((ReadDBFILEPtr rdfp));
1502 
1503 /** Calculate a hash value for a given sequence data
1504  * @param sequence containing sequence data (must not be NULL) [in]
1505  * @param sequence_length length of the buffer above populated with data [in]
1506  */
1507 Uint4 readdb_sequence_hash(const char* sequence, int sequence_length);
1508 
1509 /* For the BioseqFetch functions. */
1510 
1511 Boolean LIBCALL ReadDBBioseqFetchEnable PROTO((CharPtr program, CharPtr dbname, Boolean is_na, Boolean now));
1512 
1513 Boolean LIBCALL ReadDBBioseqSetDbGeneticCode PROTO((Int4 db_genetic_code));
1514 
1515 void LIBCALL ReadDBBioseqFetchDisable PROTO((void));
1516 
1517 /* Converts a SeqIdPtr to an ordinal_id, which readdb can use to look
1518 up sequences etc.  Negative numbers are returned if the SeqIdPtr
1519 cannot be converted. */
1520 Int4 SeqId2OrdinalId PROTO((ReadDBFILEPtr rdfp, SeqIdPtr sip));
1521 
1522 /*
1523 	Returns the ReadDBFILEPtr by the database ID.
1524 */
1525 ReadDBFILEPtr ReadDBGetDb PROTO((ReadDBFILEPtr list, Int2 db_id));
1526 
1527 /*
1528 	Returns the Database ID.
1529 */
1530 Int2 ReadDBGetDbId PROTO((ReadDBFILEPtr list, ReadDBFILEPtr target));
1531 
1532 
1533 /********************/
1534 /*     formatdb     */
1535 
1536     /* Type definitions */
1537 
1538 typedef struct FASTALookup {
1539     Int4Ptr table;          /* Main buffer for gi/fasta_id pairs */
1540     Int4    allocated;      /* Nunber of Uint4 allocated */
1541     Int4    used;           /* Number of Uint4 used      */
1542 } FASTALookup, PNTR FASTALookupPtr;
1543 
1544 /* Structure that holds the link information as read from the file */
1545 typedef struct _linkinfo {
1546     Int4 bit_number;      /* indicates the position in links bit array */
1547     Int4ListPtr gi_list;  /* update links bit array for gis in this list */
1548 } LinkInfo, *LinkInfoPtr;
1549 
1550 /* Structure that holds the membership information */
1551 typedef Boolean (*GMCriteriaFunc) (VoidPtr direc);
1552 
1553 typedef struct _membinfo {
1554     Int4 bit_number;    /* indicates the position in the membership bit array */
1555     GMCriteriaFunc criteria; /* function pointer that is invoked to
1556                                  determine wheather certain sequence
1557                                  belongs to the membership represented by
1558                                  this bit_number */
1559 } MembInfo, *MembInfoPtr;
1560 
1561 /* Options to clean up blast database files from a previous instance of the
1562  * database with the same name as the one about to be created. This has been
1563  * added to prevent the case in which an alias file might have precedence over
1564  * a single-volume blast database. */
1565 typedef enum EFDBCleanOpt {
1566     eCleanNever = 0,    /* don't remove older files of the db to be created,
1567                            just overwrite them or ignore alias files */
1568     eCleanAlways,       /* clean up all older files of the db to be created */
1569     eCleanPrompt,       /* Assumes interactive program */
1570     eCleanOptMax
1571 } EFDBCleanOpt;
1572 
1573 /*** PIG (Protein Identifier Group) interface ***/
1574 
1575 #define PIG_NONE        -1          /* No protein identifier group */
1576 
1577 /* PIG table structure
1578  * From this information the formatdb API creates a pair of ISAM files to map
1579  * PIGs to ordinal ids */
1580 typedef struct FDBPigTable {
1581     Int4Ptr     pop;                /* list of pig/ordinal id pairs */
1582     Int4        count, allocated;   /* keep track of table size */
1583 } FDBPigTable, * FDBPigTablePtr;
1584 
1585 /* Allocate a PIG table structure */
1586 FDBPigTablePtr LIBCALL
1587 FDBPigTableNew PROTO((void));
1588 
1589 /* Deallocate a PIG table structure */
1590 FDBPigTablePtr LIBCALL
1591 FDBPigTableFree PROTO((FDBPigTablePtr fptp));
1592 
1593 /* Add a PIG to the PIG table structure, return FALSE on error */
1594 Boolean LIBCALL
1595 FDBAddPig PROTO((FDBPigTablePtr fptp, Int4 pig, Int4 oid));
1596 
1597 /* Retrieve the PIG for a given ordinal id */
1598 Int4 LIBCALL
1599 readdb_get_pig PROTO((ReadDBFILEPtr rdfp, Int4 oid));
1600 
1601 /* Retrieve the ordinal id corresponding to a given PIG (analogous to
1602  * readdb_gi2seq) */
1603 Int4 LIBCALL
1604 readdb_pig2oid PROTO((ReadDBFILEPtr rdfp, Int4 pig, Int4Ptr start));
1605 
1606 /************************************************/
1607 /*** TaxidDeflineTable interface ***/
1608 
1609 /* forward declaration of main structure */
1610 typedef struct FDBTaxidDeflineTable FDBTaxidDeflineTable;
1611 typedef struct FDBTaxidDeflineTable* FDBTaxidDeflineTablePtr;
1612 
1613 /** Allocate a TaxidDefline table structure from a file
1614  * It attempts to read a list of gi/taxid pairs first, then a list of seqid
1615  * strings/taxid pairs.
1616  */
1617 FDBTaxidDeflineTablePtr LIBCALL
1618 FDBTaxidDeflineTableNew PROTO((const Char* filename));
1619 
1620 /** Deallocate a TaxidDefline table structure */
1621 FDBTaxidDeflineTablePtr LIBCALL
1622 FDBTaxidDeflineTableFree PROTO((FDBTaxidDeflineTablePtr taxid_tbl));
1623 
1624 extern const Int4 kTaxidDeflineSearch_NotFound;
1625 
1626 /** Searches the gi provided as argument in the taxid_tbl argument. If not
1627  * found it returns kTaxidDeflineSearch_NotFound, otherwise it returns the
1628  * taxonomy id */
1629 Int4 LIBCALL
1630 FDBTaxidDeflineTableSearchGi PROTO((const FDBTaxidDeflineTablePtr taxid_tbl,
1631                                     Int4 gi));
1632 
1633 /** Searches the seqid provided as argument in the taxid_tbl argument. If not
1634  * found it returns kTaxidDeflineSearch_NotFound, otherwise it returns the
1635  * taxonomy id */
1636 Int4 LIBCALL
1637 FDBTaxidDeflineTableSearchSeqid PROTO((const FDBTaxidDeflineTablePtr taxid_tbl,
1638                                        const Char* seqid));
1639 
1640 /************************************************/
1641 
1642 typedef struct _FDB_options {
1643     Int4  version;   /* Version of the database created by formatdb program
1644 	    	 	currently supported are 3 - FORMATDB_VER_TEXT and
1645 	    	 	4 - FORMATDB_VER - for ASN.1 structured deflines */
1646     CharPtr db_title;    /* Title for the database to be created */
1647     CharPtr db_file;     /* Name for input data file - 'IN' name */
1648     Int4 is_protein;     /* Is this protein database ? */
1649     Int4 parse_mode;     /* Do we assume, that deflines are started from
1650                              valid SeqIds ? */
1651     Int4 isASN;          /* read from file or ASN - used only in formatdb.c */
1652     Int4 asnbin;         /* What is this type of ASN? used only
1653                             in formatdb.c */
1654     Int4 is_seqentry;    /* What is this type of ASN? used only
1655                              in formatdb.c */
1656     CharPtr base_name;   /* Name for db files to be created 'OUT' name */
1657     CharPtr	alias_file_name; /* name to be used for BLAST alias-file. */
1658     Int4  dump_info;     /* To printout file with information about tax_id,
1659                              owner, hash etc. - used for dump from ID */
1660 
1661     Int4  sparse_idx;    /* To use only limited set of text ids to dump for
1662                              usage in indexes */
1663     Int4  test_non_unique;    /* Print messages if FASTA database has
1664                                  non-unique string ids - accessions, locuses*/
1665 
1666     RDBTaxLookupPtr tax_lookup; /* taxonomy lookup table - should be initialized in the main program to be used for creating of taxonomy information*/
1667 
1668     TaxCallbackFunc tax_callback; /* Function to retrieve taxonomy names from
1669                                      Taxonomy server */
1670    Int8 bases_in_volume;  /* The maximal number of bases that can be stored in
1671                              one volume of the database */
1672    Int4 sequences_in_volume; /* Maximum number of sequences to be stored in a
1673                                 volume */
1674    Int2 volume;      /* Largest volume */
1675    Int4 total_num_of_seqs; /* total number of sequences for this database */
1676    CharPtr	gi_file;        /* Gi file to be used in processing. */
1677    CharPtr  gi_file_bin;	/* Gi file to be used in processing. */
1678 
1679    ValNodePtr    linkbit_listp; /* list of gis and the bits to set */
1680    ValNodePtr    memb_tblp;     /* Linked list of MembInfo structures */
1681    VoidPtr       memb_argp;     /* Argument to criteria function in MembInfo
1682                                    structure */
1683    EFDBCleanOpt clean_opt;      /* clean up option */
1684 
1685 } FDB_options, PNTR FDB_optionsPtr;
1686 
1687 /** Maximum number of volumes constructed by formatdb */
1688 extern const Uint4 kFDBMaxNumVolumes;
1689 
1690 typedef struct formatdb
1691 {
1692     /* CharPtr	dbname;	(db_file)  name of input database */
1693     /* CharPtr	DbTitle; (db_title) database title */
1694 
1695     /* file handlers */
1696 
1697     FILE *fd,
1698         *fd_ind,
1699         *fd_seq,
1700         *fd_def,
1701         *fd_sdi,  /* This is file for misc. info data */
1702         *fd_stmp;
1703 
1704     /* ASN.1 input, if the "-a" specified */
1705     AsnIoPtr	aip;
1706 
1707     /* ASN.1 defline output if structured defline */
1708     AsnIoPtr aip_def;
1709 
1710     Int4 num_of_seqs;  /* number of parsed sequences in this volume */
1711     Int8 TotalLen;
1712     Int4 MaxSeqLen;
1713 
1714     /* offset tables */
1715     Int4Ptr	DefOffsetTable,	/* definitions */
1716         	SeqOffsetTable,	/* sequences */
1717         	AmbOffsetTable;	/* ambiguities */
1718 
1719     /* lookup table */
1720 
1721     FASTALookupPtr	lookup;
1722 
1723     /* Table to map PIGs to ordinal ids */
1724     FDBPigTablePtr   ptable;
1725 
1726     /* General formatdb options */
1727 
1728     FDB_optionsPtr options;
1729 
1730     Uint4Ptr	AmbCharPtr;	/* ambiguity characters while
1731                                  * convert from ncbi2na->ncbi4na */
1732 
1733     Int4 OffsetAllocated; /* storage for allocation size */
1734 
1735 } FormatDB, *FormatDBPtr;
1736 
1737 
1738 #define	MASK_WORD_SIZE	32
1739 
1740 /* Function prototypes for formatdb library*/
1741 
1742 /* --------------------- FDBOptionsNew ----------------------------
1743    Purpose: Creates formatdb options structure with parameters from
1744             the argument list.
1745    Returns: Pointer to initialized structure.
1746    Notes: If alias_file_name is provided, the function FDB_MakeAlias
1747           should be called after FDBClose. (FIXME)
1748    ---------------------------------------------------------------- */
1749 FDB_optionsPtr FDBOptionsNew(
1750         CharPtr input, /* [in] name of input file */
1751         Boolean is_prot, /* [in] input contains protein sequences? */
1752         CharPtr title,  /* [in] title to give this database */
1753         Boolean is_asn, /* [in] true if input is in ASN.1 */
1754         Boolean is_asn_bin, /* [in] true if ASN.1 input is binary */
1755         Boolean is_seqentry, /* [in] true of input is a seqentry */
1756         Boolean sparse_idx, /* [in] should sparce ISAM indices be used? */
1757         Boolean test_non_unique, /* [in] test for repeated string identifiers
1758                                     in database */
1759         Boolean parse_deflines, /* [in] input contains parseable deflines? */
1760         CharPtr basename, /* [in] name for the database to create */
1761         CharPtr alias_file_name, /* [in] name for the alias file to create */
1762         Int8 bases_per_volume, /* [in] max num of residues/bases per volume */
1763         Int4 seqs_per_volume, /* [in] max num of sequences per volume */
1764         Int4 version, /* [in] database version */
1765         Boolean dump_info, /* [in] should basename.[pn]di be created? */
1766         EFDBCleanOpt clean_opt);/* [in] should basename.* files be removed ? */
1767 
1768 /* --------------------- FDBOptionsFree ---------------------------
1769    Purpose: Frees the memory allocated for the formatdb options structure.
1770    Returns: NULL
1771    ---------------------------------------------------------------- */
1772 FDB_optionsPtr FDBOptionsFree(FDB_optionsPtr options);
1773 Boolean FDBCleanUp(FDB_optionsPtr options);
1774 
1775 /* The next 4 functions are for production database dump ({id,rs}dump_blast) */
1776 ValNodePtr FDBLoadLinksTable(void);
1777 ValNodePtr FDBDestroyLinksTable(ValNodePtr list);
1778 ValNodePtr FDBLoadMembershipsTable(void);
1779 ValNodePtr FDBDestroyMembershipsTable(ValNodePtr tbl);
1780 
1781 /* Constructs BlastDefLine structures from Bioseq */
1782 BlastDefLinePtr FDBGetDefAsnFromBioseq(BioseqPtr bsp,
1783                                        const FDBTaxidDeflineTablePtr gttp);
1784 
1785 FormatDBPtr	FormatDBInit(FDB_optionsPtr options);
1786 
1787 /* For database version FORMATDB_VER (or greater), only the first 5 parameters
1788  * are used, the latter are kept for the FORMATDB_VER_TEXT version of the BLAST
1789  * databases. Please note that the seq_data and seq_data_type will be changed
1790  * if the data passed in doesn't match the format that is required for the
1791  * BLAST database format (ncbistdaa for proteins, ncbi2na for nucleotides) */
1792 Int2 FDBAddSequence (FormatDBPtr fdbp,  BlastDefLinePtr bdp,
1793                      Uint1* seq_data_type, ByteStorePtr *seq_data,
1794                      Int4 SequenceLen,
1795                      CharPtr seq_id, CharPtr title,
1796                      Int4 gi, Int4 tax_id, CharPtr div, Int4 owner, Int4 date);
1797 
1798 /**
1799  * FDBAddSequence2: is an interface to add "non-redundant sequence", i.e
1800  * common sequence data and multiple sequence information block (1 per gi)
1801  * This function will NOT alter the seq_data field, it assumes that the data is
1802  * already provided in the required format
1803  * @param fdbp target blast db [in]
1804  * @param srp linked list of sequence information for each gi [in]
1805  * @param seq_data_type type of the parameter below [in]
1806  * @param seq_data sequence data itself [in]
1807  * @param SequenceLen length of the sequence in seq_data [in]
1808  * @param AmbCharPtr pointer to ambiguity sequence data (nucl only) [in]
1809  * @param pig_id stable protein group identifier [in]
1810  * @param hash sequence hash - to allow  resuse of hahs calculated in ID [in]
1811  * @return 1 on failure, 0 on success
1812  */
1813 Int2 FDBAddSequence2 (FormatDBPtr  fdbp,
1814                       SI_RecordPtr srp,
1815                       Uint1 seq_data_type,
1816                       const ByteStorePtr *seq_data,
1817                       Int4 SequenceLen,
1818                       Uint4Ptr  AmbCharPtr,
1819                       Int4 pig_id,
1820                       Uint4 hash
1821                       );
1822 
1823 /* For database version FORMATDB_VER (or greater), the bdp parameter must
1824  * be provided. This could be populated from the bsp parameter by calling
1825  * FDBGetDefAsnFromBioseq */
1826 Int2 FDBAddBioseq(FormatDBPtr fdbp, BioseqPtr bsp, BlastDefLinePtr bdp);
1827 Int2 FormatDBClose(FormatDBPtr fdbp);
1828 
1829 Boolean FDBAddLinksInformation(BlastDefLinePtr bdp, ValNodePtr links_tblp);
1830 Boolean FDBAddMembershipInformation(BlastDefLinePtr bdp, ValNodePtr memb_tblp,
1831                                     VoidPtr criteria_arg);
1832 
1833 Int2 process_sep (SeqEntryPtr sep, FormatDBPtr fdbp);
1834 
1835 NLM_EXTERN Boolean SeqEntrysToBLAST (SeqEntryPtr sep, FormatDBPtr fdbp,
1836                                      Boolean is_na, Uint1 group_segs);
1837 
1838 NLM_EXTERN Boolean BLASTFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf,
1839                                   Uint4 buflen, Pointer data);
1840 
1841 /*
1842 Print a summary of the database used.
1843 */
1844 Boolean LIBCALL PrintDbInformation PROTO((CharPtr database, Boolean is_aa, Int4 line_length, FILE *outfp, Boolean html));
1845 Boolean LIBCALL PrintDbInformationWithRID PROTO((CharPtr database, Boolean is_aa, Int4 line_length, FILE *outfp, Boolean html, CharPtr rid, Boolean query_is_aa));
1846 Boolean LIBCALL PrintDbInformationBasicEx PROTO((Boolean is_aa, Int4 line_length,
1847                            CharPtr definition, Int4 number_seqs,
1848                            Int8 total_length, FILE *outfp, Boolean html,
1849                            Boolean with_links));
1850 
1851 Boolean LIBCALL PrintDbInformationBasic PROTO((CharPtr database, Boolean is_aa, Int4 line_length, CharPtr definition, Int4 number_seqs, Int8 total_length, FILE *outfp, Boolean html));
1852 
1853 Boolean FDBAddSeqEntry(FormatDBPtr fdbp, SeqEntryPtr sep);
1854 
1855 /* ID1 dump stuff */
1856 
1857 typedef	struct di_record {
1858     Int4    oid;
1859     Int4    gi;
1860     Int4    taxid;
1861     Int4    owner;
1862     Char    div[4]; /* 3-letter division */
1863     Int4    len;  /* Length of sequence */
1864     Int4    hash; /* Hash value for sequence data */
1865     Int4    date; /* NB: name is misleading; this is actually sat_key */
1866     CharPtr acc; /* accession should not exceed this size */
1867     Uint1   mol; /* Molecule type, as in Seq-inst::mol */
1868     Int4    gi_threshold;   /* for 'month' subset */
1869 
1870 } DI_Record, *DI_RecordPtr;
1871 
1872 /******** genmask structures and functions *********/
1873 
1874 /* genmask scans the *.[pn]di files and sets membership bits according to the
1875    criteria specified by the GMCriteria function (see typedef above). This is
1876    one example of how to set the membership bits in the new database format.
1877    Note that the MembInfo structure has a criteria function pointer that
1878    returns a boolean value and takes a void ptr as an argument to allow
1879    flexibility in specifying the criteria to belong to a particular
1880    membership. */
1881 
1882 typedef struct {
1883     Int4           count, allocated;
1884     CharPtr        *subset_name;
1885     GMCriteriaFunc *criteria;
1886     Int4           *membership_bit;
1887 } GMSubsetData, * GMSubsetDataPtr;
1888 
1889 Boolean	ScanDIFile(CharPtr difilename, GMSubsetDataPtr gmsubsetdp,
1890 	Boolean(*callback)(DI_RecordPtr direc, VoidPtr data), VoidPtr data,
1891 	FILE *out, Int4 gi_threshold);
1892 
1893 CharPtr FDFGetAccessionFromSeqIdChain(SeqIdPtr seqid_list);
1894 
1895 /* These functions determine the criteria for the membership bits for genmask.
1896    Only protein sequences have memberships because they are in non-redundant
1897    databases */
1898 Boolean is_EST_HUMAN(VoidPtr di_record);
1899 Boolean is_EST_MOUSE(VoidPtr di_record);
1900 Boolean is_EST_OTHERS(VoidPtr di_record);
1901 Boolean is_SWISSPROT(VoidPtr di_record);
1902 Boolean is_MONTH(VoidPtr di_record);
1903 Boolean is_PDB(VoidPtr di_record);
1904 Boolean is_REFSEQ(VoidPtr di_record);
1905 Boolean is_REFSEQ_RNA(VoidPtr di_record);
1906 Boolean is_REFSEQ_GENOMIC(VoidPtr ptr);
1907 Boolean is_CONTIG(VoidPtr di_record);
1908 
1909 /************************************************************************/
1910 /*        Fastacmd API                                           */
1911 /************************************************************************/
1912 
1913 typedef struct FCMDAccList {
1914     CharPtr acc;
1915     Int4 gi;
1916     struct FCMDAccList *next;
1917 } FCMDAccList, PNTR FCMDAccListPtr;
1918 
1919 FCMDAccListPtr LIBCALL GetAccList(CharPtr file, Int4Ptr TotalItems);
1920 void LIBCALL FCMDAccListFree(FCMDAccListPtr falp);
1921 
1922 #define FASTACMD_DEFAULT_DB "nr"
1923 
1924 #define FASTACMD_SUCCESS 0
1925 #define FASTACMD_ERROR 1
1926 #define FASTACMD_DB_NOT_FOUND 2
1927 #define FASTACMD_FAILED_SEARCH 3
1928 #define FASTACMD_NO_TAXDB 4
1929 
1930 /* Fastacmd_Search and Fastacmd_Search_ex return non-zero on failure */
1931 Int2 Fastacmd_Search (CharPtr searchstr, CharPtr database,
1932 	CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out);
1933 
1934 /* Used to specify which kind of data to dump using fastacmd */
1935 typedef enum EBlastDbDumpType {
1936     eNoDump = 0,        /* Don't dump any data from the database, the default for
1937                            fastacmd */
1938     eFasta,             /* dump contents of database as FASTA */
1939     eGi,                /* List of gis in the database */
1940     eAccession,         /* List of accessions in the database */
1941     eDumpTypeMax        /* not really a dump type, needed for error checking */
1942 } EBlastDbDumpType;
1943 
1944 Int2 Fastacmd_Search_ex (CharPtr searchstr, CharPtr database, Uint1 is_prot,
1945 	CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out,
1946 	Boolean use_target, Boolean use_ctrlAs, EBlastDbDumpType dump_db,
1947     CharPtr seqlocstr, Uint1 strand, Boolean taxonomy_info_only,
1948     Boolean dbinfo_only, Int4 pig);
1949 
1950 /** Parses the string passed as its first argument, which should contain a pair
1951  * of positive integers separated by ' ', ',', or ';' and returns the integers
1952  * in the second argument. This function is non-static so that unit tests can
1953  * be written for it.
1954  */
1955 void Fastacmd_ParseLocations(const char* str, Int4 locations[2]);
1956 
1957 /**
1958  * @param rdfp Blast database handle [in]
1959  * @param fp output FILE pointer [in]
1960  * @param linelen number of characters to print per line [in]
1961  * @param use_ctrlAs use Ctrl-A to separate non-redundant deflines? [in]
1962  * @param dump_type type of information to dump [in]
1963  */
1964 Int2 DumpBlastDB(const ReadDBFILEPtr rdfp, FILE *fp, Int4 line_length,
1965 		         Boolean use_ctrlAs, EBlastDbDumpType dump_type);
1966 
1967 /**
1968  * @param rdfp Blast database handle [in]
1969  * @param fp output FILE pointer [in]
1970  * @param linelen number of characters to print per line [in]
1971  * @param use_ctrlAs use Ctrl-A to separate non-redundant deflines? [in]
1972  * @param dump_type type of information to dump [in]
1973  * @param i ordinal id of sequence to dump [in]
1974  */
1975 Int2 DumpOneSequence(const ReadDBFILEPtr rdfp, FILE *fp, Int4 line_length,
1976                      Boolean use_ctrlAs, EBlastDbDumpType dump_type, Int4 i);
1977 
1978 Int4 LIBCALL readdb_MakeGiFileBinary PROTO((CharPtr input_file, CharPtr
1979 					    output_file));
1980 
1981 Int4 FastaToBlastDB PROTO((FDB_optionsPtr options, Int4 Bases_In_Volume));
1982 
1983 BlastDefLinePtr FDReadDeflineAsn(ReadDBFILEPtr rdfp, Int4 sequence_number);
1984 
1985 CharPtr FD_ConstructMultivolumeDBList(CharPtr basename, Int4 vols);
1986 
1987 Boolean FD_CreateAliasFileEx PROTO((CharPtr title, CharPtr basename,
1988             Int4 volumes, Boolean is_protein, CharPtr parent,
1989             Int4 first_oid, Int4 last_oid, Int8 total_length, Int4 number_seqs,
1990 	    CharPtr oidlist, CharPtr gifile));
1991 
1992 Boolean FD_CreateAliasFile PROTO((CharPtr title, CharPtr basename,
1993                                     Int4 volumes, Boolean is_protein));
1994 
1995 /* simple function to make alias file give FDB_optionsPtr, alias file is only made if appropriate. */
1996 Boolean FD_MakeAliasFile PROTO((FDB_optionsPtr options));
1997 Int4 LIBCALL
1998 readdb_get_sequence_number PROTO((ReadDBFILEPtr rdfp, Int4 first_seq, Int8 offset));
1999 
2000 Boolean FDBDumpDeflineAsn(FormatDBPtr fdbp, BlastDefLinePtr bdp_in);
2001 
2002 Int4 FDBFillIndexTables(FormatDBPtr fdbp, Int4 seq_length);
2003 
2004 BlastDefLinePtr FDLCreateAsnDF(FormatDBPtr fdbp, CharPtr seq_id,
2005                                CharPtr title, Int4 taxid);
2006 void FDBBlastDefLineSetBit(Int2 bit_no, ValNodePtr PNTR retval);
2007 
2008 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX)
2009 #ifdef HAVE_MADVISE
2010 
2011 /* enable/disable madvise functionality, -- disabled by default */
2012 void LIBCALL
2013 readdb_madvise_enable PROTO((Boolean enable));
2014 
2015 /* set madvise type, -- default eMMA_Normal */
2016 void LIBCALL
2017 readdb_madvise_type PROTO((EMemMapAdvise advice));
2018 
2019 /* explicitly set madvise sync mode:
2020  * default is sync on Solaris, async on Linux
2021  */
2022 void LIBCALL
2023 readdb_madvise_sync_mode PROTO((Boolean mode));
2024 
2025 /* explicitly set madvise block size, which is the
2026  * number of sequences preloaded in a single madvise
2027  * operation, default is 65536
2028  */
2029 void LIBCALL
2030 readdb_madvise_block PROTO((Int4 nSeqs));
2031 
2032 /* call preload directly -- run madvise on a chunk of memory mapped file */
2033 void LIBCALL
2034 readdb_preload PROTO((ReadDBFILEPtr rdfp, Int4 first_db_seq,
2035 				Int4 final_db_seq, EMemMapAdvise advice, Boolean sync));
2036 
2037 #endif /* HAVE_MADVISE */
2038 #endif /* SOL || LINUX */
2039 
2040 #ifdef __cplusplus
2041 }
2042 #endif
2043 
2044 #endif /* _READDB_ */
2045