1 /* 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 */ 26 27 /***************************************************************************** 28 29 File name: readdb.h 30 31 Author: Tom Madden 32 33 Contents: defines and prototypes used by readdb.c and formatdb.c. 34 35 ******************************************************************************/ 36 37 /* 38 * File Name: readdb.h 39 * 40 * Author: Tom Madden 41 * 42 * Version Creation Date: 3/21/95 43 * 44 * $Revision: 6.180 $ 45 * 46 * File Description: 47 * Functions to rapidly read databases from files produced by formatdb. 48 * 49 * Modifications: 50 * -------------------------------------------------------------------------- 51 * Date Name Description of modification 52 * ------- ---------- ----------------------------------------------------- 53 * 54 * ========================================================================== 55 * 56 * 57 * RCS Modification History: 58 * $Log: readdb.h,v $ 59 * Revision 6.180 2009/01/15 15:20:35 madden 60 * Add prototype for readdb_check_oid 61 * 62 * Revision 6.179 2007/09/27 17:20:54 madden 63 * Add readdb_get_full_filename 64 * 65 * Revision 6.178 2007/08/21 20:06:07 kans 66 * added prototype for FDBCleanUp 67 * 68 * Revision 6.177 2007/05/08 13:09:39 madden 69 * Add ability to read STATS_NSEQ and STATS_TOTLEN from alias file with funciton readdb_get_stats_numbers 70 * 71 * Revision 6.176 2006/08/07 15:03:57 camacho 72 * +is_REFSEQ_GENOMIC 73 * 74 * Revision 6.175 2006/07/03 18:27:22 coulouri 75 * correct volume size defaults for protein databases 76 * 77 * Revision 6.174 2006/06/27 15:34:18 coulouri 78 * Correct comment and default volume size 79 * 80 * Revision 6.173 2006/06/19 18:37:08 coulouri 81 * improve default handling for non-formatdb clients 82 * 83 * Revision 6.172 2006/06/19 17:20:14 coulouri 84 * Extend 1GB default volume size to all platforms and impose a hard limit of 4G. rt#15171398 85 * 86 * Revision 6.171 2006/05/10 22:00:28 kans 87 * new function prototypes were erroneously inside ifdef HAVE_MADVISE block, moved outside 88 * 89 * Revision 6.170 2006/05/10 20:48:57 camacho 90 * From Ilya Dondoshansky: 1. Several FDB functions made public - needed for incremental dump efficiency; 2. mol field added to SI_Record and DI_Record 91 * 92 * Revision 6.169 2006/04/24 15:50:19 camacho 93 * + is_REFSEQ_RNA 94 * 95 * Revision 6.168 2006/03/16 14:14:24 camacho 96 * Fix parsing of locations for fastacmd command line argument (rt # 15151399) 97 * 98 * Revision 6.167 2006/03/09 21:56:02 camacho 99 * Refactored sequence hash function 100 * 101 * Revision 6.166 2006/03/08 19:06:14 camacho 102 * Added definition for maximum number of volumes, fixes rt ticket 15147600 103 * 104 * Revision 6.165 2006/02/15 21:07:29 camacho 105 * Add validation to fastacmd to reject mixed protein/nucleotide databases 106 * 107 * Revision 6.164 2005/10/04 20:40:50 madden 108 * Make PrintDbInformationBasicEx public 109 * 110 * Revision 6.163 2005/10/04 15:44:54 madden 111 * Workaround to time-out problem of PrintDbInformationWithRID 112 * 113 * Revision 6.162 2005/07/28 14:57:10 coulouri 114 * remove dead code 115 * 116 * Revision 6.161 2005/07/27 21:30:02 camacho 117 * 1) Replaces is_REFSEQ_* functions by a single function (is_REFSEQ), to be 118 * used by genmask and ID1 group's BLAST database dumper. 119 * 2) Removed out-of-date is_WGS* functions. 120 * 121 * Revision 6.160 2005/07/27 17:48:57 coulouri 122 * remove hardcoded paths 123 * 124 * Revision 6.159 2005/06/22 13:55:22 coulouri 125 * add support for dumping accessions 126 * 127 * Revision 6.158 2005/06/08 19:25:36 camacho 128 * New feature to allow formatdb to add taxonomy ids to BLAST databases 129 * generated from FASTA input 130 * BugzID: 6 131 * 132 * Revision 6.157 2005/02/22 14:15:48 camacho 133 * Pass bioseq data type by reference to FDBAddBioseq 134 * 135 * Revision 6.156 2004/12/04 03:41:09 camacho 136 * Add extra enum for fastacmd -D option for error checking 137 * 138 * Revision 6.155 2004/12/03 04:57:57 camacho 139 * Fix name conflict in enumeration for fastacmd dump types 140 * 141 * Revision 6.154 2004/12/02 20:37:31 camacho 142 * + fastacmd feature to dump list of gis 143 * 144 * Revision 6.153 2004/09/27 16:29:34 madden 145 * Make title on SI_Record dynamically allocated 146 * 147 * Revision 6.152 2004/08/25 14:45:23 camacho 148 * Refactorings to allow formatdb process multiple deflines 149 * 150 * Revision 6.151 2004/07/14 18:35:12 camacho 151 * Added comments for readdb_get_header_ex 152 * 153 * Revision 6.150 2004/07/09 17:09:12 camacho 154 * Updated documentation for last_oid_assigned 155 * 156 * Revision 6.149 2004/07/08 19:49:03 camacho 157 * Contributions from ID1 Group: 158 * 1) SI_Record structure. 159 * 2) Refactoring of FDBAddSequence2 to allow addition of non-redundant sequences 160 * when creating BLAST databases. 161 * 162 * Revision 6.148 2004/06/29 20:59:23 camacho 163 * Added last_oid_assigned to ReadDBSharedInfo structure 164 * 165 * Revision 6.147 2004/04/16 18:14:50 camacho 166 * Made division field in DI_Record larger 167 * 168 * Revision 6.146 2004/02/24 14:06:01 camacho 169 * Added support for approximate sequence length calculation for nucleotide 170 * sequences. 171 * 172 * Revision 6.145 2004/02/04 15:35:05 camacho 173 * Rollback to fix problems in release 2.2.7 174 * 175 * Revision 6.143 2003/07/08 18:42:40 camacho 176 * Elaborated fastacmd return values 177 * 178 * Revision 6.142 2003/06/13 19:56:48 dondosha 179 * Removed unneeded argument in FastaToBlastDB 180 * 181 * Revision 6.141 2003/04/25 18:55:27 camacho 182 * 1. Added readdb_merge_gifiles to deal with Microbial blast database issues. 183 * 2. Minor fixes to Int4List functions. 184 * 185 * Revision 6.140 2003/04/22 21:30:14 camacho 186 * Added Int4 list utilities 187 * 188 * Revision 6.139 2003/04/22 19:04:57 camacho 189 * Moved GiList structure to generic list of 4-byte integers 190 * 191 * Revision 6.138 2003/04/16 15:39:37 coulouri 192 * fix compiler warning 193 * 194 * Revision 6.137 2003/04/15 19:09:13 camacho 195 * Completed implementation of PIG interface 196 * 197 * Revision 6.136 2003/04/10 15:11:37 camacho 198 * Include PIG interface in __cplusplus 199 * 200 * Revision 6.135 2003/04/09 21:46:00 camacho 201 * Added basic PIG interface 202 * 203 * Revision 6.134 2003/04/08 19:45:35 camacho 204 * Defined invalid PIG 205 * 206 * Revision 6.133 2003/04/08 15:37:15 camacho 207 * Extended FDBAddSequence2 to take pig 208 * 209 * Revision 6.132 2003/04/01 21:51:36 camacho 210 * Made fastacmd functions & structure non-static 211 * 212 * Revision 6.131 2003/03/27 22:26:04 camacho 213 * Add error messages and non-zero return value on error for fastacmd 214 * 215 * Revision 6.130 2003/03/26 19:11:22 camacho 216 * Minor change to previous commit 217 * 218 * Revision 6.129 2003/03/26 18:50:07 camacho 219 * Added eFDBCleanOpt to formatdb API 220 * 221 * Revision 6.128 2003/01/30 21:57:28 camacho 222 * Added more detailed comment to readdb_new_ex2 223 * 224 * Revision 6.127 2003/01/22 19:41:21 camacho 225 * Added function to build multi-volume db list for creating alias files 226 * 227 * Revision 6.126 2002/12/20 14:37:34 coulouri 228 * Fix prototype for RDBTaxInfoInit() 229 * 230 * Revision 6.125 2002/12/17 20:33:25 camacho 231 * Removed unnecessary function attribute 232 * 233 * Revision 6.124 2002/12/16 20:22:48 camacho 234 * Removed unused options in formatdb options structure 235 * 236 * Revision 6.123 2002/12/16 05:01:55 camacho 237 * Fixes to previous commit 238 * 239 * Revision 6.122 2002/12/13 13:43:25 camacho 240 * Changes to set links and membership bits in formatdb API 241 * 242 * Revision 6.121 2002/11/25 17:23:28 camacho 243 * 1) Changed file access to blast taxonomy databases: only 2 files are loaded 244 * for an entire chain of rdfp's. 245 * 2) Fixed memory leak in FindBlastDBFile. 246 * 3) Protect NlmOpenMFILE against NULL argument. 247 * 248 * Revision 6.120 2002/10/25 16:49:45 camacho 249 * Added Michael Kimelman's FDBAddSequence2 250 * 251 * Revision 6.119 2002/10/03 14:13:44 camacho 252 * Added support for gilist field in alias file in multivolume databases 253 * 254 * Revision 6.118 2002/09/26 02:14:42 camacho 255 * Allow limiting the number of sequences per volume 256 * 257 * Revision 6.117 2002/09/25 20:14:20 camacho 258 * Fix for multivolume databases with non-parseable seqids 259 * 260 * Revision 6.116 2002/07/30 15:28:50 camacho 261 * Added fastacmd function to parse SeqLocs 262 * 263 * Revision 6.115 2002/07/29 15:45:19 camacho 264 * Made readdb_get_taxnames a LIBCALL function 265 * 266 * Revision 6.114 2002/07/24 19:31:48 raytseli 267 * much simpler and more efficient approach to using madvise() 268 * . 269 * 270 * Revision 6.113 2002/07/22 13:06:42 raytseli 271 * explicitly allow setting of the advice type for madvise() 272 * . 273 * 274 * Revision 6.112 2002/07/18 17:39:54 raytseli 275 * changed ifdef OS_UNIX_SUN to ifdef OS_UNIX_SOL for madvise() 276 * . 277 * 278 * Revision 6.111 2002/07/18 15:54:26 raytseli 279 * added function to explicitly set madvise() block size, and madvise() sync mode. 280 * 281 * Revision 6.110 2002/07/18 15:01:54 raytseli 282 * correct problem with pointer format "%p" ErrPostEx() handling on linux. 283 * Add extern func to allow explicit madvise() functionality activation. 284 * 285 * Revision 6.109 2002/07/17 17:15:06 raytseli 286 * only allow madvise()-related stuff on SUN or Linux. 287 * . 288 * 289 * Revision 6.108 2002/07/17 16:54:54 raytseli 290 * additional #ifdefs to allow compilation. 291 * 292 * Revision 6.106 2002/07/17 14:36:54 raytseli 293 * incorporated madvise into readdb 294 * . 295 * 296 * Revision 6.105 2002/07/14 21:02:08 camacho 297 * Added extra features to fastacmd 298 * 299 * Revision 6.104 2002/07/09 16:41:52 camacho 300 * Made taxonomy databases multi-thread safe 301 * 302 * Revision 6.103 2002/06/26 00:45:37 camacho 303 * 304 * Added readdb_get_totals_ex2 to allow recalculation of database length as 305 * well as total number of sequences after the virtual oidlist has been 306 * created. 307 * 308 * Revision 6.102 2002/06/04 21:45:39 dondosha 309 * Corrected the readdb_get_sequence_number function in case of multiple-volume databases 310 * 311 * Revision 6.101 2002/06/04 20:22:56 camacho 312 * Fixed taxonomy databases to work w/o mmap 313 * 314 * Revision 6.100 2002/05/15 20:23:47 camacho 315 * Added wgs_{mouse,anthrax} criteria functions 316 * 317 * Revision 6.99 2002/05/02 21:52:06 camacho 318 * Support for genmask's new month/subset mask combinations 319 * 320 * Revision 6.98 2002/04/18 19:35:07 camacho 321 * 1. Added fdfilter/genmask callbacks for wgs subsets 322 * 2. Modified fdfilter/genmask refseq_protein callback function 323 * 3. Fixed problem in readdb_read_alias_file to read multiple oidlists 324 * 325 * Revision 6.97 2002/03/08 16:58:50 camacho 326 * Added accessions to dump info files *.[pn]di 327 * 328 * Revision 6.96 2002/01/25 17:06:57 camacho 329 * Added new criteria to create new refseq databases 330 * 331 * Revision 6.95 2002/01/24 18:47:48 camacho 332 * Moved RDBTaxNamesFree from readdb.[ch] to txalign.[ch] 333 * 334 * Revision 6.94 2002/01/11 19:22:26 camacho 335 * 1. Added preferred_gi field to ReadDBFILE structure. 336 * 2. Modified FDReadDeflineAsn to return the preferred gi as the 337 * first element of the list of BlastDefLine structures (if set). 338 * 339 * Revision 6.93 2001/12/18 13:01:51 camacho 340 * Added new flag -D to dump blast database in FASTA format 341 * 342 * Revision 6.92 2001/12/10 19:17:13 camacho 343 * Added option to allow fastacmd to use Ctrl-As as defline separators. 344 * 345 * Revision 6.91 2001/11/09 19:05:35 dondosha 346 * ReadDBFreeSharedInfo and ReadDBOpenMHdrAndSeqFiles made static in readdb.c 347 * 348 * Revision 6.90 2001/11/02 18:30:12 dondosha 349 * Added prototypes for readdb_get_sequence_number, PrintDbInformationWithRID 350 * 351 * Revision 6.89 2001/10/19 13:40:31 camacho 352 * Updated the DI_Record structure and moved some function prototypes to allow their use by fdfilter 353 * 354 * Revision 6.88 2001/10/01 18:43:37 camacho 355 * Added BlastDBToFasta function 356 * Added readdb_get_header_ex function 357 * 358 * Revision 6.87 2001/10/01 18:37:32 camacho 359 * readdb.h 360 * 361 * Revision 6.86 2001/07/12 19:27:45 madden 362 * Add alias_file_name to Options 363 * 364 * Revision 6.85 2001/06/21 18:27:28 shavirin 365 * Moved into files txalign.[c,h] functions returning taxonomy names 366 * from Bioseq created from Blast database. 367 * 368 * Revision 6.84 2001/06/14 16:22:46 madden 369 * Add prototype for FD_MakeAliasFile 370 * 371 * Revision 6.83 2001/05/23 21:17:24 shavirin 372 * Added definitions for bits related to sequence-to-database affiliation. 373 * 374 * Revision 6.82 2001/05/11 19:59:41 madden 375 * Add gi_file_bin to FDOptions, oidlist and gifile to FD_CreateAliasFileEx 376 * 377 * Revision 6.81 2001/05/10 17:19:53 madden 378 * Add number_seqs arg to FD_CreateAliasFileEx 379 * 380 * Revision 6.80 2001/05/08 21:58:28 shavirin 381 * Added possibility to generate tax_id for every definition in Blast FASTA 382 * definition set in ASN.1 structured definition lines. 383 * 384 * Revision 6.79 2001/05/02 16:22:05 dondosha 385 * Add NSEQ and LENGTH to alias files in case of multiple inputs to formatdb 386 * 387 * Revision 6.78 2001/04/11 21:00:53 dondosha 388 * Made functions FD_CreateAliasFile(Ex) public 389 * 390 * Revision 6.77 2001/04/11 20:14:06 dondosha 391 * Added volume information to FDB_options structure 392 * 393 * Revision 6.76 2001/03/29 20:15:59 madden 394 * Removed unneeded #define 395 * 396 * Revision 6.75 2001/03/23 17:23:54 madden 397 * Move FDGetDeflineAsnFromBioseq to txalign.[ch] 398 * 399 * Revision 6.74 2001/02/05 18:52:01 shavirin 400 * Blast database size was changed from Uint4 to Uint8 - this corrected 401 * invalidly printed database size for large databases. 402 * 403 * Revision 6.73 2000/12/12 23:14:42 shavirin 404 * Added functions to initialize taxonomy names database and search functions 405 * to get all taxonomy names given tax_id using this database. 406 * 407 * Revision 6.72 2000/12/08 22:25:01 shavirin 408 * Added code for creation Taxonomy lookup database using formatdb API. 409 * 410 * Revision 6.71 2000/11/28 18:20:40 madden 411 * Comments from Sergei on FDB_options 412 * 413 * Revision 6.70 2000/11/24 15:41:58 shavirin 414 * Added parameter tax_id into function FDBAddBioseq(). 415 * 416 * Revision 6.69 2000/11/22 19:52:44 shavirin 417 * Added definition of the new function FDGetDeflineAsnFromBioseq() 418 * 419 * Revision 6.68 2000/10/26 18:30:50 dondosha 420 * Added gifile member to ReadDBFILE structure 421 * 422 * Revision 6.67 2000/10/13 17:31:52 shavirin 423 * Adjusted calls to readdb_get_header for ASN.1 structured deflines. 424 * 425 * Revision 6.66 2000/09/29 16:38:30 shavirin 426 * Added new function FDB_FreeCLOptions(FDB_optionsPtr options). 427 * 428 * Revision 6.65 2000/09/16 15:20:17 shavirin 429 * Added AsnIoPtr structure for ASN.1 structured deflines. 430 * 431 * Revision 6.64 2000/09/07 20:49:58 shavirin 432 * Added parameters to support ASN.1 defline dump for blast db. FORMATDB_VER 3->4 433 * Added parameter FORMATDB_VER_TEXT for backward compatibility. 434 * 435 * Revision 6.63 2000/07/18 19:29:29 shavirin 436 * Added new parameter test_non_unique to suppress check for non-unique 437 * strings ids in the database - default - TRUE. 438 * 439 * Revision 6.62 2000/07/07 21:20:08 vakatov 440 * Get all "#include" out of the 'extern "C" { }' scope! 441 * 442 * Revision 6.61 2000/06/28 16:55:50 madden 443 * Add function Fastacmd_Search_ex, gi_target to ReadDBFILEPtr 444 * 445 * Revision 6.60 2000/06/19 20:06:43 madden 446 * Add ready Boolean to readdb_get_sequence_ex, for nucl. sequence the data is then in blastna format with sentinel bytes 447 * 448 * Revision 6.59 2000/05/22 18:46:23 dondosha 449 * Merged all Boolean members in ReadDBFILE structure into a single Int4 450 * 451 * Revision 6.58 2000/05/09 15:54:20 shavirin 452 * Added function ReadDBBioseqSetDbGeneticCode(). 453 * 454 * Revision 6.57 2000/05/03 16:18:34 dondosha 455 * Added prototype for FastaToBlastDB 456 * 457 * Revision 6.56 2000/03/13 18:36:38 madden 458 * Added insert_ctrlA Boolean to readdb_get_bioseq_ex 459 * 460 * Revision 6.55 2000/03/10 18:52:11 madden 461 * Add prototype for readdb_get_filebits 462 * 463 * Revision 6.54 2000/02/09 19:35:52 madden 464 * Added readdb_MakeGiFileBinary 465 * 466 * Revision 6.53 2000/01/12 21:03:52 egorov 467 * 1. Introduce Fastacmd API function - Fastacmd_Search 468 * 2. Rearrange order of functions to have Fastacmd, ID1, and CommonIndex stuff separate. 469 * 470 * Revision 6.52 2000/01/07 16:00:25 madden 471 * Alias db length is Int8 instead of Uint4 472 * 473 * Revision 6.51 2000/01/03 15:46:16 lewisg 474 * add prototype for readdb_get_num_entries_total_real 475 * 476 * Revision 6.50 1999/12/31 14:23:21 egorov 477 * Add support for using mixture of real and maks database with gi-list files: 478 * 1. Change logic of creating rdfp list. 479 * 2. BlastGetDbChunk gets real databases first, then masks. 480 * 3. Propoper calculation of database sizes using alias files. 481 * 4. Change to CommonIndex to support using of mask databases. 482 * 5. Use correct gis in formated output (BlastGetAllowedGis()). 483 * 6. Other small changes 484 * 485 * Revision 6.49 1999/12/22 20:34:34 dondosha 486 * Add full_filename and shared_info to ReadDBFile structure, plus prototypes of related routines 487 * 488 * Revision 6.48 1999/12/21 20:00:27 egorov 489 * Add new parameter into readdb_gi2seq() 490 * 491 * Revision 6.47 1999/12/17 21:33:01 egorov 492 * Add support for the 'month' subset. 493 * 494 * Revision 6.46 1999/12/15 17:34:32 egorov 495 * 1. Introduce MASK_WORD_SIZE constant variable. 496 * 2. Introduce DI_Record structure for fileld of DI index file. 497 * 3. Introduce UpdateIndexStruct which is used in callback for UpdateCommonIndexFile. 498 * 4. Add new field to ReadDbFile structure - aliasfilename, which used 499 * while deciding which gi to use. 500 * 501 * Revision 6.45 1999/11/26 22:06:59 madden 502 * Added READDB_UNPACK_BASE_N macro 503 * 504 * Revision 6.44 1999/11/23 22:02:27 madden 505 * Added readdb_get_totals_ex that may use alias file values 506 * 507 * Revision 6.43 1999/11/23 21:51:24 madden 508 * Changes for freeing OIDlist 509 * 510 * Revision 6.42 1999/11/12 14:16:14 madden 511 * Allow other initialization states in readdb_new_ex2 512 * 513 * Revision 6.41 1999/09/24 18:59:16 egorov 514 * Add functions prototypes 515 * 516 * Revision 6.40 1999/09/23 15:02:53 egorov 517 * Use more descriptive name 518 * 519 * Revision 6.39 1999/09/22 21:50:57 egorov 520 * Add mask DB stuff 521 * 522 * Revision 6.38 1999/09/13 16:18:40 shavirin 523 * Added function readdb_get_bioseq_ex, which has possibility 524 * to bypass ObjMgr registration. 525 * 526 * Revision 6.37 1999/09/10 16:30:18 shavirin 527 * Fixed problems with formating proteins by formatdb 528 * 529 * Revision 6.36 1999/09/09 18:25:05 shavirin 530 * Added functions to parse ASN.1 with formatdb 531 * 532 * Revision 6.35 1999/08/25 20:17:39 shavirin 533 * Added option to create and retrieve from sparse indexes. 534 * 535 * Revision 6.34 1999/08/02 13:33:58 shavirin 536 * Rolled back last changes. 537 * 538 * Revision 6.32 1999/05/27 15:51:29 shavirin 539 * Added function readdb_get_defline () 540 * 541 * Revision 6.31 1999/05/18 20:35:31 madden 542 * Changes to read an alias file for multiple db searches and ordinal ID lists 543 * 544 * Revision 6.30 1999/05/13 19:31:14 shavirin 545 * More changes toward dump from ID. 546 * 547 * Revision 6.29 1999/05/12 15:48:03 shavirin 548 * Changed parameter in function FDBAddSequence(). 549 * 550 * Revision 6.28 1999/05/06 15:25:27 egorov 551 * Remove static function declaration 552 * 553 * Revision 6.27 1999/04/26 14:36:29 shavirin 554 * Added ability to dump statistics. 555 * 556 * Revision 6.26 1999/04/21 22:55:39 kans 557 * was not checked in 558 * 559 * Revision 6.25 1999/02/22 21:48:03 egorov 560 * Optimize GIs2OIDs not reinitializing ISAM indicies for non-exclisive databases, but use already initialized rdfp's field for that. 561 * 562 * Revision 6.24 1999/02/05 13:47:05 madden 563 * Add basename for formatdb 564 * 565 * Revision 6.23 1998/12/14 21:49:23 egorov 566 * new max gi number memeber in CommonIndexHead structure and therefore no need for COMMON_INDEX_TABLE_SIZE 567 * 568 * Revision 6.22 1998/12/14 16:05:36 egorov 569 * *** empty log message *** 570 * 571 * Revision 6.21 1998/09/14 15:11:19 egorov 572 * Add support for Int8 length databases; remove unused variables 573 * 574 * Revision 6.20 1998/08/27 15:02:37 madden 575 * Added LIBCALL for readdb_get_sequence_ex 576 * 577 * Revision 6.19 1998/08/24 14:59:57 madden 578 * readdb_get_sequence_ex function 579 * 580 * Revision 6.18 1998/08/11 17:49:48 madden 581 * is_na becomes is_aa 582 * 583 * Revision 6.17 1998/07/01 14:03:07 egorov 584 * Fix bug with a thread freeing CommonIndex: add new flag to rdfp 585 * 586 * Revision 6.16 1998/06/26 16:51:15 egorov 587 * Fix CommonIndex bugs 588 * 589 * Revision 6.15 1998/06/24 21:03:40 egorov 590 * Remove memory leaks 591 * 592 * Revision 6.12 1998/05/22 20:19:54 madden 593 * Changes to fix multi-db search bug 594 * 595 * Revision 6.11 1998/02/26 22:34:24 madden 596 * Changes for 16 bit windows 597 * 598 * Revision 6.10 1998/02/11 17:49:38 madden 599 * Added structures and prototypes for formatdb to take ASN.1 as input 600 * 601 * Revision 6.9 1998/01/16 22:03:00 madden 602 * Added init_indices Boolean 603 * 604 * Revision 6.8 1997/11/26 22:48:38 madden 605 * Added readdb_parse_db_names for multiple db searches 606 * 607 * Revision 6.7 1997/11/07 16:16:36 shavirin 608 * Added definition of new function readdb_acc2fastaEx() 609 * 610 * Revision 6.6 1997/10/24 19:08:16 madden 611 * Added ReadDBGetDb and ReadDBGetDbId 612 * 613 * Revision 6.5 1997/09/24 22:37:06 madden 614 * Added readdb_destruct_element 615 * 616 * Revision 6.4 1997/09/16 16:31:40 madden 617 * More changes for multiple db runs 618 * 619 * Revision 6.3 1997/09/12 19:55:38 madden 620 * Added readdb_compare 621 * 622 * Revision 6.2 1997/09/11 18:49:40 madden 623 * Changes to enable searches against multiple databases. 624 * 625 * Revision 6.1 1997/08/27 14:46:59 madden 626 * Changes to enable multiple DB searches 627 * 628 * Revision 6.0 1997/08/25 18:53:59 madden 629 * Revision changed to 6.0 630 * 631 * Revision 1.26 1997/05/12 21:34:05 madden 632 * readdb_new allows indeterminate database type 633 * 634 * Revision 1.25 1997/05/12 21:11:42 shavirin 635 * Added definition for function readdb_acc2fasta() 636 * 637 * Revision 1.23 1997/05/07 21:04:02 madden 638 * Added prototype for SeqId2OrdinalId and changed FORMATDB_VER 2->3 639 * 640 * Revision 1.22 1997/05/01 17:26:58 shavirin 641 * Added definition for the function readdb_seqid2fasta() 642 * 643 * Revision 1.21 1997/02/25 22:16:32 shavirin 644 * Changes in accordance to ISAM API changes 645 * 646 * Revision 1.20 1997/02/25 16:28:38 shavirin 647 * Added new entries in ReadDBFILEPtr structure to do search by gi 648 * number. 649 * 650 * Revision 1.19 1996/12/19 16:29:56 madden 651 * Changes to eliminate ".nac" file for nucl. 652 * 653 * Revision 1.18 1996/12/17 21:34:46 madden 654 * Changes to allow deflines for inidividual entries to be retrieved. 655 * 656 * Revision 1.17 1996/12/11 18:42:36 madden 657 * Added prototypes for BioseqFetch functions. 658 * 659 * Revision 1.16 1996/11/27 16:39:11 madden 660 * Added functions to return filename and date. FORMATDB_VER 1->2 661 * 662 * Revision 1.15 1996/11/26 19:54:27 madden 663 * Added check for database in standard places. 664 * 665 * Revision 1.14 1996/11/22 19:05:48 madden 666 * removed ifdef for OLD_BIT_ORDER. 667 * 668 * Revision 1.13 1996/11/08 21:45:03 madden 669 * Removed function readdb_get_partial_unpacked_sequence. 670 * 671 * Revision 1.12 1996/11/07 22:33:00 madden 672 * Added prototype for readdb_ambchar_present. 673 * 674 * Revision 1.11 1996/11/04 18:50:20 shavirin 675 * Added definitions for ambiguity information pointers 676 * 677 * Revision 1.10 1996/10/31 16:29:55 shavirin 678 * Changed definitions due to reverce of residues in BLAST database 679 * for nucleotide sequences from (4321) to (1234) 680 * New dumper now required to create BLAST databases. 681 * 682 * Revision 1.9 1996/09/27 19:12:17 madden 683 * Added function readdb_get_bioseq to obtain a BioseqPtr from the BLAST databases. 684 * 685 * Revision 1.8 1996/09/26 15:09:21 madden 686 * Corrected misplaced comment. 687 * 688 * Revision 1.7 1996/09/23 14:37:35 madden 689 * Replaced CharPtr (for sequence) with Uint1Ptr. 690 * 691 * Revision 1.6 1996/09/20 21:59:16 madden 692 * *** empty log message *** 693 * 694 * Revision 1.5 1996/09/13 20:01:52 madden 695 * defined READDB_COMPRESSION_RATIO 696 * 697 * Revision 1.4 1996/09/13 18:55:04 madden 698 * Added function readdb_get_partial_unpacked_sequence. 699 * 700 * Revision 1.3 1996/08/29 20:42:01 madden 701 * memory mapping moved to the corelib (in ncbimem.[ch]). 702 * 703 * Revision 1.2 1996/08/07 18:32:05 madden 704 * Moved define of MMAP_AVAIL from readdb.h to readdb.c 705 * 706 * Revision 1.1 1996/08/05 19:48:21 madden 707 * Initial revision 708 * 709 * Revision 1.12 1996/08/02 14:20:06 madden 710 * Added readdb_attach function. 711 * 712 * Revision 1.11 1996/07/31 13:09:17 madden 713 * Changes for partial copy of ReadDB structure. 714 * 715 * Revision 1.10 1996/07/25 20:45:20 madden 716 * Change to arguments of readdb_get_sequence. 717 * 718 * Revision 1.9 1996/07/25 12:56:15 madden 719 * readdb_get_sequence changed to allow for systems w/o mmap. 720 * 721 * Revision 1.8 1996/06/20 17:00:11 madden 722 * Added "__cplusplus" define. 723 * 724 * Revision 1.7 1996/06/20 16:16:36 madden 725 * Replaced int's with Int4's. 726 * 727 * Revision 1.6 1996/05/16 19:50:15 madden 728 * Added documentation block. 729 * 730 * Revision 1.5 1996/04/22 21:42:07 madden 731 * New prototype for readdb_get_sequence 732 * 733 * Revision 1.4 1996/04/11 14:30:06 madden 734 * Memory-mapping added. 735 * 736 * Revision 1.3 1996/03/29 21:28:30 madden 737 * Added function readdb_get_sequence_length. 738 * 739 * Revision 1.2 1996/03/28 20:42:36 madden 740 * Added functions readdb_get_title, readdb_is_prot and 741 * readdb_get_formatdb_version. 742 * 743 * Revision 1.1 1996/03/26 19:38:08 madden 744 * Initial revision 745 * 746 * 747 */ 748 749 #ifndef _READDB_ 750 #define _READDB_ 751 752 753 /****************************************************************************/ 754 /* INCLUDES */ 755 /****************************************************************************/ 756 757 #include <ncbi.h> 758 #include <objloc.h> 759 #include <sequtil.h> 760 #include <ncbisam.h> 761 #include <tofasta.h> 762 #include <txalign.h> 763 764 /* This define should be added here to pacify NT build */ 765 #ifndef NLM_GENERATED_CODE_PROTO 766 #define NLM_GENERATED_CODE_PROTO 767 #endif 768 769 770 #include <fdlobj.h> 771 772 #ifdef __cplusplus 773 extern "C" { 774 #endif 775 776 /****************************************************************************/ 777 /* Structure of index file header - old version */ 778 /****************************************************************************/ 779 780 /* 781 4 bytes 4 bytes 4 bytes title_len bytes n bytes 782 <version><is_protein?><title_len><the_database_title><date_stamp> 783 784 (title_len+n)%8 bytes 785 <ex_bytes><num_of_seqs><total_len><max_seq_len> 786 787 num_of_seqs*4bytes num_of_seqs*4bytes num_of_seqs*4bytes 788 <defline_offset_table><sequence_offset_table><ambig_offset_table> 789 */ 790 791 /****************************************************************************/ 792 /* DEFINES */ 793 /****************************************************************************/ 794 795 /* Defines used to retrieve a base out of a packed byte. */ 796 /* x should be unsigned (Uint1) to avoid sign extension problems. */ 797 798 #define READDB_UNPACK_BASE_1(x) ((x)>>6) 799 #define READDB_UNPACK_BASE_2(x) (((x)>>4) & 0x03) 800 #define READDB_UNPACK_BASE_3(x) (((x)>>2) & 0x03) 801 #define READDB_UNPACK_BASE_4(x) ((x) & 0x03) 802 #define READDB_UNPACK_BASE_N(x, N) (((x)>>(2*(N))) & 0x03) 803 804 /* Compress 4 bytes to one. */ 805 #define READDB_COMPRESSION_RATIO 4 806 807 /* Character used to separate deflines from different entries that all 808 belong to the same sequence. */ 809 #define READDB_DEF_SEPARATOR '\001' 810 811 /* Choices for whether it's a protein db or not. */ 812 #define READDB_DB_IS_NUC 0 813 #define READDB_DB_IS_PROT 1 814 #define READDB_DB_UNKNOWN 2 815 816 #define READDB_CONTENTS_ALLOCATED 0x00000001 817 #define READDB_IS_PROT 0x00000002 818 #define READDB_HANDLE_COMMON_INDEX 0x00000004 819 #define READDB_NOT_FIRST_TIME 0x00000008 820 #define READDB_NO_SEQ_FILE 0x00000010 821 #define READDB_KEEP_HDR_AND_SEQ 0x00000020 822 823 /*** Choices for how much to initialize on startup in readdb_new_internal. ***/ 824 825 /* attempt to memory map all files. */ 826 #define READDB_NEW_DO_ALL ((Uint1) (1<<0)) 827 /* Only open the nin or pin files for a database report. */ 828 #define READDB_NEW_DO_REPORT ((Uint1) (1<<1)) 829 /* Only open the nin (or pin) and nsq (or psq) files for a search. */ 830 #define READDB_NEW_DO_SEARCH ((Uint1) (1<<2)) 831 /* Open only index (nin or pin) files for memory mapping */ 832 #define READDB_NEW_INDEX ((Uint1) (1<<3)) 833 /* Same as above and memory map blast taxonomy db files */ 834 #define READDB_NEW_DO_TAXDB ((Uint1) (1<<4)) 835 836 /* The following variables are shared by formatdb and readdb. */ 837 /* version of formatdb. 838 839 Explanations: last text version of defline used for blast database 840 was 3 - all subsequent versions use ASN.1 for defline storage. 841 For backward compatibility if database version is 3 new program 842 will handle it OK. If database version > 3 - exact match of version 843 is needed to proceed. 844 845 */ 846 847 #define FORMATDB_VER_TEXT 3 848 #define FORMATDB_VER 4 849 850 /* 'Magic' number at the beginning of a binary gi list that indicates it is binary. */ 851 #define READDB_MAGIC_NUMBER UINT4_MAX 852 853 /* Maximum volume size, in bytes */ 854 #define SEQFILE_SIZE_MAX 4000000000UL 855 856 /* Default volume size; 4*10^9 bases, or 1*10^9 residues */ 857 #define SEQFILE_SIZE_DFL 4000000000UL 858 859 /****************************************************************************/ 860 /* TYPEDEFS */ 861 /****************************************************************************/ 862 863 typedef struct nlm_mfile { 864 Nlm_MemMapPtr mem_mapp; /* structure containing mem-map info, 865 produced by Nlm_MemMapInit. */ 866 FILE PNTR fp; /* FILE pointer. */ 867 Uint1Ptr mmp_begin, /* beginning of mmap'ed are. */ 868 mmp, /* present position of mmap'ed pointer. */ 869 mmp_end; /* end of mmap'ed area. */ 870 Int4 file_size; /* size of file that is mmap'ed. */ 871 Boolean mfile_true; /* If TRUE then mmap succeeded. */ 872 Boolean contents_allocated; /* If TRUE, the contents have been allocated 873 and are not merely a copy. */ 874 Uint1Ptr mmp_madvise_end; /* madvise() file offset */ 875 } NlmMFILE, PNTR NlmMFILEPtr; 876 877 /* 878 Open the file and initialze the memory mapping. 879 */ 880 NlmMFILEPtr LIBCALL NlmOpenMFILE PROTO((CharPtr name)); 881 882 /* 883 Undo the memory mapping. 884 */ 885 NlmMFILEPtr LIBCALL NlmCloseMFILE PROTO((NlmMFILEPtr mfp)); 886 887 /* 888 Read "nitems" of size "size" from a memory mapped file into "buffer" 889 usig the memory-mapped file given by "mfp". 890 */ 891 Int4 LIBCALL NlmReadMFILE PROTO((Uint1Ptr buffer, size_t size, Int4 nitems, NlmMFILEPtr mfp)); 892 893 /* 894 "fseek" to a point in the memory mapped file. 895 */ 896 Int4 LIBCALL NlmSeekInMFILE PROTO((NlmMFILEPtr mfp, long offset, Int4 ptrname)); 897 898 /* 899 What is the offset (in bytes) to the beginning of the file. 900 Analog to ftell. 901 */ 902 Int4 LIBCALL NlmTellMFILE PROTO((NlmMFILEPtr mfp)); 903 904 /* Generic 4-byte integer list */ 905 typedef struct _gilist { 906 Int4 count, allocated; 907 Int4Ptr i; 908 } Int4List, *Int4ListPtr; 909 910 /* Creates a new list of 4-byte integers */ 911 Int4ListPtr LIBCALL 912 Int4ListNew PROTO((void)); 913 914 /* Creates a new list of 4-byte integers of size s */ 915 Int4ListPtr LIBCALL 916 Int4ListNewEx PROTO((Int4 s)); 917 918 /* Deallocates the list of 4-byte integers */ 919 Int4ListPtr LIBCALL 920 Int4ListFree PROTO((Int4ListPtr lp)); 921 922 /* Reads a list of newline separated 4-byte integers. 923 * Caller is responsible for deallocating the return value */ 924 Int4ListPtr LIBCALL 925 Int4ListReadFromFile PROTO((CharPtr filename)); 926 927 /* Appends i to the end of the list, reallocating memory if necessary. Returns 928 * FALSE if it cannot allocate more memory */ 929 Boolean LIBCALL 930 Int4ListAdd PROTO((Int4ListPtr lp, Int4 i)); 931 932 /* Returns the concatenation of list1 and list2, freeing both parameters. It 933 * returns NULL if both lists are empty and if it cannot allocate more memory */ 934 Int4ListPtr LIBCALL 935 Int4ListConcat PROTO((Int4ListPtr *list1, Int4ListPtr *list2)); 936 937 /* Attempts to reallocate new_size elements to the list. Returns NULL on 938 * incorrect arguments or if it cannot allocate more memory */ 939 Int4ListPtr LIBCALL 940 Int4ListResize PROTO((Int4ListPtr listp, Int4 new_size)); 941 942 /* Performs a binary search for key on lp. 943 Returns the index into lp->i where key is located or -1 if key is not found 944 */ 945 Int4 LIBCALL 946 Int4ListBSearch PROTO((Int4ListPtr lp, Int4 key)); 947 948 /* Ascendingly sorts the list and removes repeated entries */ 949 Int4ListPtr LIBCALL 950 Int4ListMakeUnique PROTO((Int4ListPtr list)); 951 952 /* Returns the ascending sorted intersection of list1 and list2, freeing the 953 * both parameters */ 954 Int4ListPtr LIBCALL 955 Int4ListIntersect PROTO((Int4ListPtr *list1, Int4ListPtr *list2)); 956 957 958 /* 959 Common index structures 960 */ 961 962 #define COMMONINDEX_FN "comindex.mm" 963 #define DB_CONFIG_FN "dblist.txt" 964 965 typedef struct CommonIndex{ 966 Int4 dbmask; /* mask to define which db contains the GI */ 967 Int4 oftenOID; /* ordinal ID for the GI in most often DB */ 968 } CommonIndex, *CommonIndexPtr; 969 970 typedef struct CommonIndexResult { 971 Int4 gi; /* GI */ 972 Int4 oid; /* OID */ 973 Int2 dbid; /* database ID */ 974 struct CommonIndexResult *next; /* make a list */ 975 } CommonIndexResult, *CommonIndexResultPtr; 976 977 /* Data bases */ 978 979 typedef struct DataBaseID { 980 CharPtr name; /* database name like gss, nr, etc */ 981 Char id; /* integer ID, value from 0, to 32, used for bitmasks */ 982 Boolean isprot; /* says TRUE if database contains proteins, FALSE otherwise */ 983 } DataBaseID, *DataBaseIDPtr; 984 985 typedef struct CommonIndexHead { 986 CommonIndexPtr ci; 987 Nlm_MemMapPtr memmap; 988 Int2 num_of_DBs; 989 DataBaseIDPtr dbids; 990 Int4 maxgi; /* maximum GI number permitted */ 991 } CommonIndexHead, *CommonIndexHeadPtr; 992 993 typedef struct OIDList { 994 CharPtr filename; /* name of the file containing OID list */ 995 Uint4Ptr list; /* array of OID's */ 996 Uint4Ptr memory; /* memory to keep the OID's (element list). 997 if this is NULL, then list is memory mapped. */ 998 Int4 total; /* number of elements in the array */ 999 NlmMFILEPtr mfp; /* Used for memory-mapped file. */ 1000 } OIDList, *OIDListPtr; 1001 1002 OIDListPtr OIDListFree (OIDListPtr oidlist); 1003 1004 typedef struct read_db_shared_info { 1005 Int2 nthreads; 1006 NlmMFILEPtr headerfp, sequencefp; 1007 1008 /* This is the ordinal id of the last chunk assigned to a thread when 1009 * iterating over a database via the BlastSeqSrc interface with multiple 1010 * threads. It should not be used in other contexts. It is analogous to 1011 * the db_chunk_last field of the BlastThrInfo structure. 1012 * Please note that in case of a linked list of ReadDBFILE structures, only 1013 * the first shared_info->last_oid_assigned field is significant when 1014 * performing an iteration with multiple threads. 1015 */ 1016 Uint4 last_oid_assigned; 1017 } ReadDBSharedInfo, *ReadDBSharedInfoPtr; 1018 1019 /* ---------------------------------------------------------------------*/ 1020 /* -- Here is set of definitions used with taxonomy info database ----- */ 1021 /* ---------------------------------------------------------------------*/ 1022 1023 /* The following #define allows for the creation of taxonomy databases along 1024 * with the blast databases. Please note that the code to create the blast 1025 * databases is NOT thread-safe! */ 1026 /*#define FDB_TAXONOMYDB*/ 1027 1028 typedef struct _RDBTaxId { 1029 Uint4 taxid; 1030 Uint4 offset; 1031 } RDBTaxId, PNTR RDBTaxIdPtr; 1032 1033 typedef struct _RDBTaxInfo { 1034 Int4 all_taxid_count; /* Total number of taxids in the database */ 1035 Int4 reserved[4]; /* reserved */ 1036 NlmMFILEPtr taxfp; /* Memory mapped index file */ 1037 RDBTaxIdPtr taxdata; /* Index tax_id/file offset */ 1038 Boolean taxdata_alloc; /* true if taxdata was allocated */ 1039 NlmMFILEPtr name_fd; /* Pointer to the file with taxonomy names */ 1040 Boolean taxinfo_alloc; /* Flag to determine structure ptr ownership */ 1041 } RDBTaxInfo, *RDBTaxInfoPtr; 1042 1043 typedef struct _RDBTaxLookup { 1044 Int4 all_taxid_count; /* Total number of taxids in the database */ 1045 Int4 taxids_in_db; 1046 RDBTaxNamesPtr *tax_array; /* This array's index correspond to tax_id and 1047 value of the cell corresponds to tax names 1048 if any */ 1049 VoidPtr tax_data; /* This data may be set and used by the callback */ 1050 } RDBTaxLookup, *RDBTaxLookupPtr; 1051 1052 typedef Boolean (*TaxCallbackFunc) (RDBTaxLookupPtr tax_lookup, Int4 tax_id); 1053 1054 1055 /* 1056 * sequence info record (SI_Record): 1057 * > contains information about given gi 1058 * > most of it will be dumped to *[np]di files 1059 * > form a linked list for identical gis 1060 * > used for transferring data into AddSequence interface 1061 * 1062 * Contribution from Michael Kimelman/Olga Cherenkov from 1063 * NCBI's ID1 group. 1064 */ 1065 1066 typedef struct si_record { 1067 struct si_record PNTR next; 1068 Int4 gi; 1069 char seqid[256]; /* seqid in FASTA format */ 1070 char* title; /* defline */ 1071 Int4 taxid; 1072 Int4 owner; 1073 char div[4]; 1074 Int4 ent; /* entity (sat_key) */ 1075 Uint1 mol; /* Molecule type, as in Seq-inst::mol */ 1076 } SI_Record, PNTR SI_RecordPtr; 1077 1078 /** Allocates a single node in the SI_Record linked list structure */ 1079 SI_Record* SI_RecordNew(void); 1080 /** Deallocates the linked list of SI_Record structures in srp 1081 * @return NULL 1082 */ 1083 SI_Record* SI_RecordFree(SI_Record* srp); 1084 1085 /* ---- 1086 Here are functions for run-time blast in relation to the 1087 Taxonomy blast database 1088 ---- */ 1089 1090 #define TAXDB_ON_FTP "ftp://ftp.ncbi.nih.gov/blast/db/taxdb.tar.gz" 1091 #define BLAST_TAXDB_FILENAME "taxdb" 1092 1093 /* Initialize taxonomy lookup database. returns NULL if failure or 1094 this database do not exists */ 1095 RDBTaxInfoPtr RDBTaxInfoInit(void); 1096 1097 /* Free memory, unmap files etc. related to the taxonomy database */ 1098 void RDBTaxInfoClose(RDBTaxInfoPtr tip); 1099 1100 /* Main function to get taxonomy names for given tax_id from 1101 blast taxonomy database. Returns NULL if tax_id is not in the database */ 1102 RDBTaxNamesPtr RDBGetTaxNames(RDBTaxInfoPtr tip, Int4 tax_id); 1103 1104 #define TAX_DB_MAGIC_NUMBER 0x8739 1105 1106 typedef struct read_db_file { 1107 struct read_db_file PNTR next; 1108 Int4 parameters; /* All boolean parameters */ 1109 /* Bits: 0 - contents allocated 1110 1 - is protein 1111 2 - handle common index 1112 3 - not first time 1113 4 - do not open sequence files 1114 5 - do not close header and sequence files in readdb_get_link 1115 */ 1116 /* 0: Are contents of this struct allocated, or not? Does NOT include 1117 the actual structure and buffer, below. */ 1118 /* 1: If TRUE, sequence is protein, otherwise dna. */ 1119 /* 2: TRUE only for the initial thread; needed for proper freeing of the CommonIndex */ 1120 /* 3: For recursive calls to readdb_new_ex2. */ 1121 CharPtr filename; /* name of the input (w/o extensions). */ 1122 CharPtr aliasfilename; /* name of the alias of input */ 1123 /* The files pointers for "file" (above), the index file, the file 1124 containing the headers, and the sequence file. */ 1125 NlmMFILEPtr indexfp, headerfp, sequencefp; 1126 Int4 header_index_offset; /* offset to beginning of header index in indexfp. */ 1127 CharPtr title, /* Database Title. */ 1128 date; /* Date and time database was prepared. */ 1129 Int4 num_seqs, /* Number of sequences in the database. */ 1130 formatdb_ver; /* Version of formatdb used. */ 1131 BlastDefLinePtr blast_deflinep; /* when not NULL, points to the first defline of the seq*/ 1132 Int4 start, /* 1st ordinal id in this file. */ 1133 stop; /* last ordinal id in this file. */ 1134 Int8 totlen; /* Total length of database. */ 1135 Int8 totlen_stats; /* Total length of database used for expect value and search space. */ 1136 Uint4 maxlen; /* Length of longest sequence in database. */ 1137 Int8 aliaslen; /* Length of the database as read from alias file */ 1138 Uint4 aliasnseq;/* Number of seqs of the database as read from alias file */ 1139 Uint4 nseq_stats; /* Number of seqs to be used for search space and expect value. */ 1140 /* The "index" arrays specify the offsets (in files) of the header and 1141 sequence information. */ 1142 Uint4Ptr header_index, sequence_index, ambchar_index; 1143 Uint4Ptr header_index_start, sequence_index_start, ambchar_index_start; 1144 /* Buffer and allocated amount of this buffer. These should always be 1145 NULL (i.e., NOT USED) if mem-mapping is used; only used to store sequence 1146 if there is no mem-mapping or it failed. */ 1147 1148 ISAMObjectPtr nisam_opt; /* Object for numeric search */ 1149 ISAMObjectPtr sisam_opt; /* Object for string search */ 1150 ISAMObjectPtr isam_pig; /* Object for PIG search */ 1151 RDBTaxInfoPtr taxinfo; /* This object if not NULL - pointer to 1152 the taxonomy names database */ 1153 Uint1Ptr buffer; 1154 Int4 allocated_length; 1155 CommonIndexHeadPtr cih; /* head of the common index */ 1156 Int2 filebit; /* bit corresponding to the DB file */ 1157 Int2 aliasfilebit;/* bit corresponding to the DB alias file */ 1158 OIDListPtr oidlist; /* structure containing a list of ordinal ID's. */ 1159 Int4 membership_bit; /* membership bit read from .[pn]al file for structured asn deflines */ 1160 Int4 sparse_idx;/* Sparse indexes indicator */ 1161 Char full_filename[PATH_MAX]; /* Full path for the file */ 1162 ReadDBSharedInfoPtr shared_info; 1163 Int4 gi_target; /* only this gi should be retrieved */ 1164 /* if non-zero. */ 1165 CharPtr gifile; /* Path to a file with the gi list, should 1166 always be NULL after readdb_new* calls */ 1167 Int4ListPtr gilist; /* storage for the above file in memory */ 1168 Int4 preferred_gi; /* this gi should be listed first */ 1169 /* in the bioseq if non-zero */ 1170 Int4 last_preloaded; /* starting ordinal id of the last preloaded file block */ 1171 } ReadDBFILE, PNTR ReadDBFILEPtr; 1172 1173 /* Function prototypes */ 1174 Int4 GI2OID(CommonIndexHeadPtr cih, Int4 gi, Int4 dbmask, Int4 alias_dbmask, 1175 Int2Ptr dbid, Int2Ptr alias_dbid, ReadDBFILEPtr rdfp); 1176 Int2 DBShift(Int2 num_of_DBs, DataBaseIDPtr dbids, CharPtr dbname, Boolean is_prot); 1177 CharPtr DBName(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift); 1178 Boolean DBisProt(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift); 1179 CommonIndexResultPtr GIs2OIDs(CommonIndexHeadPtr cih, 1180 Int4Ptr gis, Int4 number_of_gis, Int4 dbshift, ReadDBFILEPtr rdfp); 1181 Int2 SeniorBit(Int4 bitmask); 1182 CommonIndexHeadPtr CommonIndexInit(CharPtr indexfilename); 1183 void CommonIndexDestruct(CommonIndexHeadPtr cihp); 1184 Int2 bit_engine_firstbit (Int4 word); 1185 Int2Ptr bit_engine_arr(Int4 word); 1186 Int2 bit_engine_numofbits(Int4 word); 1187 Int2 ParseDBConfigFile(DataBaseIDPtr *dbidsp, CharPtr path); 1188 CharPtr FindBlastDBFile (CharPtr filename); 1189 CharPtr FindDBbyGI(CommonIndexHeadPtr cih, Int4 gi, Uint1 *is_prot); 1190 RDBTaxNamesPtr LIBCALL readdb_get_taxnames PROTO(( 1191 ReadDBFILEPtr rdfp, Int4 tax_id)); 1192 1193 /* mmap's */ 1194 1195 NLM_EXTERN Nlm_MemMapPtr EA_MemMapInit(const Nlm_Char PNTR name, Boolean readonly); 1196 1197 /****************************************************************************/ 1198 /* FINCTION DEFINITIONS */ 1199 /****************************************************************************/ 1200 /* Deallocate the memory mapping of header and sequence files */ 1201 ReadDBFILEPtr ReadDBCloseMHdrAndSeqFiles PROTO((ReadDBFILEPtr rdfp)); 1202 1203 /* 1204 Intitialize the readdb structure using the database "filename". 1205 If no database is used, set filename to NULL. 1206 */ 1207 ReadDBFILEPtr LIBCALL readdb_new PROTO((CharPtr filename, Uint1 is_prot)); 1208 1209 /* 1210 init_indices should be TRUE if entire database is to be searched, otherwise 1211 it can be FALSE. 1212 */ 1213 ReadDBFILEPtr LIBCALL readdb_new_ex PROTO((CharPtr filename, Uint1 is_prot, Boolean init_indices)); 1214 1215 /* 1216 * Initializes the blast database specified in the argument list. 1217 * filename: blast database to initialize 1218 * is_prot: is this database protein ? 1219 * init_state: bitwise-OR of the READDB_NEW_* values (selectively mmap certain 1220 * files) 1221 * oidlist: Path to the ordinal id list to use (this is mmap'd) 1222 * gilist: Path to the gi list to use (this is not resolved until the search 1223 * is conducted (see BlastProcessGiLists) 1224 */ 1225 ReadDBFILEPtr LIBCALL readdb_new_ex2 PROTO((CharPtr filename, Uint1 is_prot, 1226 Uint1 init_state, CharPtr oidlist, CharPtr gilist)); 1227 1228 1229 /* 1230 Deallocate the ReadDBFILEPtr. 1231 */ 1232 ReadDBFILEPtr LIBCALL readdb_destruct PROTO((ReadDBFILEPtr readdb)); 1233 1234 ReadDBFILEPtr LIBCALL readdb_destruct_element PROTO((ReadDBFILEPtr rdfp)); 1235 1236 1237 /* 1238 Attach to an already open ReadDBFILEPtr. Duplicate the 1239 indexfp, sequencefp, and headerfp structures as the pointers 1240 there (i.e., mmp) will need to be manipulated. Do not 1241 change the FILE PNTR fp. 1242 */ 1243 ReadDBFILEPtr LIBCALL readdb_attach PROTO((ReadDBFILEPtr rdfp)); 1244 1245 /* 1246 Checks whether a ReadDBFILEPtr is the original, or just attaced. 1247 It does this by checking the rdfp->contents_allocated flag. 1248 */ 1249 Boolean LIBCALL readdb_copy PROTO((ReadDBFILEPtr rdfp)); 1250 1251 /* 1252 Checks two ReadDBFILEPtr to see if they refer to the same 1253 database. 1254 */ 1255 Boolean LIBCALL readdb_compare PROTO((ReadDBFILEPtr rdfp1, ReadDBFILEPtr rdfp2)); 1256 1257 1258 /* 1259 Get total length and number of sequences in multiple databases. 1260 */ 1261 1262 Boolean LIBCALL readdb_get_totals PROTO((ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num)); 1263 1264 /* 1265 Get total length and number of sequences in multiple databases. 1266 if 'use_alias' is TRUE, values from the alias file will be used 1267 if non-zero. 1268 */ 1269 1270 Boolean LIBCALL 1271 readdb_get_totals_ex PROTO((ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num, Boolean use_alias)); 1272 1273 /* retrieves the total number of sequences and database length in the 1274 * rdfp_list. use_alias and use_virtual_oidlist are mutually exclusive 1275 * options (both of them cannot be true at the same time). If 1276 * use_virtual_oidlist is TRUE, this function assumes that this rdfp_list has 1277 * been processed by BlastProcessGiLists */ 1278 Boolean LIBCALL 1279 readdb_get_totals_ex2 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr dblen, 1280 Int4Ptr nseq, Boolean use_alias, Boolean use_virtual_oidlist)); 1281 1282 /* Enumerated type to determine if the database length (number of 1283 * bases/residues) should be approximated or calculated exactly by 1284 * readdb_get_totals_ex3 */ 1285 typedef enum { 1286 eExact, 1287 eApproximate 1288 } EAccountingMode; 1289 1290 /* This function is identical to readdb_get_totals_ex2 but it uses its last 1291 * argument to determine if in the case of nucleotide databases the exact 1292 * database length is required. If eExact is used, the exact database size is 1293 * calculated, if eApproximate is used, an approximation is returned. This is 1294 * done to avoid having to touch every last byte of each sequence to determine 1295 * the exact length of the database when it is restricted by a virtual oidlist. 1296 * The EAccountingMode argument is irrelevant for protein databases, where this 1297 * function always return the exact database length. Same assumption about 1298 * BlastProcessGiLists as in readdb_get_totals_ex2 applies. 1299 */ 1300 Boolean LIBCALL 1301 readdb_get_totals_ex3 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr dblen, 1302 Int4Ptr nseq, Boolean use_alias, Boolean use_virtual_oidlist, 1303 EAccountingMode acc_mode)); 1304 1305 /* 1306 Gets the number to be used for statistical purposes. Should be set in 1307 alias file as STATS_NSEQ and STATS_TOTLEN. 1308 */ 1309 Boolean LIBCALL 1310 readdb_get_stats_numbers(ReadDBFILEPtr rdfp_list, Int4* num_seqs_stats, Int8* tot_len_stats); 1311 1312 /* 1313 Get the sequence with sequence_number and put it in buffer. No memory 1314 is allocated for this if memory-mapped files are used, otherwise it is. 1315 Return the length of the sequence. 1316 */ 1317 Int4 LIBCALL readdb_get_sequence PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer)); 1318 1319 /* 1320 Gets the sequence number "sequence_number". The sequence returned includes 1321 all ambiguity information. THis funciton should only be used for nucleic 1322 acid sequences, for proteins use readdb_get_sequence. 1323 1324 buffer contains the sequence and is reallocated if *buffer_length is not long enough. 1325 1326 The length of the sequence requested is the return value. 1327 protein sequences are always returned as Seq_code_ncbistdaa, 1328 nucleotide sequences as Seq_code_ncbi4na. 1329 */ 1330 Int4 LIBCALL readdb_get_sequence_ex PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer, Int4 *buffer_length, Boolean ready)); 1331 1332 /* Gets sequence number by gi number. Returnes -1 if gi not found or 1333 other negative value if NISAM library faults. Non-negative value 1334 means success. Use numeric ISAM indexes. 1335 */ 1336 Int4 LIBCALL readdb_gi2seq(ReadDBFILEPtr rdfp, Int4 gi, Int4Ptr start); 1337 1338 /* Gets sequence number by SeqId number. Returnes -1 if gi not found or 1339 other negative value if SISAM library faults. Non-negative value 1340 means success. Use string ISAM indexes. 1341 */ 1342 Int4 LIBCALL readdb_seqid2fasta(ReadDBFILEPtr rdfp, SeqIdPtr sip); 1343 1344 /* Gets sequence number by Accession/Locus string. Returnes -1 1345 if accession not found or 1346 other negative value if SISAM library faults. Non-negative value 1347 means success. Use string ISAM indexes. 1348 */ 1349 Int4 LIBCALL readdb_acc2fasta(ReadDBFILEPtr rdfp, CharPtr string); 1350 1351 /* Gets array of sequence numbers by Accession/Locus string. Returnes -1 1352 if accession not found or 1353 other negative value if SISAM library faults. Non-negative value 1354 means success. Use string ISAM indexes. 1355 */ 1356 Int4 LIBCALL readdb_acc2fastaEx(ReadDBFILEPtr rdfp, CharPtr string, 1357 Int4Ptr PNTR ids, Int4Ptr count); 1358 1359 /* 1360 Gets a BioseqPtr containing the sequence in sequence_number. 1361 */ 1362 BioseqPtr LIBCALL readdb_get_bioseq PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number)); 1363 BioseqPtr LIBCALL readdb_get_bioseq_ex PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Boolean use_objmgr, Boolean insert_ctrlA)); 1364 1365 /* 1366 Gets the exact sequence length for protein sequences, but for nucleotide 1367 sequences it gets the length of the sequence +/- at most 3 bases (last byte 1368 is not examined, therefore the return value is an approximation). 1369 */ 1370 Int4 LIBCALL readdb_get_sequence_length_approx PROTO((ReadDBFILEPtr rdfp, 1371 Int4 sequence_number)); 1372 1373 /* 1374 Get the length of the sequence. 1375 */ 1376 Int4 LIBCALL readdb_get_sequence_length PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number)); 1377 1378 /* 1379 Get the ID and definition for the sequence with sequence_number. 1380 It is the caller's RESPONSIBILITY to DEALLOCATE "id" and "description". 1381 */ 1382 Boolean LIBCALL readdb_get_descriptor PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, SeqIdPtr PNTR id, CharPtr PNTR description)); 1383 Boolean 1384 readdb_get_defline (ReadDBFILEPtr rdfp, Int4 sequence_number, CharPtr PNTR description); 1385 1386 /* 1387 Get the ID's and headers for a sequence. 1388 */ 1389 Boolean LIBCALL 1390 readdb_get_header PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr header_index , SeqIdPtr PNTR id, CharPtr PNTR description)); 1391 1392 /* 1393 Get the ID's, headers, taxid, memberships, and links for a sequence. 1394 Returns FALSE if the sequence_number is not applicable in the context of the 1395 database in rdfp (i.e.: masked databases), otherwise it will return TRUE until 1396 there are sequences associated with this sequence_number (then it returns 1397 FALSE). 1398 */ 1399 Boolean LIBCALL 1400 readdb_get_header_ex PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, 1401 Uint4Ptr header_index, SeqIdPtr PNTR id, 1402 CharPtr PNTR description, Int4 PNTR taxid, 1403 ValNodePtr PNTR memberships, ValNodePtr PNTR links)); 1404 1405 /* 1406 Get the Int4Ptr to ambiguity buffer 1407 */ 1408 Boolean LIBCALL readdb_get_ambchar PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr PNTR ambchar_return)); 1409 1410 /* 1411 Check whether ambiguity characters are present in the sequence. 1412 */ 1413 Boolean LIBCALL readdb_ambchar_present PROTO((ReadDBFILEPtr rdfp, Int4 sequence_number)); 1414 1415 /* 1416 Get the total length (in bp or residues) of the database. 1417 */ 1418 Int8 LIBCALL readdb_get_dblen PROTO((ReadDBFILEPtr rdfp)); 1419 1420 /* 1421 Get the number of entries in the database. 1422 */ 1423 Int4 LIBCALL readdb_get_num_entries PROTO((ReadDBFILEPtr rdfp)); 1424 1425 /* 1426 Get the total number of entries in all the files. 1427 */ 1428 Int4 LIBCALL readdb_get_num_entries_total PROTO((ReadDBFILEPtr rdfp)); 1429 1430 /* 1431 Obtains the total number of real database sequences from all the ReadDBFILE structures. 1432 */ 1433 1434 Int4 LIBCALL readdb_get_num_entries_total_real PROTO((ReadDBFILEPtr rdfp)); 1435 1436 /* Check whether an OID is actually in the database according to the mask file. */ 1437 Boolean readdb_check_oid(ReadDBFILEPtr rdfp_head, Int4 oid); 1438 1439 /* 1440 Get the length of the longest sequence in the database. 1441 */ 1442 Int4 LIBCALL readdb_get_maxlen PROTO((ReadDBFILEPtr rdfp)); 1443 1444 /* 1445 Get the title (i.e., name) of the database. 1446 NOTE: the CharPtr returned is not owned by the caller! 1447 */ 1448 CharPtr LIBCALL readdb_get_title PROTO((ReadDBFILEPtr rdfp)); 1449 1450 /* 1451 Get the name of the file used for formatting. 1452 NOTE: the CharPtr returned is not owned by the caller! 1453 */ 1454 CharPtr LIBCALL readdb_get_filename PROTO((ReadDBFILEPtr rdfp)); 1455 1456 /* For use by the seq-src to get the alias file name with full path. */ 1457 CharPtr LIBCALL readdb_get_full_filename PROTO((ReadDBFILEPtr rdfp)); 1458 1459 /* 1460 Get the date the database was formatted. 1461 NOTE: the CharPtr returned is not owned by the caller! 1462 */ 1463 CharPtr LIBCALL readdb_get_date PROTO((ReadDBFILEPtr rdfp)); 1464 1465 /* 1466 Is this a protein database? 1467 */ 1468 Boolean LIBCALL readdb_is_prot PROTO((ReadDBFILEPtr rdfp)); 1469 1470 /* 1471 Parses the databases names (if more than one) from 1472 'filenames' into buffer. buffer should already be 1473 long enough and allocated. The funciton should be 1474 repeatedly called until TRUE is returned. 1475 */ 1476 Boolean LIBCALL readdb_parse_db_names PROTO((CharPtr PNTR filenames, CharPtr buffer)); 1477 1478 /* 1479 Get the version of formatdb used on this database. 1480 */ 1481 Int4 LIBCALL readdb_get_formatdb_version PROTO((ReadDBFILEPtr rdfp)); 1482 1483 /* 1484 returns the 'filebits' associated with a certain ordinal number. 1485 This is done by going to the rdfp for that ordinal id and 1486 gathering the filebits. 1487 */ 1488 Boolean LIBCALL readdb_get_filebits PROTO((ReadDBFILEPtr rdfp, Int4 ordinal_id, Uint2Ptr filebit, Uint2Ptr aliasfilebit)); 1489 1490 /* Possible return values for readdb_validate */ 1491 1492 #define READDB_VALID 0 1493 #define READDB_INVALID_NULL_ARG -1 1494 #define READDB_INVALID_MIXED_DBS -2 1495 1496 /* Validate the linked list of rdfp structures passed as an argument to this 1497 * function. 1498 * Return 1499 * list or if the argument is NULL, otherwise returns TRUE 1500 */ 1501 Int4 LIBCALL readdb_validate PROTO((ReadDBFILEPtr rdfp)); 1502 1503 /** Calculate a hash value for a given sequence data 1504 * @param sequence containing sequence data (must not be NULL) [in] 1505 * @param sequence_length length of the buffer above populated with data [in] 1506 */ 1507 Uint4 readdb_sequence_hash(const char* sequence, int sequence_length); 1508 1509 /* For the BioseqFetch functions. */ 1510 1511 Boolean LIBCALL ReadDBBioseqFetchEnable PROTO((CharPtr program, CharPtr dbname, Boolean is_na, Boolean now)); 1512 1513 Boolean LIBCALL ReadDBBioseqSetDbGeneticCode PROTO((Int4 db_genetic_code)); 1514 1515 void LIBCALL ReadDBBioseqFetchDisable PROTO((void)); 1516 1517 /* Converts a SeqIdPtr to an ordinal_id, which readdb can use to look 1518 up sequences etc. Negative numbers are returned if the SeqIdPtr 1519 cannot be converted. */ 1520 Int4 SeqId2OrdinalId PROTO((ReadDBFILEPtr rdfp, SeqIdPtr sip)); 1521 1522 /* 1523 Returns the ReadDBFILEPtr by the database ID. 1524 */ 1525 ReadDBFILEPtr ReadDBGetDb PROTO((ReadDBFILEPtr list, Int2 db_id)); 1526 1527 /* 1528 Returns the Database ID. 1529 */ 1530 Int2 ReadDBGetDbId PROTO((ReadDBFILEPtr list, ReadDBFILEPtr target)); 1531 1532 1533 /********************/ 1534 /* formatdb */ 1535 1536 /* Type definitions */ 1537 1538 typedef struct FASTALookup { 1539 Int4Ptr table; /* Main buffer for gi/fasta_id pairs */ 1540 Int4 allocated; /* Nunber of Uint4 allocated */ 1541 Int4 used; /* Number of Uint4 used */ 1542 } FASTALookup, PNTR FASTALookupPtr; 1543 1544 /* Structure that holds the link information as read from the file */ 1545 typedef struct _linkinfo { 1546 Int4 bit_number; /* indicates the position in links bit array */ 1547 Int4ListPtr gi_list; /* update links bit array for gis in this list */ 1548 } LinkInfo, *LinkInfoPtr; 1549 1550 /* Structure that holds the membership information */ 1551 typedef Boolean (*GMCriteriaFunc) (VoidPtr direc); 1552 1553 typedef struct _membinfo { 1554 Int4 bit_number; /* indicates the position in the membership bit array */ 1555 GMCriteriaFunc criteria; /* function pointer that is invoked to 1556 determine wheather certain sequence 1557 belongs to the membership represented by 1558 this bit_number */ 1559 } MembInfo, *MembInfoPtr; 1560 1561 /* Options to clean up blast database files from a previous instance of the 1562 * database with the same name as the one about to be created. This has been 1563 * added to prevent the case in which an alias file might have precedence over 1564 * a single-volume blast database. */ 1565 typedef enum EFDBCleanOpt { 1566 eCleanNever = 0, /* don't remove older files of the db to be created, 1567 just overwrite them or ignore alias files */ 1568 eCleanAlways, /* clean up all older files of the db to be created */ 1569 eCleanPrompt, /* Assumes interactive program */ 1570 eCleanOptMax 1571 } EFDBCleanOpt; 1572 1573 /*** PIG (Protein Identifier Group) interface ***/ 1574 1575 #define PIG_NONE -1 /* No protein identifier group */ 1576 1577 /* PIG table structure 1578 * From this information the formatdb API creates a pair of ISAM files to map 1579 * PIGs to ordinal ids */ 1580 typedef struct FDBPigTable { 1581 Int4Ptr pop; /* list of pig/ordinal id pairs */ 1582 Int4 count, allocated; /* keep track of table size */ 1583 } FDBPigTable, * FDBPigTablePtr; 1584 1585 /* Allocate a PIG table structure */ 1586 FDBPigTablePtr LIBCALL 1587 FDBPigTableNew PROTO((void)); 1588 1589 /* Deallocate a PIG table structure */ 1590 FDBPigTablePtr LIBCALL 1591 FDBPigTableFree PROTO((FDBPigTablePtr fptp)); 1592 1593 /* Add a PIG to the PIG table structure, return FALSE on error */ 1594 Boolean LIBCALL 1595 FDBAddPig PROTO((FDBPigTablePtr fptp, Int4 pig, Int4 oid)); 1596 1597 /* Retrieve the PIG for a given ordinal id */ 1598 Int4 LIBCALL 1599 readdb_get_pig PROTO((ReadDBFILEPtr rdfp, Int4 oid)); 1600 1601 /* Retrieve the ordinal id corresponding to a given PIG (analogous to 1602 * readdb_gi2seq) */ 1603 Int4 LIBCALL 1604 readdb_pig2oid PROTO((ReadDBFILEPtr rdfp, Int4 pig, Int4Ptr start)); 1605 1606 /************************************************/ 1607 /*** TaxidDeflineTable interface ***/ 1608 1609 /* forward declaration of main structure */ 1610 typedef struct FDBTaxidDeflineTable FDBTaxidDeflineTable; 1611 typedef struct FDBTaxidDeflineTable* FDBTaxidDeflineTablePtr; 1612 1613 /** Allocate a TaxidDefline table structure from a file 1614 * It attempts to read a list of gi/taxid pairs first, then a list of seqid 1615 * strings/taxid pairs. 1616 */ 1617 FDBTaxidDeflineTablePtr LIBCALL 1618 FDBTaxidDeflineTableNew PROTO((const Char* filename)); 1619 1620 /** Deallocate a TaxidDefline table structure */ 1621 FDBTaxidDeflineTablePtr LIBCALL 1622 FDBTaxidDeflineTableFree PROTO((FDBTaxidDeflineTablePtr taxid_tbl)); 1623 1624 extern const Int4 kTaxidDeflineSearch_NotFound; 1625 1626 /** Searches the gi provided as argument in the taxid_tbl argument. If not 1627 * found it returns kTaxidDeflineSearch_NotFound, otherwise it returns the 1628 * taxonomy id */ 1629 Int4 LIBCALL 1630 FDBTaxidDeflineTableSearchGi PROTO((const FDBTaxidDeflineTablePtr taxid_tbl, 1631 Int4 gi)); 1632 1633 /** Searches the seqid provided as argument in the taxid_tbl argument. If not 1634 * found it returns kTaxidDeflineSearch_NotFound, otherwise it returns the 1635 * taxonomy id */ 1636 Int4 LIBCALL 1637 FDBTaxidDeflineTableSearchSeqid PROTO((const FDBTaxidDeflineTablePtr taxid_tbl, 1638 const Char* seqid)); 1639 1640 /************************************************/ 1641 1642 typedef struct _FDB_options { 1643 Int4 version; /* Version of the database created by formatdb program 1644 currently supported are 3 - FORMATDB_VER_TEXT and 1645 4 - FORMATDB_VER - for ASN.1 structured deflines */ 1646 CharPtr db_title; /* Title for the database to be created */ 1647 CharPtr db_file; /* Name for input data file - 'IN' name */ 1648 Int4 is_protein; /* Is this protein database ? */ 1649 Int4 parse_mode; /* Do we assume, that deflines are started from 1650 valid SeqIds ? */ 1651 Int4 isASN; /* read from file or ASN - used only in formatdb.c */ 1652 Int4 asnbin; /* What is this type of ASN? used only 1653 in formatdb.c */ 1654 Int4 is_seqentry; /* What is this type of ASN? used only 1655 in formatdb.c */ 1656 CharPtr base_name; /* Name for db files to be created 'OUT' name */ 1657 CharPtr alias_file_name; /* name to be used for BLAST alias-file. */ 1658 Int4 dump_info; /* To printout file with information about tax_id, 1659 owner, hash etc. - used for dump from ID */ 1660 1661 Int4 sparse_idx; /* To use only limited set of text ids to dump for 1662 usage in indexes */ 1663 Int4 test_non_unique; /* Print messages if FASTA database has 1664 non-unique string ids - accessions, locuses*/ 1665 1666 RDBTaxLookupPtr tax_lookup; /* taxonomy lookup table - should be initialized in the main program to be used for creating of taxonomy information*/ 1667 1668 TaxCallbackFunc tax_callback; /* Function to retrieve taxonomy names from 1669 Taxonomy server */ 1670 Int8 bases_in_volume; /* The maximal number of bases that can be stored in 1671 one volume of the database */ 1672 Int4 sequences_in_volume; /* Maximum number of sequences to be stored in a 1673 volume */ 1674 Int2 volume; /* Largest volume */ 1675 Int4 total_num_of_seqs; /* total number of sequences for this database */ 1676 CharPtr gi_file; /* Gi file to be used in processing. */ 1677 CharPtr gi_file_bin; /* Gi file to be used in processing. */ 1678 1679 ValNodePtr linkbit_listp; /* list of gis and the bits to set */ 1680 ValNodePtr memb_tblp; /* Linked list of MembInfo structures */ 1681 VoidPtr memb_argp; /* Argument to criteria function in MembInfo 1682 structure */ 1683 EFDBCleanOpt clean_opt; /* clean up option */ 1684 1685 } FDB_options, PNTR FDB_optionsPtr; 1686 1687 /** Maximum number of volumes constructed by formatdb */ 1688 extern const Uint4 kFDBMaxNumVolumes; 1689 1690 typedef struct formatdb 1691 { 1692 /* CharPtr dbname; (db_file) name of input database */ 1693 /* CharPtr DbTitle; (db_title) database title */ 1694 1695 /* file handlers */ 1696 1697 FILE *fd, 1698 *fd_ind, 1699 *fd_seq, 1700 *fd_def, 1701 *fd_sdi, /* This is file for misc. info data */ 1702 *fd_stmp; 1703 1704 /* ASN.1 input, if the "-a" specified */ 1705 AsnIoPtr aip; 1706 1707 /* ASN.1 defline output if structured defline */ 1708 AsnIoPtr aip_def; 1709 1710 Int4 num_of_seqs; /* number of parsed sequences in this volume */ 1711 Int8 TotalLen; 1712 Int4 MaxSeqLen; 1713 1714 /* offset tables */ 1715 Int4Ptr DefOffsetTable, /* definitions */ 1716 SeqOffsetTable, /* sequences */ 1717 AmbOffsetTable; /* ambiguities */ 1718 1719 /* lookup table */ 1720 1721 FASTALookupPtr lookup; 1722 1723 /* Table to map PIGs to ordinal ids */ 1724 FDBPigTablePtr ptable; 1725 1726 /* General formatdb options */ 1727 1728 FDB_optionsPtr options; 1729 1730 Uint4Ptr AmbCharPtr; /* ambiguity characters while 1731 * convert from ncbi2na->ncbi4na */ 1732 1733 Int4 OffsetAllocated; /* storage for allocation size */ 1734 1735 } FormatDB, *FormatDBPtr; 1736 1737 1738 #define MASK_WORD_SIZE 32 1739 1740 /* Function prototypes for formatdb library*/ 1741 1742 /* --------------------- FDBOptionsNew ---------------------------- 1743 Purpose: Creates formatdb options structure with parameters from 1744 the argument list. 1745 Returns: Pointer to initialized structure. 1746 Notes: If alias_file_name is provided, the function FDB_MakeAlias 1747 should be called after FDBClose. (FIXME) 1748 ---------------------------------------------------------------- */ 1749 FDB_optionsPtr FDBOptionsNew( 1750 CharPtr input, /* [in] name of input file */ 1751 Boolean is_prot, /* [in] input contains protein sequences? */ 1752 CharPtr title, /* [in] title to give this database */ 1753 Boolean is_asn, /* [in] true if input is in ASN.1 */ 1754 Boolean is_asn_bin, /* [in] true if ASN.1 input is binary */ 1755 Boolean is_seqentry, /* [in] true of input is a seqentry */ 1756 Boolean sparse_idx, /* [in] should sparce ISAM indices be used? */ 1757 Boolean test_non_unique, /* [in] test for repeated string identifiers 1758 in database */ 1759 Boolean parse_deflines, /* [in] input contains parseable deflines? */ 1760 CharPtr basename, /* [in] name for the database to create */ 1761 CharPtr alias_file_name, /* [in] name for the alias file to create */ 1762 Int8 bases_per_volume, /* [in] max num of residues/bases per volume */ 1763 Int4 seqs_per_volume, /* [in] max num of sequences per volume */ 1764 Int4 version, /* [in] database version */ 1765 Boolean dump_info, /* [in] should basename.[pn]di be created? */ 1766 EFDBCleanOpt clean_opt);/* [in] should basename.* files be removed ? */ 1767 1768 /* --------------------- FDBOptionsFree --------------------------- 1769 Purpose: Frees the memory allocated for the formatdb options structure. 1770 Returns: NULL 1771 ---------------------------------------------------------------- */ 1772 FDB_optionsPtr FDBOptionsFree(FDB_optionsPtr options); 1773 Boolean FDBCleanUp(FDB_optionsPtr options); 1774 1775 /* The next 4 functions are for production database dump ({id,rs}dump_blast) */ 1776 ValNodePtr FDBLoadLinksTable(void); 1777 ValNodePtr FDBDestroyLinksTable(ValNodePtr list); 1778 ValNodePtr FDBLoadMembershipsTable(void); 1779 ValNodePtr FDBDestroyMembershipsTable(ValNodePtr tbl); 1780 1781 /* Constructs BlastDefLine structures from Bioseq */ 1782 BlastDefLinePtr FDBGetDefAsnFromBioseq(BioseqPtr bsp, 1783 const FDBTaxidDeflineTablePtr gttp); 1784 1785 FormatDBPtr FormatDBInit(FDB_optionsPtr options); 1786 1787 /* For database version FORMATDB_VER (or greater), only the first 5 parameters 1788 * are used, the latter are kept for the FORMATDB_VER_TEXT version of the BLAST 1789 * databases. Please note that the seq_data and seq_data_type will be changed 1790 * if the data passed in doesn't match the format that is required for the 1791 * BLAST database format (ncbistdaa for proteins, ncbi2na for nucleotides) */ 1792 Int2 FDBAddSequence (FormatDBPtr fdbp, BlastDefLinePtr bdp, 1793 Uint1* seq_data_type, ByteStorePtr *seq_data, 1794 Int4 SequenceLen, 1795 CharPtr seq_id, CharPtr title, 1796 Int4 gi, Int4 tax_id, CharPtr div, Int4 owner, Int4 date); 1797 1798 /** 1799 * FDBAddSequence2: is an interface to add "non-redundant sequence", i.e 1800 * common sequence data and multiple sequence information block (1 per gi) 1801 * This function will NOT alter the seq_data field, it assumes that the data is 1802 * already provided in the required format 1803 * @param fdbp target blast db [in] 1804 * @param srp linked list of sequence information for each gi [in] 1805 * @param seq_data_type type of the parameter below [in] 1806 * @param seq_data sequence data itself [in] 1807 * @param SequenceLen length of the sequence in seq_data [in] 1808 * @param AmbCharPtr pointer to ambiguity sequence data (nucl only) [in] 1809 * @param pig_id stable protein group identifier [in] 1810 * @param hash sequence hash - to allow resuse of hahs calculated in ID [in] 1811 * @return 1 on failure, 0 on success 1812 */ 1813 Int2 FDBAddSequence2 (FormatDBPtr fdbp, 1814 SI_RecordPtr srp, 1815 Uint1 seq_data_type, 1816 const ByteStorePtr *seq_data, 1817 Int4 SequenceLen, 1818 Uint4Ptr AmbCharPtr, 1819 Int4 pig_id, 1820 Uint4 hash 1821 ); 1822 1823 /* For database version FORMATDB_VER (or greater), the bdp parameter must 1824 * be provided. This could be populated from the bsp parameter by calling 1825 * FDBGetDefAsnFromBioseq */ 1826 Int2 FDBAddBioseq(FormatDBPtr fdbp, BioseqPtr bsp, BlastDefLinePtr bdp); 1827 Int2 FormatDBClose(FormatDBPtr fdbp); 1828 1829 Boolean FDBAddLinksInformation(BlastDefLinePtr bdp, ValNodePtr links_tblp); 1830 Boolean FDBAddMembershipInformation(BlastDefLinePtr bdp, ValNodePtr memb_tblp, 1831 VoidPtr criteria_arg); 1832 1833 Int2 process_sep (SeqEntryPtr sep, FormatDBPtr fdbp); 1834 1835 NLM_EXTERN Boolean SeqEntrysToBLAST (SeqEntryPtr sep, FormatDBPtr fdbp, 1836 Boolean is_na, Uint1 group_segs); 1837 1838 NLM_EXTERN Boolean BLASTFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf, 1839 Uint4 buflen, Pointer data); 1840 1841 /* 1842 Print a summary of the database used. 1843 */ 1844 Boolean LIBCALL PrintDbInformation PROTO((CharPtr database, Boolean is_aa, Int4 line_length, FILE *outfp, Boolean html)); 1845 Boolean LIBCALL PrintDbInformationWithRID PROTO((CharPtr database, Boolean is_aa, Int4 line_length, FILE *outfp, Boolean html, CharPtr rid, Boolean query_is_aa)); 1846 Boolean LIBCALL PrintDbInformationBasicEx PROTO((Boolean is_aa, Int4 line_length, 1847 CharPtr definition, Int4 number_seqs, 1848 Int8 total_length, FILE *outfp, Boolean html, 1849 Boolean with_links)); 1850 1851 Boolean LIBCALL PrintDbInformationBasic PROTO((CharPtr database, Boolean is_aa, Int4 line_length, CharPtr definition, Int4 number_seqs, Int8 total_length, FILE *outfp, Boolean html)); 1852 1853 Boolean FDBAddSeqEntry(FormatDBPtr fdbp, SeqEntryPtr sep); 1854 1855 /* ID1 dump stuff */ 1856 1857 typedef struct di_record { 1858 Int4 oid; 1859 Int4 gi; 1860 Int4 taxid; 1861 Int4 owner; 1862 Char div[4]; /* 3-letter division */ 1863 Int4 len; /* Length of sequence */ 1864 Int4 hash; /* Hash value for sequence data */ 1865 Int4 date; /* NB: name is misleading; this is actually sat_key */ 1866 CharPtr acc; /* accession should not exceed this size */ 1867 Uint1 mol; /* Molecule type, as in Seq-inst::mol */ 1868 Int4 gi_threshold; /* for 'month' subset */ 1869 1870 } DI_Record, *DI_RecordPtr; 1871 1872 /******** genmask structures and functions *********/ 1873 1874 /* genmask scans the *.[pn]di files and sets membership bits according to the 1875 criteria specified by the GMCriteria function (see typedef above). This is 1876 one example of how to set the membership bits in the new database format. 1877 Note that the MembInfo structure has a criteria function pointer that 1878 returns a boolean value and takes a void ptr as an argument to allow 1879 flexibility in specifying the criteria to belong to a particular 1880 membership. */ 1881 1882 typedef struct { 1883 Int4 count, allocated; 1884 CharPtr *subset_name; 1885 GMCriteriaFunc *criteria; 1886 Int4 *membership_bit; 1887 } GMSubsetData, * GMSubsetDataPtr; 1888 1889 Boolean ScanDIFile(CharPtr difilename, GMSubsetDataPtr gmsubsetdp, 1890 Boolean(*callback)(DI_RecordPtr direc, VoidPtr data), VoidPtr data, 1891 FILE *out, Int4 gi_threshold); 1892 1893 CharPtr FDFGetAccessionFromSeqIdChain(SeqIdPtr seqid_list); 1894 1895 /* These functions determine the criteria for the membership bits for genmask. 1896 Only protein sequences have memberships because they are in non-redundant 1897 databases */ 1898 Boolean is_EST_HUMAN(VoidPtr di_record); 1899 Boolean is_EST_MOUSE(VoidPtr di_record); 1900 Boolean is_EST_OTHERS(VoidPtr di_record); 1901 Boolean is_SWISSPROT(VoidPtr di_record); 1902 Boolean is_MONTH(VoidPtr di_record); 1903 Boolean is_PDB(VoidPtr di_record); 1904 Boolean is_REFSEQ(VoidPtr di_record); 1905 Boolean is_REFSEQ_RNA(VoidPtr di_record); 1906 Boolean is_REFSEQ_GENOMIC(VoidPtr ptr); 1907 Boolean is_CONTIG(VoidPtr di_record); 1908 1909 /************************************************************************/ 1910 /* Fastacmd API */ 1911 /************************************************************************/ 1912 1913 typedef struct FCMDAccList { 1914 CharPtr acc; 1915 Int4 gi; 1916 struct FCMDAccList *next; 1917 } FCMDAccList, PNTR FCMDAccListPtr; 1918 1919 FCMDAccListPtr LIBCALL GetAccList(CharPtr file, Int4Ptr TotalItems); 1920 void LIBCALL FCMDAccListFree(FCMDAccListPtr falp); 1921 1922 #define FASTACMD_DEFAULT_DB "nr" 1923 1924 #define FASTACMD_SUCCESS 0 1925 #define FASTACMD_ERROR 1 1926 #define FASTACMD_DB_NOT_FOUND 2 1927 #define FASTACMD_FAILED_SEARCH 3 1928 #define FASTACMD_NO_TAXDB 4 1929 1930 /* Fastacmd_Search and Fastacmd_Search_ex return non-zero on failure */ 1931 Int2 Fastacmd_Search (CharPtr searchstr, CharPtr database, 1932 CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out); 1933 1934 /* Used to specify which kind of data to dump using fastacmd */ 1935 typedef enum EBlastDbDumpType { 1936 eNoDump = 0, /* Don't dump any data from the database, the default for 1937 fastacmd */ 1938 eFasta, /* dump contents of database as FASTA */ 1939 eGi, /* List of gis in the database */ 1940 eAccession, /* List of accessions in the database */ 1941 eDumpTypeMax /* not really a dump type, needed for error checking */ 1942 } EBlastDbDumpType; 1943 1944 Int2 Fastacmd_Search_ex (CharPtr searchstr, CharPtr database, Uint1 is_prot, 1945 CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out, 1946 Boolean use_target, Boolean use_ctrlAs, EBlastDbDumpType dump_db, 1947 CharPtr seqlocstr, Uint1 strand, Boolean taxonomy_info_only, 1948 Boolean dbinfo_only, Int4 pig); 1949 1950 /** Parses the string passed as its first argument, which should contain a pair 1951 * of positive integers separated by ' ', ',', or ';' and returns the integers 1952 * in the second argument. This function is non-static so that unit tests can 1953 * be written for it. 1954 */ 1955 void Fastacmd_ParseLocations(const char* str, Int4 locations[2]); 1956 1957 /** 1958 * @param rdfp Blast database handle [in] 1959 * @param fp output FILE pointer [in] 1960 * @param linelen number of characters to print per line [in] 1961 * @param use_ctrlAs use Ctrl-A to separate non-redundant deflines? [in] 1962 * @param dump_type type of information to dump [in] 1963 */ 1964 Int2 DumpBlastDB(const ReadDBFILEPtr rdfp, FILE *fp, Int4 line_length, 1965 Boolean use_ctrlAs, EBlastDbDumpType dump_type); 1966 1967 /** 1968 * @param rdfp Blast database handle [in] 1969 * @param fp output FILE pointer [in] 1970 * @param linelen number of characters to print per line [in] 1971 * @param use_ctrlAs use Ctrl-A to separate non-redundant deflines? [in] 1972 * @param dump_type type of information to dump [in] 1973 * @param i ordinal id of sequence to dump [in] 1974 */ 1975 Int2 DumpOneSequence(const ReadDBFILEPtr rdfp, FILE *fp, Int4 line_length, 1976 Boolean use_ctrlAs, EBlastDbDumpType dump_type, Int4 i); 1977 1978 Int4 LIBCALL readdb_MakeGiFileBinary PROTO((CharPtr input_file, CharPtr 1979 output_file)); 1980 1981 Int4 FastaToBlastDB PROTO((FDB_optionsPtr options, Int4 Bases_In_Volume)); 1982 1983 BlastDefLinePtr FDReadDeflineAsn(ReadDBFILEPtr rdfp, Int4 sequence_number); 1984 1985 CharPtr FD_ConstructMultivolumeDBList(CharPtr basename, Int4 vols); 1986 1987 Boolean FD_CreateAliasFileEx PROTO((CharPtr title, CharPtr basename, 1988 Int4 volumes, Boolean is_protein, CharPtr parent, 1989 Int4 first_oid, Int4 last_oid, Int8 total_length, Int4 number_seqs, 1990 CharPtr oidlist, CharPtr gifile)); 1991 1992 Boolean FD_CreateAliasFile PROTO((CharPtr title, CharPtr basename, 1993 Int4 volumes, Boolean is_protein)); 1994 1995 /* simple function to make alias file give FDB_optionsPtr, alias file is only made if appropriate. */ 1996 Boolean FD_MakeAliasFile PROTO((FDB_optionsPtr options)); 1997 Int4 LIBCALL 1998 readdb_get_sequence_number PROTO((ReadDBFILEPtr rdfp, Int4 first_seq, Int8 offset)); 1999 2000 Boolean FDBDumpDeflineAsn(FormatDBPtr fdbp, BlastDefLinePtr bdp_in); 2001 2002 Int4 FDBFillIndexTables(FormatDBPtr fdbp, Int4 seq_length); 2003 2004 BlastDefLinePtr FDLCreateAsnDF(FormatDBPtr fdbp, CharPtr seq_id, 2005 CharPtr title, Int4 taxid); 2006 void FDBBlastDefLineSetBit(Int2 bit_no, ValNodePtr PNTR retval); 2007 2008 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) 2009 #ifdef HAVE_MADVISE 2010 2011 /* enable/disable madvise functionality, -- disabled by default */ 2012 void LIBCALL 2013 readdb_madvise_enable PROTO((Boolean enable)); 2014 2015 /* set madvise type, -- default eMMA_Normal */ 2016 void LIBCALL 2017 readdb_madvise_type PROTO((EMemMapAdvise advice)); 2018 2019 /* explicitly set madvise sync mode: 2020 * default is sync on Solaris, async on Linux 2021 */ 2022 void LIBCALL 2023 readdb_madvise_sync_mode PROTO((Boolean mode)); 2024 2025 /* explicitly set madvise block size, which is the 2026 * number of sequences preloaded in a single madvise 2027 * operation, default is 65536 2028 */ 2029 void LIBCALL 2030 readdb_madvise_block PROTO((Int4 nSeqs)); 2031 2032 /* call preload directly -- run madvise on a chunk of memory mapped file */ 2033 void LIBCALL 2034 readdb_preload PROTO((ReadDBFILEPtr rdfp, Int4 first_db_seq, 2035 Int4 final_db_seq, EMemMapAdvise advice, Boolean sync)); 2036 2037 #endif /* HAVE_MADVISE */ 2038 #endif /* SOL || LINUX */ 2039 2040 #ifdef __cplusplus 2041 } 2042 #endif 2043 2044 #endif /* _READDB_ */ 2045