1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBVOL_HPP 2 #define OBJTOOLS_READERS_SEQDB__SEQDBVOL_HPP 3 4 /* $Id: seqdbvol.hpp 631537 2021-05-19 13:50:49Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Kevin Bealer 30 * 31 */ 32 33 /// @file seqdbvol.hpp 34 /// Defines database volume access classes. 35 /// 36 /// Defines classes: 37 /// CSeqDBVol 38 /// 39 /// Implemented for: UNIX, MS-Windows 40 41 #include <objtools/blast/seqdb_reader/impl/seqdbatlas.hpp> 42 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp> 43 #include <objtools/blast/seqdb_reader/impl/seqdbtax.hpp> 44 #include "seqdbcol.hpp" 45 #include <objects/seq/seq__.hpp> 46 47 BEGIN_NCBI_SCOPE 48 49 /// Import definitions from the objects namespace. 50 USING_SCOPE(objects); 51 52 /// CSeqDBGiIndex 53 /// 54 /// This class maintains the OID->GI translation 55 class CSeqDBGiIndex : public CObject { 56 public: 57 typedef CSeqDBAtlas::TIndx TIndx; 58 typedef int TOid; 59 // typedef int TGi; 60 CSeqDBGiIndex(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)61 CSeqDBGiIndex(CSeqDBAtlas & atlas, 62 const string & dbname, 63 char prot_nucl) 64 : m_Lease (atlas), 65 //m_Fname (dbname + '.' + prot_nucl + "og"), 66 m_NumOIDs (0) { 67 m_Lease.Init(dbname + '.' + prot_nucl + "og"); 68 } 69 ~CSeqDBGiIndex()70 ~CSeqDBGiIndex() 71 { 72 m_Lease.Clear(); 73 } 74 IndexExists(const string & name,const char prot_nucl)75 static bool IndexExists(const string & name, 76 const char prot_nucl) 77 { 78 string fn(name + '.' + prot_nucl + "og"); 79 return CFile(fn).Exists(); 80 } 81 82 TGi GetSeqGI(TOid oid, CSeqDBLockHold & locked); 83 84 private: 85 CSeqDBFileMemMap m_Lease; 86 //string m_Fname; 87 Int4 m_Size; 88 Int4 m_NumOIDs; 89 }; 90 91 92 /// CSeqDBRangeList 93 /// 94 /// This class maintains a list of ranges of sequence offsets that are 95 /// desired for performance optimization. For large sequences that 96 /// need to be unpacked, this class describes the subsets of those 97 /// sequences that will actually be used. Each instance of this class 98 /// corresponds to sequence data for one OID. 99 100 class CSeqDBRangeList : public CObject { 101 public: 102 /// Constructor. 103 /// @param atlas The SeqDB memory management layer. [in] CSeqDBRangeList()104 CSeqDBRangeList() 105 : m_CacheData (false) 106 { 107 // Sequence caching is not implemented yet. It would increase 108 // performance further, but requires some consideration of the 109 // design with respect to locking and correctness. 110 } 111 112 /// Destructor. ~CSeqDBRangeList()113 ~CSeqDBRangeList() 114 { 115 FlushSequence(); 116 } 117 118 /// Returns true if the sequence data is cached. IsCached()119 bool IsCached() 120 { 121 return false; 122 } 123 124 /// List of sequence offset ranges. 125 typedef set< pair<int, int> > TRangeList; 126 127 /// Set ranges of the sequence that will be used. 128 /// @param ranges Offset ranges of the sequence that are needed. [in] 129 /// @param append_ranges If true, combine new ranges with old. [in] 130 /// @param cache_data If true, SeqDB is allowed to cache data. [in] 131 void SetRanges(const TRangeList & ranges, 132 bool append_ranges, 133 bool cache_data); 134 135 /// Get ranges of sequence offsets that will be used. GetRanges()136 const TRangeList & GetRanges() 137 { 138 return m_Ranges; 139 } 140 141 /// Flush cached sequence data (if any). FlushSequence()142 void FlushSequence() 143 { 144 } 145 146 /// Sequences shorter than this will not use ranges in any case. ImmediateLength()147 static int ImmediateLength() 148 { 149 return 10240; 150 } 151 152 private: 153 /// Range of offsets needed for this sequence. 154 TRangeList m_Ranges; 155 156 /// True if caching of sequence data is required for this sequence. 157 bool m_CacheData; 158 }; 159 160 /// CSeqDBVol class. 161 /// 162 /// This object defines access to one database volume. It aggregates 163 /// file objects associated with the sequence and header data, and 164 /// ISAM objects used for translation of GIs and PIGs for data in this 165 /// volume. The extensions managed here include those with file 166 /// extensions (pin, phr, psq, nin, nhr, and nsq), plus the optional 167 /// ISAM objects via the CSeqDBIsam class. 168 169 class CSeqDBVol { 170 public: 171 /// Import TIndx definition from the CSeqDBAtlas class. 172 typedef CSeqDBAtlas::TIndx TIndx; 173 174 /// Constructor. 175 /// 176 /// All files connected with the database volume will be opened, 177 /// metadata about the volume will be read from the index file, 178 /// and identifier translation indices will be opened. The name 179 /// of these files is the specified name of the volume plus an 180 /// extension. 181 /// 182 /// @param atlas 183 /// The memory management layer object. [in] 184 /// @param name 185 /// The base name of the volumes files. [in] 186 /// @param prot_nucl 187 /// The sequence type, kSeqTypeProt, or kSeqTypeNucl. [in] 188 /// @param user_list 189 /// Specifies GIs or TIs of sequences to include. [in] 190 /// @param neg_list 191 /// Specifies GIs or TIs of sequences to exclude. [in] 192 /// @param vol_start 193 /// The volume's starting OID. [in] 194 /// @param locked 195 /// The lock holder object for this thread. [in] 196 CSeqDBVol(CSeqDBAtlas & atlas, 197 const string & name, 198 char prot_nucl, 199 CSeqDBGiList * user_list, 200 CSeqDBNegativeList * neg_list, 201 int vol_start, 202 CSeqDBLockHold & locked); 203 204 /// Open sequence file 205 /// 206 /// By default, sequence file is opened on a "lazy" schedule. 207 /// This method will force the sequence file to be opened. 208 /// 209 /// @param locked 210 /// The lock holder object for this thread. [in] 211 void OpenSeqFile(CSeqDBLockHold &locked) const; 212 213 /// Sequence length for protein databases. 214 /// 215 /// This method returns the length of the sequence in bases, and 216 /// should only be called for protein sequences. It does not 217 /// require synchronization via the atlas object's lock. 218 /// 219 /// @param oid 220 /// The OID of the sequence. [in] 221 /// @param locked 222 /// The lock holder object for this thread. [in] 223 /// @return 224 /// The length in bases of the sequence. 225 int GetSeqLengthProt(int oid) const; 226 227 /// Approximate sequence length for nucleotide databases. 228 /// 229 /// This method returns the length of the sequence using a fast 230 /// method that may be off by as much as 4 bases. The method is 231 /// designed to be unbiased, meaning that the total length of 232 /// large numbers of sequences will approximate what the exact 233 /// length would be. The approximate lengths will change if the 234 /// database is regenerated. It does not require synchronization. 235 /// 236 /// @param oid 237 /// The OID of the sequence. [in] 238 /// @param locked 239 /// The lock holder object for this thread. [in] 240 /// @return 241 /// The approximate length in bases of the sequence. 242 int GetSeqLengthApprox(int oid) const; 243 244 /// Exact sequence length for nucleotide databases. 245 /// 246 /// This method returns the length of the sequence in bases, and 247 /// should only be called for nucleotide sequences. It requires 248 /// synchronization via the atlas object's lock, which must be 249 /// done in the calling code. 250 /// 251 /// @param oid 252 /// The OID of the sequence. [in] 253 /// @param locked 254 /// The lock holder object for this thread. [in] 255 /// @return 256 /// The length in bases of the sequence. 257 int GetSeqLengthExact(int oid) const; 258 259 /// Get filtered sequence header information. 260 /// 261 /// This method returns the set of Blast-def-line objects stored 262 /// for each sequence. These contain descriptive information 263 /// related to the sequence. If OID filtering is enabled and a 264 /// membership bit is used, only deflines with that membership bit 265 /// set will be returned. The OID list existence and membership 266 /// bit are contained in filt_info. This field may be NULL, in 267 /// which case OID list bit filtering is not done (in this case 268 /// the deflines are not cached). 269 /// 270 /// @param oid 271 /// The OID of the sequence. [in] 272 /// @param locked 273 /// The lock holder object for this thread. [in] 274 /// @return 275 /// The set of blast-def-lines describing this sequence. 276 CRef<CBlast_def_line_set> 277 GetFilteredHeader(int oid, 278 CSeqDBLockHold & locked) const; 279 280 /// Get the sequence type stored in this database. 281 /// 282 /// This method returns the type of sequences stored in this 283 /// database, either kSeqTypeProt for protein, or kSeqTypeNucl for 284 /// nucleotide. 285 /// 286 /// @return 287 /// Either kSeqTypeProt for protein, or kSeqTypeNucl for nucleotide. 288 char GetSeqType() const; 289 290 /// Get a CBioseq object for this sequence. 291 /// 292 /// This method builds and returns a Bioseq for this sequence. 293 /// The taxonomy information is cached in this volume, so it 294 /// should not be modified directly, or other Bioseqs from this 295 /// SeqDB object may be affected. If the CBioseq has an OID list, 296 /// and it uses a membership bit, the deflines included in the 297 /// CBioseq will be filtered based on the membership bit. Zero 298 /// for the membership bit means no filtering. Filtering can also 299 /// be done by a GI, in which case, only the defline matching that 300 /// GI will be returned. The seqdata parameter can be specified 301 /// as false to indicate that sequence data should not be included 302 /// in this object; in this case the CSeq_inst object attached to 303 /// the bioseq will be configured to a "not set" state. This is 304 /// used to allow Bioseq summary data to be provided without the 305 /// performance penalty of loading (possibly very large) sequence 306 /// data from disk. 307 /// 308 /// @param oid 309 /// The OID of the sequence. [in] 310 /// @param pref_gi 311 /// If specified, only return deflines containing this GI. [in] 312 /// @param pref_seq_id 313 /// If specified, only return deflines containing this Seq_id. [in] 314 /// @param tax_info 315 /// The taxonomy database object. [in] 316 /// @param seqdata 317 /// Include sequence data in the returned Bioseq. [in] 318 /// @param locked 319 /// The lock holder object for this thread. [in] 320 /// @return 321 /// A CBioseq describing this sequence. 322 CRef<CBioseq> 323 GetBioseq(int oid, 324 TGi pref_gi, 325 const CSeq_id * pref_seq_id, 326 bool seqdata, 327 CSeqDBLockHold & locked); 328 329 /// Get the sequence data. 330 /// 331 /// This method gets the sequence data, returning a pointer and 332 /// the length of the sequence. The atlas will be locked, but the 333 /// lock may also be returned during this method. The computation 334 /// of the length of a nucleotide sequence involves a one byte 335 /// read that is likely to cause a page fault. Releasing the 336 /// atlas lock before this (potential) page fault can help the 337 /// average performance in the multithreaded case. It is safe to 338 /// release the lock because the sequence data is pinned down by 339 /// the reference count we have acquired to return to the user. 340 /// The returned sequence data is intended for blast searches, and 341 /// will contain random values in any ambiguous regions. 342 /// 343 /// @param oid 344 /// The OID of the sequence. [in] 345 /// @param buffer 346 /// The returned sequence data. [out] 347 /// @param locked 348 /// The lock holder object for this thread. [in] 349 /// @param in_lease 350 /// Only perform sequence retrieval if the requested oid is 351 /// within the previous lease [in] 352 /// @return 353 /// The length of this sequence in bases. GetSequence(int oid,const char ** buffer) const354 int GetSequence(int oid, const char ** buffer) const 355 { 356 return x_GetSequence(oid, buffer); 357 } 358 359 /// Get a sequence with ambiguous regions. 360 /// 361 /// This method gets the sequence data, returning a pointer and 362 /// the length of the sequence. For nucleotide sequences, the 363 /// data can be returned in one of two encodings. Specify either 364 /// (kSeqDBNuclNcbiNA8) for NCBI/NA8, or (kSeqDBNuclBlastNA8) for 365 /// Blast/NA8. The data can also be allocated in one of three 366 /// ways, enumerated in ESeqDBAllocType. Specify eAtlas to use 367 /// the Atlas code, eMalloc to use the malloc() function, or eNew 368 /// to use the new operator. 369 /// 370 /// @param oid 371 /// The OID of the sequence. [in] 372 /// @param buffer 373 /// The returned sequence data. [out] 374 /// @param nucl_code 375 /// The encoding of the returned sequence data. [in] 376 /// @param alloc_type 377 /// The allocation routine used. [in] 378 /// @param region 379 /// If non-null, the offset range to get. [in] 380 /// @param locked 381 /// The lock holder object for this thread. [in] 382 /// @return 383 /// The length of this sequence in bases. 384 int GetAmbigSeq(int oid, 385 char ** buffer, 386 int nucl_code, 387 ESeqDBAllocType alloc_type, 388 SSeqDBSlice * region, 389 CSeqDB::TSequenceRanges * masks) const; 390 391 int GetAmbigPartialSeq(int oid, 392 char ** buffer, 393 int nucl_code, 394 ESeqDBAllocType alloc_type, 395 CSeqDB::TSequenceRanges * partial_ranges, 396 CSeqDB::TSequenceRanges * masks) const; 397 398 /// Get the Seq-ids associated with a sequence. 399 /// 400 /// This method returns a list containing all the CSeq_id objects 401 /// associated with a sequence. 402 /// 403 /// @param oid 404 /// The OID of the sequence. [in] 405 /// @param locked 406 /// The lock holder object for this thread. [in] 407 /// @return 408 /// The list of Seq-id objects for this sequences. 409 list< CRef<CSeq_id> > GetSeqIDs(int oid) const; 410 // same as above version with cached CObjectIStreamAsnBinary 411 list< CRef<CSeq_id> > GetSeqIDs(int oid, CObjectIStreamAsnBinary *inpstr) const; 412 413 /// Get the GI of a sequence 414 /// This method returns the gi of the sequence 415 /// 416 /// @param oid 417 /// The OID of the sequence. [in] 418 /// @return 419 /// The oid of the sequence 420 TGi GetSeqGI(int oid, CSeqDBLockHold & locked) const; 421 422 /// Get the volume title. 423 /// @return The volume's title. 424 string GetTitle() const; 425 426 /// Get sqlite file name associated with this volume 427 /// Empty string if version 4 428 string GetLMDBFileName() const; 429 430 /// Get the formatting date of the volume. 431 /// @return The create-date of the volume. 432 string GetDate() const; 433 434 /// Get the number of OIDs for this volume. 435 /// @return The number of OIDs. 436 int GetNumOIDs() const; 437 438 /// Get the total length of this volume (in bases). 439 /// @return The total volume length. 440 Uint8 GetVolumeLength() const; 441 442 /// Get the length of the largest sequence in this volume. 443 /// @return The largest sequence's length. 444 int GetMaxLength() const; 445 446 /// Get the length of the smallest sequence in this volume. 447 /// @return The smallest sequence's length. 448 int GetMinLength() const; 449 450 /// Get the volume name. 451 /// @return The volume name. GetVolName() const452 const string & GetVolName() const 453 { 454 return m_VolName; 455 } 456 457 /// Return expendable resources held by this volume. 458 /// 459 /// This volume holds resources acquired via the atlas. This 460 /// method returns all such resources which can be automatically 461 /// reacquired (but not, for example, the index file data). 462 void UnLease(); 463 464 465 /// Find the OID given a PIG. 466 /// 467 /// A lookup is done for the PIG, and if found, the corresponding 468 /// OID is returned. 469 /// 470 /// @param pig 471 /// The pig to look up. [in] 472 /// @param oid 473 /// The returned ordinal ID. [out] 474 /// @param locked 475 /// The lock holder object for this thread. [in] 476 /// @return 477 /// True if the PIG was found. 478 bool PigToOid(int pig, int & oid) const; 479 480 /// Find the PIG given an OID. 481 /// 482 /// If this OID is associated with a PIG, the PIG is returned. 483 /// 484 /// @param oid 485 /// The oid of the sequence. [in] 486 /// @param pig 487 /// The returned PIG. [out] 488 /// @param locked 489 /// The lock holder object for this thread. [in] 490 /// @return 491 /// True if a PIG was returned. 492 bool GetPig(int oid, int & pig, CSeqDBLockHold & locked) const; 493 494 /// Find the OID given a TI. 495 /// 496 /// A lookup is done for the TI, and if found, the corresponding 497 /// OID is returned. 498 /// 499 /// @param ti 500 /// The ti to look up. [in] 501 /// @param oid 502 /// The returned ordinal ID. [out] 503 /// @param locked 504 /// The lock holder object for this thread. [in] 505 /// @return 506 /// True if the TI was found. 507 bool TiToOid(Int8 ti, 508 int & oid, 509 CSeqDBLockHold & locked) const; 510 511 /// Find the OID given a GI. 512 /// 513 /// A lookup is done for the GI, and if found, the corresponding 514 /// OID is returned. 515 /// 516 /// @param gi 517 /// The gi to look up. [in] 518 /// @param oid 519 /// The returned ordinal ID. [out] 520 /// @param locked 521 /// The lock holder object for this thread. [in] 522 /// @return 523 /// True if an OID was returned. 524 bool GiToOid(TGi gi, int & oid, CSeqDBLockHold & locked) const; 525 526 /// Find the GI given an OID. 527 /// 528 /// If this OID is associated with a GI, the GI is returned. 529 /// 530 /// @param oid 531 /// The oid of the sequence. [in] 532 /// @param gi 533 /// The returned GI. [out] 534 /// @param locked 535 /// The lock holder object for this thread. [in] 536 /// @return 537 /// True if a GI was returned. 538 bool GetGi(int oid, 539 TGi & gi, 540 CSeqDBLockHold & locked) const; 541 542 /// Find OIDs for the specified accession or formatted Seq-id. 543 /// 544 /// An attempt will be made to simplify the string by parsing it 545 /// into a list of Seq-ids. If this works, the best Seq-id (for 546 /// lookup purposes) will be formatted and the resulting string 547 /// will be looked up in the string ISAM file. The resulting set 548 /// of OIDs will be returned. If the string is not found, the 549 /// array will be left empty. Most matches only produce one OID. 550 /// 551 /// @param acc 552 /// An accession or formatted Seq-id for which to search. [in] 553 /// @param oids 554 /// A set of OIDs found for this sequence. [out] 555 /// @param locked 556 /// The lock holder object for this thread. [in] 557 void AccessionToOids(const string & acc, 558 vector<int> & oids, 559 CSeqDBLockHold & locked) const; 560 561 /// Find OIDs for the specified Seq-id. 562 /// 563 /// The Seq-id will be formatted and the resulting string will be 564 /// looked up in the string ISAM file. The resulting set of OIDs 565 /// will be returned. If the string is not found, the array will 566 /// be left empty. Most matches only produce one OID. 567 /// 568 /// @param seqid 569 /// A Seq-id for which to search. [in] 570 /// @param oids 571 /// A set of OIDs found for this sequence. [out] 572 /// @param locked 573 /// The lock holder object for this thread. [in] 574 void SeqidToOids(CSeq_id & seqid, 575 vector<int> & oids, 576 CSeqDBLockHold & locked) const; 577 578 /// Find the OID at a given index into the database. 579 /// 580 /// This method considers the database as one long array of bases, 581 /// and finds the base at an offset into that array. The sequence 582 /// nearest that base is determined, and the sequence's OID is 583 /// returned. The OIDs are assigned to volumes in a different 584 /// order than with the readdb library, which can be an issue when 585 /// splitting the database for load balancing purposes. When 586 /// computing the OID range, be sure to use GetNumOIDs(), not 587 /// GetNumSeqs(). 588 /// 589 /// @param first_seq 590 /// This OID or later is always returned. [in] 591 /// @param residue 592 /// The position to find relative to the total length. [in] 593 /// @param locked 594 /// The lock holder object for this thread. [in] 595 /// @return 596 /// The OID of the sequence nearest the specified residue. 597 int GetOidAtOffset(int first_seq, 598 Uint8 residue, 599 CSeqDBLockHold & locked) const; 600 601 /// Translate Gis to Oids for the given vector of Gi/Oid pairs. 602 /// 603 /// This method iterates over a vector of Gi/Oid pairs. For each 604 /// pair where OID is -1, the GI will be looked up in the ISAM 605 /// file, and (if found) the correct OID will be stored (otherwise 606 /// the -1 will remain). This method will normally be called once 607 /// for each volume. 608 /// 609 /// @param gis 610 /// The set of GI/OID, TI/OID, and Seq-id/OID pairs. [in|out] 611 /// @param locked 612 /// The lock holder object for this thread. [in] 613 void IdsToOids(CSeqDBGiList & gis, 614 CSeqDBLockHold & locked) const; 615 616 /// Add OIDs for this volume, filtered by negative ID lists. 617 /// 618 /// This method iterates over a vector of Gis or Tis. For each 619 /// GI+OID or TI+OID line in the ISAM file, the OID's bit will be 620 /// enabled in the ID list, if the GI or TI is not found in the 621 /// negated GI or TI lists. This method will normally be called 622 /// once for each volume. 623 /// 624 /// @param gis 625 /// The set of GIs, TIs, and the OID bitmap. [in|out] 626 /// @param locked 627 /// The lock holder object for this thread. [in] 628 void IdsToOids(CSeqDBNegativeList & gis, 629 CSeqDBLockHold & locked) const; 630 631 /// Filter this volume using the specified GI list. 632 /// 633 /// A volume can be filtered by a GI list. This method attaches a 634 /// GI list to the volume, in addition to any GI lists that are 635 /// already attached. 636 /// 637 /// @param gilist 638 /// A list of GIs to use as a filter. [in] AttachVolumeGiList(CRef<CSeqDBGiList> gilist) const639 void AttachVolumeGiList(CRef<CSeqDBGiList> gilist) const 640 { 641 m_VolumeGiLists.push_back(gilist); 642 } 643 644 /// Simplify the GI list configuration. 645 /// 646 /// When all user and volume GI lists have been attached, the user 647 /// GI list may be removed; this is only possible if neither the 648 /// user nor volume GI lists contain Seq-id data. 649 void OptimizeGiLists() const; 650 651 /// Fetch data as a CSeq_data object. 652 /// 653 /// All or part of the sequence is fetched in a CSeq_data object. 654 /// The portion of the sequence returned is specified by begin and 655 /// end. An exception will be thrown if begin is greater than or 656 /// equal to end, or if end is greater than or equal to the length 657 /// of the sequence. Begin and end should be specified in bases; 658 /// a range like (0,1) specifies 1 base, not 2. Nucleotide data 659 /// will always be returned in ncbi4na format. 660 /// 661 /// @param oid Specifies the sequence to fetch. [in] 662 /// @param begin Specifies the start of the data to get. [in] 663 /// @param end Specifies the end of the data to get. [in] 664 /// @param locked The lock holder object for this thread. [in] 665 /// @return The sequence data as a Seq-data object. 666 CRef<CSeq_data> GetSeqData(int oid, 667 TSeqPos begin, 668 TSeqPos end, 669 CSeqDBLockHold & locked) const; 670 671 /// Get Raw Sequence and Ambiguity Data. 672 /// 673 /// Get a pointer to the raw sequence and ambiguity data, and the 674 /// length of each. The encoding for these is not defined here 675 /// and should not be relied on to be compatible between different 676 /// database format versions. NULL can be supplied for parameters 677 /// that are not needed (except oid). RetSequence() must be 678 /// called with the pointer returned by 'buffer' if and only if 679 /// that pointer is supplied as non-null by the user. Protein 680 /// sequences will never have ambiguity data. Ambiguity data will 681 /// be packed in the returned buffer at offset *seq_length. 682 /// 683 /// @param oid Ordinal id of the sequence. [in] 684 /// @param buffer Buffer of raw data. [out] 685 /// @param seq_length Returned length of the sequence data. [out] 686 /// @param seq_length Returned length of the ambiguity data. [out] 687 /// @param locked Lock holder object for this thread. [in] 688 void GetRawSeqAndAmbig(int oid, 689 const char ** buffer, 690 int * seq_length, 691 int * ambig_length) const; 692 693 /// Get GI Bounds. 694 /// 695 /// Fetch the lowest, highest, and total number of GIs. If the 696 /// operation fails, zero will be returned for count. 697 /// 698 /// @param low_id Lowest GI value in database. [out] 699 /// @param high_id Highest GI value in database. [out] 700 /// @param count Number of GI values in database. [out] 701 /// @param locked Lock holder object for this thread. [in] 702 void GetGiBounds(TGi & low_id, 703 TGi & high_id, 704 int & count, 705 CSeqDBLockHold & locked) const; 706 707 /// Get PIG Bounds. 708 /// 709 /// Fetch the lowest, highest, and total number of PIGs. If the 710 /// operation fails, zero will be returned for count. 711 /// 712 /// @param low_id Lowest PIG value in database. [out] 713 /// @param high_id Highest PIG value in database. [out] 714 /// @param count Number of PIG values in database. [out] 715 /// @param locked Lock holder object for this thread. [in] 716 void GetPigBounds(int & low_id, 717 int & high_id, 718 int & count, 719 CSeqDBLockHold & locked) const; 720 721 /// Get String Bounds. 722 /// 723 /// Fetch the lowest, highest, and total number of string keys in 724 /// the database index. If the operation fails, zero will be 725 /// returned for count. 726 /// 727 /// @param low_id Lowest string value in database. [out] 728 /// @param high_id Highest string value in database. [out] 729 /// @param count Number of string values in database. [out] 730 /// @param locked Lock holder object for this thread. [in] 731 void GetStringBounds(string & low_id, 732 string & high_id, 733 int & count) const; 734 735 /// List of sequence offset ranges. 736 typedef set< pair<int, int> > TRangeList; 737 738 /// Apply a range of offsets to a database sequence. 739 /// 740 /// The GetAmbigSeq() method requires an amount of work (and I/O) 741 /// which is proportional to the size of the sequence data (more 742 /// if ambiguities are present). In some cases, only certain 743 /// subranges of this data will be utilized. This method allows 744 /// the user to specify which parts of a sequence are actually 745 /// needed by the user. (Care should be taken if one SeqDB object 746 /// is shared by several program components.) (Note that offsets 747 /// above the length of the sequence will not generate an error, 748 /// and are replaced by the sequence length.) 749 /// 750 /// If ranges are specified for a sequence, data areas in 751 /// specified sequences will be accurate, but data outside the 752 /// specified ranges should not be accessed, and no guarantees are 753 /// made about what data they will contain. If the keep_current 754 /// flag is true, the range will be added to existing ranges. If 755 /// false, existing ranges will be flushed and replaced by new 756 /// ranges. To remove ranges, call this method with an empty list 757 /// of ranges; future calls will return the complete sequence. 758 /// 759 /// If the cache_data flag is provided, data for this sequence 760 /// will be kept for the duration of SeqDB's lifetime. To disable 761 /// caching (and flush cached data) for this sequence, call the 762 /// method again, but specify cache_data to be false. 763 /// 764 /// @param oid OID of the sequence. [in] 765 /// @param offset_ranges Ranges of sequence data to return. [in] 766 /// @param append_ranges Append new ranges to existing list. [in] 767 /// @param cache_data Keep sequence data for future callers. [in] 768 /// @param locked Lock holder object for this thread. [in] 769 void SetOffsetRanges(int oid, 770 const TRangeList & offset_ranges, 771 bool append_ranges, 772 bool cache_data) const; 773 774 /// Flush all offset ranges cached 775 /// @param locked Lock holder object for this thread. [in] 776 void FlushOffsetRangeCache(); 777 778 /// Get the sequence hash for a given OID. 779 /// 780 /// The sequence data is fetched and the sequence hash is 781 /// computed and returned. 782 /// 783 /// @param oid The sequence to compute the hash of. [in] 784 /// @return The sequence hash. 785 unsigned GetSequenceHash(int oid); 786 787 /// Get the OIDs for a given sequence hash. 788 /// 789 /// The OIDs corresponding to a hash value (if any) are found and 790 /// returned. If none are found, the vector will be empty. If 791 /// the index does not exist for this volume, an exception will be 792 /// thrown. Some false positives may be returned due to hash 793 /// value collisions. 794 /// 795 /// @param hash The sequence hash to look up. [in] 796 /// @param oids OIDs of sequences with this hash. [out] 797 /// @param locked Lock holder object for this thread. [in|out] 798 void HashToOids(unsigned hash, 799 vector<int> & oids, 800 CSeqDBLockHold & locked) const; 801 802 /// List the titles of all columns for this volume. 803 void ListColumns(set<string> & titles, 804 CSeqDBLockHold & locked); 805 806 /// Get an ID number for a given column title. 807 /// 808 /// For a given column title, this returns an ID that can be used 809 /// to access that column in the future. The returned ID number 810 /// is specific to this instance of SeqDB. If the database does 811 /// not have a column with this name, -1 will be returned. 812 /// 813 /// @param title Column title to search for. [in] 814 /// @param locked The lock holder object for this thread. [in] 815 /// @return Column ID number for this column, or -1. [in] 816 int GetColumnId(const string & title, 817 CSeqDBLockHold & locked); 818 819 /// Get all metadata for the specified column. 820 /// 821 /// Columns may contain user-defined metadata as a list of 822 /// key-value pairs. For the specified column, this returns that 823 /// column's metadata in the provided map. If multiple volumes 824 /// are present, and they define contradictory meta data (this is 825 /// more common when multiple databases are opened at once), this 826 /// method returns the first value it finds for each metadata key. 827 /// If this is unsatisfactory, the two-argument version of this 828 /// method may be used to get more precise values for specific 829 /// volumes. 830 /// 831 /// @param col_id The column id from GetColumnId. [in] 832 /// @param locked The lock holder object for this thread. [in] 833 /// @return The map of metadata for this column. [out] 834 const map<string,string> & 835 GetColumnMetaData(int col_id, 836 CSeqDBLockHold & locked); 837 838 /// Fetch the data blob for the given column and oid. 839 /// 840 /// This method finds the blob data for this OID and column, and 841 /// stores a reference to in the provided blob. If `keep' is 842 /// true, a `lifetime' object is attached to the blob to insure 843 /// the memory is not unmapped when the atlas lock is released. 844 /// 845 /// It is important to specify `keep' correctly to avoid memory 846 /// faults and/or deadlocks. If `keep' is false, the blob must 847 /// not be returned to the user or accessed after the atlas lock 848 /// is released, since the memory it references may no longer be 849 /// mmapped. On the other hand, if `keep' is true, the blob may 850 /// be safely returned to the user, but must not be reassigned or 851 /// destructed until the atlas lock is released (or a deadlock 852 /// will occur). This includes destruction due `stack unwinding'. 853 /// 854 /// For similar reasons, the blob should be empty on input. 855 /// 856 /// @param col_id The column to fetch data from. [in] 857 /// @param oid The OID of the blob. [in] 858 /// @param blob The data will be returned here. [out] 859 /// @param keep If true, increment the memory region. [in] 860 /// @param locked The lock holder object for this thread. [in] 861 void GetColumnBlob(int col_id, 862 int oid, 863 CBlastDbBlob & blob, 864 bool keep, 865 CSeqDBLockHold & locked); 866 867 /// Set the MEMB_BIT fitlering for this volume. 868 /// 869 /// This method sets the MEMB_BIT for the volume. If the 870 /// MEMB_BIT has already been set, and the new bit is different, 871 /// exception will be thrown. This prevents conflicting MEMB_BIT 872 /// settings within an alias tree; nevertheless, it also prevents 873 /// aggregating the same volume with different MEMB_BIT settings, 874 /// such as "DBLIST swissprot pdb". The latter case is probably 875 /// not desired. Support for this "paralogous" case will probably 876 /// come later. 877 /// 878 /// @param mbit The bit to set [in] SetMemBit(int mbit) const879 void SetMemBit(int mbit) const { 880 if (m_MemBit && mbit != m_MemBit) { 881 NCBI_THROW(CSeqDBException, eFileErr, 882 "MEMB_BIT error: conflicting bit found."); 883 } 884 m_MemBit = mbit; 885 } 886 887 private: 888 void x_StringToOids(const string & acc, 889 ESeqDBIdType id_type, 890 Int8 ident, 891 const string & str_id, 892 bool simplified, 893 vector<int> & oids) const; 894 895 /// A set of GI lists. 896 typedef vector< CRef<CSeqDBGiList> > TGiLists; 897 898 /// Returns true if this volume has a positive ID list. x_HaveGiList(void) const899 bool x_HaveGiList(void) const 900 { 901 return ! (m_UserGiList.Empty() && m_VolumeGiLists.empty()); 902 } 903 904 /// Returns true if this volume has a negative ID list. x_HaveNegativeList(void) const905 bool x_HaveNegativeList(void) const 906 { 907 return m_NegativeList.NotEmpty(); 908 } 909 910 /// Returns true if this volume has an ID list. x_HaveIdFilter(void) const911 bool x_HaveIdFilter(void) const 912 { 913 return x_HaveGiList() || x_HaveNegativeList(); 914 } 915 916 /// Determine if a user ID list affects this ID, and how. 917 /// 918 /// This is used to accumulate information about a Seq-id in two 919 /// boolean variables. In order for a Seq-id to be considered 920 /// `included', it must pass filtering by both the user ID list 921 /// (if one was specified) and at least one of the set of ID lists 922 /// attached to the volume (if any exist). This function will be 923 /// called repeatedly for each ID in a defline to determine if the 924 /// defline as a whole passes the filtering tests. If the 925 /// booleans are set to true, this code never sets it to false, 926 /// and can skip the associated test. This is because a defline 927 /// is included if one of its Seq-ids matches the volume ID list 928 /// but a different one matches the user ID list. For negative ID 929 /// lists this returns true if the type of ID matches the kind 930 /// used by the negative list, but the ID is not found therein. 931 /// 932 /// @param id Sequence id to check for. [in] 933 /// @param have_user Will be set if the user list has id. [in|out] 934 /// @param have_vol Will be set if the volume list has id. [in|out] x_FilterHasId(const CSeq_id & id,bool & have_user,bool & have_vol) const935 void x_FilterHasId(const CSeq_id & id, 936 bool & have_user, 937 bool & have_vol) const 938 { 939 if (! have_user) { 940 if (m_UserGiList.NotEmpty() && m_UserGiList->GetNumTaxIds() == 0 && m_UserGiList->GetNumPigs() == 0 ) { 941 have_user |= x_ListIncludesId(*m_UserGiList, id); 942 } else if (m_NegativeList.NotEmpty() && m_NegativeList->GetNumTaxIds() == 0 && m_NegativeList->GetNumPigs() == 0 ) { 943 have_user |= x_ListIncludesId(*m_NegativeList, id); 944 } else { 945 have_user = true; 946 } 947 } 948 949 if (! have_vol) { 950 if (m_VolumeGiLists.empty()) { 951 have_vol = true; 952 } else { 953 NON_CONST_ITERATE(TGiLists, gilist, m_VolumeGiLists) { 954 if (x_ListIncludesId(**gilist, id)) { 955 have_vol = true; 956 break; 957 } 958 } 959 } 960 } 961 } 962 963 /// Returns true if this volume's ID list has this Seq-id. 964 /// @param L A GI list to test against. [in] 965 /// @param id A Seq-id to test against L. [in] 966 /// @return True if the list contains the specified Seq-id. x_ListIncludesId(CSeqDBGiList & L,const CSeq_id & id) const967 bool x_ListIncludesId(CSeqDBGiList & L, const CSeq_id & id) const 968 { 969 return L.FindId(id); 970 } 971 972 /// Returns true if this ID is not found in the negative ID list. 973 /// 974 /// This checks whether an ID is found in the negative ID list, 975 /// and whether the ID is the right type (so that it might 976 /// possibly be found). If the ID is the right type, and is not 977 /// found, this method returns true. In other cases it returns 978 /// false. This technique could be described as treating the 979 /// negative GI list as the list of all GIs not mentioned in the 980 /// vector stored in the list, and similarly for the TIs. This 981 /// means that every TI and GI in the ASN.1 for this defline must 982 /// be mentioned in the negative ID list in order to exclude the 983 /// defline. In normal practice, only one GI or TI ever exists 984 /// for a defline. 985 /// 986 /// @param L A GI list to test against. [in] 987 /// @param id A Seq-id to test against L. [in] 988 /// @return True if the list contains the specified Seq-id. x_ListIncludesId(CSeqDBNegativeList & L,const CSeq_id & id) const989 bool x_ListIncludesId(CSeqDBNegativeList & L, const CSeq_id & id) const 990 { 991 // A defline is included IFF either a GI or TI is found, and 992 // that ID is not on the list. 993 994 // I use the terms 'included' and 'mentioned' to describe the 995 // negative list processing as follows: "A negative list 996 // INCLUDES a TI or GI if that ID is not MENTIONED in the 997 // negative list." 998 999 bool match_type = false; 1000 bool found = L.FindId(id, match_type); 1001 1002 return (! found) && match_type; 1003 } 1004 1005 /// Get sequence header object. 1006 /// 1007 /// This method returns the sequence header information as an 1008 /// ASN.1 object. Seq-ids of type "gnl|BL_ORD_ID|#" are stored as 1009 /// values relative to this volume. If they will be returned to 1010 /// the user in any way, specify true for adjust_oids to adjust 1011 /// them to the global OID range. 1012 /// 1013 /// @param oid 1014 /// The OID of the sequence. [in] 1015 /// @param adjust_oids 1016 /// If true, BL_ORD_ID ids will be adjusted to this volume. [in] 1017 /// @param changed 1018 /// Indicates whether ASN.1 data needed changes (optional). [out] 1019 /// @param locked 1020 /// The lock holder object for this thread. [in] 1021 /// @return 1022 /// The Blast-def-line-set describing this sequence. 1023 CRef<CBlast_def_line_set> 1024 x_GetHdrAsn1(int oid, 1025 bool adjust_oids, 1026 bool * changed) const; 1027 1028 CRef<CBlast_def_line_set> 1029 x_GetHdrAsn1(int oid, 1030 bool adjust_oids, 1031 bool * changed, 1032 CObjectIStreamAsnBinary *inpstr) const; 1033 /// Get sequence header binary data. 1034 /// 1035 /// This method returns the sequence header information as a 1036 /// reference to raw ASN.1 binary data. This reference can be 1037 /// used until the next access to the Atlas layer or the header 1038 /// data memory lease. 1039 /// 1040 /// @param oid 1041 /// The OID of the sequence. [in] 1042 /// @param locked 1043 /// The lock holder object for this thread. [in] 1044 /// @return 1045 /// The Blast-def-line-set describing this sequence. 1046 CTempString x_GetHdrAsn1Binary(int oid) const; 1047 1048 /// Get binary sequence header information. 1049 /// 1050 /// This method reads the sequence header information (as binary 1051 /// encoded ASN.1) into a supplied char vector. 1052 /// 1053 /// @param oid 1054 /// The OID of the sequence. [in] 1055 /// @param hdr_data 1056 /// The returned binary ASN.1 of the Blast-def-line-set. [out] 1057 /// @param locked 1058 /// The lock holder object for this thread. [in] 1059 void 1060 x_GetFilteredBinaryHeader(int oid, 1061 vector<char> & hdr_data) const; 1062 1063 1064 /// Get sequence header information. 1065 /// 1066 /// This method returns the set of Blast-def-line objects stored 1067 /// for each sequence. These contain descriptive information 1068 /// related to the sequence. If OID filtering is enabled and a 1069 /// membership bit is used, only deflines with that membership bit 1070 /// set will be returned. 1071 /// 1072 /// @param oid 1073 /// The OID of the sequence. [in] 1074 /// @param changed 1075 /// Indicates whether ASN.1 data needed changes (optional). [out] 1076 /// @param locked 1077 /// The lock holder object for this thread. [in] 1078 /// @return 1079 /// The set of blast-def-lines describing this sequence. 1080 CRef<CBlast_def_line_set> 1081 x_GetFilteredHeader(int oid, 1082 bool * changed) const; 1083 1084 CRef<CBlast_def_line_set> 1085 x_GetFilteredHeader(int oid, 1086 bool * changed, 1087 CObjectIStreamAsnBinary *inpstr ) const; 1088 1089 /// Get sequence header information structures. 1090 /// 1091 /// This method reads the sequence header information and returns 1092 /// a Seqdesc suitable for inclusion in a CBioseq. This object 1093 /// will contain an opaque type, storing the sequence headers as 1094 /// binary ASN.1, wrapped in a C++ ASN.1 structure (CSeqdesc). 1095 /// 1096 /// @param oid 1097 /// The OID of the sequence. [in] 1098 /// @param locked 1099 /// The lock holder object for this thread. [in] 1100 /// @return 1101 /// The CSeqdesc to include in the CBioseq. 1102 CRef<CSeqdesc> x_GetAsnDefline(int oid) const; 1103 1104 /// Returns 'p' for protein databases, or 'n' for nucleotide. 1105 char x_GetSeqType() const; 1106 1107 /// Get ambiguity information. 1108 /// 1109 /// This method is used to fetch the ambiguity data for sequences 1110 /// in a nucleotide database. The ambiguity data describes 1111 /// sections of the nucleotide sequence for which more than one of 1112 /// 'A', 'C', 'G', or 'T' are possible. The integers returned by 1113 /// this function contain a packed description of the ranges of 1114 /// the sequence which have such data. This method only returns 1115 /// the array of integers, and does not interpret them, except for 1116 /// byte swapping. 1117 /// 1118 /// @param oid 1119 /// The OID of the sequence. [in] 1120 /// @param ambchars 1121 /// The returned array of ambiguity descriptors. [out] 1122 /// @param locked 1123 /// The lock holder object for this thread. [in] 1124 void x_GetAmbChar(int oid, 1125 vector<Int4> & ambchars) const; 1126 1127 /// Get a sequence with ambiguous regions. 1128 /// 1129 /// This method gets the sequence data, returning a pointer and 1130 /// the length of the sequence. For nucleotide sequences, the 1131 /// data can be returned in one of two encodings. Specify either 1132 /// (kSeqDBNuclNcbiNA8) for NCBI/NA8, or (kSeqDBNuclBlastNA8) for 1133 /// Blast/NA8. The data can also be allocated in one of three 1134 /// ways, enumerated in ESeqDBAllocType. Specify eAtlas to use 1135 /// the Atlas code, eMalloc to use the malloc() function, or eNew 1136 /// to use the new operator. 1137 /// 1138 /// @param oid 1139 /// The OID of the sequence. [in] 1140 /// @param buffer 1141 /// The returned sequence data. [out] 1142 /// @param nucl_code 1143 /// The encoding of the returned sequence data. [in] 1144 /// @param alloc_type 1145 /// The allocation routine used. [in] 1146 /// @param region 1147 /// If non-null, the offset range to get. [in] 1148 /// @param locked 1149 /// The lock holder object for this thread. [in] 1150 /// @return 1151 /// The length of this sequence in bases. 1152 int x_GetAmbigSeq(int oid, 1153 char ** buffer, 1154 int nucl_code, 1155 ESeqDBAllocType alloc_type, 1156 SSeqDBSlice * region, 1157 CSeqDB::TSequenceRanges *masks) const; 1158 1159 /// Allocate memory in one of several ways. 1160 /// 1161 /// This method provides functionality to allocate memory with the 1162 /// atlas layer, using malloc, or using the new [] operator. The 1163 /// user is expected to return the data using the corresponding 1164 /// deallocation technique. 1165 /// 1166 /// @param length 1167 /// The number of bytes to get. [in] 1168 /// @param alloc_type 1169 /// The type of allocation routine to use. [in] 1170 /// @param locked 1171 /// The lock holder object for this thread. [in] 1172 /// @return 1173 /// A pointer to the allocated memory. 1174 char * x_AllocType(size_t length, 1175 ESeqDBAllocType alloc_type) const; 1176 1177 /// Get sequence data. 1178 /// 1179 /// The sequence data is found and returned for the specified 1180 /// sequence. The caller owns the data and a hold on the 1181 /// underlying memory region. There is a memory access in this 1182 /// code that tends to trigger a soft (and possibly hard) page 1183 /// fault in the nucleotide case. If the can_release and keep 1184 /// flags are true, this code may return the lock holder object 1185 /// before that point to reduce lock contention in multithreaded 1186 /// code. 1187 /// 1188 /// @param oid 1189 /// The ordinal ID of the sequence to get. [in] 1190 /// @param buffer 1191 /// The returned sequence data buffer. [out] 1192 /// @param keep 1193 /// Specify true if the caller wants a hold on the sequence. [in] 1194 /// @param locked 1195 /// The lock holder object for this thread. [in] 1196 /// @param can_release 1197 /// Specify true if the atlas lock can be released. [in] 1198 /// @param in_lease 1199 /// Only perform retrieval if the oid is within previous lease [in] 1200 /// @return 1201 /// The length of the sequence in bases. 1202 int x_GetSequence(int oid, 1203 const char ** buffer) const; 1204 1205 /// Get partial sequence data. 1206 /// 1207 /// The sequence data is found and returned for the specified oid 1208 /// and offset range. If the region argument is non-null, the 1209 /// region endpoints are verified against the sequence endpoints. 1210 /// Otherwise, this method is the same as x_GetSequence(). Note 1211 /// that the code returns the length of the region in bases, but 1212 /// buffer is set to a pointer to the beginning of the sequence, 1213 /// not the beginning of the region. 1214 /// 1215 /// @param oid 1216 /// The ordinal ID of the sequence to get. [in] 1217 /// @param buffer 1218 /// The returned sequence data buffer. [out] 1219 /// @param keep 1220 /// Specify true if the caller wants a hold on the sequence. [in] 1221 /// @param locked 1222 /// The lock holder object for this thread. [in] 1223 /// @param can_release 1224 /// Specify true if the atlas lock can be released. [in] 1225 /// @param region 1226 /// If non-null, the offset range to get. [in] 1227 /// @return 1228 /// The length of the returned portion in bases. 1229 int x_GetSequence(int oid, 1230 const char ** buffer, 1231 bool keep, 1232 CSeqDBLockHold & locked, 1233 bool can_release, 1234 SSeqDBSlice * region) const; 1235 1236 /// Get defline filtered by several criteria. 1237 /// 1238 /// This method returns the set of deflines for a sequence. If 1239 /// there is an OID list and membership bit, these will be 1240 /// filtered by membership bit. If there is a preferred GI is 1241 /// specified, the defline matching that GI (if found) will be 1242 /// moved to the front of the set. 1243 /// 1244 /// @param oid 1245 /// The ordinal ID of the sequence to get. [in] 1246 /// @param preferred_gi 1247 /// This GI's defline (if non-zero and found) will be put at the front of the list. [in] 1248 /// @param preferred_seqid 1249 /// This SeqID's defline (if non-NULL and found) will be put at the front of the list. [in] 1250 /// @param locked 1251 /// The lock holder object for this thread. [in] 1252 /// @return 1253 /// The defline set for the specified oid. 1254 CRef<CBlast_def_line_set> 1255 x_GetTaxDefline(int oid, 1256 TGi preferred_gi, 1257 const CSeq_id * preferred_seq_id); 1258 1259 1260 /// Get taxonomic descriptions of a sequence. 1261 /// 1262 /// This method builds a set of CSeqdesc objects from taxonomic 1263 /// information and blast deflines. If there is an OID list and 1264 /// membership bit, the deflines will be filtered by membership 1265 /// bit. If there is a preferred GI is specified, the defline 1266 /// matching that GI (if found) will be moved to the front of the 1267 /// set. This method is called as part of the processing for 1268 /// building a CBioseq object. 1269 /// 1270 /// @param oid 1271 /// The ordinal ID of the sequence to get. [in] 1272 /// @param preferred_gi 1273 /// This GI's defline (if non-zero and found) will be put at the front of the list. [in] 1274 /// @param preferred_seqid 1275 /// This SeqID's defline (if non-NULL and found) will be put at the front of the list. [in] 1276 /// @param tax_info 1277 /// Taxonomic info to encode. [in] 1278 /// @param locked 1279 /// The lock holder object for this thread. [in] 1280 /// @return 1281 /// A list of CSeqdesc objects for the specified oid. 1282 list< CRef<CSeqdesc> > 1283 x_GetTaxonomy(int oid, 1284 TGi preferred_gi, 1285 const CSeq_id * preferred_seq_id); 1286 1287 1288 /// Returns the base-offset of the specified oid. 1289 /// 1290 /// This method finds the starting offset of the OID relative to 1291 /// the start of the volume, and returns that distance as a number 1292 /// of bytes. The range of the return value should be from zero 1293 /// to the size of the sequence file in bytes. Note that the 1294 /// total volume length in bytes can be found by submitting the 1295 /// OID count as the input oid, because the index file contains 1296 /// one more array element than there are sequences. 1297 /// 1298 /// @param oid 1299 /// The sequence of which to get the starting offset. [in] 1300 /// @param locked 1301 /// The lock holder object for this thread. [in] 1302 /// @return 1303 /// The offset in the volume of that sequence in bytes. 1304 Uint8 x_GetSeqResidueOffset(int oid) const; 1305 1306 /// Find all columns for this volume. 1307 /// 1308 /// This method looks for and opens any columns that might be 1309 /// associated with this database volume. 1310 /// 1311 /// @param locked 1312 /// The lock holder object for this thread. [in] 1313 void x_OpenAllColumns(CSeqDBLockHold & locked); 1314 1315 /// Check Seq-id versions for special sparse-id support case. 1316 /// 1317 /// The BlastDB `sparse indexing' feature omits versions when 1318 /// emitting (string) ISAM indices. If a search for a Seq-id with 1319 /// a version fails, SeqDB strips the version and tries the search 1320 /// again. However, for non-sparse databases, this second search 1321 /// has the harmful side effect that it can find IDs with the same 1322 /// accession but an incorrect version. This method scans the OID 1323 /// list and removes the OIDs with incorrect versions. It should 1324 /// only be called in cases when the version removal needed to be 1325 /// done to get results. 1326 /// 1327 /// @param acc 1328 /// An accession or formatted Seq-id for which to search. [in] 1329 /// @param oids 1330 /// A set of OIDs found for this sequence. [out] 1331 /// @param locked 1332 /// The lock holder object for this thread. [in] 1333 void x_CheckVersions(const string & acc, 1334 vector<int> & oids) const; 1335 1336 void x_OpenSeqFile(void) const; 1337 void x_OpenHdrFile(void) const; 1338 void x_OpenPigFile(void) const; 1339 void x_UnleasePigFile(void) const; 1340 void x_OpenGiFile(void) const; 1341 void x_UnleaseGiFile(void) const; 1342 void x_OpenStrFile(void) const; 1343 void x_UnleaseStrFile(void) const; 1344 void x_OpenTiFile(void) const; 1345 void x_UnleaseTiFile(void) const; 1346 void x_OpenHashFile(void) const; 1347 void x_OpenOidFile(void) const; 1348 1349 /// The memory management layer. 1350 CSeqDBAtlas & m_Atlas; 1351 1352 /// True if the volume is protein, false for nucleotide. 1353 bool m_IsAA; 1354 1355 /// The name of this volume. 1356 string m_VolName; 1357 1358 /// Metadata plus offsets into the sequence, header, and ambiguity data. 1359 CRef<CSeqDBIdxFile> m_Idx; 1360 1361 /// Contains sequence data for this volume. 1362 mutable CRef<CSeqDBSeqFile> m_Seq; 1363 1364 /// Contains header (defline) information for this volume. 1365 mutable CRef<CSeqDBHdrFile> m_Hdr; 1366 1367 // These are mutable because they defer initialization. 1368 1369 /// Handles translation of GIs to OIDs. 1370 mutable CRef<CSeqDBIsam> m_IsamPig; 1371 1372 /// Handles translation of GIs to OIDs. 1373 mutable CRef<CSeqDBIsam> m_IsamGi; 1374 1375 /// Handles translation of strings (accessions) to OIDs. 1376 mutable CRef<CSeqDBIsam> m_IsamStr; 1377 1378 /// Handles translation of TI (trace ids) to OIDs. 1379 mutable CRef<CSeqDBIsam> m_IsamTi; 1380 1381 /// Handles translation of sequence hash value to OIDs. 1382 mutable CRef<CSeqDBIsam> m_IsamHash; 1383 1384 /// The GI index file (for fast oid->gi conversion) 1385 mutable CRef<CSeqDBGiIndex> m_GiIndex; 1386 1387 /// This cache allows CBioseqs to share taxonomic objects. 1388 mutable CSeqDBIntCache< CRef<CSeqdesc> > m_TaxCache; 1389 1390 /// The user ID list, if one exists. 1391 mutable CRef<CSeqDBGiList> m_UserGiList; 1392 1393 /// The negative ID list, if one exists. 1394 mutable CRef<CSeqDBNegativeList> m_NegativeList; 1395 1396 /// The volume GI lists, if any exist. 1397 mutable TGiLists m_VolumeGiLists; 1398 1399 /// The filtering MEMB_BIT 1400 mutable int m_MemBit; 1401 1402 /// Cached/ranged sequence info type. 1403 typedef map<int, CRef<CSeqDBRangeList> > TRangeCache; 1404 1405 /// Cached/ranged sequence info. 1406 mutable TRangeCache m_RangeCache; 1407 1408 /// Starting OID of this volume. 1409 int m_VolStart; 1410 1411 /// First OID past end of this volume. 1412 int m_VolEnd; 1413 1414 /// Filtered defline plus whether binary data needed changes. 1415 typedef pair<CRef<CBlast_def_line_set>, bool> TDeflineCacheItem; 1416 1417 /// Cache of filtered deflines. 1418 mutable CSeqDBIntCache<TDeflineCacheItem> m_DeflineCache; 1419 1420 /// True if we have opened the columns for this volume. 1421 bool m_HaveColumns; 1422 1423 /// True if the volume file has been (at least tried to) opened 1424 mutable bool m_SeqFileOpened; 1425 mutable bool m_HdrFileOpened; 1426 mutable bool m_HashFileOpened; 1427 mutable bool m_OidFileOpened; 1428 1429 mutable CFastMutex m_MtxGi; 1430 mutable CFastMutex m_MtxPig; 1431 mutable CFastMutex m_MtxStr; 1432 mutable CFastMutex m_MtxTi; 1433 mutable CFastMutex m_MtxSeq; 1434 mutable CFastMutex m_MtxHdr; 1435 mutable CFastMutex m_MtxCachedRange; 1436 1437 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 1438 (!defined(NCBI_COMPILER_MIPSPRO)) ) 1439 /// Set of columns defined for this volume. 1440 vector< CRef<CSeqDBColumn> > m_Columns; 1441 #endif 1442 }; 1443 1444 END_NCBI_SCOPE 1445 1446 #endif // OBJTOOLS_READERS_SEQDB__SEQDBVOL_HPP 1447 1448 1449