1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBVOLSET_HPP 2 #define OBJTOOLS_READERS_SEQDB__SEQDBVOLSET_HPP 3 4 /* $Id: seqdbvolset.hpp 538739 2017-06-13 18:26:55Z rackerst $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Kevin Bealer 30 * 31 */ 32 33 /// @file seqdbvolset.hpp 34 /// Manages a set of database volumes. 35 /// 36 /// Defines classes: 37 /// CSeqDBVolSet 38 /// CVolEntry 39 /// 40 /// Implemented for: UNIX, MS-Windows 41 42 #include <objtools/blast/seqdb_reader/impl/seqdbvol.hpp> 43 #include "seqdbfilter.hpp" 44 #include <algo/blast/core/ncbi_std.h> 45 46 BEGIN_NCBI_SCOPE 47 48 /// Import definitions from the ncbi::objects namespace. 49 USING_SCOPE(objects); 50 51 /// CSeqDBVolEntry 52 /// 53 /// This class controls access to the CSeqDBVol class. It contains 54 /// data that is not relevant to the internal operation of a volume, 55 /// but is associated with that volume for operations over the volume 56 /// set as a whole, such as the starting OID of the volume and masking 57 /// information (GI and OID lists). 58 59 class CSeqDBVolEntry { 60 public: 61 /// Constructor 62 /// 63 /// This creates a object containing the specified volume object 64 /// pointer. Although this object owns the pointer, it uses a 65 /// vector, so it does not keep an auto pointer or CRef<>. 66 /// Instead, the destructor of the CSeqDBVolSet class deletes the 67 /// volumes by calling Free() in a destructor. Using indirect 68 /// pointers (CRef<> for example) would require slightly more 69 /// cycles in several performance critical paths. 70 /// 71 /// @param new_vol 72 /// A pointer to a volume. CSeqDBVolEntry(CSeqDBVol * new_vol)73 CSeqDBVolEntry(CSeqDBVol * new_vol) 74 : m_Vol (new_vol), 75 m_OIDStart (0), 76 m_OIDEnd (0) 77 { 78 } 79 80 /// Free the volume object 81 /// 82 /// The associated volume object is deleted. Free()83 void Free() 84 { 85 if (m_Vol) { 86 delete m_Vol; 87 m_Vol = 0; 88 } 89 } 90 91 /// Set the OID range 92 /// 93 /// The volume is queried for the number of OIDs it contains, and 94 /// the starting and ending OIDs are set. 95 /// 96 /// @param start The first OID in the range. SetStartAndEnd(int start)97 void SetStartAndEnd(int start) 98 { 99 m_OIDStart = start; 100 m_OIDEnd = start + m_Vol->GetNumOIDs(); 101 } 102 103 /// Get the starting OID in this volume's range. 104 /// 105 /// This returns the first OID in this volume's OID range. 106 /// 107 /// @return The starting OID of the range OIDStart() const108 int OIDStart() const 109 { 110 return m_OIDStart; 111 } 112 113 /// Get the ending OID in this volume's range. 114 /// 115 /// This returns the first OID past the end of this volume's OID 116 /// range. 117 /// 118 /// @return 119 /// The ending OID of the range OIDEnd() const120 int OIDEnd() const 121 { 122 return m_OIDEnd; 123 } 124 125 /// Get a pointer to the underlying volume object. Vol()126 CSeqDBVol * Vol() 127 { 128 return m_Vol; 129 } 130 131 /// Get a const pointer to the underlying volume object. Vol() const132 const CSeqDBVol * Vol() const 133 { 134 return m_Vol; 135 } 136 137 private: 138 /// The underlying volume object 139 CSeqDBVol * m_Vol; 140 141 /// The start of the OID range. 142 int m_OIDStart; 143 144 /// The end of the OID range. 145 int m_OIDEnd; 146 }; 147 148 149 /// CSeqDBVolSet 150 /// 151 /// This class stores a set of CSeqDBVol objects and defines an 152 /// interface to control usage of them. Several methods are provided 153 /// to create the set of volumes, or to get the required volumes by 154 /// different criteria. Also, certain methods perform operations over 155 /// the set of volumes. The CSeqDBVolEntry class, defined internally 156 /// to this one, provides some of this abstraction. 157 class CSeqDBVolSet { 158 public: 159 /// Standard Constructor 160 /// 161 /// An object of this class will be constructed after the alias 162 /// files have been read, and the volume names will come from that 163 /// processing step. All of the specified volumes will be opened 164 /// and the metadata will be verified during construction. 165 /// 166 /// @param atlas 167 /// The memory management object to use. 168 /// @param vol_names 169 /// The names of the volumes this object will manage. 170 /// @param prot_nucl 171 /// Whether these are protein or nucleotide sequences. 172 /// @param user_list 173 /// If specified, will be used to include deflines by GI or TI. 174 /// @param neg_list 175 /// If specified, will be used to exclude deflines by GI or TI. 176 CSeqDBVolSet(CSeqDBAtlas & atlas, 177 const vector<string> & vol_names, 178 char prot_nucl, 179 CSeqDBGiList * user_list, 180 CSeqDBNegativeList * neg_list); 181 182 /// Default Constructor 183 /// 184 /// An empty volume set will be created; this is in support of the 185 /// CSeqDBExpert class's default constructor. 186 CSeqDBVolSet(); 187 188 /// Destructor 189 /// 190 /// The destructor will release all resources still held, but some 191 /// of the resources will probably already be cleaned up via a 192 /// call to the UnLease method. 193 ~CSeqDBVolSet(); 194 195 /// Find a volume by OID. 196 /// 197 /// Many of the CSeqDB methods identify which sequence to use by 198 /// OID. That OID applies to all sequences in all volumes of the 199 /// opened database(s). This method is used to find the volume 200 /// (if any) that contains this OID, and to return both a pointer 201 /// to that volume and the OID within that volume that corresponds 202 /// to the global input OID. 203 /// 204 /// @param oid 205 /// The global OID to search for. 206 /// @param vol_oid 207 /// The returned OID within the relevant volume. 208 /// @return 209 /// A pointer to the volume containing the oid, or NULL. FindVol(int oid,int & vol_oid) const210 CSeqDBVol * FindVol(int oid, int & vol_oid) const 211 { 212 // The 'const' usage here should be cleaned up, i.e. const 213 // should be removed from most of SeqDB's methods. Since the 214 // atlas often remaps the actual file data due to seemingly 215 // read-only user requests, there are very few parts of this 216 // code that can really be considered const. "Conceptual" 217 // const is not worth the trouble, particularly for internal 218 // methods. 219 220 // A good technique would be to remove all or nearly all of 221 // the 'mutable' keywords, then remove the word 'const' from 222 // almost everything the compiler complains about. 223 224 int vol_idx(0); 225 return const_cast<CSeqDBVol*>(FindVol(oid, vol_oid, vol_idx)); 226 } 227 228 /// Find a volume by OID. 229 /// 230 /// Many of the CSeqDB methods identify which sequence to use by 231 /// OID. That OID applies to all sequences in all volumes of the 232 /// opened database(s). This method is used to find the volume 233 /// (if any) that contains this OID, and to return a pointer to 234 /// that volume, the OID within that volume that corresponds to 235 /// the global input OID, and the volume index. 236 /// 237 /// @param oid 238 /// The global OID to search for. 239 /// @param vol_oid 240 /// The returned OID within the relevant volume. 241 /// @param vol_idx 242 /// The returned index of the relevant volume. 243 /// @return 244 /// A pointer to the volume containing the oid, or NULL. FindVol(int oid,int & vol_oid,int & vol_idx) const245 const CSeqDBVol * FindVol(int oid, int & vol_oid, int & vol_idx) const 246 { 247 int rec_indx = m_RecentVol; 248 249 if (rec_indx < (int) m_VolList.size()) { 250 const CSeqDBVolEntry & rvol = m_VolList[rec_indx]; 251 252 if ((rvol.OIDStart() <= oid) && 253 (rvol.OIDEnd() > oid)) { 254 255 vol_oid = oid - rvol.OIDStart(); 256 vol_idx = rec_indx; 257 258 return rvol.Vol(); 259 } 260 } 261 262 for(int index = 0; index < (int) m_VolList.size(); index++) { 263 if ((m_VolList[index].OIDStart() <= oid) && 264 (m_VolList[index].OIDEnd() > oid)) { 265 266 m_RecentVol = index; 267 268 vol_oid = oid - m_VolList[index].OIDStart(); 269 vol_idx = index; 270 271 return m_VolList[index].Vol(); 272 } 273 } 274 275 return NULL; 276 } 277 278 /// Find a volume by OID. 279 /// 280 /// Many of the CSeqDB methods identify which sequence to use by 281 /// OID. That OID applies to all sequences in all volumes of the 282 /// opened database(s). This method is used to find the volume 283 /// (if any) that contains this OID, and to return both a pointer 284 /// to that volume and the OID within that volume that corresponds 285 /// to the global input OID. 286 /// 287 /// @param oid 288 /// The global OID to search for. 289 /// @param vol_oid 290 /// The returned OID within the relevant volume. 291 /// @return 292 /// A pointer to the volume containing the oid, or NULL. FindVol(int oid,int & vol_oid)293 CSeqDBVol * FindVol(int oid, int & vol_oid) 294 { 295 int rec_indx = m_RecentVol; 296 297 if (rec_indx < (int) m_VolList.size()) { 298 CSeqDBVolEntry & rvol = m_VolList[rec_indx]; 299 300 if ((rvol.OIDStart() <= oid) && 301 (rvol.OIDEnd() > oid)) { 302 303 vol_oid = oid - rvol.OIDStart(); 304 305 return rvol.Vol(); 306 } 307 } 308 309 for(int index = 0; index < (int) m_VolList.size(); index++) { 310 if ((m_VolList[index].OIDStart() <= oid) && 311 (m_VolList[index].OIDEnd() > oid)) { 312 313 m_RecentVol = index; 314 315 vol_oid = oid - m_VolList[index].OIDStart(); 316 317 return m_VolList[index].Vol(); 318 } 319 } 320 321 return 0; 322 } 323 324 /// Find a volume by index. 325 /// 326 /// This method returns a volume by index, so that 0 is the first 327 /// volume, and N-1 is the last volume of a set of N. 328 /// 329 /// @param i 330 /// The index of the volume to return. 331 /// @return 332 /// A pointer to the indicated volume, or NULL. GetVol(int i) const333 const CSeqDBVol * GetVol(int i) const 334 { 335 if (m_VolList.empty()) { 336 return 0; 337 } 338 339 if (i >= (int) m_VolList.size()) { 340 return 0; 341 } 342 343 m_RecentVol = i; 344 345 return m_VolList[i].Vol(); 346 } 347 348 /// Find a volume by index. 349 /// 350 /// This method returns a volume by index, so that 0 is the first 351 /// volume, and N-1 is the last volume of a set of N. 352 /// 353 /// @param i 354 /// The index of the volume to return. 355 /// @return 356 /// A pointer to the indicated volume, or NULL. GetVolNonConst(int i)357 CSeqDBVol * GetVolNonConst(int i) 358 { 359 if (m_VolList.empty()) { 360 return 0; 361 } 362 363 if (i >= (int) m_VolList.size()) { 364 return 0; 365 } 366 367 m_RecentVol = i; 368 369 return m_VolList[i].Vol(); 370 } 371 372 /// Find a volume entry by index. 373 /// 374 /// This method returns a CSeqDBVolEntry by index, so that 0 is 375 /// the first volume, and N-1 is the last volume of a set of N. 376 /// 377 /// @param i 378 /// The index of the volume entry to return. 379 /// @return 380 /// A pointer to the indicated volume entry, or NULL. GetVolEntry(int i) const381 const CSeqDBVolEntry * GetVolEntry(int i) const 382 { 383 if (m_VolList.empty()) { 384 return 0; 385 } 386 387 if (i >= (int) m_VolList.size()) { 388 return 0; 389 } 390 391 m_RecentVol = i; 392 393 return & m_VolList[i]; 394 } 395 396 /// Find a volume by name. 397 /// 398 /// Each volume has a name, which is the name of the component 399 /// files (.pin, .psq, etc), without the file extension. This 400 /// method returns a const pointer to the volume matching the 401 /// specified name. 402 /// 403 /// @param volname 404 /// The name of the volume to search for. 405 /// @return 406 /// A pointer to the volume matching the specified name, or NULL. GetVol(const string & volname) const407 const CSeqDBVol * GetVol(const string & volname) const 408 { 409 if (const CSeqDBVolEntry * v = x_FindVolName(volname)) { 410 return v->Vol(); 411 } 412 return 0; 413 } 414 415 /// Find a volume by name (non-const version). 416 /// 417 /// Each volume has a name, which is the name of the component 418 /// files (.pin, .psq, etc), without the file extension. This 419 /// method returns a non-const pointer to the volume matching the 420 /// specified name. 421 /// 422 /// @param volname 423 /// The name of the volume to search for. 424 /// @return 425 /// A pointer to the volume matching the specified name, or NULL. GetVol(const string & volname)426 CSeqDBVol * GetVol(const string & volname) 427 { 428 if (CSeqDBVolEntry * v = x_FindVolName(volname)) { 429 return v->Vol(); 430 } 431 return 0; 432 } 433 434 /// Get the number of volumes 435 /// 436 /// This returns the number of volumes available from this set. 437 /// It would be needed, for example, in order to iterate over all 438 /// volumes with the GetVol(int) method. 439 /// @return 440 /// The number of volumes available from this set. GetNumVols() const441 int GetNumVols() const 442 { 443 return (int)m_VolList.size(); 444 } 445 446 /// Get the size of the OID range. 447 /// 448 /// This method returns the total size of the combined (global) 449 /// OID range of this database. 450 /// 451 /// @return 452 /// The number of OIDs. GetNumOIDs() const453 int GetNumOIDs() const 454 { 455 return x_GetNumOIDs(); 456 } 457 458 /// Return storage held by the volumes 459 /// 460 /// This method returns any storage held by CSeqDBMemLease objects 461 /// which are part of this set of volumes. The memory leases will 462 /// be reacquired by the volumes if the data is requested again. UnLease()463 void UnLease() 464 { 465 for(int index = 0; index < (int) m_VolList.size(); index++) { 466 m_VolList[index].Vol()->UnLease(); 467 } 468 } 469 470 /// Get the first OID in a volume. 471 /// 472 /// Each volume is considered to span a range of OIDs. This 473 /// method returns the first OID in the OID range of the indicated 474 /// volume. The returned OID may not be included (ie. it may be 475 /// turned off via a filtering mechanism). 476 /// 477 /// @param i 478 /// The index of the volume. GetVolOIDStart(int i) const479 int GetVolOIDStart(int i) const 480 { 481 if (m_VolList.empty()) { 482 return 0; 483 } 484 485 if (i >= (int) m_VolList.size()) { 486 return 0; 487 } 488 489 m_RecentVol = i; 490 491 return m_VolList[i].OIDStart(); 492 } 493 494 /// Find total volume length for all volumes 495 /// 496 /// Each volume in the set has an internally stored length, which 497 /// indicates the length (in nucleotides/residues/bases) of all of 498 /// the sequences in the volume. This returns the total of these 499 /// lengths. 500 /// 501 /// @return 502 /// The sum of the lengths of all volumes. GetVolumeSetLength() const503 Uint8 GetVolumeSetLength() const 504 { 505 Uint8 vol_total = 0; 506 507 for(int index = 0; index < (int) m_VolList.size(); index++) { 508 vol_total += m_VolList[index].Vol()->GetVolumeLength(); 509 } 510 511 return vol_total; 512 } 513 GetMaxLength() const514 int GetMaxLength() const 515 { 516 int max_len = 0; 517 518 for(int index = 0; index < (int) m_VolList.size(); index++) { 519 max_len = max( max_len, m_VolList[index].Vol()->GetMaxLength()); 520 } 521 522 return max_len; 523 } 524 GetMinLength() const525 int GetMinLength() const 526 { 527 int min_len = INT4_MAX; 528 529 for(int index = 0; index < (int) m_VolList.size(); index++) { 530 min_len = min( min_len, m_VolList[index].Vol()->GetMinLength()); 531 } 532 533 return min_len; 534 } 535 536 /// Optimize the GI list configuration. 537 /// 538 /// This tells the volumes to examine and optimize their GI list 539 /// configuration. It should not be called until all GI lists 540 /// have been added to the volumes (by alias file processing). OptimizeGiLists()541 void OptimizeGiLists() 542 { 543 for(int i = 0; i< (int) m_VolList.size(); i++) { 544 m_VolList[i].Vol()->OptimizeGiLists(); 545 } 546 } 547 548 private: 549 /// Private constructor to prevent copy operation. 550 CSeqDBVolSet(const CSeqDBVolSet &); 551 552 /// Private operator to prevent assignment. 553 CSeqDBVolSet & operator=(const CSeqDBVolSet &); 554 555 /// Get the size of the entire OID range. x_GetNumOIDs() const556 int x_GetNumOIDs() const 557 { 558 if (m_VolList.empty()) 559 return 0; 560 561 return m_VolList.back().OIDEnd(); 562 } 563 564 /// Add a volume 565 /// 566 /// This method adds a volume to the set. 567 /// 568 /// @param atlas 569 /// The memory management layer object. 570 /// @param nm 571 /// The name of the volume. 572 /// @param pn 573 /// The sequence type. 574 /// @param user_list 575 /// If specified, will be used to include deflines by ID. 576 /// @param neg_list 577 /// If specified, will be used to exclude deflines by ID. 578 /// @param locked 579 /// The lock holder object for this thread. 580 void x_AddVolume(CSeqDBAtlas & atlas, 581 const string & nm, 582 char pn, 583 CSeqDBGiList * user_list, 584 CSeqDBNegativeList * neg_list, 585 CSeqDBLockHold & locked); 586 587 /// Find a volume by name 588 /// 589 /// This returns the CSeqDBVolEntry object for the volume matching 590 /// the specified name. 591 /// 592 /// @param volname 593 /// The name of the volume. 594 /// @return 595 /// A const pointer to the CSeqDBVolEntry object, or NULL. x_FindVolName(const string & volname) const596 const CSeqDBVolEntry * x_FindVolName(const string & volname) const 597 { 598 for(int i = 0; i< (int) m_VolList.size(); i++) { 599 if (volname == m_VolList[i].Vol()->GetVolName()) { 600 return & m_VolList[i]; 601 } 602 } 603 604 return 0; 605 } 606 607 /// Find a volume by name 608 /// 609 /// This returns the CSeqDBVolEntry object for the volume matching 610 /// the specified name (non const version). 611 /// 612 /// @param volname 613 /// The name of the volume. 614 /// @return 615 /// A non-const pointer to the CSeqDBVolEntry object, or NULL. x_FindVolName(const string & volname)616 CSeqDBVolEntry * x_FindVolName(const string & volname) 617 { 618 for(int i = 0; i < (int) m_VolList.size(); i++) { 619 if (volname == m_VolList[i].Vol()->GetVolName()) { 620 return & m_VolList[i]; 621 } 622 } 623 624 return 0; 625 } 626 627 /// The actual set of volumes. 628 vector<CSeqDBVolEntry> m_VolList; 629 630 /// The index of the most recently used volume 631 /// 632 /// This variable is mutable and volatile, but is not protected by 633 /// locking. Instead, the following precautions are always taken. 634 /// 635 /// 1. First, the value is copied into a local variable. 636 /// 2. Secondly, the range is always checked. 637 /// 3. It is always treated as a hint; there is always fallback 638 /// code to search for the correct volume. 639 mutable volatile int m_RecentVol; 640 }; 641 642 END_NCBI_SCOPE 643 644 #endif // OBJTOOLS_READERS_SEQDB__SEQDBVOLSET_HPP 645 646 647