1 #ifndef OBJECTS_OBJMGR___DATA_LOADER__HPP 2 #define OBJECTS_OBJMGR___DATA_LOADER__HPP 3 4 /* $Id: data_loader.hpp 610968 2020-06-26 12:55:17Z grichenk $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Aleksey Grichenko, Michael Kimelman, Eugene Vasilchenko 30 * 31 * File Description: 32 * Data loader base class for object manager 33 * 34 */ 35 36 #include <corelib/ncbiobj.hpp> 37 #include <util/range.hpp> 38 #include <objmgr/object_manager.hpp> 39 #include <objmgr/annot_name.hpp> 40 #include <objmgr/annot_type_selector.hpp> 41 #include <objmgr/impl/tse_lock.hpp> 42 #include <objmgr/blob_id.hpp> 43 44 #include <objects/seq/seq_id_handle.hpp> 45 #include <objects/seq/Seq_inst.hpp> 46 #include <corelib/plugin_manager.hpp> 47 #include <set> 48 #include <map> 49 50 BEGIN_NCBI_SCOPE 51 BEGIN_SCOPE(objects) 52 53 /** @addtogroup ObjectManagerCore 54 * 55 * @{ 56 */ 57 58 59 // fwd decl 60 class CDataSource; 61 class CTSE_Info; 62 class CTSE_Chunk_Info; 63 class CBioseq_Info; 64 class IEditSaver; 65 struct SAnnotSelector; 66 class CScope_Impl; 67 68 ///////////////////////////////////////////////////////////////////////////// 69 // structure to describe required data set 70 // 71 72 struct SRequestDetails 73 { 74 typedef CRange<TSeqPos> TRange; 75 typedef set<SAnnotTypeSelector> TAnnotTypesSet; 76 typedef map<CAnnotName, TAnnotTypesSet> TAnnotSet; 77 enum FAnnotBlobType { 78 fAnnotBlobNone = 0, 79 fAnnotBlobInternal = 1<<0, 80 fAnnotBlobExternal = 1<<1, 81 fAnnotBlobOrphan = 1<<2, 82 fAnnotBlobAll = (fAnnotBlobInternal | 83 fAnnotBlobExternal | 84 fAnnotBlobOrphan) 85 }; 86 typedef int TAnnotBlobType; 87 SRequestDetailsSRequestDetails88 SRequestDetails(void) 89 : m_NeedSeqMap(TRange::GetEmpty()), 90 m_NeedSeqData(TRange::GetEmpty()), 91 m_AnnotBlobType(fAnnotBlobNone) 92 { 93 } 94 95 TRange m_NeedSeqMap; 96 TRange m_NeedSeqData; 97 TAnnotSet m_NeedAnnots; 98 TAnnotBlobType m_AnnotBlobType; 99 }; 100 101 102 ///////////////////////////////////////////////////////////////////////////// 103 // Template for data loader construction. 104 class CLoaderMaker_Base 105 { 106 public: 107 // Virtual method for creating an instance of the data loader 108 virtual CDataLoader* CreateLoader(void) const = 0; 109 ~CLoaderMaker_Base(void)110 virtual ~CLoaderMaker_Base(void) {} 111 112 protected: 113 typedef SRegisterLoaderInfo<CDataLoader> TRegisterInfo_Base; 114 string m_Name; 115 TRegisterInfo_Base m_RegisterInfo; 116 117 friend class CObjectManager; 118 }; 119 120 121 // Construction of data loaders without arguments 122 template <class TDataLoader> 123 class CSimpleLoaderMaker : public CLoaderMaker_Base 124 { 125 public: CSimpleLoaderMaker(void)126 CSimpleLoaderMaker(void) 127 { 128 m_Name = TDataLoader::GetLoaderNameFromArgs(); 129 } 130 ~CSimpleLoaderMaker(void)131 virtual ~CSimpleLoaderMaker(void) {} 132 CreateLoader(void) const133 virtual CDataLoader* CreateLoader(void) const 134 { 135 return new TDataLoader(m_Name); 136 } 137 typedef SRegisterLoaderInfo<TDataLoader> TRegisterInfo; GetRegisterInfo(void)138 TRegisterInfo GetRegisterInfo(void) 139 { 140 TRegisterInfo info; 141 info.Set(m_RegisterInfo.GetLoader(), m_RegisterInfo.IsCreated()); 142 return info; 143 } 144 }; 145 146 147 // Construction of data loaders with an argument. A structure 148 // may be used to create loaders with multiple arguments. 149 template <class TDataLoader, class TParam> 150 class CParamLoaderMaker : public CLoaderMaker_Base 151 { 152 public: 153 typedef TParam TParamType; 154 public: 155 // TParam should have copy method. CParamLoaderMaker(TParam param)156 CParamLoaderMaker(TParam param) 157 : m_Param(param) 158 { 159 m_Name = TDataLoader::GetLoaderNameFromArgs(param); 160 } 161 ~CParamLoaderMaker(void)162 virtual ~CParamLoaderMaker(void) {} 163 CreateLoader(void) const164 virtual CDataLoader* CreateLoader(void) const 165 { 166 return new TDataLoader(m_Name, m_Param); 167 } 168 typedef SRegisterLoaderInfo<TDataLoader> TRegisterInfo; GetRegisterInfo(void)169 TRegisterInfo GetRegisterInfo(void) 170 { 171 TRegisterInfo info; 172 info.Set(m_RegisterInfo.GetLoader(), m_RegisterInfo.IsCreated()); 173 return info; 174 } 175 protected: 176 TParam m_Param; 177 }; 178 179 180 //////////////////////////////////////////////////////////////////// 181 // 182 // CDataLoader -- 183 // 184 // Load data from different sources 185 // 186 187 // There are three types of blobs (top-level Seq-entries) related to 188 // any Seq-id: 189 // 1. main (eBioseq/eBioseqCore/eSequence): 190 // Seq-entry containing Bioseq with Seq-id. 191 // 2. external (eExtAnnot): 192 // Seq-entry doesn't contain Bioseq but contains annotations on Seq-id, 193 // provided this data source contain some blob with Bioseq. 194 // 3. orphan (eOrphanAnnot): 195 // Seq-entry contains only annotations and this data source doesn't 196 // contain Bioseq with specified Seq-id at all. 197 198 class NCBI_XOBJMGR_EXPORT CDataLoader : public CObject 199 { 200 protected: 201 CDataLoader(void); 202 CDataLoader(const string& loader_name); 203 204 public: 205 virtual ~CDataLoader(void); 206 207 public: 208 /// main blob is blob with sequence 209 /// all other blobs are external and contain external annotations 210 enum EChoice { 211 eBlob, ///< whole main 212 eBioseq, ///< main blob with complete bioseq 213 eCore, ///< ?only seq-entry core? 214 eBioseqCore, ///< main blob with bioseq core (no seqdata and annots) 215 eSequence, ///< seq data 216 eFeatures, ///< features from main blob 217 eGraph, ///< graph annotations from main blob 218 eAlign, ///< aligns from main blob 219 eAnnot, ///< all annotations from main blob 220 eExtFeatures, ///< external features 221 eExtGraph, ///< external graph annotations 222 eExtAlign, ///< external aligns 223 eExtAnnot, ///< all external annotations 224 eOrphanAnnot, ///< all external annotations if no Bioseq exists 225 eAll ///< all blobs (main and external) 226 }; 227 228 typedef CTSE_Lock TTSE_Lock; 229 typedef set<TTSE_Lock> TTSE_LockSet; 230 typedef CRef<CTSE_Chunk_Info> TChunk; 231 typedef vector<TChunk> TChunkSet; 232 233 typedef set<string> TProcessedNAs; 234 static bool IsRequestedAnyNA(const SAnnotSelector* sel); 235 static bool IsRequestedNA(const string& na, const SAnnotSelector* sel); 236 static bool IsProcessedNA(const string& na, const TProcessedNAs* processed_nas); 237 static void SetProcessedNA(const string& na, TProcessedNAs* processed_nas); 238 239 /// Request from a datasource using handles and ranges instead of seq-loc 240 /// The TSEs loaded in this call will be added to the tse_set. 241 /// The GetRecords() may throw CBlobStateException if the sequence 242 /// is not available (not known or disabled), and blob state 243 /// is different from minimal fState_no_data. 244 /// The actual blob state can be read from the exception in this case. 245 virtual TTSE_LockSet GetRecords(const CSeq_id_Handle& idh, 246 EChoice choice); 247 /// The same as GetRecords() but always returns empty TSE lock set 248 /// instead of throwing CBlobStateException. 249 TTSE_LockSet GetRecordsNoBlobState(const CSeq_id_Handle& idh, 250 EChoice choice); 251 /// Request from a datasource using handles and ranges instead of seq-loc 252 /// The TSEs loaded in this call will be added to the tse_set. 253 /// Default implementation will call GetRecords(). 254 virtual TTSE_LockSet GetDetailedRecords(const CSeq_id_Handle& idh, 255 const SRequestDetails& details); 256 /// Request from a datasource set of blobs with external annotations. 257 /// CDataLoader has reasonable default implementation. 258 virtual TTSE_LockSet GetExternalRecords(const CBioseq_Info& bioseq); 259 260 /// old Get*AnnotRecords() methods 261 virtual TTSE_LockSet GetOrphanAnnotRecords(const CSeq_id_Handle& idh, 262 const SAnnotSelector* sel); 263 virtual TTSE_LockSet GetExternalAnnotRecords(const CSeq_id_Handle& idh, 264 const SAnnotSelector* sel); 265 virtual TTSE_LockSet GetExternalAnnotRecords(const CBioseq_Info& bioseq, 266 const SAnnotSelector* sel); 267 268 typedef set<CSeq_id_Handle> TSeq_idSet; 269 /// new Get*AnnotRecords() methods 270 virtual TTSE_LockSet GetOrphanAnnotRecordsNA(const CSeq_id_Handle& idh, 271 const SAnnotSelector* sel, 272 TProcessedNAs* processed_nas); 273 virtual TTSE_LockSet GetOrphanAnnotRecordsNA(const TSeq_idSet& ids, 274 const SAnnotSelector* sel, 275 TProcessedNAs* processed_nas); 276 virtual TTSE_LockSet GetExternalAnnotRecordsNA(const CSeq_id_Handle& idh, 277 const SAnnotSelector* sel, 278 TProcessedNAs* processed_nas); 279 virtual TTSE_LockSet GetExternalAnnotRecordsNA(const CBioseq_Info& bioseq, 280 const SAnnotSelector* sel, 281 TProcessedNAs* processed_nas); 282 283 typedef vector<CSeq_id_Handle> TIds; 284 /// Request for a list of all Seq-ids of a sequence. 285 /// The result container should not change if sequence with requested id 286 /// is not known. 287 /// The result must be non-empty for existing sequences 288 virtual void GetIds(const CSeq_id_Handle& idh, TIds& ids); 289 290 /// helper function to check if sequence exists, uses GetIds() 291 bool SequenceExists(const CSeq_id_Handle& idh); 292 293 /// Request for a accession.version Seq-id of a sequence. 294 /// Returns null CSeq_id_Handle if sequence with requested id is not known, 295 /// or if existing sequence doesn't have an accession 296 /// @sa GetAccVerFound() 297 virtual CSeq_id_Handle GetAccVer(const CSeq_id_Handle& idh); 298 /// Better replacement of GetAccVer(), this method should be defined in 299 /// data loaders, GetAccVer() is left for compatibility. 300 /// @sa GetAccVer() 301 struct SAccVerFound { 302 bool sequence_found; // true if the sequence is found by data loader 303 CSeq_id_Handle acc_ver; // may be null even for existing sequence SAccVerFoundCDataLoader::SAccVerFound304 SAccVerFound() : sequence_found(false) {} 305 }; 306 virtual SAccVerFound GetAccVerFound(const CSeq_id_Handle& idh); 307 308 /// Request for a gi of a sequence. 309 /// Returns zero gi if sequence with requested id is not known, 310 /// or if existing sequence doesn't have a gi 311 /// @sa GetGiFound() 312 virtual TGi GetGi(const CSeq_id_Handle& idh); 313 /// Better replacement of GetGi(), this method should be defined in 314 /// data loaders, GetGi() is left for compatibility. 315 /// @sa GetGi() 316 struct SGiFound { 317 bool sequence_found; // true if the sequence is found by data loader 318 TGi gi; // may be 0 even for existing sequence SGiFoundCDataLoader::SGiFound319 SGiFound() : sequence_found(false), gi(ZERO_GI) {} 320 }; 321 virtual SGiFound GetGiFound(const CSeq_id_Handle& idh); 322 323 /// Request for a label string of a sequence. 324 /// Returns empty string if sequence with requested id is not known. 325 /// The result must be non-empty for existing sequences 326 virtual string GetLabel(const CSeq_id_Handle& idh); 327 328 /// Request for a taxonomy id of a sequence. 329 /// Returns -1 if sequence with requested id is not known. 330 /// Returns 0 if existing sequence doesn't have TaxID 331 virtual TTaxId GetTaxId(const CSeq_id_Handle& idh); 332 333 /// Request for a length of a sequence. 334 /// Returns kInvalidSeqPos if sequence with requested id is not known. 335 /// The result must not be kInvalidSeqPos for existing sequences 336 virtual TSeqPos GetSequenceLength(const CSeq_id_Handle& idh); 337 338 /// Request for a type of a sequence 339 /// Returns CSeq_inst::eMol_not_set if sequence is not known 340 /// @sa GetSequenceTypeFound() 341 virtual CSeq_inst::TMol GetSequenceType(const CSeq_id_Handle& idh); 342 /// Better replacement of GetSequenceType(), this method should be 343 /// defined in data loaders, GetSequenceType() is left for compatibility. 344 /// @sa GetSequenceType() 345 struct STypeFound { 346 bool sequence_found; // true if the sequence is found by data loader 347 CSeq_inst::TMol type; // may be eMol_not_set even for existing sequence STypeFoundCDataLoader::STypeFound348 STypeFound() : sequence_found(false), type(CSeq_inst::eMol_not_set) {} 349 }; 350 virtual STypeFound GetSequenceTypeFound(const CSeq_id_Handle& idh); 351 352 /// Request for a state of a sequence. 353 /// Returns CBioseq_Handle::fState_not_found|fState_no_data if sequence 354 /// with requested id is not known. 355 /// Result mustn't be fState_not_found|fState_no_data if sequence exists 356 virtual int GetSequenceState(const CSeq_id_Handle& idh); 357 358 /// Request for a sequence hash. 359 /// Returns 0 if the sequence or its hash is not known. 360 /// @sa GetSequenceHashFound() 361 virtual int GetSequenceHash(const CSeq_id_Handle& idh); 362 /// Better replacement of GetSequenceHash(), this method should be 363 /// defined in data loaders, GetSequenceHash() is left for compatibility. 364 /// @sa GetSequenceHash() 365 struct SHashFound { 366 bool sequence_found; // true if the sequence is found by data loader 367 bool hash_known; // true if sequence exists and hash value is set 368 int hash; // may be 0 even for existing sequence SHashFoundCDataLoader::SHashFound369 SHashFound() 370 : sequence_found(false), 371 hash_known(false), 372 hash(0) 373 { 374 } 375 }; 376 virtual SHashFound GetSequenceHashFound(const CSeq_id_Handle& idh); 377 378 /// Bulk loading interface for a small pieces of information per id. 379 /// The 'loaded' bit set (in/out) marks ids that already processed. 380 /// If an element in 'loaded' is set on input then bulk methods 381 /// should skip corresponding id, as it's already processed. 382 /// Othewise, if the id is known and processed, the 'loaded' element 383 /// will be set to true. 384 /// Othewise, the 'loaded' element will remain false. 385 typedef vector<bool> TLoaded; 386 typedef vector<TGi> TGis; 387 typedef vector<string> TLabels; 388 typedef vector<TTaxId> TTaxIds; 389 typedef vector<TSeqPos> TSequenceLengths; 390 typedef vector<CSeq_inst::TMol> TSequenceTypes; 391 typedef vector<int> TSequenceStates; 392 typedef vector<int> TSequenceHashes; 393 typedef vector<bool> THashKnown; 394 /// Bulk request for accession.version Seq-ids of a set of sequences. 395 virtual void GetAccVers(const TIds& ids, TLoaded& loaded, TIds& ret); 396 /// Bulk request for gis of a set of sequences. 397 virtual void GetGis(const TIds& ids, TLoaded& loaded, TGis& ret); 398 /// Bulk request for label strings of a set of sequences. 399 virtual void GetLabels(const TIds& ids, TLoaded& loaded, TLabels& ret); 400 /// Bulk request for taxonomy ids of a set of sequences. 401 virtual void GetTaxIds(const TIds& ids, TLoaded& loaded, TTaxIds& ret); 402 /// Bulk request for lengths of a set of sequences. 403 virtual void GetSequenceLengths(const TIds& ids, TLoaded& loaded, 404 TSequenceLengths& ret); 405 /// Bulk request for types of a set of sequences. 406 virtual void GetSequenceTypes(const TIds& ids, TLoaded& loaded, 407 TSequenceTypes& ret); 408 /// Bulk request for states of a set of sequences. 409 virtual void GetSequenceStates(const TIds& ids, TLoaded& loaded, 410 TSequenceStates& ret); 411 /// Bulk request for hashes of a set of sequences. 412 virtual void GetSequenceHashes(const TIds& ids, TLoaded& loaded, 413 TSequenceHashes& ret, THashKnown& known); 414 415 // Load multiple seq-ids. Same as GetRecords() for multiple ids 416 // with choise set to eBlob. The map should be initialized with 417 // the id handles to be loaded. 418 typedef map<CSeq_id_Handle, TTSE_LockSet> TTSE_LockSets; 419 virtual void GetBlobs(TTSE_LockSets& tse_sets); 420 421 // blob operations 422 typedef CBlobIdKey TBlobId; 423 typedef int TBlobVersion; 424 virtual TBlobId GetBlobId(const CSeq_id_Handle& idh); 425 virtual TBlobId GetBlobIdFromString(const string& str) const; 426 virtual TBlobVersion GetBlobVersion(const TBlobId& id); 427 428 virtual bool CanGetBlobById(void) const; 429 virtual TTSE_Lock GetBlobById(const TBlobId& blob_id); 430 431 virtual SRequestDetails ChoiceToDetails(EChoice choice) const; 432 virtual EChoice DetailsToChoice(const SRequestDetails::TAnnotSet& annots) const; 433 virtual EChoice DetailsToChoice(const SRequestDetails& details) const; 434 435 virtual void GetChunk(TChunk chunk_info); 436 virtual void GetChunks(const TChunkSet& chunks); 437 438 // 439 virtual void DropTSE(CRef<CTSE_Info> tse_info); 440 441 /// Specify datasource to send loaded data to. 442 void SetTargetDataSource(CDataSource& data_source); 443 444 string GetName(void) const; 445 446 /// Resolve TSE conflict 447 /// *select the best TSE from the set of dead TSEs. 448 /// *select the live TSE from the list of live TSEs 449 /// and mark the others one as dead. 450 virtual TTSE_Lock ResolveConflict(const CSeq_id_Handle& id, 451 const TTSE_LockSet& tse_set); 452 virtual void GC(void); 453 454 typedef CRef<IEditSaver> TEditSaver; 455 virtual TEditSaver GetEditSaver() const; 456 457 virtual CObjectManager::TPriority GetDefaultPriority(void) const; 458 459 virtual Uint4 EstimateLoadBytes(const CTSE_Chunk_Info& chunk) const; 460 virtual double EstimateLoadSeconds(const CTSE_Chunk_Info& chunk, Uint4 bytes) const; 461 462 virtual unsigned GetDefaultBlobCacheSizeLimit() const; 463 464 protected: 465 /// Register the loader only if the name is not yet 466 /// registered in the object manager 467 static void RegisterInObjectManager( 468 CObjectManager& om, 469 CLoaderMaker_Base& loader_maker, 470 CObjectManager::EIsDefault is_default, 471 CObjectManager::TPriority priority); 472 473 void SetName(const string& loader_name); 474 CDataSource* GetDataSource(void) const; 475 476 friend class CGBReaderRequestResult; 477 friend class CScope_Impl; 478 479 private: 480 CDataLoader(const CDataLoader&); 481 CDataLoader& operator=(const CDataLoader&); 482 483 string m_Name; 484 CDataSource* m_DataSource; 485 486 friend class CObjectManager; 487 }; 488 489 490 /* @} */ 491 492 END_SCOPE(objects) 493 494 NCBI_DECLARE_INTERFACE_VERSION(objects::CDataLoader, "xloader", 6, 0, 0); 495 496 template<> 497 class CDllResolver_Getter<objects::CDataLoader> 498 { 499 public: operator ()(void)500 CPluginManager_DllResolver* operator()(void) 501 { 502 CPluginManager_DllResolver* resolver = 503 new CPluginManager_DllResolver 504 (CInterfaceVersion<objects::CDataLoader>::GetName(), 505 kEmptyStr, 506 CVersionInfo::kAny, 507 CDll::eAutoUnload); 508 509 resolver->SetDllNamePrefix("ncbi"); 510 return resolver; 511 } 512 }; 513 514 515 END_NCBI_SCOPE 516 517 #endif // OBJECTS_OBJMGR___DATA_LOADER__HPP 518