1 #ifndef OBJECTS_OBJMGR___DATA_LOADER__HPP
2 #define OBJECTS_OBJMGR___DATA_LOADER__HPP
3 
4 /*  $Id: data_loader.hpp 610968 2020-06-26 12:55:17Z grichenk $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Aleksey Grichenko, Michael Kimelman, Eugene Vasilchenko
30 *
31 * File Description:
32 *   Data loader base class for object manager
33 *
34 */
35 
36 #include <corelib/ncbiobj.hpp>
37 #include <util/range.hpp>
38 #include <objmgr/object_manager.hpp>
39 #include <objmgr/annot_name.hpp>
40 #include <objmgr/annot_type_selector.hpp>
41 #include <objmgr/impl/tse_lock.hpp>
42 #include <objmgr/blob_id.hpp>
43 
44 #include <objects/seq/seq_id_handle.hpp>
45 #include <objects/seq/Seq_inst.hpp>
46 #include <corelib/plugin_manager.hpp>
47 #include <set>
48 #include <map>
49 
50 BEGIN_NCBI_SCOPE
51 BEGIN_SCOPE(objects)
52 
53 /** @addtogroup ObjectManagerCore
54  *
55  * @{
56  */
57 
58 
59 // fwd decl
60 class CDataSource;
61 class CTSE_Info;
62 class CTSE_Chunk_Info;
63 class CBioseq_Info;
64 class IEditSaver;
65 struct SAnnotSelector;
66 class CScope_Impl;
67 
68 /////////////////////////////////////////////////////////////////////////////
69 // structure to describe required data set
70 //
71 
72 struct SRequestDetails
73 {
74     typedef CRange<TSeqPos> TRange;
75     typedef set<SAnnotTypeSelector> TAnnotTypesSet;
76     typedef map<CAnnotName, TAnnotTypesSet> TAnnotSet;
77     enum FAnnotBlobType {
78         fAnnotBlobNone      = 0,
79         fAnnotBlobInternal  = 1<<0,
80         fAnnotBlobExternal  = 1<<1,
81         fAnnotBlobOrphan    = 1<<2,
82         fAnnotBlobAll       = (fAnnotBlobInternal |
83                                fAnnotBlobExternal |
84                                fAnnotBlobOrphan)
85     };
86     typedef int TAnnotBlobType;
87 
SRequestDetailsSRequestDetails88     SRequestDetails(void)
89         : m_NeedSeqMap(TRange::GetEmpty()),
90           m_NeedSeqData(TRange::GetEmpty()),
91           m_AnnotBlobType(fAnnotBlobNone)
92         {
93         }
94 
95     TRange          m_NeedSeqMap;
96     TRange          m_NeedSeqData;
97     TAnnotSet       m_NeedAnnots;
98     TAnnotBlobType  m_AnnotBlobType;
99 };
100 
101 
102 /////////////////////////////////////////////////////////////////////////////
103 // Template for data loader construction.
104 class CLoaderMaker_Base
105 {
106 public:
107     // Virtual method for creating an instance of the data loader
108     virtual CDataLoader* CreateLoader(void) const = 0;
109 
~CLoaderMaker_Base(void)110     virtual ~CLoaderMaker_Base(void) {}
111 
112 protected:
113     typedef SRegisterLoaderInfo<CDataLoader> TRegisterInfo_Base;
114     string             m_Name;
115     TRegisterInfo_Base m_RegisterInfo;
116 
117     friend class CObjectManager;
118 };
119 
120 
121 // Construction of data loaders without arguments
122 template <class TDataLoader>
123 class CSimpleLoaderMaker : public CLoaderMaker_Base
124 {
125 public:
CSimpleLoaderMaker(void)126     CSimpleLoaderMaker(void)
127         {
128             m_Name = TDataLoader::GetLoaderNameFromArgs();
129         }
130 
~CSimpleLoaderMaker(void)131     virtual ~CSimpleLoaderMaker(void) {}
132 
CreateLoader(void) const133     virtual CDataLoader* CreateLoader(void) const
134         {
135             return new TDataLoader(m_Name);
136         }
137     typedef SRegisterLoaderInfo<TDataLoader> TRegisterInfo;
GetRegisterInfo(void)138     TRegisterInfo GetRegisterInfo(void)
139         {
140             TRegisterInfo info;
141             info.Set(m_RegisterInfo.GetLoader(), m_RegisterInfo.IsCreated());
142             return info;
143         }
144 };
145 
146 
147 // Construction of data loaders with an argument. A structure
148 // may be used to create loaders with multiple arguments.
149 template <class TDataLoader, class TParam>
150 class CParamLoaderMaker : public CLoaderMaker_Base
151 {
152 public:
153     typedef TParam TParamType;
154 public:
155     // TParam should have copy method.
CParamLoaderMaker(TParam param)156     CParamLoaderMaker(TParam param)
157         : m_Param(param)
158         {
159             m_Name = TDataLoader::GetLoaderNameFromArgs(param);
160         }
161 
~CParamLoaderMaker(void)162     virtual ~CParamLoaderMaker(void) {}
163 
CreateLoader(void) const164     virtual CDataLoader* CreateLoader(void) const
165         {
166             return new TDataLoader(m_Name, m_Param);
167         }
168     typedef SRegisterLoaderInfo<TDataLoader> TRegisterInfo;
GetRegisterInfo(void)169     TRegisterInfo GetRegisterInfo(void)
170         {
171             TRegisterInfo info;
172             info.Set(m_RegisterInfo.GetLoader(), m_RegisterInfo.IsCreated());
173             return info;
174         }
175 protected:
176     TParam m_Param;
177 };
178 
179 
180 ////////////////////////////////////////////////////////////////////
181 //
182 //  CDataLoader --
183 //
184 //  Load data from different sources
185 //
186 
187 // There are three types of blobs (top-level Seq-entries) related to
188 // any Seq-id:
189 //   1. main (eBioseq/eBioseqCore/eSequence):
190 //      Seq-entry containing Bioseq with Seq-id.
191 //   2. external (eExtAnnot):
192 //      Seq-entry doesn't contain Bioseq but contains annotations on Seq-id,
193 //      provided this data source contain some blob with Bioseq.
194 //   3. orphan (eOrphanAnnot):
195 //      Seq-entry contains only annotations and this data source doesn't
196 //      contain Bioseq with specified Seq-id at all.
197 
198 class NCBI_XOBJMGR_EXPORT CDataLoader : public CObject
199 {
200 protected:
201     CDataLoader(void);
202     CDataLoader(const string& loader_name);
203 
204 public:
205     virtual ~CDataLoader(void);
206 
207 public:
208     /// main blob is blob with sequence
209     /// all other blobs are external and contain external annotations
210     enum EChoice {
211         eBlob,        ///< whole main
212         eBioseq,      ///< main blob with complete bioseq
213         eCore,        ///< ?only seq-entry core?
214         eBioseqCore,  ///< main blob with bioseq core (no seqdata and annots)
215         eSequence,    ///< seq data
216         eFeatures,    ///< features from main blob
217         eGraph,       ///< graph annotations from main blob
218         eAlign,       ///< aligns from main blob
219         eAnnot,       ///< all annotations from main blob
220         eExtFeatures, ///< external features
221         eExtGraph,    ///< external graph annotations
222         eExtAlign,    ///< external aligns
223         eExtAnnot,    ///< all external annotations
224         eOrphanAnnot, ///< all external annotations if no Bioseq exists
225         eAll          ///< all blobs (main and external)
226     };
227 
228     typedef CTSE_Lock               TTSE_Lock;
229     typedef set<TTSE_Lock>          TTSE_LockSet;
230     typedef CRef<CTSE_Chunk_Info>   TChunk;
231     typedef vector<TChunk>          TChunkSet;
232 
233     typedef set<string> TProcessedNAs;
234     static bool IsRequestedAnyNA(const SAnnotSelector* sel);
235     static bool IsRequestedNA(const string& na, const SAnnotSelector* sel);
236     static bool IsProcessedNA(const string& na, const TProcessedNAs* processed_nas);
237     static void SetProcessedNA(const string& na, TProcessedNAs* processed_nas);
238 
239     /// Request from a datasource using handles and ranges instead of seq-loc
240     /// The TSEs loaded in this call will be added to the tse_set.
241     /// The GetRecords() may throw CBlobStateException if the sequence
242     /// is not available (not known or disabled), and blob state
243     /// is different from minimal fState_no_data.
244     /// The actual blob state can be read from the exception in this case.
245     virtual TTSE_LockSet GetRecords(const CSeq_id_Handle& idh,
246                                     EChoice choice);
247     /// The same as GetRecords() but always returns empty TSE lock set
248     /// instead of throwing CBlobStateException.
249     TTSE_LockSet GetRecordsNoBlobState(const CSeq_id_Handle& idh,
250                                        EChoice choice);
251     /// Request from a datasource using handles and ranges instead of seq-loc
252     /// The TSEs loaded in this call will be added to the tse_set.
253     /// Default implementation will call GetRecords().
254     virtual TTSE_LockSet GetDetailedRecords(const CSeq_id_Handle& idh,
255                                             const SRequestDetails& details);
256     /// Request from a datasource set of blobs with external annotations.
257     /// CDataLoader has reasonable default implementation.
258     virtual TTSE_LockSet GetExternalRecords(const CBioseq_Info& bioseq);
259 
260     /// old Get*AnnotRecords() methods
261     virtual TTSE_LockSet GetOrphanAnnotRecords(const CSeq_id_Handle& idh,
262                                                const SAnnotSelector* sel);
263     virtual TTSE_LockSet GetExternalAnnotRecords(const CSeq_id_Handle& idh,
264                                                  const SAnnotSelector* sel);
265     virtual TTSE_LockSet GetExternalAnnotRecords(const CBioseq_Info& bioseq,
266                                                  const SAnnotSelector* sel);
267 
268     typedef set<CSeq_id_Handle> TSeq_idSet;
269     /// new Get*AnnotRecords() methods
270     virtual TTSE_LockSet GetOrphanAnnotRecordsNA(const CSeq_id_Handle& idh,
271                                                  const SAnnotSelector* sel,
272                                                  TProcessedNAs* processed_nas);
273     virtual TTSE_LockSet GetOrphanAnnotRecordsNA(const TSeq_idSet& ids,
274                                                  const SAnnotSelector* sel,
275                                                  TProcessedNAs* processed_nas);
276     virtual TTSE_LockSet GetExternalAnnotRecordsNA(const CSeq_id_Handle& idh,
277                                                    const SAnnotSelector* sel,
278                                                    TProcessedNAs* processed_nas);
279     virtual TTSE_LockSet GetExternalAnnotRecordsNA(const CBioseq_Info& bioseq,
280                                                    const SAnnotSelector* sel,
281                                                    TProcessedNAs* processed_nas);
282 
283     typedef vector<CSeq_id_Handle> TIds;
284     /// Request for a list of all Seq-ids of a sequence.
285     /// The result container should not change if sequence with requested id
286     /// is not known.
287     /// The result must be non-empty for existing sequences
288     virtual void GetIds(const CSeq_id_Handle& idh, TIds& ids);
289 
290     /// helper function to check if sequence exists, uses GetIds()
291     bool SequenceExists(const CSeq_id_Handle& idh);
292 
293     /// Request for a accession.version Seq-id of a sequence.
294     /// Returns null CSeq_id_Handle if sequence with requested id is not known,
295     /// or if existing sequence doesn't have an accession
296     /// @sa GetAccVerFound()
297     virtual CSeq_id_Handle GetAccVer(const CSeq_id_Handle& idh);
298     /// Better replacement of GetAccVer(), this method should be defined in
299     /// data loaders, GetAccVer() is left for compatibility.
300     /// @sa GetAccVer()
301     struct SAccVerFound {
302         bool sequence_found; // true if the sequence is found by data loader
303         CSeq_id_Handle acc_ver; // may be null even for existing sequence
SAccVerFoundCDataLoader::SAccVerFound304         SAccVerFound() : sequence_found(false) {}
305     };
306     virtual SAccVerFound GetAccVerFound(const CSeq_id_Handle& idh);
307 
308     /// Request for a gi of a sequence.
309     /// Returns zero gi if sequence with requested id is not known,
310     /// or if existing sequence doesn't have a gi
311     /// @sa GetGiFound()
312     virtual TGi GetGi(const CSeq_id_Handle& idh);
313     /// Better replacement of GetGi(), this method should be defined in
314     /// data loaders, GetGi() is left for compatibility.
315     /// @sa GetGi()
316     struct SGiFound {
317         bool sequence_found; // true if the sequence is found by data loader
318         TGi gi; // may be 0 even for existing sequence
SGiFoundCDataLoader::SGiFound319         SGiFound() : sequence_found(false), gi(ZERO_GI) {}
320     };
321     virtual SGiFound GetGiFound(const CSeq_id_Handle& idh);
322 
323     /// Request for a label string of a sequence.
324     /// Returns empty string if sequence with requested id is not known.
325     /// The result must be non-empty for existing sequences
326     virtual string GetLabel(const CSeq_id_Handle& idh);
327 
328     /// Request for a taxonomy id of a sequence.
329     /// Returns -1 if sequence with requested id is not known.
330     /// Returns 0 if existing sequence doesn't have TaxID
331     virtual TTaxId GetTaxId(const CSeq_id_Handle& idh);
332 
333     /// Request for a length of a sequence.
334     /// Returns kInvalidSeqPos if sequence with requested id is not known.
335     /// The result must not be kInvalidSeqPos for existing sequences
336     virtual TSeqPos GetSequenceLength(const CSeq_id_Handle& idh);
337 
338     /// Request for a type of a sequence
339     /// Returns CSeq_inst::eMol_not_set if sequence is not known
340     /// @sa GetSequenceTypeFound()
341     virtual CSeq_inst::TMol GetSequenceType(const CSeq_id_Handle& idh);
342     /// Better replacement of GetSequenceType(), this method should be
343     /// defined in data loaders, GetSequenceType() is left for compatibility.
344     /// @sa GetSequenceType()
345     struct STypeFound {
346         bool sequence_found; // true if the sequence is found by data loader
347         CSeq_inst::TMol type; // may be eMol_not_set even for existing sequence
STypeFoundCDataLoader::STypeFound348         STypeFound() : sequence_found(false), type(CSeq_inst::eMol_not_set) {}
349     };
350     virtual STypeFound GetSequenceTypeFound(const CSeq_id_Handle& idh);
351 
352     /// Request for a state of a sequence.
353     /// Returns CBioseq_Handle::fState_not_found|fState_no_data if sequence
354     /// with requested id is not known.
355     /// Result mustn't be fState_not_found|fState_no_data if sequence exists
356     virtual int GetSequenceState(const CSeq_id_Handle& idh);
357 
358     /// Request for a sequence hash.
359     /// Returns 0 if the sequence or its hash is not known.
360     /// @sa GetSequenceHashFound()
361     virtual int GetSequenceHash(const CSeq_id_Handle& idh);
362     /// Better replacement of GetSequenceHash(), this method should be
363     /// defined in data loaders, GetSequenceHash() is left for compatibility.
364     /// @sa GetSequenceHash()
365     struct SHashFound {
366         bool sequence_found; // true if the sequence is found by data loader
367         bool hash_known; // true if sequence exists and hash value is set
368         int hash; // may be 0 even for existing sequence
SHashFoundCDataLoader::SHashFound369         SHashFound()
370             : sequence_found(false),
371               hash_known(false),
372               hash(0)
373             {
374             }
375     };
376     virtual SHashFound GetSequenceHashFound(const CSeq_id_Handle& idh);
377 
378     /// Bulk loading interface for a small pieces of information per id.
379     /// The 'loaded' bit set (in/out) marks ids that already processed.
380     /// If an element in 'loaded' is set on input then bulk methods
381     /// should skip corresponding id, as it's already processed.
382     /// Othewise, if the id is known and processed, the 'loaded' element
383     /// will be set to true.
384     /// Othewise, the 'loaded' element will remain false.
385     typedef vector<bool> TLoaded;
386     typedef vector<TGi> TGis;
387     typedef vector<string> TLabels;
388     typedef vector<TTaxId> TTaxIds;
389     typedef vector<TSeqPos> TSequenceLengths;
390     typedef vector<CSeq_inst::TMol> TSequenceTypes;
391     typedef vector<int> TSequenceStates;
392     typedef vector<int> TSequenceHashes;
393     typedef vector<bool> THashKnown;
394     /// Bulk request for accession.version Seq-ids of a set of sequences.
395     virtual void GetAccVers(const TIds& ids, TLoaded& loaded, TIds& ret);
396     /// Bulk request for gis of a set of sequences.
397     virtual void GetGis(const TIds& ids, TLoaded& loaded, TGis& ret);
398     /// Bulk request for label strings of a set of sequences.
399     virtual void GetLabels(const TIds& ids, TLoaded& loaded, TLabels& ret);
400     /// Bulk request for taxonomy ids of a set of sequences.
401     virtual void GetTaxIds(const TIds& ids, TLoaded& loaded, TTaxIds& ret);
402     /// Bulk request for lengths of a set of sequences.
403     virtual void GetSequenceLengths(const TIds& ids, TLoaded& loaded,
404                                     TSequenceLengths& ret);
405     /// Bulk request for types of a set of sequences.
406     virtual void GetSequenceTypes(const TIds& ids, TLoaded& loaded,
407                                   TSequenceTypes& ret);
408     /// Bulk request for states of a set of sequences.
409     virtual void GetSequenceStates(const TIds& ids, TLoaded& loaded,
410                                    TSequenceStates& ret);
411     /// Bulk request for hashes of a set of sequences.
412     virtual void GetSequenceHashes(const TIds& ids, TLoaded& loaded,
413                                    TSequenceHashes& ret, THashKnown& known);
414 
415     // Load multiple seq-ids. Same as GetRecords() for multiple ids
416     // with choise set to eBlob. The map should be initialized with
417     // the id handles to be loaded.
418     typedef map<CSeq_id_Handle, TTSE_LockSet> TTSE_LockSets;
419     virtual void GetBlobs(TTSE_LockSets& tse_sets);
420 
421     // blob operations
422     typedef CBlobIdKey TBlobId;
423     typedef int TBlobVersion;
424     virtual TBlobId GetBlobId(const CSeq_id_Handle& idh);
425     virtual TBlobId GetBlobIdFromString(const string& str) const;
426     virtual TBlobVersion GetBlobVersion(const TBlobId& id);
427 
428     virtual bool CanGetBlobById(void) const;
429     virtual TTSE_Lock GetBlobById(const TBlobId& blob_id);
430 
431     virtual SRequestDetails ChoiceToDetails(EChoice choice) const;
432     virtual EChoice DetailsToChoice(const SRequestDetails::TAnnotSet& annots) const;
433     virtual EChoice DetailsToChoice(const SRequestDetails& details) const;
434 
435     virtual void GetChunk(TChunk chunk_info);
436     virtual void GetChunks(const TChunkSet& chunks);
437 
438     //
439     virtual void DropTSE(CRef<CTSE_Info> tse_info);
440 
441     /// Specify datasource to send loaded data to.
442     void SetTargetDataSource(CDataSource& data_source);
443 
444     string GetName(void) const;
445 
446     /// Resolve TSE conflict
447     /// *select the best TSE from the set of dead TSEs.
448     /// *select the live TSE from the list of live TSEs
449     ///  and mark the others one as dead.
450     virtual TTSE_Lock ResolveConflict(const CSeq_id_Handle& id,
451                                       const TTSE_LockSet& tse_set);
452     virtual void GC(void);
453 
454     typedef CRef<IEditSaver> TEditSaver;
455     virtual TEditSaver GetEditSaver() const;
456 
457     virtual CObjectManager::TPriority GetDefaultPriority(void) const;
458 
459     virtual Uint4 EstimateLoadBytes(const CTSE_Chunk_Info& chunk) const;
460     virtual double EstimateLoadSeconds(const CTSE_Chunk_Info& chunk, Uint4 bytes) const;
461 
462     virtual unsigned GetDefaultBlobCacheSizeLimit() const;
463 
464 protected:
465     /// Register the loader only if the name is not yet
466     /// registered in the object manager
467     static void RegisterInObjectManager(
468         CObjectManager&            om,
469         CLoaderMaker_Base&         loader_maker,
470         CObjectManager::EIsDefault is_default,
471         CObjectManager::TPriority  priority);
472 
473     void SetName(const string& loader_name);
474     CDataSource* GetDataSource(void) const;
475 
476     friend class CGBReaderRequestResult;
477     friend class CScope_Impl;
478 
479 private:
480     CDataLoader(const CDataLoader&);
481     CDataLoader& operator=(const CDataLoader&);
482 
483     string       m_Name;
484     CDataSource* m_DataSource;
485 
486     friend class CObjectManager;
487 };
488 
489 
490 /* @} */
491 
492 END_SCOPE(objects)
493 
494 NCBI_DECLARE_INTERFACE_VERSION(objects::CDataLoader, "xloader", 6, 0, 0);
495 
496 template<>
497 class CDllResolver_Getter<objects::CDataLoader>
498 {
499 public:
operator ()(void)500     CPluginManager_DllResolver* operator()(void)
501     {
502         CPluginManager_DllResolver* resolver =
503             new CPluginManager_DllResolver
504             (CInterfaceVersion<objects::CDataLoader>::GetName(),
505              kEmptyStr,
506              CVersionInfo::kAny,
507              CDll::eAutoUnload);
508 
509         resolver->SetDllNamePrefix("ncbi");
510         return resolver;
511     }
512 };
513 
514 
515 END_NCBI_SCOPE
516 
517 #endif  // OBJECTS_OBJMGR___DATA_LOADER__HPP
518