1 /*  $Id: sraloader.cpp 592089 2019-08-26 18:56:36Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Eugene Vasilchenko
27  *
28  * File Description: SRA file data loader
29  *
30  * ===========================================================================
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <objects/general/general__.hpp>
37 #include <objects/seqloc/Seq_id.hpp>
38 #include <objects/seq/seq__.hpp>
39 #include <objects/seqres/seqres__.hpp>
40 
41 #include <objmgr/impl/data_source.hpp>
42 #include <objmgr/impl/tse_loadlock.hpp>
43 #include <objmgr/data_loader_factory.hpp>
44 #include <corelib/plugin_manager_impl.hpp>
45 #include <corelib/plugin_manager_store.hpp>
46 
47 #include <sra/data_loaders/sra/sraloader.hpp>
48 #include <sra/data_loaders/sra/impl/sraloader_impl.hpp>
49 #include <sra/readers/ncbi_traces_path.hpp>
50 
51 BEGIN_NCBI_SCOPE
52 
53 class CObject;
54 
55 NCBI_PARAM_DECL(bool, SRA_LOADER, TRIM);
56 NCBI_PARAM_DEF_EX(bool, SRA_LOADER, TRIM, false,
57                   eParam_NoThread, SRA_LOADER_TRIM);
58 
59 BEGIN_SCOPE(objects)
60 
61 class CDataLoader;
62 
63 BEGIN_LOCAL_NAMESPACE;
64 
65 class CLoaderFilter : public CObjectManager::IDataLoaderFilter {
66 public:
IsDataLoaderMatches(CDataLoader & loader) const67     bool IsDataLoaderMatches(CDataLoader& loader) const {
68         return dynamic_cast<CSRADataLoader*>(&loader) != 0;
69     }
70 };
71 
72 
73 class CRevoker {
74 public:
~CRevoker()75     ~CRevoker() {
76         CLoaderFilter filter;
77         CObjectManager::GetInstance()->RevokeDataLoaders(filter);
78     }
79 };
80 static CSafeStatic<CRevoker> s_Revoker(CSafeStaticLifeSpan(
81     CSafeStaticLifeSpan::eLifeLevel_AppMain,
82     CSafeStaticLifeSpan::eLifeSpan_Long));
83 
84 END_LOCAL_NAMESPACE;
85 
86 
87 /////////////////////////////////////////////////////////////////////////////
88 // CSRABlobId
89 /////////////////////////////////////////////////////////////////////////////
90 
91 class CSRABlobId : public CBlobId
92 {
93 public:
94     CSRABlobId(const string& acc, unsigned spot_id);
95     ~CSRABlobId(void);
96 
97     string m_Accession;
98     unsigned m_SpotId;
99 
100     string ToString(void) const;
101     bool operator<(const CBlobId& id) const;
102     bool operator==(const CBlobId& id) const;
103 };
104 
105 
CSRABlobId(const string & acc,unsigned spot_id)106 CSRABlobId::CSRABlobId(const string& acc, unsigned spot_id)
107     : m_Accession(acc), m_SpotId(spot_id)
108 {
109 }
110 
111 
~CSRABlobId(void)112 CSRABlobId::~CSRABlobId(void)
113 {
114 }
115 
116 
ToString(void) const117 string CSRABlobId::ToString(void) const
118 {
119     CNcbiOstrstream out;
120     out << m_Accession << '.' << m_SpotId;
121     return CNcbiOstrstreamToString(out);
122 }
123 
124 
operator <(const CBlobId & id) const125 bool CSRABlobId::operator<(const CBlobId& id) const
126 {
127     const CSRABlobId& sra2 = dynamic_cast<const CSRABlobId&>(id);
128     return m_Accession < sra2.m_Accession ||
129         (m_Accession == sra2.m_Accession && m_SpotId < sra2.m_SpotId);
130 }
131 
132 
operator ==(const CBlobId & id) const133 bool CSRABlobId::operator==(const CBlobId& id) const
134 {
135     const CSRABlobId& sra2 = dynamic_cast<const CSRABlobId&>(id);
136     return m_Accession == sra2.m_Accession && m_SpotId == sra2.m_SpotId;
137 }
138 
139 
140 /////////////////////////////////////////////////////////////////////////////
141 // CSRADataLoader_Impl
142 /////////////////////////////////////////////////////////////////////////////
143 
144 
GetTrimParam(void)145 static bool GetTrimParam(void)
146 {
147     static NCBI_PARAM_TYPE(SRA_LOADER, TRIM) s_Value;
148     return s_Value.Get();
149 }
150 
151 
CSRADataLoader_Impl(CSraMgr::ETrim trim)152 CSRADataLoader_Impl::CSRADataLoader_Impl(CSraMgr::ETrim trim)
153     : m_Mgr(trim)
154 {
155 }
156 
157 
~CSRADataLoader_Impl(void)158 CSRADataLoader_Impl::~CSRADataLoader_Impl(void)
159 {
160 }
161 
162 
LoadSRAEntry(const string & accession,unsigned spot_id)163 CRef<CSeq_entry> CSRADataLoader_Impl::LoadSRAEntry(const string& accession,
164                                                    unsigned spot_id)
165 {
166     CMutexGuard LOCK(m_Mutex);
167     if ( m_Run.GetAccession() != accession ) {
168         m_Run.Init(m_Mgr, accession);
169     }
170     return m_Run.GetSpotEntry(spot_id);
171 }
172 
173 
GetSequenceType(const string & accession,unsigned spot_id,unsigned read_id)174 CSeq_inst::TMol CSRADataLoader_Impl::GetSequenceType(const string& accession,
175                                                      unsigned spot_id,
176                                                      unsigned read_id)
177 {
178     CMutexGuard LOCK(m_Mutex);
179     if ( m_Run.GetAccession() != accession ) {
180         m_Run.Init(m_Mgr, accession);
181     }
182     return m_Run.GetSequenceType(spot_id, read_id);
183 }
184 
185 
GetSequenceLength(const string & accession,unsigned spot_id,unsigned read_id)186 TSeqPos CSRADataLoader_Impl::GetSequenceLength(const string& accession,
187                                                unsigned spot_id,
188                                                unsigned read_id)
189 {
190     CMutexGuard LOCK(m_Mutex);
191     if ( m_Run.GetAccession() != accession ) {
192         m_Run.Init(m_Mgr, accession);
193     }
194     return m_Run.GetSequenceLength(spot_id, read_id);
195 }
196 
197 
198 /////////////////////////////////////////////////////////////////////////////
199 // CSRADataLoader
200 /////////////////////////////////////////////////////////////////////////////
201 
SLoaderParams(void)202 CSRADataLoader::SLoaderParams::SLoaderParams(void)
203     : m_Trim(GetTrimParam())
204 {
205 }
206 
207 
SLoaderParams(bool trim)208 CSRADataLoader::SLoaderParams::SLoaderParams(bool trim)
209     : m_Trim(trim)
210 {
211 }
212 
213 
~SLoaderParams(void)214 CSRADataLoader::SLoaderParams::~SLoaderParams(void)
215 {
216 }
217 
218 
RegisterInObjectManager(CObjectManager & om,CObjectManager::EIsDefault is_default,CObjectManager::TPriority priority)219 CSRADataLoader::TRegisterLoaderInfo CSRADataLoader::RegisterInObjectManager(
220     CObjectManager& om,
221     CObjectManager::EIsDefault is_default,
222     CObjectManager::TPriority priority)
223 {
224     SLoaderParams params;
225     TMaker maker(params);
226     CDataLoader::RegisterInObjectManager(om, maker, is_default, priority);
227     return maker.GetRegisterInfo();
228 }
229 
230 
RegisterInObjectManager(CObjectManager & om,const string & rep_path,const string & vol_path,CObjectManager::EIsDefault is_default,CObjectManager::TPriority priority)231 CSRADataLoader::TRegisterLoaderInfo CSRADataLoader::RegisterInObjectManager(
232     CObjectManager& om,
233     const string& rep_path,
234     const string& vol_path,
235     CObjectManager::EIsDefault is_default,
236     CObjectManager::TPriority priority)
237 {
238     SLoaderParams params;
239     params.m_RepPath = rep_path;
240     params.m_VolPath = vol_path;
241     TMaker maker(params);
242     CDataLoader::RegisterInObjectManager(om, maker, is_default, priority);
243     return maker.GetRegisterInfo();
244 }
245 
246 
RegisterInObjectManager(CObjectManager & om,ETrim trim,CObjectManager::EIsDefault is_default,CObjectManager::TPriority priority)247 CSRADataLoader::TRegisterLoaderInfo CSRADataLoader::RegisterInObjectManager(
248     CObjectManager& om,
249     ETrim trim,
250     CObjectManager::EIsDefault is_default,
251     CObjectManager::TPriority priority)
252 {
253     SLoaderParams params(trim == eTrim);
254     TMaker maker(params);
255     CDataLoader::RegisterInObjectManager(om, maker, is_default, priority);
256     return maker.GetRegisterInfo();
257 }
258 
259 
RegisterInObjectManager(CObjectManager & om,const string & rep_path,const string & vol_path,ETrim trim,CObjectManager::EIsDefault is_default,CObjectManager::TPriority priority)260 CSRADataLoader::TRegisterLoaderInfo CSRADataLoader::RegisterInObjectManager(
261     CObjectManager& om,
262     const string& rep_path,
263     const string& vol_path,
264     ETrim trim,
265     CObjectManager::EIsDefault is_default,
266     CObjectManager::TPriority priority)
267 {
268     SLoaderParams params(trim == eTrim);
269     params.m_RepPath = rep_path;
270     params.m_VolPath = vol_path;
271     TMaker maker(params);
272     CDataLoader::RegisterInObjectManager(om, maker, is_default, priority);
273     return maker.GetRegisterInfo();
274 }
275 
276 
GetLoaderNameFromArgs(const SLoaderParams & params)277 string CSRADataLoader::GetLoaderNameFromArgs(const SLoaderParams& params)
278 {
279     string ret = "SRADataLoader";
280     if ( params.m_Trim ) {
281         ret += "Trim";
282     }
283     if ( !params.m_RepPath.empty() || !params.m_VolPath.empty() ) {
284         ret += ":";
285         ret += params.m_RepPath;
286         ret += ":";
287         ret += params.m_VolPath;
288     }
289     return ret;
290 }
291 
292 
GetLoaderNameFromArgs(void)293 string CSRADataLoader::GetLoaderNameFromArgs(void)
294 {
295     SLoaderParams params;
296     return GetLoaderNameFromArgs(params);
297 }
298 
299 
GetLoaderNameFromArgs(const string & rep_path,const string & vol_path)300 string CSRADataLoader::GetLoaderNameFromArgs(const string& rep_path,
301                                              const string& vol_path)
302 {
303     SLoaderParams params;
304     params.m_RepPath = rep_path;
305     params.m_VolPath = vol_path;
306     return GetLoaderNameFromArgs(params);
307 }
308 
309 
GetLoaderNameFromArgs(ETrim trim)310 string CSRADataLoader::GetLoaderNameFromArgs(ETrim trim)
311 {
312     SLoaderParams params(trim == eTrim);
313     return GetLoaderNameFromArgs(params);
314 }
315 
316 
GetLoaderNameFromArgs(const string & rep_path,const string & vol_path,ETrim trim)317 string CSRADataLoader::GetLoaderNameFromArgs(const string& rep_path,
318                                              const string& vol_path,
319                                              ETrim trim)
320 {
321     SLoaderParams params(trim == eTrim);
322     params.m_RepPath = rep_path;
323     params.m_VolPath = vol_path;
324     return GetLoaderNameFromArgs(params);
325 }
326 
327 
CSRADataLoader(const string & loader_name,const SLoaderParams & params)328 CSRADataLoader::CSRADataLoader(const string& loader_name,
329                                const SLoaderParams& params)
330     : CDataLoader(loader_name)
331 {
332     CSraMgr::ETrim trim = params.m_Trim? CSraMgr::eTrim: CSraMgr::eNoTrim;
333     m_Impl = new CSRADataLoader_Impl(trim);
334 }
335 
336 
~CSRADataLoader(void)337 CSRADataLoader::~CSRADataLoader(void)
338 {
339 }
340 
341 
342 typedef pair<CRef<CSRABlobId>, unsigned> TReadId;
343 
sx_GetReadId(const string & sra,bool with_chunk)344 static TReadId sx_GetReadId(const string& sra, bool with_chunk)
345 {
346     SIZE_TYPE dot1 = sra.find('.');
347     if ( dot1 == NPOS ) {
348         return TReadId();
349     }
350     SIZE_TYPE dot2 = with_chunk? sra.find('.', dot1+1): sra.size();
351     if ( dot2 == NPOS || dot1+1 >= dot2 || sra[dot1+1] == '0' ||
352          (with_chunk && (dot2+2 != sra.size() ||
353                          (sra[dot2+1] != '2' && sra[dot2+1] != '4') )) ) {
354         return TReadId();
355     }
356     unsigned spot_id =
357         NStr::StringToUInt(CTempString(sra.data()+dot1+1, dot2-dot1-1));
358     TReadId ret;
359     ret.first = new CSRABlobId(sra.substr(0, dot1), spot_id);
360     ret.second = sra[dot2+1] - '0';
361     return ret;
362 }
363 
364 
sx_GetReadId(const CSeq_id_Handle & idh)365 static TReadId sx_GetReadId(const CSeq_id_Handle& idh)
366 {
367     if ( idh.Which() != CSeq_id::e_General ) {
368         return TReadId();
369     }
370     CConstRef<CSeq_id> id = idh.GetSeqId();
371     const CDbtag& general = id->GetGeneral();
372     if ( general.GetDb() != "SRA") {
373         return TReadId();
374     }
375     return sx_GetReadId(general.GetTag().GetStr(), true);
376 }
377 
378 
GetBlobId(const CSeq_id_Handle & idh)379 CDataLoader::TBlobId CSRADataLoader::GetBlobId(const CSeq_id_Handle& idh)
380 {
381     return TBlobId(sx_GetReadId(idh).first);
382 }
383 
384 
385 CDataLoader::TBlobId
GetBlobIdFromString(const string & str) const386 CSRADataLoader::GetBlobIdFromString(const string& str) const
387 {
388     return TBlobId(sx_GetReadId(str, false).first);
389 }
390 
391 
CanGetBlobById(void) const392 bool CSRADataLoader::CanGetBlobById(void) const
393 {
394     return true;
395 }
396 
397 
398 CDataLoader::TTSE_LockSet
GetRecords(const CSeq_id_Handle & idh,EChoice)399 CSRADataLoader::GetRecords(const CSeq_id_Handle& idh,
400                            EChoice /* choice */)
401 {
402     TTSE_LockSet locks;
403     TBlobId blob_id = GetBlobId(idh);
404     if ( blob_id ) {
405         locks.insert(GetBlobById(blob_id));
406     }
407     return locks;
408 }
409 
410 
411 CDataLoader::TTSE_Lock
GetBlobById(const TBlobId & blob_id)412 CSRADataLoader::GetBlobById(const TBlobId& blob_id)
413 {
414     CTSE_LoadLock load_lock = GetDataSource()->GetTSE_LoadLock(blob_id);
415     if ( !load_lock.IsLoaded() ) {
416         const CSRABlobId& sra_id = dynamic_cast<const CSRABlobId&>(*blob_id);
417         CRef<CSeq_entry> entry =
418             m_Impl->LoadSRAEntry(sra_id.m_Accession, sra_id.m_SpotId);
419         if ( entry ) {
420             load_lock->SetSeq_entry(*entry);
421         }
422         load_lock.SetLoaded();
423     }
424     return load_lock;
425 }
426 
427 
GetSequenceLength(const CSeq_id_Handle & idh)428 TSeqPos CSRADataLoader::GetSequenceLength(const CSeq_id_Handle& idh)
429 {
430     TReadId read_id = sx_GetReadId(idh);
431     if ( read_id.first ) {
432         const CSRABlobId& sra_id = *read_id.first;
433         return m_Impl->GetSequenceLength(sra_id.m_Accession,
434                                          sra_id.m_SpotId,
435                                          read_id.second);
436     }
437     return kInvalidSeqPos;
438 }
439 
440 
441 CDataLoader::STypeFound
GetSequenceTypeFound(const CSeq_id_Handle & idh)442 CSRADataLoader::GetSequenceTypeFound(const CSeq_id_Handle& idh)
443 {
444     STypeFound ret;
445     TReadId read_id = sx_GetReadId(idh);
446     if ( read_id.first ) {
447         const CSRABlobId& sra_id = *read_id.first;
448         ret.sequence_found = true;
449         ret.type = m_Impl->GetSequenceType(sra_id.m_Accession,
450                                            sra_id.m_SpotId,
451                                            read_id.second);
452     }
453     return ret;
454 }
455 
456 
GetDefaultPriority(void) const457 CObjectManager::TPriority CSRADataLoader::GetDefaultPriority(void) const
458 {
459     return CObjectManager::kPriority_Replace;
460 }
461 
462 
463 END_SCOPE(objects)
464 
465 // ===========================================================================
466 
467 USING_SCOPE(objects);
468 
DataLoaders_Register_SRA(void)469 void DataLoaders_Register_SRA(void)
470 {
471     RegisterEntryPoint<CDataLoader>(NCBI_EntryPoint_DataLoader_Sra);
472 }
473 
474 
475 const string kDataLoader_Sra_DriverName("sra");
476 
477 class CSRA_DataLoaderCF : public CDataLoaderFactory
478 {
479 public:
CSRA_DataLoaderCF(void)480     CSRA_DataLoaderCF(void)
481         : CDataLoaderFactory(kDataLoader_Sra_DriverName) {}
~CSRA_DataLoaderCF(void)482     virtual ~CSRA_DataLoaderCF(void) {}
483 
484 protected:
485     virtual CDataLoader* CreateAndRegister(
486         CObjectManager& om,
487         const TPluginManagerParamTree* params) const;
488 };
489 
490 
CreateAndRegister(CObjectManager & om,const TPluginManagerParamTree * params) const491 CDataLoader* CSRA_DataLoaderCF::CreateAndRegister(
492     CObjectManager& om,
493     const TPluginManagerParamTree* params) const
494 {
495     if ( !ValidParams(params) ) {
496         // Use constructor without arguments
497         return CSRADataLoader::RegisterInObjectManager(om).GetLoader();
498     }
499     // IsDefault and Priority arguments may be specified
500     return CSRADataLoader::RegisterInObjectManager(
501         om,
502         GetIsDefault(params),
503         GetPriority(params)).GetLoader();
504 }
505 
506 
NCBI_EntryPoint_DataLoader_Sra(CPluginManager<CDataLoader>::TDriverInfoList & info_list,CPluginManager<CDataLoader>::EEntryPointRequest method)507 void NCBI_EntryPoint_DataLoader_Sra(
508     CPluginManager<CDataLoader>::TDriverInfoList&   info_list,
509     CPluginManager<CDataLoader>::EEntryPointRequest method)
510 {
511     CHostEntryPointImpl<CSRA_DataLoaderCF>::NCBI_EntryPointImpl(info_list, method);
512 }
513 
514 
NCBI_EntryPoint_xloader_sra(CPluginManager<objects::CDataLoader>::TDriverInfoList & info_list,CPluginManager<objects::CDataLoader>::EEntryPointRequest method)515 void NCBI_EntryPoint_xloader_sra(
516     CPluginManager<objects::CDataLoader>::TDriverInfoList&   info_list,
517     CPluginManager<objects::CDataLoader>::EEntryPointRequest method)
518 {
519     NCBI_EntryPoint_DataLoader_Sra(info_list, method);
520 }
521 
522 
523 END_NCBI_SCOPE
524