1 #ifndef OBJTOOLS_DATA_LOADERS_CSRA___CSRALOADER_IMPL__HPP
2 #define OBJTOOLS_DATA_LOADERS_CSRA___CSRALOADER_IMPL__HPP
3 
4 /*  $Id: csraloader_impl.hpp 610971 2020-06-26 12:57:19Z grichenk $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Eugene Vasilchenko
30  *
31  * File Description: CSRA file data loader
32  *
33  * ===========================================================================
34  */
35 
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbimtx.hpp>
39 #include <sra/data_loaders/csra/csraloader.hpp>
40 #include <sra/readers/sra/csraread.hpp>
41 #include <objtools/readers/iidmapper.hpp>
42 #include <util/limited_size_map.hpp>
43 
44 BEGIN_NCBI_SCOPE
45 BEGIN_SCOPE(objects)
46 
47 class CDataLoader;
48 class CCSRADataLoader_Impl;
49 class CCSRARefSeqChunkInfo;
50 class CCSRARefSeqInfo;
51 class CCSRAFileInfo;
52 
53 template<class Key, class Value, class Less = less<Key> >
54 class CCacheWithLock : public CObject
55 {
56 public:
57     typedef Key key_type;
58     typedef Value mapped_type;
59 
60 protected:
61     class CSlot;
62     typedef Less TLess;
63     typedef map<key_type, CRef<CSlot>, TLess> TMap;
64     typedef typename TMap::iterator TMapIterator;
65     typedef typename TMap::const_iterator TMapConstIterator;
66     typedef list<TMapIterator> TRemoveList;
67     typedef typename TRemoveList::iterator TRemoveListIterator;
68 
69     class CSlot : public CObject {
70     public:
CSlot()71         CSlot() {
72             m_LockCounter.Set(1);
73         }
74         TMapIterator        m_MapIter;
75         TRemoveListIterator m_RemoveListIter;
76         CAtomicCounter      m_LockCounter;
77         CFastMutex          m_ValueMutex;
78         mapped_type         m_Value;
79     };
80 
81     TMap m_Map;
82     size_t m_SizeLimit;
83     size_t m_RemoveSize;
84     TRemoveList m_RemoveList;
85     CMutex m_Mutex;
86 
87 public:
88     class CLock {
89     protected:
90         CRef<CCacheWithLock> m_Cache;
91         CRef<CSlot> m_Slot;
92         friend class CCacheWithLock<key_type, mapped_type, TLess>;
93 
CLock(CCacheWithLock * cache,CSlot * slot)94         CLock(CCacheWithLock* cache, CSlot* slot)
95             : m_Cache(cache),
96               m_Slot(slot)
97             {
98                 _ASSERT(cache);
99                 _ASSERT(slot->m_LockCounter.Get() > 0);
100             }
101 
102     public:
CLock()103         CLock() {
104         }
~CLock()105         ~CLock() {
106             Reset();
107         }
CLock(const CLock & lock)108         CLock(const CLock& lock)
109             : m_Cache(lock.m_Cache),
110               m_Slot(lock.m_Slot)
111             {
112                 if ( m_Slot ) {
113                     m_Slot->m_LockCounter.Add(1);
114                 }
115             }
operator =(const CLock & lock)116         CLock& operator=(const CLock& lock)
117             {
118                 if ( m_Slot != lock.m_Slot ) {
119                     if ( m_Slot ) {
120                         m_Cache->Unlock(m_Slot);
121                     }
122                     m_Cache = lock.m_Cache;
123                     m_Slot = lock.m_Slot;
124                     if ( m_Slot ) {
125                         m_Slot->m_LockCounter.Add(1);
126                     }
127                 }
128                 return *this;
129             }
CLock(CLock && lock)130         CLock(CLock&& lock)
131             : m_Cache(move(lock.m_Cache)),
132               m_Slot(move(lock.m_Slot))
133             {
134             }
operator =(CLock && lock)135         CLock& operator=(CLock&& lock)
136             {
137                 if ( m_Slot != lock.m_Slot ) {
138                     Reset();
139                     m_Cache.Swap(lock.m_Cache);
140                     m_Slot.Swap(lock.m_Slot);
141                 }
142                 return *this;
143             }
144 
Reset()145         void Reset() {
146             if ( m_Slot ) {
147                 m_Cache->Unlock(m_Slot);
148                 m_Slot = null;
149                 m_Cache = null;
150             }
151         }
152 
GetValueMutex()153         CFastMutex& GetValueMutex() { return m_Slot.GetNCObject().m_ValueMutex; }
154 
operator *() const155         mapped_type& operator*() const { return m_Slot.GetNCObject().m_Value; }
operator ->() const156         mapped_type* operator->() const { return m_Slot.GetNCPointer().m_Value; }
157 
operator ==(CLock a) const158         bool operator==(CLock a) const {
159             return m_Slot == a.m_Slot;
160         }
operator !=(CLock a) const161         bool operator!=(CLock a) const {
162             return !(*this == a);
163         }
164     };
165 
CCacheWithLock(size_t size_limit=0)166     CCacheWithLock(size_t size_limit = 0)
167         : m_SizeLimit(size_limit),
168           m_RemoveSize(0)
169         {
170         }
171 
get_lock(const key_type & key)172     CLock get_lock(const key_type& key) {
173         CMutexGuard guard(m_Mutex);
174         TMapIterator iter = m_Map.lower_bound(key);
175         if ( iter == m_Map.end() || m_Map.key_comp()(key, iter->first) ) {
176             // insert
177             typedef typename TMap::value_type TValue;
178             iter = m_Map.insert(iter, TValue(key, Ref(new CSlot())));
179             iter->second->m_MapIter = iter;
180         }
181         else if ( iter->second->m_LockCounter.Add(1) == 1 ) {
182             // first lock from remove list
183             _ASSERT(m_RemoveSize > 0);
184             _ASSERT(m_RemoveSize == m_RemoveList.size());
185             m_RemoveList.erase(iter->second->m_RemoveListIter);
186             --m_RemoveSize;
187         }
188         return CLock(this, iter->second);
189     }
190 
get_size_limit(void) const191     size_t get_size_limit(void) const {
192         return m_SizeLimit;
193     }
set_size_limit(size_t size_limit)194     void set_size_limit(size_t size_limit) {
195         if ( size_limit != m_SizeLimit ) {
196             CMutexGuard guard(m_Mutex);
197             m_SizeLimit = size_limit;
198             x_GC();
199         }
200     }
201 
202 protected:
Unlock(CSlot * slot)203     void Unlock(CSlot* slot) {
204         CMutexGuard guard(m_Mutex);
205         _ASSERT(slot);
206         _ASSERT(slot->m_MapIter->second == slot);
207         if ( slot->m_LockCounter.Add(-1) == 0 ) {
208             // last lock removed
209             slot->m_RemoveListIter =
210                 m_RemoveList.insert(m_RemoveList.end(), slot->m_MapIter);
211             ++m_RemoveSize;
212             x_GC();
213         }
214     }
215 
x_GC()216     void x_GC() {
217         while ( m_RemoveSize > m_SizeLimit ) {
218             m_Map.erase(m_RemoveList.front());
219             m_RemoveList.pop_front();
220             --m_RemoveSize;
221         }
222     }
223 
224 public:
225 };
226 
227 
228 class CCSRABlobId : public CBlobId
229 {
230 public:
231     enum EBlobType {
232         eBlobType_annot, // refseq coverage/pileup graphs and alignments
233         eBlobType_refseq, // refseq itself
234         eBlobType_reads, // short reads
235         eBlobType_reads_align // short reads primary alignments
236     };
237     typedef CCacheWithLock<string, CRef<CCSRAFileInfo> > TSRRFiles;
238     typedef pair<CRef<CCSRAFileInfo>, TSRRFiles::CLock> TFileLock;
239     typedef pair<CRef<CCSRARefSeqInfo>, TSRRFiles::CLock> TRefLock;
240 
241     explicit CCSRABlobId(const CTempString& str);
242     CCSRABlobId(EBlobType blob_type,
243                 const TRefLock& ref);
244     CCSRABlobId(const TFileLock& file,
245                 TVDBRowId first_spot_id);
246     ~CCSRABlobId(void);
247 
248     EBlobType m_BlobType;
249     CCSraDb::ERefIdType m_RefIdType;
250     // cSRA file name or SRR accession
251     string m_File;
252     // Ref Seq-id for annot blobs
253     // First short read Seq-id for reads' blobs
254     CSeq_id_Handle m_SeqId;
255     TVDBRowId m_FirstSpotId;
256     TSRRFiles::CLock m_FileLock;
257 
258     // returns length of accession part or NPOS
259     static SIZE_TYPE ParseReadId(CTempString str,
260                                  TVDBRowId* spot_id_ptr = 0,
261                                  Uint4* read_id_ptr = 0);
262     static bool GetGeneralSRAAccLabel(const CSeq_id_Handle& idh,
263                                       string* srr_acc_ptr = 0,
264                                       string* label_ptr = 0);
265     static bool GetGeneralSRAAccReadId(const CSeq_id_Handle& idh,
266                                        string* srr_acc_ptr = 0,
267                                        TVDBRowId* spot_id_ptr = 0,
268                                        Uint4* read_id_ptr = 0);
269 
270     enum EGeneralIdType {
271         eNotGeneralIdType      = 0,
272         eGeneralIdType_refseq  = 1<<0,
273         eGeneralIdType_read    = 1<<1,
274         eGeneralIdType_both    = eGeneralIdType_refseq|eGeneralIdType_read
275     };
276     static EGeneralIdType GetGeneralIdType(const CSeq_id_Handle& idh,
277                                            EGeneralIdType allow_type,
278                                            const string* srr = 0);
GetGeneralIdType(const CSeq_id_Handle & idh,EGeneralIdType allow_type,const string & srr)279     static EGeneralIdType GetGeneralIdType(const CSeq_id_Handle& idh,
280                                            EGeneralIdType allow_type,
281                                            const string& srr)
282     {
283         return GetGeneralIdType(idh, allow_type, &srr);
284     }
285 
286     // string blob id representation:
287     // eBlobType_annot_plain_id
288     string ToString(void) const;
289     void FromString(CTempString str);
290 
291     bool operator<(const CBlobId& id) const;
292     bool operator==(const CBlobId& id) const;
293 };
294 
295 
296 class CCSRARefSeqChunkInfo
297 {
298 public:
299     typedef CRange<TSeqPos> TRange;
300 
GetRefSeqRangeStart(void) const301     const TRange& GetRefSeqRangeStart(void) const
302         {
303             return m_RefSeqRangeStart;
304         }
305 
306 protected:
307     friend class CCSRARefSeqInfo;
308 
309     TRange m_RefSeqRangeStart; // range of alignments' start positions
310 };
311 
312 
313 enum ECSRAAnnotChunkIdType {
314     eCSRAAnnotChunk_align,
315     eCSRAAnnotChunk_pileup_graph,
316     eCSRAAnnotChunk_mul
317 };
318 
319 
320 class CCSRARefSeqInfo : public CObject
321 {
322 public:
323     CCSRARefSeqInfo(CCSRAFileInfo* csra_file,
324                     const CSeq_id_Handle& seq_id);
325 
GetRefSeqId(void) const326     const CSeq_id_Handle& GetRefSeqId(void) const
327         {
328             return m_RefSeqId;
329         }
330 
331     CCSraRefSeqIterator GetRefSeqIterator(void) const;
332 
333     //CRef<CCSRABlobId> GetBlobId(CCSRABlobId::EBlobType type) const;
334     int GetAnnotChunkId(TSeqPos ref_pos) const;
335 
336     void LoadRanges(void);
337 
338     void LoadAnnotBlob(CTSE_LoadLock& load_lock);
339     void LoadAnnotChunk(CTSE_Chunk_Info& chunk_info);
340 
341     void LoadAnnotMainSplit(CTSE_LoadLock& load_lock);
342     void LoadAnnotMainChunk(CTSE_Chunk_Info& chunk_info);
343     void LoadAnnotAlignChunk(CTSE_Chunk_Info& chunk_info);
344     void LoadAnnotPileupChunk(CTSE_Chunk_Info& chunk_info);
345 
346     void LoadRefSeqBlob(CTSE_LoadLock& load_lock);
347     void LoadRefSeqChunk(CTSE_Chunk_Info& chunk_info);
348 
349     void LoadRefSeqMainEntry(CTSE_LoadLock& load_lock);
350 
351 protected:
352     friend class CCSRADataLoader_Impl;
353     friend class CCSRABlobId;
354 
355     // start of chunk and number of alignments in the chunk
356     struct SChunkInfo {
357         TSeqPos start_pos;
358         unsigned align_count;
359 
operator ()CCSRARefSeqInfo::SChunkInfo360         bool operator()(TSeqPos pos, const SChunkInfo& chunk) const
361             { return pos < chunk.start_pos; }
362     };
363     typedef vector<SChunkInfo> TChunks;
364 
365     void x_LoadRangesStat(void);
366 
367     CCSRAFileInfo* m_File;
368     CSeq_id_Handle m_RefSeqId;
369     CRef<CSeq_annot> m_CovAnnot;
370     int m_MinMapQuality;
371     TChunks m_AlignChunks;
372     TChunks m_GraphChunks;
373 };
374 
375 
376 class CCSRAFileInfo : public CObject
377 {
378 public:
379     CCSRAFileInfo(CCSRADataLoader_Impl& impl,
380                   const string& csra,
381                   CCSraDb::ERefIdType ref_id_type);
382 
GetCSRAName(void) const383     const string& GetCSRAName(void) const
384         {
385             return m_CSRAName;
386         }
GetBaseAnnotName(void) const387     const string& GetBaseAnnotName(void) const
388         {
389             return m_AnnotName;
390         }
391     string GetAnnotName(const string& spot_group,
392                         ECSRAAnnotChunkIdType type) const;
393     string GetAlignAnnotName(void) const;
394     string GetAlignAnnotName(const string& spot_group) const;
395     string GetPileupAnnotName(void) const;
396     string GetPileupAnnotName(const string& spot_group) const;
397 
GetRefIdType(void) const398     CCSraDb::ERefIdType GetRefIdType(void) const
399         {
400             return m_RefIdType;
401         }
GetMinMapQuality(void) const402     int GetMinMapQuality(void) const
403         {
404             return m_MinMapQuality;
405         }
GetPileupGraphs(void) const406     bool GetPileupGraphs(void) const
407         {
408             return m_PileupGraphs;
409         }
GetQualityGraphs(void) const410     bool GetQualityGraphs(void) const
411         {
412             return m_QualityGraphs;
413         }
414 
415     bool IsValidReadId(TVDBRowId spot_id, Uint4 read_id,
416                        CRef<CCSRARefSeqInfo>* ref_ptr = 0,
417                        TSeqPos* ref_pos_ptr = 0);
418     //CRef<CCSRABlobId> GetReadsBlobId(TVDBRowId spot_id) const;
419 
420     void GetAnnotBlobId(CRef<CCSRABlobId>& ret,
421                         const CSeq_id_Handle& idh);
422 
423     CRef<CCSRARefSeqInfo> GetRefSeqInfo(const CSeq_id_Handle& seq_id);
GetRefSeqInfo(const CCSRABlobId & blob_id)424     CRef<CCSRARefSeqInfo> GetRefSeqInfo(const CCSRABlobId& blob_id)
425         {
426             return GetRefSeqInfo(blob_id.m_SeqId);
427         }
428 
GetDb(void)429     CCSraDb& GetDb(void)
430         {
431             return m_CSRADb;
432         }
operator CCSraDb&(void)433     operator CCSraDb&(void)
434         {
435             return GetDb();
436         }
437 
438     void AddRefSeq(const string& refseq_label,
439                    const CSeq_id_Handle& refseq_id);
440 
GetSeparateSpotGroups(void) const441     const vector<string>& GetSeparateSpotGroups(void) const
442         {
443             return m_SeparateSpotGroups;
444         }
445 
446     typedef CCSRADataLoader::TAnnotNames TAnnotNames;
447     void GetPossibleAnnotNames(TAnnotNames& names) const;
448 
449     void LoadReadsBlob(const CCSRABlobId& blob_id,
450                        CTSE_LoadLock& load_lock);
451 
452 protected:
453     friend class CCSRADataLoader_Impl;
454 
455     typedef map<CSeq_id_Handle, CRef<CCSRARefSeqInfo> > TRefSeqs;
456 
457     void x_Initialize(CCSRADataLoader_Impl& impl,
458                       const string& csra,
459                       CCSraDb::ERefIdType ref_id_type);
460 
461     string m_CSRAName;
462     CCSraDb::ERefIdType m_RefIdType;
463     string m_AnnotName;
464     int m_MinMapQuality;
465     bool m_PileupGraphs;
466     bool m_QualityGraphs;
467     CCSraDb m_CSRADb;
468     vector<string> m_SeparateSpotGroups;
469     TRefSeqs m_RefSeqs;
470 };
471 
472 
473 class CCSRADataLoader_Impl : public CObject
474 {
475 public:
476     explicit CCSRADataLoader_Impl(const CCSRADataLoader::SLoaderParams& params);
477     ~CCSRADataLoader_Impl(void);
478 
479     void AddSrzDef(void);
480     void AddCSRAFile(const string& csra);
481 
482     typedef CCacheWithLock<string, CRef<CCSRAFileInfo> > TSRRFiles;
483     typedef pair<CRef<CCSRAFileInfo>, TSRRFiles::CLock> TFileLock;
484     typedef pair<CRef<CCSRARefSeqInfo>, TSRRFiles::CLock> TRefLock;
485 
486     TFileLock GetSRRFile(const string& acc);
487 
GetMinMapQuality(void) const488     int GetMinMapQuality(void) const
489         {
490             return m_MinMapQuality;
491         }
GetPileupGraphs(void) const492     bool GetPileupGraphs(void) const
493         {
494             return m_PileupGraphs;
495         }
GetQualityGraphs(void) const496     bool GetQualityGraphs(void) const
497         {
498             return m_QualityGraphs;
499         }
GetSpotReadAlign(void) const500     bool GetSpotReadAlign(void) const
501         {
502             return m_SpotReadAlign;
503         }
504     void SetSpotReadAlign(bool value);
GetPathInId(void) const505     int GetPathInId(void) const
506         {
507             return m_PathInId;
508         }
GetSpotGroups(void) const509     int GetSpotGroups(void) const
510         {
511             return m_SpotGroups;
512         }
513 
514     TRefLock GetRefSeqInfo(const CSeq_id_Handle& idh);
515     TFileLock GetReadsFileInfo(const CSeq_id_Handle& idh,
516                                TVDBRowId* spot_id_ptr = 0,
517                                Uint4* read_id_ptr = 0,
518                                CRef<CCSRARefSeqInfo>* ref_ptr = 0,
519                                TSeqPos* ref_pos_ptr = 0);
520     TFileLock GetFileInfo(const CCSRABlobId& blob_id);
521     CCSraRefSeqIterator GetRefSeqIterator(const CSeq_id_Handle& idh);
522     CCSraShortReadIterator GetShortReadIterator(const CSeq_id_Handle& idh);
523 
524     CDataLoader::TTSE_LockSet GetRecords(CDataSource* data_source,
525                                          const CSeq_id_Handle& idh,
526                                          CDataLoader::EChoice choice);
527     CRef<CCSRABlobId> GetBlobId(const CSeq_id_Handle& idh);
528     CRef<CCSRABlobId> GetBlobId(const TRefLock& lock, CCSRABlobId::EBlobType type);
529     CRef<CCSRABlobId> GetReadsBlobId(const TFileLock& lock, TVDBRowId spot_id);
530     CTSE_LoadLock GetBlobById(CDataSource* data_source,
531                               const CCSRABlobId& blob_id);
532     void LoadBlob(const CCSRABlobId& blob_id,
533                   CTSE_LoadLock& load_lock);
534     void LoadChunk(const CCSRABlobId& blob_id,
535                    CTSE_Chunk_Info& chunk);
536 
537     typedef CCSRADataLoader::TAnnotNames TAnnotNames;
538     TAnnotNames GetPossibleAnnotNames(void) const;
539 
540     typedef vector<CSeq_id_Handle> TIds;
541     void GetIds(const CSeq_id_Handle& idh, TIds& ids);
542     CDataSource::SAccVerFound GetAccVer(const CSeq_id_Handle& idh);
543     CDataSource::SGiFound GetGi(const CSeq_id_Handle& idh);
544     string GetLabel(const CSeq_id_Handle& idh);
545     TTaxId GetTaxId(const CSeq_id_Handle& idh);
546     TSeqPos GetSequenceLength(const CSeq_id_Handle& idh);
547     CDataSource::STypeFound GetSequenceType(const CSeq_id_Handle& idh);
548 
549 protected:
550     friend class CCSRAFileInfo;
551     struct SDirSeqInfo {
552         CSeq_id_Handle m_SeqId;
553         string m_CSRAFileName;
554         string m_CSRASeqLabel;
555         string m_Label;
556     };
557 
558 private:
559     // first:
560     //   false if explicitly listed file in the loader params
561     //   true if dynamically loaded SRA
562     // second: SRA accession or csra file path
563 
564     typedef map<string, CRef<CCSRAFileInfo> > TFixedFiles;
565 
566     // mutex guarding input into the map
567     CVDBMgr m_Mgr;
568     string  m_DirPath;
569     int m_MinMapQuality;
570     bool m_PileupGraphs;
571     bool m_QualityGraphs;
572     bool m_SpotReadAlign;
573     int m_PathInId;
574     int m_SpotGroups;
575     TFixedFiles m_FixedFiles;
576     CRef<TSRRFiles> m_SRRFiles;
577     AutoPtr<IIdMapper> m_IdMapper;
578     string m_AnnotName;
579 };
580 
581 END_SCOPE(objects)
582 END_NCBI_SCOPE
583 
584 #endif  // OBJTOOLS_DATA_LOADERS_CSRA___CSRALOADER_IMPL__HPP
585