1 #ifndef OBJTOOLS_DATA_LOADERS_CSRA___CSRALOADER_IMPL__HPP 2 #define OBJTOOLS_DATA_LOADERS_CSRA___CSRALOADER_IMPL__HPP 3 4 /* $Id: csraloader_impl.hpp 610971 2020-06-26 12:57:19Z grichenk $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Eugene Vasilchenko 30 * 31 * File Description: CSRA file data loader 32 * 33 * =========================================================================== 34 */ 35 36 37 #include <corelib/ncbistd.hpp> 38 #include <corelib/ncbimtx.hpp> 39 #include <sra/data_loaders/csra/csraloader.hpp> 40 #include <sra/readers/sra/csraread.hpp> 41 #include <objtools/readers/iidmapper.hpp> 42 #include <util/limited_size_map.hpp> 43 44 BEGIN_NCBI_SCOPE 45 BEGIN_SCOPE(objects) 46 47 class CDataLoader; 48 class CCSRADataLoader_Impl; 49 class CCSRARefSeqChunkInfo; 50 class CCSRARefSeqInfo; 51 class CCSRAFileInfo; 52 53 template<class Key, class Value, class Less = less<Key> > 54 class CCacheWithLock : public CObject 55 { 56 public: 57 typedef Key key_type; 58 typedef Value mapped_type; 59 60 protected: 61 class CSlot; 62 typedef Less TLess; 63 typedef map<key_type, CRef<CSlot>, TLess> TMap; 64 typedef typename TMap::iterator TMapIterator; 65 typedef typename TMap::const_iterator TMapConstIterator; 66 typedef list<TMapIterator> TRemoveList; 67 typedef typename TRemoveList::iterator TRemoveListIterator; 68 69 class CSlot : public CObject { 70 public: CSlot()71 CSlot() { 72 m_LockCounter.Set(1); 73 } 74 TMapIterator m_MapIter; 75 TRemoveListIterator m_RemoveListIter; 76 CAtomicCounter m_LockCounter; 77 CFastMutex m_ValueMutex; 78 mapped_type m_Value; 79 }; 80 81 TMap m_Map; 82 size_t m_SizeLimit; 83 size_t m_RemoveSize; 84 TRemoveList m_RemoveList; 85 CMutex m_Mutex; 86 87 public: 88 class CLock { 89 protected: 90 CRef<CCacheWithLock> m_Cache; 91 CRef<CSlot> m_Slot; 92 friend class CCacheWithLock<key_type, mapped_type, TLess>; 93 CLock(CCacheWithLock * cache,CSlot * slot)94 CLock(CCacheWithLock* cache, CSlot* slot) 95 : m_Cache(cache), 96 m_Slot(slot) 97 { 98 _ASSERT(cache); 99 _ASSERT(slot->m_LockCounter.Get() > 0); 100 } 101 102 public: CLock()103 CLock() { 104 } ~CLock()105 ~CLock() { 106 Reset(); 107 } CLock(const CLock & lock)108 CLock(const CLock& lock) 109 : m_Cache(lock.m_Cache), 110 m_Slot(lock.m_Slot) 111 { 112 if ( m_Slot ) { 113 m_Slot->m_LockCounter.Add(1); 114 } 115 } operator =(const CLock & lock)116 CLock& operator=(const CLock& lock) 117 { 118 if ( m_Slot != lock.m_Slot ) { 119 if ( m_Slot ) { 120 m_Cache->Unlock(m_Slot); 121 } 122 m_Cache = lock.m_Cache; 123 m_Slot = lock.m_Slot; 124 if ( m_Slot ) { 125 m_Slot->m_LockCounter.Add(1); 126 } 127 } 128 return *this; 129 } CLock(CLock && lock)130 CLock(CLock&& lock) 131 : m_Cache(move(lock.m_Cache)), 132 m_Slot(move(lock.m_Slot)) 133 { 134 } operator =(CLock && lock)135 CLock& operator=(CLock&& lock) 136 { 137 if ( m_Slot != lock.m_Slot ) { 138 Reset(); 139 m_Cache.Swap(lock.m_Cache); 140 m_Slot.Swap(lock.m_Slot); 141 } 142 return *this; 143 } 144 Reset()145 void Reset() { 146 if ( m_Slot ) { 147 m_Cache->Unlock(m_Slot); 148 m_Slot = null; 149 m_Cache = null; 150 } 151 } 152 GetValueMutex()153 CFastMutex& GetValueMutex() { return m_Slot.GetNCObject().m_ValueMutex; } 154 operator *() const155 mapped_type& operator*() const { return m_Slot.GetNCObject().m_Value; } operator ->() const156 mapped_type* operator->() const { return m_Slot.GetNCPointer().m_Value; } 157 operator ==(CLock a) const158 bool operator==(CLock a) const { 159 return m_Slot == a.m_Slot; 160 } operator !=(CLock a) const161 bool operator!=(CLock a) const { 162 return !(*this == a); 163 } 164 }; 165 CCacheWithLock(size_t size_limit=0)166 CCacheWithLock(size_t size_limit = 0) 167 : m_SizeLimit(size_limit), 168 m_RemoveSize(0) 169 { 170 } 171 get_lock(const key_type & key)172 CLock get_lock(const key_type& key) { 173 CMutexGuard guard(m_Mutex); 174 TMapIterator iter = m_Map.lower_bound(key); 175 if ( iter == m_Map.end() || m_Map.key_comp()(key, iter->first) ) { 176 // insert 177 typedef typename TMap::value_type TValue; 178 iter = m_Map.insert(iter, TValue(key, Ref(new CSlot()))); 179 iter->second->m_MapIter = iter; 180 } 181 else if ( iter->second->m_LockCounter.Add(1) == 1 ) { 182 // first lock from remove list 183 _ASSERT(m_RemoveSize > 0); 184 _ASSERT(m_RemoveSize == m_RemoveList.size()); 185 m_RemoveList.erase(iter->second->m_RemoveListIter); 186 --m_RemoveSize; 187 } 188 return CLock(this, iter->second); 189 } 190 get_size_limit(void) const191 size_t get_size_limit(void) const { 192 return m_SizeLimit; 193 } set_size_limit(size_t size_limit)194 void set_size_limit(size_t size_limit) { 195 if ( size_limit != m_SizeLimit ) { 196 CMutexGuard guard(m_Mutex); 197 m_SizeLimit = size_limit; 198 x_GC(); 199 } 200 } 201 202 protected: Unlock(CSlot * slot)203 void Unlock(CSlot* slot) { 204 CMutexGuard guard(m_Mutex); 205 _ASSERT(slot); 206 _ASSERT(slot->m_MapIter->second == slot); 207 if ( slot->m_LockCounter.Add(-1) == 0 ) { 208 // last lock removed 209 slot->m_RemoveListIter = 210 m_RemoveList.insert(m_RemoveList.end(), slot->m_MapIter); 211 ++m_RemoveSize; 212 x_GC(); 213 } 214 } 215 x_GC()216 void x_GC() { 217 while ( m_RemoveSize > m_SizeLimit ) { 218 m_Map.erase(m_RemoveList.front()); 219 m_RemoveList.pop_front(); 220 --m_RemoveSize; 221 } 222 } 223 224 public: 225 }; 226 227 228 class CCSRABlobId : public CBlobId 229 { 230 public: 231 enum EBlobType { 232 eBlobType_annot, // refseq coverage/pileup graphs and alignments 233 eBlobType_refseq, // refseq itself 234 eBlobType_reads, // short reads 235 eBlobType_reads_align // short reads primary alignments 236 }; 237 typedef CCacheWithLock<string, CRef<CCSRAFileInfo> > TSRRFiles; 238 typedef pair<CRef<CCSRAFileInfo>, TSRRFiles::CLock> TFileLock; 239 typedef pair<CRef<CCSRARefSeqInfo>, TSRRFiles::CLock> TRefLock; 240 241 explicit CCSRABlobId(const CTempString& str); 242 CCSRABlobId(EBlobType blob_type, 243 const TRefLock& ref); 244 CCSRABlobId(const TFileLock& file, 245 TVDBRowId first_spot_id); 246 ~CCSRABlobId(void); 247 248 EBlobType m_BlobType; 249 CCSraDb::ERefIdType m_RefIdType; 250 // cSRA file name or SRR accession 251 string m_File; 252 // Ref Seq-id for annot blobs 253 // First short read Seq-id for reads' blobs 254 CSeq_id_Handle m_SeqId; 255 TVDBRowId m_FirstSpotId; 256 TSRRFiles::CLock m_FileLock; 257 258 // returns length of accession part or NPOS 259 static SIZE_TYPE ParseReadId(CTempString str, 260 TVDBRowId* spot_id_ptr = 0, 261 Uint4* read_id_ptr = 0); 262 static bool GetGeneralSRAAccLabel(const CSeq_id_Handle& idh, 263 string* srr_acc_ptr = 0, 264 string* label_ptr = 0); 265 static bool GetGeneralSRAAccReadId(const CSeq_id_Handle& idh, 266 string* srr_acc_ptr = 0, 267 TVDBRowId* spot_id_ptr = 0, 268 Uint4* read_id_ptr = 0); 269 270 enum EGeneralIdType { 271 eNotGeneralIdType = 0, 272 eGeneralIdType_refseq = 1<<0, 273 eGeneralIdType_read = 1<<1, 274 eGeneralIdType_both = eGeneralIdType_refseq|eGeneralIdType_read 275 }; 276 static EGeneralIdType GetGeneralIdType(const CSeq_id_Handle& idh, 277 EGeneralIdType allow_type, 278 const string* srr = 0); GetGeneralIdType(const CSeq_id_Handle & idh,EGeneralIdType allow_type,const string & srr)279 static EGeneralIdType GetGeneralIdType(const CSeq_id_Handle& idh, 280 EGeneralIdType allow_type, 281 const string& srr) 282 { 283 return GetGeneralIdType(idh, allow_type, &srr); 284 } 285 286 // string blob id representation: 287 // eBlobType_annot_plain_id 288 string ToString(void) const; 289 void FromString(CTempString str); 290 291 bool operator<(const CBlobId& id) const; 292 bool operator==(const CBlobId& id) const; 293 }; 294 295 296 class CCSRARefSeqChunkInfo 297 { 298 public: 299 typedef CRange<TSeqPos> TRange; 300 GetRefSeqRangeStart(void) const301 const TRange& GetRefSeqRangeStart(void) const 302 { 303 return m_RefSeqRangeStart; 304 } 305 306 protected: 307 friend class CCSRARefSeqInfo; 308 309 TRange m_RefSeqRangeStart; // range of alignments' start positions 310 }; 311 312 313 enum ECSRAAnnotChunkIdType { 314 eCSRAAnnotChunk_align, 315 eCSRAAnnotChunk_pileup_graph, 316 eCSRAAnnotChunk_mul 317 }; 318 319 320 class CCSRARefSeqInfo : public CObject 321 { 322 public: 323 CCSRARefSeqInfo(CCSRAFileInfo* csra_file, 324 const CSeq_id_Handle& seq_id); 325 GetRefSeqId(void) const326 const CSeq_id_Handle& GetRefSeqId(void) const 327 { 328 return m_RefSeqId; 329 } 330 331 CCSraRefSeqIterator GetRefSeqIterator(void) const; 332 333 //CRef<CCSRABlobId> GetBlobId(CCSRABlobId::EBlobType type) const; 334 int GetAnnotChunkId(TSeqPos ref_pos) const; 335 336 void LoadRanges(void); 337 338 void LoadAnnotBlob(CTSE_LoadLock& load_lock); 339 void LoadAnnotChunk(CTSE_Chunk_Info& chunk_info); 340 341 void LoadAnnotMainSplit(CTSE_LoadLock& load_lock); 342 void LoadAnnotMainChunk(CTSE_Chunk_Info& chunk_info); 343 void LoadAnnotAlignChunk(CTSE_Chunk_Info& chunk_info); 344 void LoadAnnotPileupChunk(CTSE_Chunk_Info& chunk_info); 345 346 void LoadRefSeqBlob(CTSE_LoadLock& load_lock); 347 void LoadRefSeqChunk(CTSE_Chunk_Info& chunk_info); 348 349 void LoadRefSeqMainEntry(CTSE_LoadLock& load_lock); 350 351 protected: 352 friend class CCSRADataLoader_Impl; 353 friend class CCSRABlobId; 354 355 // start of chunk and number of alignments in the chunk 356 struct SChunkInfo { 357 TSeqPos start_pos; 358 unsigned align_count; 359 operator ()CCSRARefSeqInfo::SChunkInfo360 bool operator()(TSeqPos pos, const SChunkInfo& chunk) const 361 { return pos < chunk.start_pos; } 362 }; 363 typedef vector<SChunkInfo> TChunks; 364 365 void x_LoadRangesStat(void); 366 367 CCSRAFileInfo* m_File; 368 CSeq_id_Handle m_RefSeqId; 369 CRef<CSeq_annot> m_CovAnnot; 370 int m_MinMapQuality; 371 TChunks m_AlignChunks; 372 TChunks m_GraphChunks; 373 }; 374 375 376 class CCSRAFileInfo : public CObject 377 { 378 public: 379 CCSRAFileInfo(CCSRADataLoader_Impl& impl, 380 const string& csra, 381 CCSraDb::ERefIdType ref_id_type); 382 GetCSRAName(void) const383 const string& GetCSRAName(void) const 384 { 385 return m_CSRAName; 386 } GetBaseAnnotName(void) const387 const string& GetBaseAnnotName(void) const 388 { 389 return m_AnnotName; 390 } 391 string GetAnnotName(const string& spot_group, 392 ECSRAAnnotChunkIdType type) const; 393 string GetAlignAnnotName(void) const; 394 string GetAlignAnnotName(const string& spot_group) const; 395 string GetPileupAnnotName(void) const; 396 string GetPileupAnnotName(const string& spot_group) const; 397 GetRefIdType(void) const398 CCSraDb::ERefIdType GetRefIdType(void) const 399 { 400 return m_RefIdType; 401 } GetMinMapQuality(void) const402 int GetMinMapQuality(void) const 403 { 404 return m_MinMapQuality; 405 } GetPileupGraphs(void) const406 bool GetPileupGraphs(void) const 407 { 408 return m_PileupGraphs; 409 } GetQualityGraphs(void) const410 bool GetQualityGraphs(void) const 411 { 412 return m_QualityGraphs; 413 } 414 415 bool IsValidReadId(TVDBRowId spot_id, Uint4 read_id, 416 CRef<CCSRARefSeqInfo>* ref_ptr = 0, 417 TSeqPos* ref_pos_ptr = 0); 418 //CRef<CCSRABlobId> GetReadsBlobId(TVDBRowId spot_id) const; 419 420 void GetAnnotBlobId(CRef<CCSRABlobId>& ret, 421 const CSeq_id_Handle& idh); 422 423 CRef<CCSRARefSeqInfo> GetRefSeqInfo(const CSeq_id_Handle& seq_id); GetRefSeqInfo(const CCSRABlobId & blob_id)424 CRef<CCSRARefSeqInfo> GetRefSeqInfo(const CCSRABlobId& blob_id) 425 { 426 return GetRefSeqInfo(blob_id.m_SeqId); 427 } 428 GetDb(void)429 CCSraDb& GetDb(void) 430 { 431 return m_CSRADb; 432 } operator CCSraDb&(void)433 operator CCSraDb&(void) 434 { 435 return GetDb(); 436 } 437 438 void AddRefSeq(const string& refseq_label, 439 const CSeq_id_Handle& refseq_id); 440 GetSeparateSpotGroups(void) const441 const vector<string>& GetSeparateSpotGroups(void) const 442 { 443 return m_SeparateSpotGroups; 444 } 445 446 typedef CCSRADataLoader::TAnnotNames TAnnotNames; 447 void GetPossibleAnnotNames(TAnnotNames& names) const; 448 449 void LoadReadsBlob(const CCSRABlobId& blob_id, 450 CTSE_LoadLock& load_lock); 451 452 protected: 453 friend class CCSRADataLoader_Impl; 454 455 typedef map<CSeq_id_Handle, CRef<CCSRARefSeqInfo> > TRefSeqs; 456 457 void x_Initialize(CCSRADataLoader_Impl& impl, 458 const string& csra, 459 CCSraDb::ERefIdType ref_id_type); 460 461 string m_CSRAName; 462 CCSraDb::ERefIdType m_RefIdType; 463 string m_AnnotName; 464 int m_MinMapQuality; 465 bool m_PileupGraphs; 466 bool m_QualityGraphs; 467 CCSraDb m_CSRADb; 468 vector<string> m_SeparateSpotGroups; 469 TRefSeqs m_RefSeqs; 470 }; 471 472 473 class CCSRADataLoader_Impl : public CObject 474 { 475 public: 476 explicit CCSRADataLoader_Impl(const CCSRADataLoader::SLoaderParams& params); 477 ~CCSRADataLoader_Impl(void); 478 479 void AddSrzDef(void); 480 void AddCSRAFile(const string& csra); 481 482 typedef CCacheWithLock<string, CRef<CCSRAFileInfo> > TSRRFiles; 483 typedef pair<CRef<CCSRAFileInfo>, TSRRFiles::CLock> TFileLock; 484 typedef pair<CRef<CCSRARefSeqInfo>, TSRRFiles::CLock> TRefLock; 485 486 TFileLock GetSRRFile(const string& acc); 487 GetMinMapQuality(void) const488 int GetMinMapQuality(void) const 489 { 490 return m_MinMapQuality; 491 } GetPileupGraphs(void) const492 bool GetPileupGraphs(void) const 493 { 494 return m_PileupGraphs; 495 } GetQualityGraphs(void) const496 bool GetQualityGraphs(void) const 497 { 498 return m_QualityGraphs; 499 } GetSpotReadAlign(void) const500 bool GetSpotReadAlign(void) const 501 { 502 return m_SpotReadAlign; 503 } 504 void SetSpotReadAlign(bool value); GetPathInId(void) const505 int GetPathInId(void) const 506 { 507 return m_PathInId; 508 } GetSpotGroups(void) const509 int GetSpotGroups(void) const 510 { 511 return m_SpotGroups; 512 } 513 514 TRefLock GetRefSeqInfo(const CSeq_id_Handle& idh); 515 TFileLock GetReadsFileInfo(const CSeq_id_Handle& idh, 516 TVDBRowId* spot_id_ptr = 0, 517 Uint4* read_id_ptr = 0, 518 CRef<CCSRARefSeqInfo>* ref_ptr = 0, 519 TSeqPos* ref_pos_ptr = 0); 520 TFileLock GetFileInfo(const CCSRABlobId& blob_id); 521 CCSraRefSeqIterator GetRefSeqIterator(const CSeq_id_Handle& idh); 522 CCSraShortReadIterator GetShortReadIterator(const CSeq_id_Handle& idh); 523 524 CDataLoader::TTSE_LockSet GetRecords(CDataSource* data_source, 525 const CSeq_id_Handle& idh, 526 CDataLoader::EChoice choice); 527 CRef<CCSRABlobId> GetBlobId(const CSeq_id_Handle& idh); 528 CRef<CCSRABlobId> GetBlobId(const TRefLock& lock, CCSRABlobId::EBlobType type); 529 CRef<CCSRABlobId> GetReadsBlobId(const TFileLock& lock, TVDBRowId spot_id); 530 CTSE_LoadLock GetBlobById(CDataSource* data_source, 531 const CCSRABlobId& blob_id); 532 void LoadBlob(const CCSRABlobId& blob_id, 533 CTSE_LoadLock& load_lock); 534 void LoadChunk(const CCSRABlobId& blob_id, 535 CTSE_Chunk_Info& chunk); 536 537 typedef CCSRADataLoader::TAnnotNames TAnnotNames; 538 TAnnotNames GetPossibleAnnotNames(void) const; 539 540 typedef vector<CSeq_id_Handle> TIds; 541 void GetIds(const CSeq_id_Handle& idh, TIds& ids); 542 CDataSource::SAccVerFound GetAccVer(const CSeq_id_Handle& idh); 543 CDataSource::SGiFound GetGi(const CSeq_id_Handle& idh); 544 string GetLabel(const CSeq_id_Handle& idh); 545 TTaxId GetTaxId(const CSeq_id_Handle& idh); 546 TSeqPos GetSequenceLength(const CSeq_id_Handle& idh); 547 CDataSource::STypeFound GetSequenceType(const CSeq_id_Handle& idh); 548 549 protected: 550 friend class CCSRAFileInfo; 551 struct SDirSeqInfo { 552 CSeq_id_Handle m_SeqId; 553 string m_CSRAFileName; 554 string m_CSRASeqLabel; 555 string m_Label; 556 }; 557 558 private: 559 // first: 560 // false if explicitly listed file in the loader params 561 // true if dynamically loaded SRA 562 // second: SRA accession or csra file path 563 564 typedef map<string, CRef<CCSRAFileInfo> > TFixedFiles; 565 566 // mutex guarding input into the map 567 CVDBMgr m_Mgr; 568 string m_DirPath; 569 int m_MinMapQuality; 570 bool m_PileupGraphs; 571 bool m_QualityGraphs; 572 bool m_SpotReadAlign; 573 int m_PathInId; 574 int m_SpotGroups; 575 TFixedFiles m_FixedFiles; 576 CRef<TSRRFiles> m_SRRFiles; 577 AutoPtr<IIdMapper> m_IdMapper; 578 string m_AnnotName; 579 }; 580 581 END_SCOPE(objects) 582 END_NCBI_SCOPE 583 584 #endif // OBJTOOLS_DATA_LOADERS_CSRA___CSRALOADER_IMPL__HPP 585