1 #ifndef SRA__READER__BAM__BGZF__HPP 2 #define SRA__READER__BAM__BGZF__HPP 3 /* $Id: bgzf.hpp 604204 2020-03-24 15:25:24Z vasilche $ 4 * =========================================================================== 5 * 6 * PUBLIC DOMAIN NOTICE 7 * National Center for Biotechnology Information 8 * 9 * This software/database is a "United States Government Work" under the 10 * terms of the United States Copyright Act. It was written as part of 11 * the author's official duties as a United States Government employee and 12 * thus cannot be copyrighted. This software/database is freely available 13 * to the public for use. The National Library of Medicine and the U.S. 14 * Government have not placed any restriction on its use or reproduction. 15 * 16 * Although all reasonable efforts have been taken to ensure the accuracy 17 * and reliability of the software and data, the NLM and the U.S. 18 * Government do not and cannot warrant the performance or results that 19 * may be obtained by using this software or data. The NLM and the U.S. 20 * Government disclaim all warranties, express or implied, including 21 * warranties of performance, merchantability or fitness for any particular 22 * purpose. 23 * 24 * Please cite the author in any work or product based on this material. 25 * 26 * =========================================================================== 27 * 28 * Authors: Eugene Vasilchenko 29 * 30 * File Description: 31 * Access to BGZF files (block GZip file) 32 * 33 */ 34 35 #include <corelib/ncbistd.hpp> 36 #include <corelib/ncbifile.hpp> 37 #include <util/simple_buffer.hpp> 38 #include <sra/readers/bam/vdbfile.hpp> 39 #include <sra/readers/bam/cache_with_lock.hpp> 40 41 BEGIN_NCBI_SCOPE 42 BEGIN_SCOPE(objects) 43 44 class CSeq_entry; 45 class CPagedFile; 46 class CPagedFilePage; 47 class CBGZFFile; 48 class CBGZFStream; 49 50 class CPagedFilePage : public CObject 51 { 52 public: 53 typedef Uint8 TFilePos; 54 55 CPagedFilePage(); 56 ~CPagedFilePage(); 57 GetFilePos() const58 TFilePos GetFilePos() const 59 { 60 return m_FilePos; 61 } GetPageSize() const62 size_t GetPageSize() const 63 { 64 return m_Size; 65 } GetPagePtr() const66 const char* GetPagePtr() const 67 { 68 return m_Ptr; 69 } 70 Contains(TFilePos file_pos) const71 bool Contains(TFilePos file_pos) const 72 { 73 return (file_pos - GetFilePos()) < GetPageSize(); 74 } 75 76 protected: 77 friend class CPagedFile; 78 79 private: 80 volatile TFilePos m_FilePos; 81 size_t m_Size; 82 const char* m_Ptr; 83 CSimpleBufferT<char> m_Buffer; 84 CMemoryFileMap* m_MemFile; 85 }; 86 87 88 class NCBI_BAMREAD_EXPORT CPagedFile : public CObject 89 { 90 public: 91 typedef CPagedFilePage::TFilePos TFilePos; 92 93 explicit 94 CPagedFile(const string& file_name); 95 ~CPagedFile(); 96 97 #define USE_RANGE_CACHE 1 98 #ifdef USE_RANGE_CACHE 99 typedef CBinaryRangeCacheWithLock<TFilePos, CPagedFilePage> TPageCache; 100 #else 101 typedef CCacheWithLock<TFilePos, CPagedFilePage> TPageCache; 102 #endif 103 typedef TPageCache::CLock TPage; 104 105 // return page that contains the file position 106 TPage GetPage(TFilePos pos); 107 108 pair<Uint8, double> GetReadStatistics() const; 109 void SetPreviousReadStatistics(const pair<Uint8, double>& stats); 110 // estimate best next page size to read using collected statistics 111 size_t GetNextPageSizePow2() const; 112 113 private: 114 void x_AddReadStatistics(Uint8 bytes, double seconds); 115 116 void x_ReadPage(CPagedFilePage& page, TFilePos file_pos, size_t size); 117 118 CFastMutex m_Mutex; 119 120 // three variants: direct file IO, memory mapped file, or VDB KFile 121 CFileIO m_File; 122 AutoPtr<CMemoryFileMap> m_MemFile; 123 CBamVDBFile m_VDBFile; 124 125 // cache for loaded pages 126 CRef<TPageCache> m_PageCache; 127 128 volatile Uint8 m_TotalReadBytes; 129 volatile double m_TotalReadSeconds; 130 Uint8 m_PreviousReadBytes; 131 double m_PreviousReadSeconds; 132 }; 133 134 135 class NCBI_BAMREAD_EXPORT CBGZFException : public CException 136 { 137 public: 138 enum EErrCode { 139 eOtherError, 140 eFormatError, ///< includes decompression errors 141 eInvalidArg ///< invalid function argument 142 }; 143 virtual const char* GetErrCodeString(void) const override; 144 NCBI_EXCEPTION_DEFAULT(CBGZFException,CException); 145 }; 146 147 148 struct SBamUtil { 149 // conversion of BAM bytes into larger values - ints and floats 150 // the source data have any alignment 151 MakeUint2SBamUtil152 static Uint2 MakeUint2(const char* buf) 153 { 154 return Uint2(Uint1(buf[0]))| 155 (Uint2(Uint1(buf[1]))<<8); 156 } 157 MakeUint4SBamUtil158 static Uint4 MakeUint4(const char* buf) 159 { 160 return Uint4(Uint1(buf[0]))| 161 (Uint4(Uint1(buf[1]))<<8)| 162 (Uint4(Uint1(buf[2]))<<16)| 163 (Uint4(Uint1(buf[3]))<<24); 164 } 165 MakeUint8SBamUtil166 static Uint8 MakeUint8(const char* buf) 167 { 168 return Uint8(Uint1(buf[0]))| 169 (Uint8(Uint1(buf[1]))<<8)| 170 (Uint8(Uint1(buf[2]))<<16)| 171 (Uint8(Uint1(buf[3]))<<24)| 172 (Uint8(Uint1(buf[4]))<<32)| 173 (Uint8(Uint1(buf[5]))<<40)| 174 (Uint8(Uint1(buf[6]))<<48)| 175 (Uint8(Uint1(buf[7]))<<56); 176 } 177 178 union UFloatUint4 { 179 float f; 180 Uint4 i; 181 }; MakeFloatSBamUtil182 static float MakeFloat(const char* buf) 183 { 184 UFloatUint4 u; 185 u.i = MakeUint4(buf); 186 return u.f; 187 } 188 }; 189 190 191 class CBGZFPos 192 { 193 public: 194 typedef Uint8 TFileBlockPos; // position of block start in a file 195 typedef Uint4 TByteOffset; // position of byte within block 196 typedef Uint8 TVirtualPos; // virtual position, ordered 197 198 static const Uint4 kMaxBlockSize = 1<<16; 199 CBGZFPos()200 CBGZFPos() 201 : m_VirtualPos(0) 202 { 203 } 204 explicit CBGZFPos(TVirtualPos pos)205 CBGZFPos(TVirtualPos pos) 206 : m_VirtualPos(pos) 207 { 208 } CBGZFPos(TFileBlockPos block_pos,TByteOffset byte_offset)209 CBGZFPos(TFileBlockPos block_pos, TByteOffset byte_offset) 210 : m_VirtualPos((block_pos<<16)+byte_offset) 211 { 212 } 213 GetVirtualPos() const214 TVirtualPos GetVirtualPos() const 215 { 216 return m_VirtualPos; 217 } 218 GetFileBlockPos() const219 TFileBlockPos GetFileBlockPos() const 220 { 221 return m_VirtualPos >> 16; 222 } GetByteOffset() const223 TByteOffset GetByteOffset() const 224 { 225 return TByteOffset(m_VirtualPos&(0xffff)); 226 } 227 operator ==(const CBGZFPos & b) const228 bool operator==(const CBGZFPos& b) const 229 { 230 return m_VirtualPos == b.m_VirtualPos; 231 } operator !=(const CBGZFPos & b) const232 bool operator!=(const CBGZFPos& b) const 233 { 234 return m_VirtualPos != b.m_VirtualPos; 235 } operator <(const CBGZFPos & b) const236 bool operator<(const CBGZFPos& b) const 237 { 238 return m_VirtualPos < b.m_VirtualPos; 239 } operator >(const CBGZFPos & b) const240 bool operator>(const CBGZFPos& b) const 241 { 242 return m_VirtualPos > b.m_VirtualPos; 243 } operator <=(const CBGZFPos & b) const244 bool operator<=(const CBGZFPos& b) const 245 { 246 return m_VirtualPos <= b.m_VirtualPos; 247 } operator >=(const CBGZFPos & b) const248 bool operator>=(const CBGZFPos& b) const 249 { 250 return m_VirtualPos >= b.m_VirtualPos; 251 } 252 GetInvalid()253 static CBGZFPos GetInvalid() 254 { 255 return CBGZFPos(TVirtualPos(-1)); 256 } IsInvalid() const257 bool IsInvalid() const 258 { 259 return GetVirtualPos() == TVirtualPos(-1); 260 } 261 262 DECLARE_OPERATOR_BOOL(m_VirtualPos != 0); 263 264 private: 265 TVirtualPos m_VirtualPos; 266 267 }; 268 NCBI_BAMREAD_EXPORT 269 ostream& operator<<(ostream& out, const CBGZFPos& p); 270 271 typedef pair<CBGZFPos, CBGZFPos> CBGZFRange; 272 NCBI_BAMREAD_EXPORT 273 ostream& operator<<(ostream& out, const CBGZFRange& r); 274 275 class CBGZFBlock 276 { 277 public: 278 typedef Uint8 TFileBlockPos; // position of block start in a file 279 typedef Uint4 TFileBlockSize; // size of block in a file 280 typedef Uint4 TDataSize; // size of uncompressed data 281 typedef Uint4 TCRC32; 282 283 CBGZFBlock(); 284 ~CBGZFBlock(); 285 286 GetFileBlockPos() const287 TFileBlockPos GetFileBlockPos() const 288 { 289 return m_FileBlockPos; 290 } GetFileBlockSize() const291 TFileBlockSize GetFileBlockSize() const 292 { 293 return m_FileBlockSize; 294 } GetNextFileBlockPos() const295 TFileBlockPos GetNextFileBlockPos() const 296 { 297 return GetFileBlockPos() + GetFileBlockSize(); 298 } GetDataSize() const299 TDataSize GetDataSize() const 300 { 301 return m_DataSize; 302 } 303 304 static const TFileBlockSize kMaxFileBlockSize = 1<<16; 305 static const TDataSize kMaxDataSize = 1<<16; 306 307 protected: 308 friend class CBGZFFile; 309 friend class CBGZFStream; 310 311 private: 312 volatile TFileBlockPos m_FileBlockPos; 313 TFileBlockSize m_FileBlockSize; 314 TDataSize m_DataSize; 315 AutoArray<char> m_Data; 316 }; 317 318 319 class NCBI_BAMREAD_EXPORT CBGZFFile : public CObject 320 { 321 public: 322 explicit 323 CBGZFFile(const string& file_name); 324 ~CBGZFFile(); 325 GetReadStatistics() const326 pair<Uint8, double> GetReadStatistics() const 327 { 328 return m_File->GetReadStatistics(); 329 } SetPreviousReadStatistics(const pair<Uint8,double> & stats)330 void SetPreviousReadStatistics(const pair<Uint8, double>& stats) 331 { 332 m_File->SetPreviousReadStatistics(stats); 333 } 334 335 pair<Uint8, double> GetUncompressStatistics() const; 336 337 protected: 338 friend class CBGZFStream; 339 340 void x_AddUncompressStatistics(Uint8 bytes, double seconds); 341 342 typedef CBGZFPos::TFileBlockPos TFileBlockPos; 343 typedef CCacheWithLock<TFileBlockPos, CBGZFBlock> TBlockCache; 344 typedef TBlockCache::CLock TBlock; 345 346 TBlock GetBlock(TFileBlockPos file_pos, 347 CPagedFile::TPage& page, 348 CSimpleBufferT<char>& buffer); 349 350 bool x_ReadBlock(CBGZFBlock& block, 351 TFileBlockPos file_pos, 352 CPagedFile::TPage& page, 353 CSimpleBufferT<char>& buffer); 354 355 private: 356 CRef<CPagedFile> m_File; 357 CRef<TBlockCache> m_BlockCache; 358 359 CFastMutex m_Mutex; 360 361 volatile Uint8 m_TotalUncompressBytes; 362 volatile double m_TotalUncompressSeconds; 363 }; 364 365 366 class NCBI_BAMREAD_EXPORT CBGZFStream 367 { 368 public: 369 CBGZFStream(); 370 explicit 371 CBGZFStream(CBGZFFile& file); 372 ~CBGZFStream(); 373 374 void Close(); 375 void Open(CBGZFFile& file); 376 GetBlockDataSize() const377 CBGZFBlock::TDataSize GetBlockDataSize() const 378 { 379 return m_Block? m_Block->GetDataSize(): 0; 380 } GetBlockFilePos() const381 CBGZFBlock::TFileBlockPos GetBlockFilePos() const 382 { 383 return m_Block? m_Block->GetFileBlockPos(): 0; 384 } GetNextBlockFilePos() const385 CBGZFBlock::TFileBlockPos GetNextBlockFilePos() const 386 { 387 return m_Block? m_Block->GetNextFileBlockPos(): 0; 388 } HaveBytesInBlock() const389 bool HaveBytesInBlock() const 390 { 391 return m_ReadPos < GetBlockDataSize(); 392 } 393 GetPos() const394 CBGZFPos GetPos() const 395 { 396 return CBGZFPos(GetBlockFilePos(), m_ReadPos); 397 } GetNextBlockPos() const398 CBGZFPos GetNextBlockPos() const 399 { 400 return CBGZFPos(GetNextBlockFilePos(), 0); 401 } GetSeekPos() const402 CBGZFPos GetSeekPos() const 403 { 404 if ( HaveBytesInBlock() ) { 405 return GetPos(); 406 } 407 else { 408 return GetNextBlockPos(); 409 } 410 } GetEndPos() const411 CBGZFPos GetEndPos() const 412 { 413 return m_EndPos; 414 } 415 // seek to position to read till end_pos, or EOF if end_pos is invalid 416 void Seek(CBGZFPos pos, CBGZFPos end_pos = CBGZFPos::GetInvalid()); 417 418 // return non-zero number of available bytes in current decompressed buffer 419 size_t GetNextAvailableBytes(); 420 // return true if there are more bytes before this position HaveNextAvailableBytes()421 bool HaveNextAvailableBytes() 422 { 423 if ( HaveBytesInBlock() ) { 424 return GetPos() < m_EndPos; 425 } 426 return HaveNextDataBlock(); 427 } 428 // return true if there are more data blocks before this position 429 // current buffer must be read till the end 430 bool HaveNextDataBlock(); 431 432 // read up to count bytes into a buffer, may return smaller number 433 size_t Read(char* buf, size_t count); 434 435 // read count bytes and return pointer to read data 436 // the pointer is either into decompressed buffer or into temporary buffer 437 // the returned pointer is guaranteed to be valid until next read or seek 438 const char* Read(size_t count); 439 440 private: 441 bool x_NextBlock(); 442 443 const char* x_Read(CBGZFPos::TFileBlockPos file_pos, size_t size, char* buffer); 444 445 // returns false if m_EndPos is invalid and EOF happened 446 bool x_ReadBlock(CBGZFPos::TFileBlockPos file_pos); 447 448 CRef<CBGZFFile> m_File; 449 CPagedFile::TPage m_Page; 450 CBGZFFile::TBlock m_Block; 451 CBGZFPos::TByteOffset m_ReadPos; 452 CSimpleBufferT<char> m_InReadBuffer; 453 CSimpleBufferT<char> m_OutReadBuffer; 454 CBGZFPos m_EndPos; 455 }; 456 457 458 END_SCOPE(objects) 459 END_NCBI_SCOPE 460 461 #endif // SRA__READER__BAM__BGZF__HPP 462