1 #ifndef SRA__READER__BAM__BGZF__HPP
2 #define SRA__READER__BAM__BGZF__HPP
3 /*  $Id: bgzf.hpp 604204 2020-03-24 15:25:24Z vasilche $
4  * ===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors:  Eugene Vasilchenko
29  *
30  * File Description:
31  *   Access to BGZF files (block GZip file)
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbifile.hpp>
37 #include <util/simple_buffer.hpp>
38 #include <sra/readers/bam/vdbfile.hpp>
39 #include <sra/readers/bam/cache_with_lock.hpp>
40 
41 BEGIN_NCBI_SCOPE
42 BEGIN_SCOPE(objects)
43 
44 class CSeq_entry;
45 class CPagedFile;
46 class CPagedFilePage;
47 class CBGZFFile;
48 class CBGZFStream;
49 
50 class CPagedFilePage : public CObject
51 {
52 public:
53     typedef Uint8 TFilePos;
54 
55     CPagedFilePage();
56     ~CPagedFilePage();
57 
GetFilePos() const58     TFilePos GetFilePos() const
59     {
60         return m_FilePos;
61     }
GetPageSize() const62     size_t GetPageSize() const
63     {
64         return m_Size;
65     }
GetPagePtr() const66     const char* GetPagePtr() const
67     {
68         return m_Ptr;
69     }
70 
Contains(TFilePos file_pos) const71     bool Contains(TFilePos file_pos) const
72     {
73         return (file_pos - GetFilePos()) < GetPageSize();
74     }
75 
76 protected:
77     friend class CPagedFile;
78 
79 private:
80     volatile TFilePos m_FilePos;
81     size_t m_Size;
82     const char* m_Ptr;
83     CSimpleBufferT<char> m_Buffer;
84     CMemoryFileMap* m_MemFile;
85 };
86 
87 
88 class NCBI_BAMREAD_EXPORT CPagedFile : public CObject
89 {
90 public:
91     typedef CPagedFilePage::TFilePos TFilePos;
92 
93     explicit
94     CPagedFile(const string& file_name);
95     ~CPagedFile();
96 
97 #define USE_RANGE_CACHE 1
98 #ifdef USE_RANGE_CACHE
99     typedef CBinaryRangeCacheWithLock<TFilePos, CPagedFilePage> TPageCache;
100 #else
101     typedef CCacheWithLock<TFilePos, CPagedFilePage> TPageCache;
102 #endif
103     typedef TPageCache::CLock TPage;
104 
105     // return page that contains the file position
106     TPage GetPage(TFilePos pos);
107 
108     pair<Uint8, double> GetReadStatistics() const;
109     void SetPreviousReadStatistics(const pair<Uint8, double>& stats);
110     // estimate best next page size to read using collected statistics
111     size_t GetNextPageSizePow2() const;
112 
113 private:
114     void x_AddReadStatistics(Uint8 bytes, double seconds);
115 
116     void x_ReadPage(CPagedFilePage& page, TFilePos file_pos, size_t size);
117 
118     CFastMutex m_Mutex;
119 
120     // three variants: direct file IO, memory mapped file, or VDB KFile
121     CFileIO m_File;
122     AutoPtr<CMemoryFileMap> m_MemFile;
123     CBamVDBFile m_VDBFile;
124 
125     // cache for loaded pages
126     CRef<TPageCache> m_PageCache;
127 
128     volatile Uint8 m_TotalReadBytes;
129     volatile double m_TotalReadSeconds;
130     Uint8 m_PreviousReadBytes;
131     double m_PreviousReadSeconds;
132 };
133 
134 
135 class NCBI_BAMREAD_EXPORT CBGZFException : public CException
136 {
137 public:
138     enum EErrCode {
139         eOtherError,
140         eFormatError,      ///< includes decompression errors
141         eInvalidArg        ///< invalid function argument
142     };
143     virtual const char* GetErrCodeString(void) const override;
144     NCBI_EXCEPTION_DEFAULT(CBGZFException,CException);
145 };
146 
147 
148 struct SBamUtil {
149     // conversion of BAM bytes into larger values - ints and floats
150     // the source data have any alignment
151 
MakeUint2SBamUtil152     static Uint2 MakeUint2(const char* buf)
153         {
154             return Uint2(Uint1(buf[0]))|
155                 (Uint2(Uint1(buf[1]))<<8);
156         }
157 
MakeUint4SBamUtil158     static Uint4 MakeUint4(const char* buf)
159         {
160             return Uint4(Uint1(buf[0]))|
161                 (Uint4(Uint1(buf[1]))<<8)|
162                 (Uint4(Uint1(buf[2]))<<16)|
163                 (Uint4(Uint1(buf[3]))<<24);
164         }
165 
MakeUint8SBamUtil166     static Uint8 MakeUint8(const char* buf)
167         {
168             return Uint8(Uint1(buf[0]))|
169                 (Uint8(Uint1(buf[1]))<<8)|
170                 (Uint8(Uint1(buf[2]))<<16)|
171                 (Uint8(Uint1(buf[3]))<<24)|
172                 (Uint8(Uint1(buf[4]))<<32)|
173                 (Uint8(Uint1(buf[5]))<<40)|
174                 (Uint8(Uint1(buf[6]))<<48)|
175                 (Uint8(Uint1(buf[7]))<<56);
176         }
177 
178     union UFloatUint4 {
179         float f;
180         Uint4 i;
181     };
MakeFloatSBamUtil182     static float MakeFloat(const char* buf)
183         {
184             UFloatUint4 u;
185             u.i = MakeUint4(buf);
186             return u.f;
187         }
188 };
189 
190 
191 class CBGZFPos
192 {
193 public:
194     typedef Uint8 TFileBlockPos; // position of block start in a file
195     typedef Uint4 TByteOffset; // position of byte within block
196     typedef Uint8 TVirtualPos; // virtual position, ordered
197 
198     static const Uint4 kMaxBlockSize = 1<<16;
199 
CBGZFPos()200     CBGZFPos()
201         : m_VirtualPos(0)
202         {
203         }
204     explicit
CBGZFPos(TVirtualPos pos)205     CBGZFPos(TVirtualPos pos)
206         : m_VirtualPos(pos)
207         {
208         }
CBGZFPos(TFileBlockPos block_pos,TByteOffset byte_offset)209     CBGZFPos(TFileBlockPos block_pos, TByteOffset byte_offset)
210         : m_VirtualPos((block_pos<<16)+byte_offset)
211         {
212         }
213 
GetVirtualPos() const214     TVirtualPos GetVirtualPos() const
215         {
216             return m_VirtualPos;
217         }
218 
GetFileBlockPos() const219     TFileBlockPos GetFileBlockPos() const
220         {
221             return m_VirtualPos >> 16;
222         }
GetByteOffset() const223     TByteOffset GetByteOffset() const
224         {
225             return TByteOffset(m_VirtualPos&(0xffff));
226         }
227 
operator ==(const CBGZFPos & b) const228     bool operator==(const CBGZFPos& b) const
229         {
230             return m_VirtualPos == b.m_VirtualPos;
231         }
operator !=(const CBGZFPos & b) const232     bool operator!=(const CBGZFPos& b) const
233         {
234             return m_VirtualPos != b.m_VirtualPos;
235         }
operator <(const CBGZFPos & b) const236     bool operator<(const CBGZFPos& b) const
237         {
238             return m_VirtualPos < b.m_VirtualPos;
239         }
operator >(const CBGZFPos & b) const240     bool operator>(const CBGZFPos& b) const
241         {
242             return m_VirtualPos > b.m_VirtualPos;
243         }
operator <=(const CBGZFPos & b) const244     bool operator<=(const CBGZFPos& b) const
245         {
246             return m_VirtualPos <= b.m_VirtualPos;
247         }
operator >=(const CBGZFPos & b) const248     bool operator>=(const CBGZFPos& b) const
249         {
250             return m_VirtualPos >= b.m_VirtualPos;
251         }
252 
GetInvalid()253     static CBGZFPos GetInvalid()
254         {
255             return CBGZFPos(TVirtualPos(-1));
256         }
IsInvalid() const257     bool IsInvalid() const
258         {
259             return GetVirtualPos() == TVirtualPos(-1);
260         }
261 
262     DECLARE_OPERATOR_BOOL(m_VirtualPos != 0);
263 
264 private:
265     TVirtualPos m_VirtualPos;
266 
267 };
268 NCBI_BAMREAD_EXPORT
269 ostream& operator<<(ostream& out, const CBGZFPos& p);
270 
271 typedef pair<CBGZFPos, CBGZFPos> CBGZFRange;
272 NCBI_BAMREAD_EXPORT
273 ostream& operator<<(ostream& out, const CBGZFRange& r);
274 
275 class CBGZFBlock
276 {
277 public:
278     typedef Uint8 TFileBlockPos; // position of block start in a file
279     typedef Uint4 TFileBlockSize; // size of block in a file
280     typedef Uint4 TDataSize; // size of uncompressed data
281     typedef Uint4 TCRC32;
282 
283     CBGZFBlock();
284     ~CBGZFBlock();
285 
286 
GetFileBlockPos() const287     TFileBlockPos GetFileBlockPos() const
288         {
289             return m_FileBlockPos;
290         }
GetFileBlockSize() const291     TFileBlockSize GetFileBlockSize() const
292         {
293             return m_FileBlockSize;
294         }
GetNextFileBlockPos() const295     TFileBlockPos GetNextFileBlockPos() const
296         {
297             return GetFileBlockPos() + GetFileBlockSize();
298         }
GetDataSize() const299     TDataSize GetDataSize() const
300         {
301             return m_DataSize;
302         }
303 
304     static const TFileBlockSize kMaxFileBlockSize = 1<<16;
305     static const TDataSize kMaxDataSize = 1<<16;
306 
307 protected:
308     friend class CBGZFFile;
309     friend class CBGZFStream;
310 
311 private:
312     volatile TFileBlockPos m_FileBlockPos;
313     TFileBlockSize m_FileBlockSize;
314     TDataSize m_DataSize;
315     AutoArray<char> m_Data;
316 };
317 
318 
319 class NCBI_BAMREAD_EXPORT CBGZFFile : public CObject
320 {
321 public:
322     explicit
323     CBGZFFile(const string& file_name);
324     ~CBGZFFile();
325 
GetReadStatistics() const326     pair<Uint8, double> GetReadStatistics() const
327         {
328             return m_File->GetReadStatistics();
329         }
SetPreviousReadStatistics(const pair<Uint8,double> & stats)330     void SetPreviousReadStatistics(const pair<Uint8, double>& stats)
331         {
332             m_File->SetPreviousReadStatistics(stats);
333         }
334 
335     pair<Uint8, double> GetUncompressStatistics() const;
336 
337 protected:
338     friend class CBGZFStream;
339 
340     void x_AddUncompressStatistics(Uint8 bytes, double seconds);
341 
342     typedef CBGZFPos::TFileBlockPos TFileBlockPos;
343     typedef CCacheWithLock<TFileBlockPos, CBGZFBlock> TBlockCache;
344     typedef TBlockCache::CLock TBlock;
345 
346     TBlock GetBlock(TFileBlockPos file_pos,
347                     CPagedFile::TPage& page,
348                     CSimpleBufferT<char>& buffer);
349 
350     bool x_ReadBlock(CBGZFBlock& block,
351                      TFileBlockPos file_pos,
352                      CPagedFile::TPage& page,
353                      CSimpleBufferT<char>& buffer);
354 
355 private:
356     CRef<CPagedFile> m_File;
357     CRef<TBlockCache> m_BlockCache;
358 
359     CFastMutex m_Mutex;
360 
361     volatile Uint8 m_TotalUncompressBytes;
362     volatile double m_TotalUncompressSeconds;
363 };
364 
365 
366 class NCBI_BAMREAD_EXPORT CBGZFStream
367 {
368 public:
369     CBGZFStream();
370     explicit
371     CBGZFStream(CBGZFFile& file);
372     ~CBGZFStream();
373 
374     void Close();
375     void Open(CBGZFFile& file);
376 
GetBlockDataSize() const377     CBGZFBlock::TDataSize GetBlockDataSize() const
378         {
379             return m_Block? m_Block->GetDataSize(): 0;
380         }
GetBlockFilePos() const381     CBGZFBlock::TFileBlockPos GetBlockFilePos() const
382         {
383             return m_Block? m_Block->GetFileBlockPos(): 0;
384         }
GetNextBlockFilePos() const385     CBGZFBlock::TFileBlockPos GetNextBlockFilePos() const
386         {
387             return m_Block? m_Block->GetNextFileBlockPos(): 0;
388         }
HaveBytesInBlock() const389     bool HaveBytesInBlock() const
390         {
391             return m_ReadPos < GetBlockDataSize();
392         }
393 
GetPos() const394     CBGZFPos GetPos() const
395         {
396             return CBGZFPos(GetBlockFilePos(), m_ReadPos);
397         }
GetNextBlockPos() const398     CBGZFPos GetNextBlockPos() const
399         {
400             return CBGZFPos(GetNextBlockFilePos(), 0);
401         }
GetSeekPos() const402     CBGZFPos GetSeekPos() const
403         {
404             if ( HaveBytesInBlock() ) {
405                 return GetPos();
406             }
407             else {
408                 return GetNextBlockPos();
409             }
410         }
GetEndPos() const411     CBGZFPos GetEndPos() const
412         {
413             return m_EndPos;
414         }
415     // seek to position to read till end_pos, or EOF if end_pos is invalid
416     void Seek(CBGZFPos pos, CBGZFPos end_pos = CBGZFPos::GetInvalid());
417 
418     // return non-zero number of available bytes in current decompressed buffer
419     size_t GetNextAvailableBytes();
420     // return true if there are more bytes before this position
HaveNextAvailableBytes()421     bool HaveNextAvailableBytes()
422         {
423             if ( HaveBytesInBlock() ) {
424                 return GetPos() < m_EndPos;
425             }
426             return HaveNextDataBlock();
427         }
428     // return true if there are more data blocks before this position
429     // current buffer must be read till the end
430     bool HaveNextDataBlock();
431 
432     // read up to count bytes into a buffer, may return smaller number
433     size_t Read(char* buf, size_t count);
434 
435     // read count bytes and return pointer to read data
436     // the pointer is either into decompressed buffer or into temporary buffer
437     // the returned pointer is guaranteed to be valid until next read or seek
438     const char* Read(size_t count);
439 
440 private:
441     bool x_NextBlock();
442 
443     const char* x_Read(CBGZFPos::TFileBlockPos file_pos, size_t size, char* buffer);
444 
445     // returns false if m_EndPos is invalid and EOF happened
446     bool x_ReadBlock(CBGZFPos::TFileBlockPos file_pos);
447 
448     CRef<CBGZFFile> m_File;
449     CPagedFile::TPage m_Page;
450     CBGZFFile::TBlock m_Block;
451     CBGZFPos::TByteOffset m_ReadPos;
452     CSimpleBufferT<char> m_InReadBuffer;
453     CSimpleBufferT<char> m_OutReadBuffer;
454     CBGZFPos m_EndPos;
455 };
456 
457 
458 END_SCOPE(objects)
459 END_NCBI_SCOPE
460 
461 #endif // SRA__READER__BAM__BGZF__HPP
462