1 /*
2  * File:   BZ2StreamScanner.h
3  * Author: Yavor Nikolov
4  *
5  * Created on March 6, 2010, 10:07 PM
6  */
7 
8 #ifndef _BZ2STREAMSCANNER_H
9 #define _BZ2STREAMSCANNER_H
10 
11 #include "pbzip2.h"
12 #include <vector>
13 #include <string>
14 
15 using namespace std;
16 
17 namespace pbzip2
18 {
19 
20 class BZ2StreamScanner
21 {
22 public:
23 	typedef unsigned char CharType;
24 
25 	static const size_t DEFAULT_IN_BUFF_CAPACITY = 1024 * 1024; // 1M
26 	static const size_t DEFAULT_OUT_BUFF_LIMIT = 1024 * 1024;
27 
28 	enum BZ2SScannerErrorFlag
29 	{
30 		ERR_MEM_ALLOC_INBUFF = 1,
31 		ERR_MEM_ALLOC_OUTBUFF = 1 << 1,
32 		ERR_IO_READ = 1 << 2,
33 		ERR_IO_INSUFFICIENT_BUFF_CAPACITY = 1 << 3,
34 		ERR_INVALID_STATE = 1 << 4,
35 		ERR_INVALID_FILE_FORMAT = 1 << 5
36 	};
37 
38 	BZ2StreamScanner( int hInFile, size_t inBuffCapacity = DEFAULT_IN_BUFF_CAPACITY );
39 	int init( int hInFile, size_t inBuffCapacity = DEFAULT_IN_BUFF_CAPACITY );
40 
41 	virtual ~BZ2StreamScanner();
42 
43 	outBuff * getNextStream();
44 
getInBuffSize()45 	size_t getInBuffSize() const { return ( _inBuffEnd - _inBuff ); }
getInBuffCapacity()46 	size_t getInBuffCapacity() const { return _inBuffCapacity; }
getHeader()47 	const basic_string<CharType> & getHeader() const { return _bz2Header; }
getHeaderSize()48 	size_t getHeaderSize() const { return _bz2Header.size(); }
getErrState()49 	int getErrState() const { return _errState; }
failed()50 	bool failed() { return ( _errState != 0 ); }
51 
52 	/** true if header has been found since last initialization */
isBz2HeaderFound()53 	bool isBz2HeaderFound() const { return _bz2HeaderFound; }
54 
55 	/** status of last/current search only */
getSearchStatus()56 	bool getSearchStatus() const { return _searchStatus; }
57 
58 	// end of file
eof()59 	bool eof() const { return _eof; }
60 
61 	/** true if out buffer is full enough to produce output block */
isOutBuffFullEnough()62 	bool isOutBuffFullEnough() const { return _outBuff.bufSize >= getOutBuffCapacityLimit(); }
63 
64 	/**
65 	 * dispose memory resources
66 	 */
67 	virtual void dispose();
68 
69 	#ifdef PBZIP_DEBUG
70 	void printCurrentState();
71 	#endif
72 
73 private:
74 	/* disable copy c-tor */
BZ2StreamScanner(const BZ2StreamScanner & orig)75 	BZ2StreamScanner( const BZ2StreamScanner& orig ) {}
76 
77 	void initOutBuff( char * buf = NULL, size_t bufSize = 0, size_t bufCapacity = 0 );
78 	int appendOutBuffData( CharType * end );
appendOutBuffData()79 	int appendOutBuffData() { return appendOutBuffData( getInBuffSearchPtr() ); }
80 	int appendOutBuffDataUpToLimit();
81 	int ensureOutBuffCapacity( size_t newSize );
82 	int readData();
83 
getInBuffEnd()84 	CharType * getInBuffEnd() { return _inBuffEnd; }
getInBuffBegin()85 	CharType * getInBuffBegin() { return _inBuff; }
getInBuffCurrent()86 	CharType * getInBuffCurrent() { return _inBuffCurrent; }
getInBuffSearchPtr()87 	CharType * getInBuffSearchPtr() { return _inBuffSearchPtr; }
getOutBuffEnd()88 	char * getOutBuffEnd() { return _outBuff.buf + _outBuff.bufSize; }
getUnsearchedCount()89 	size_t getUnsearchedCount() const { return _inBuffEnd - _inBuffSearchPtr; }
90 
91 	/**
92 	 * Search next bz2 header. Read more data from file if needed.
93 	 *
94 	 * @return pointer to header is returned if found;
95 	 *         getInBuffEnd() - if not found; NULL - on error.
96 	 */
97 	CharType * searchNextHeader();
98 
99 	/**
100 	 * Search next bz2 header just in currently available input buffer.
101 	 * (Doesn't read more data from file).
102 	 *
103 	 * @return pointer to header or getInBuffEnd() if such is not found.
104 	 */
105 	CharType * searchNextHeaderInBuff();
106 
107 	/**
108 	 * Prepare for next read from file into input buffer.
109 	 * Consumes remaining input data buffer and moves header tail to beginning.
110 	 *
111 	 */
112 	int rewindInBuff();
113 
114 	/**
115 	 * Locate BZh header prefix in buffer. In case of first search - just check
116 	 * the beginning of buffer and signal error if it doesn't match to headers.
117 	 *
118 	 * @return pointer to BZh header prefix if located. getInBuffEnd() if not.
119 	 *         failure() and getErrState() will indicate error if such occurred.
120 	 */
121 	CharType * locateHeaderPrefixInBuff();
122 
getOutBuffCapacityLimit()123 	size_t getOutBuffCapacityLimit() const { return _outBuffCapacityLimit; }
124 
125 	int _hInFile; // input file descriptor
126 	bool _eof;
127 
128 	basic_string<CharType> _bz2Header;
129 	basic_string<CharType> _bz2HeaderZero;
130 	bool _bz2HeaderFound;
131 	bool _searchStatus;
132 
133 	CharType * _inBuff;
134 	CharType * _inBuffEnd; // end of data read from file
135 	CharType * _inBuffCurrent;
136 	CharType * _inBuffSearchPtr;
137 
138 	size_t _inBuffCapacity; // allocated memory capacity for in buffer
139 
140 	outBuff _outBuff;
141 	size_t _outBuffCapacity;
142 	size_t _outBuffCapacityHint; // keep max used capacity
143 	size_t _outBuffCapacityLimit;
144 
145 	unsigned int _errState; // 0 - ok; otherwise error
146 	int _outSequenceNumber; // output block sequence number in bz2 stream (>0 if segmented)
147 	int _streamNumber;
148 };
149 
150 }
151 
152 #endif /* _BZ2STREAMSCANNER_H */
153 
154