1 /*
2 * File: BZ2StreamScanner.cpp
3 * Author: Yavor Nikolov
4 *
5 * Created on March 6, 2010, 10:07 PM
6 */
7
8 #include "pbzip2.h"
9 #include "BZ2StreamScanner.h"
10
11 #include <algorithm>
12 #include <new>
13 #include <exception>
14 #include <limits>
15
16 #include <cstring>
17 #include <cstdio>
18
19 using namespace std;
20
21 namespace pbzip2
22 {
23
24 const size_t BZ2StreamScanner::DEFAULT_OUT_BUFF_LIMIT;
25
BZ2StreamScanner(int hInFile,size_t inBuffCapacity)26 BZ2StreamScanner::BZ2StreamScanner( int hInFile, size_t inBuffCapacity ):
27 _inBuff( NULL ),
28 _inBuffCapacity( 0 )
29 {
30 _outBuff.buf = NULL;
31 _outBuff.bufSize = 0;
32
33 init( hInFile, inBuffCapacity );
34 }
35
36 /**
37 * Initialize - position to beginning of input file and prepare for searching.
38 *
39 * @return 0 - on success; -1 on error.
40 */
init(int hInFile,size_t inBuffCapacity)41 int BZ2StreamScanner::init( int hInFile, size_t inBuffCapacity )
42 {
43 dispose();
44
45 CharType bz2header[] = "BZh91AY&SY";
46 // zero-terminated string
47 CharType bz2ZeroHeader[] =
48 { 'B', 'Z', 'h', '9', 0x17, 0x72, 0x45, 0x38, 0x50, 0x90, 0 };
49
50 _hInFile = hInFile;
51 _eof = false;
52 _bz2Header = bz2header;
53 _bz2HeaderZero = bz2ZeroHeader;
54 _bz2HeaderFound = false;
55 _inBuffCapacity = 0;
56 _errState = 0;
57 _searchStatus = false;
58 _outBuffCapacityHint = 0;
59 _outBuffCapacityLimit = DEFAULT_OUT_BUFF_LIMIT;
60 _outSequenceNumber = 0;
61 _streamNumber = 0;
62
63 // Prevent too small buffer
64 if ( inBuffCapacity < 2 * _bz2Header.size() )
65 {
66 inBuffCapacity = 2 * _bz2Header.size();
67 }
68
69 // allocate memory to read in file
70 _inBuff = new(std::nothrow) CharType[inBuffCapacity];
71
72 if ( _inBuff == NULL )
73 {
74 _errState |= ERR_MEM_ALLOC_INBUFF;
75 _inBuffEnd = NULL;
76 handle_error( EF_EXIT, -1,
77 "pbzip2: *ERROR: Could not allocate memory (FileData)! Aborting...\n" );
78
79 return -1;
80 }
81
82 _inBuffCapacity = inBuffCapacity;
83
84 _inBuffCurrent = _inBuffSearchPtr = _inBuffEnd = _inBuff;
85
86 return 0;
87 }
88
89 /**
90 * dispose memory resources
91 */
dispose()92 void BZ2StreamScanner::dispose()
93 {
94 disposeMemory( _outBuff.buf );
95 _outBuff.bufSize = 0;
96
97 disposeMemory( _inBuff );
98 _inBuffCapacity = 0;
99
100 // close( _hInFile );
101 }
102
~BZ2StreamScanner()103 BZ2StreamScanner::~BZ2StreamScanner()
104 {
105 dispose();
106 }
107
108 /**
109 * Verify if there is enough space in output buffer. If not - then allocate.
110 */
ensureOutBuffCapacity(size_t newSize)111 int BZ2StreamScanner::ensureOutBuffCapacity( size_t newSize )
112 {
113 #ifdef PBZIP_DEBUG
114 fprintf( stderr, " start ensureOutBuffCapacity/newSize=%" PRIuMAX ": [", (uintmax_t) newSize );
115 printCurrentState();
116 fprintf( stderr, "\n" );
117 #endif
118
119 if ( newSize <= _outBuffCapacity )
120 {
121 return 0; // enough capacity already
122 }
123
124 if ( newSize > _outBuffCapacityHint )
125 {
126 _outBuffCapacityHint = ( 11 * newSize ) / 10;
127
128 if ( ( newSize <= getOutBuffCapacityLimit() ) &&
129 ( _outBuffCapacityHint > getOutBuffCapacityLimit() ) )
130 {
131 _outBuffCapacityHint = getOutBuffCapacityLimit();
132 }
133 }
134
135 char * newBuff = new(std::nothrow) char[_outBuffCapacityHint];
136 if ( newBuff == NULL )
137 {
138 handle_error( EF_EXIT, -1,
139 "pbzip2: *ERROR: Could not allocate memory (ensureOutBuffCapacity/%u)!"
140 "Aborting...\n", _outBuffCapacityHint );
141
142 _errState |= ERR_MEM_ALLOC_OUTBUFF;
143 return -1;
144 }
145
146 if ( _outBuff.buf != NULL )
147 {
148 memcpy( newBuff, _outBuff.buf, _outBuff.bufSize );
149 delete [] _outBuff.buf;
150 }
151
152 initOutBuff( newBuff, _outBuff.bufSize, _outBuffCapacityHint );
153
154 #ifdef PBZIP_DEBUG
155 fprintf( stderr, " end ensureOutBuffCapacity/newSize=%" PRIuMAX ": [", (uintmax_t) newSize );
156 printCurrentState();
157 fprintf( stderr, "\n" );
158 #endif
159
160 return 0;
161 }
162
163 /**
164 * Depending on wether we have already found bz2 header or not - either append
165 * data to output buffer or discard it.
166 * On append [current, end) is appended to output buffer. Output buffer is
167 * extended if there is not enough existing space available in it.
168 *
169 * @return the number of bytes appended to output buff or skipped. -1 on error.
170 */
appendOutBuffData(CharType * end)171 int BZ2StreamScanner::appendOutBuffData(CharType * end)
172 {
173 int additionSize = end - getInBuffCurrent();
174
175 #ifdef PBZIP_DEBUG
176 fprintf( stderr, " start appendOutBuffData/%d: [", additionSize );
177 printCurrentState();
178 fprintf( stderr, "\n" );
179 #endif
180
181 if ( additionSize <= 0 )
182 {
183 return 0;
184 }
185
186 if ( isBz2HeaderFound() )
187 {
188 size_t newSize = _outBuff.bufSize + additionSize;
189
190 if ( ensureOutBuffCapacity( newSize ) != 0 )
191 {
192 return - 1; // failure encountered
193 }
194
195 memcpy( getOutBuffEnd(), getInBuffCurrent(), additionSize );
196 _outBuff.bufSize += additionSize;
197 }
198
199 // slide current position
200 _inBuffCurrent = end;
201
202 #ifdef PBZIP_DEBUG
203 fprintf( stderr, " end appendOutBuffData/%d: [", additionSize );
204 printCurrentState();
205 fprintf( stderr, "\n" );
206 #endif
207
208 return additionSize;
209 }
210
211 /**
212 * Append available data from [current, search pos) to output buffer but
213 * just up to fill current out buffer capacity
214 */
appendOutBuffDataUpToLimit()215 int BZ2StreamScanner::appendOutBuffDataUpToLimit()
216 {
217 size_t maxCapacity = std::max( getOutBuffCapacityLimit(), _outBuffCapacity );
218 int maxAddition = maxCapacity - _outBuff.bufSize;
219 if (maxAddition <= 0 )
220 {
221 return 0;
222 }
223
224 CharType * end1;
225 if ( eof() )
226 {
227 end1 = getInBuffEnd();
228 }
229 else
230 {
231 // subtract header size to keep the tail (since start of next header may be in it)
232 end1 = std::min( getInBuffSearchPtr(), getInBuffEnd() - ( getHeaderSize() - 1 ) );
233 }
234 CharType * end2 = getInBuffCurrent() + maxAddition;
235 CharType * end = std::min( end1, end2 );
236
237 return appendOutBuffData( end );
238 }
239
240 /**
241 * Copy end section of input buffer to beginning just in case the BZIP2 header
242 * is located between two buffer boundaries. Copy the other remaining
243 * data into output buffer.
244 */
rewindInBuff()245 int BZ2StreamScanner::rewindInBuff()
246 {
247 // temporarily mark tail beginning (not real header position)
248 _inBuffSearchPtr = getInBuffEnd() - ( _bz2Header.size() - 1 );
249 int ret = appendOutBuffData();
250 if ( failed() || ( ret < 0 ) )
251 {
252 return -1;
253 }
254 else if ( ret == 0 )
255 {
256 // search pos <= current
257 _inBuffSearchPtr = getInBuffCurrent();
258 }
259
260 int tailSize = getInBuffEnd() - getInBuffSearchPtr();
261
262 #ifdef PBZIP_DEBUG
263 fprintf( stderr, " rewindInBuff: tail len: %d; app ret=%d [", tailSize, ret );
264 printCurrentState();
265 fprintf( stderr, "\n" );
266 #endif
267
268 // copy tail of input buffer to start and cut the rest
269 std::copy( getInBuffSearchPtr(), getInBuffEnd(), getInBuffBegin() );
270 _inBuffEnd = getInBuffBegin() + tailSize;
271 _inBuffCurrent = getInBuffBegin();
272 _inBuffSearchPtr = getInBuffBegin();
273
274 #ifdef PBZIP_DEBUG
275 fprintf( stderr, " end rewindInBuff: tail len: %d; app ret=%d [", tailSize, ret );
276 printCurrentState();
277 fprintf( stderr, "\n" );
278 #endif
279
280 return 0;
281 }
282
283 /**
284 * Load data from file to input buffer. Read untill buffer is full or end of
285 * file is reached or error is encountered.
286 *
287 * Enough additional capacity should be ensured otherwise may return 0 before
288 * eof.
289 *
290 * @return Returns number of read bytes on success; 0 - on end of file; < 0 on error
291 */
readData()292 int BZ2StreamScanner::readData()
293 {
294 rewindInBuff();
295 if ( failed() )
296 {
297 return -1;
298 }
299
300 if ( getInBuffSize() >= getInBuffCapacity() )
301 {
302 handle_error( EF_EXIT, -1,
303 "pbzip2: *ERROR: BZ2StreamScanner::readData not enough buffer free space!"
304 " inBuffSize=%u, _inBuffCapacity=%u! Aborting...\n",
305 getInBuffSize(), getInBuffCapacity() );
306
307 _errState |= ERR_IO_INSUFFICIENT_BUFF_CAPACITY;
308 return -1;
309 }
310
311 int bytesRead = do_read( _hInFile, getInBuffEnd(),
312 getInBuffCapacity() - getInBuffSize() );
313
314 #ifdef PBZIP_DEBUG
315 fprintf( stderr, " readData: %d bytes read\n", bytesRead );
316 #endif
317
318 if ( bytesRead > 0 )
319 {
320 _inBuffEnd += bytesRead;
321 }
322 else if ( bytesRead < 0 )
323 {
324 handle_error( EF_EXIT, -1,
325 "pbzip2: *ERROR: Could not read from input file [err=%d]! Aborting...\n", bytesRead );
326
327 _errState |= ERR_IO_READ;
328 return bytesRead;
329 }
330 else // ( bytesRead == 0 )
331 {
332 _eof = true;
333 }
334
335 return bytesRead;
336 }
337
338 /**
339 * Locate BZh header prefix in buffer. In case of first search - just check
340 * the beginning of buffer and signal error if it doesn't match to headers.
341 *
342 * @return pointer to BZh header prefix if located. getInBuffEnd() if not.
343 * failure() and getErrState() will indicate error if such occurred.
344 */
locateHeaderPrefixInBuff()345 BZ2StreamScanner::CharType * BZ2StreamScanner::locateHeaderPrefixInBuff()
346 {
347 size_t prefixLen = 3;
348
349 #ifdef PBZIP_DEBUG
350 fprintf( stderr, " start locateHeaderPrefixInBuff; " );
351 printCurrentState();
352 fprintf( stderr, "\n" );
353 #endif
354
355 // first search
356 if ( !isBz2HeaderFound() )
357 {
358 if ( ( getInBuffSearchPtr() != getInBuffBegin() ) ||
359 ( getInBuffSize() < _bz2Header.size() ) )
360 {
361 _errState |= ERR_INVALID_FILE_FORMAT;
362 _inBuffSearchPtr = getInBuffEnd();
363 }
364 else if ( _bz2Header.compare( 0, prefixLen, getInBuffSearchPtr(), prefixLen ) == 0 )
365 {
366 // header prefix found
367 }
368 else
369 {
370 _errState |= ERR_INVALID_FILE_FORMAT;
371 _inBuffSearchPtr = getInBuffEnd();
372 }
373 }
374 else
375 {
376 _inBuffSearchPtr = std::search( getInBuffSearchPtr(), getInBuffEnd(),
377 _bz2Header.begin(), _bz2Header.begin() + prefixLen );
378 }
379
380 #ifdef PBZIP_DEBUG
381 if ( getInBuffSearchPtr() != getInBuffEnd() )
382 {
383 fprintf( stderr, " end locateHeaderPrefixInBuff - header prefix found; " );
384 }
385 else
386 {
387 fprintf( stderr, " end locateHeaderPrefixInBuff - header prefix not found; " );
388 }
389 printCurrentState();
390 fprintf( stderr, "\n" );
391 #endif
392
393 return getInBuffSearchPtr();
394 }
395
396
397 /**
398 * Search next bz2 header just in currently available input buffer.
399 * (Doesn't read more data from file).
400 *
401 * @return pointer to header or getInBuffEnd() if such is not found.
402 */
searchNextHeaderInBuff()403 BZ2StreamScanner::CharType * BZ2StreamScanner::searchNextHeaderInBuff()
404 {
405 #ifdef PBZIP_DEBUG
406 fprintf( stderr, " start searchNextHeaderInBuff; " );
407 printCurrentState();
408 fprintf( stderr, "\n" );
409 #endif
410
411 _searchStatus = false;
412 size_t prefixLen = 3;
413 size_t hsp = prefixLen + 1; // header selection position
414
415 locateHeaderPrefixInBuff();
416 while ( !failed() && ( getUnsearchedCount() >= getHeaderSize() ) )
417 {
418 // _inBuffSearchPtr += prefixLen;
419 basic_string<CharType> * pHdr = NULL;
420
421 if ( getInBuffSearchPtr()[hsp] == _bz2Header[hsp] )
422 {
423 pHdr = &_bz2Header;
424 #ifdef PBZIP_DEBUG
425 fprintf( stderr, " searchNextHeaderInBuff - kind of NON-ZERO header\n" );
426 #endif
427 }
428 else if ( getInBuffSearchPtr()[hsp] == _bz2HeaderZero[hsp] )
429 {
430 pHdr = &_bz2HeaderZero;
431 #ifdef PBZIP_DEBUG
432 fprintf( stderr, " searchNextHeaderInBuff - kind of ZERO header\n" );
433 #endif
434 }
435
436 if ( pHdr != NULL )
437 {
438 CharType bwtSizeChar = getInBuffSearchPtr()[prefixLen];
439 if ( ( bwtSizeChar >= '1' ) && ( bwtSizeChar <= '9' ) )
440 {
441 (*pHdr)[prefixLen] = bwtSizeChar;
442
443 // compare the remaining part of magic header
444 int cmpres = pHdr->compare( hsp, pHdr->size() - hsp,
445 getInBuffSearchPtr() + hsp, pHdr->size() - hsp );
446
447 #ifdef PBZIP_DEBUG
448 fprintf( stderr, " searchNextHeaderInBuff:cmpres=%d\n", cmpres );
449 #endif
450 if ( cmpres == 0 )
451 {
452 _searchStatus = true;
453 #ifdef PBZIP_DEBUG
454 fprintf( stderr, " end searchNextHeaderInBuff - found; " );
455 printCurrentState();
456 fprintf( stderr, "\n" );
457 #endif
458 return _inBuffSearchPtr;
459 }
460 }
461 }
462
463 if ( !isBz2HeaderFound() )
464 {
465 // not finding header on first search means failure
466 _errState |= ERR_INVALID_FILE_FORMAT;
467 break;
468 }
469 else
470 {
471 _inBuffSearchPtr += prefixLen;
472 locateHeaderPrefixInBuff();
473 }
474 }
475
476 // no header has been found if we're here
477 _inBuffSearchPtr = getInBuffEnd();
478
479 #ifdef PBZIP_DEBUG
480 fprintf( stderr, " end searchNextHeaderInBuff; " );
481 printCurrentState();
482 fprintf( stderr, "\n" );
483 #endif
484
485 return _inBuffSearchPtr;
486 }
487
488
489 #ifdef PBZIP_DEBUG
printCurrentState()490 void BZ2StreamScanner::printCurrentState()
491 {
492 fprintf( stderr, "current=%ld, search pos=%ld, end pos=%ld; s-c=%ld"
493 "; out buf size=%d; out buf capacity=%d; header found=%d; search status=%d",
494 (long)(getInBuffCurrent() - getInBuffBegin()),
495 (long)(getInBuffSearchPtr() - getInBuffBegin()),
496 (long)(getInBuffEnd() - getInBuffBegin()),
497 (long)(getInBuffSearchPtr() - getInBuffCurrent()),
498 (int)_outBuff.bufSize,
499 (int)_outBuffCapacity,
500 (int)isBz2HeaderFound(),
501 (int)getSearchStatus() );
502 }
503 #endif
504
505 /**
506 * Search next bz2 header. Read more data from file if needed.
507 *
508 * @return pointer to header is returned if found;
509 * getInBuffEnd() - if not found (or error).
510 * One should check failure() or _errorState for error details.
511 */
searchNextHeader()512 BZ2StreamScanner::CharType * BZ2StreamScanner::searchNextHeader()
513 {
514 #ifdef PBZIP_DEBUG
515 fprintf( stderr, " start searchNextHeader %u/%" PRIuMAX "... : ",
516 (unsigned) (getInBuffSearchPtr() - getInBuffBegin()), (uintmax_t) getInBuffSize() );
517 printCurrentState();
518 fprintf( stderr, "\n" );
519 #endif
520
521 if ( getUnsearchedCount() > 0 )
522 {
523 searchNextHeaderInBuff();
524 }
525
526 while ( !getSearchStatus() && !eof() && !failed() && !isOutBuffFullEnough() )
527 {
528 readData();
529
530 if ( failed() )
531 {
532 return getInBuffEnd();
533 }
534
535 searchNextHeaderInBuff();
536 }
537
538 if ( getSearchStatus() )
539 {
540 _bz2HeaderFound = true;
541
542 #ifdef PBZIP_DEBUG
543 fprintf( stderr, " header found; " );
544 printCurrentState();
545 fprintf( stderr, "\n" );
546 #endif
547 }
548
549 if ( failed() )
550 {
551 return _inBuffSearchPtr = getInBuffEnd();
552 }
553
554 #ifdef PBZIP_DEBUG
555 fprintf( stderr, " end searchNextHeader %u/%" PRIuMAX "... NOT FOUND: ",
556 (unsigned) (getInBuffSearchPtr() - getInBuffBegin()), (uintmax_t) getInBuffSize() );
557 printCurrentState();
558 fprintf( stderr, "\n" );
559 #endif
560
561 return _inBuffSearchPtr;
562 }
563
564 /**
565 * Get next BZ2 stream from the input.
566 *
567 * @return output buffer initialized with bz2 stream. failure() should be checked
568 * after calling this method - true would mean failure(). If failure() is false:
569 * - outBuff.bufSize == 0 indicates end of file;
570 */
getNextStream()571 outBuff * BZ2StreamScanner::getNextStream()
572 {
573 initOutBuff();
574
575 #ifdef PBZIP_DEBUG
576 static OFF_T blockNum = 0;
577 #endif
578
579 outBuff * res = new(std::nothrow) outBuff;
580 if ( res == NULL )
581 {
582 handle_error( EF_EXIT, -1,
583 "pbzip2: *ERROR: Could not allocate memory (getNextStream/%u)!"
584 "Aborting...\n", (unsigned) sizeof( outBuff ) );
585
586 _errState |= ERR_MEM_ALLOC_OUTBUFF;
587 return res;
588 }
589
590 res->buf = NULL;
591 res->bufSize = std::numeric_limits<unsigned int>::max();
592
593 // first search
594 if ( !failed() && !isBz2HeaderFound() )
595 {
596 #ifdef PBZIP_DEBUG
597 blockNum = 0;
598 fprintf( stderr, " First search start\n" );
599 #endif
600
601 _searchStatus = false;
602 searchNextHeader();
603 }
604
605 if ( failed() )
606 {
607 return res;
608 }
609
610 if ( ( getInBuffCurrent() == getInBuffEnd() ) && eof() )
611 {
612 // end of file
613 #ifdef PBZIP_DEBUG
614 fprintf( stderr, " End of file\n" );
615 #endif
616
617 res->bufSize = 0;
618 return res;
619 }
620
621 if ( ( getInBuffCurrent() == getInBuffSearchPtr() ) ||
622 ( !getSearchStatus() && !eof() ) )
623 {
624 // search for next header
625 // Slide a bit to skip current header in order to search for next one.
626 _inBuffSearchPtr = std::min( getInBuffSearchPtr() + _bz2Header.size(),
627 getInBuffEnd() );
628 _searchStatus = false;
629
630 #ifdef PBZIP_DEBUG
631 fprintf( stderr, " getNextStream - Searching subsequent header... " );
632 printCurrentState();
633 fprintf( stderr, "\n" );
634 #endif
635
636 searchNextHeader();
637 }
638
639 if ( failed() )
640 {
641 return res;
642 }
643
644 appendOutBuffDataUpToLimit();
645
646 if ( failed() )
647 {
648 return res;
649 }
650
651 if ( _outSequenceNumber > 0 )
652 {
653 // continuing an unterminated sequence
654 ++_outSequenceNumber;
655 }
656 else if ( getInBuffCurrent() != getInBuffSearchPtr() )
657 {
658 // start of long multi-part stream
659 _outSequenceNumber = 1;
660 }
661
662 _outBuff.sequenceNumber = _outSequenceNumber;
663 _outBuff.inSize = _outBuff.bufSize;
664 _outBuff.blockNumber = _streamNumber;
665
666 if ( getInBuffCurrent() == getInBuffSearchPtr() )
667 {
668 // we're at end of stream (either single or multi-segment one)
669 _outBuff.isLastInSequence = true;
670 _outSequenceNumber = 0;
671 ++_streamNumber;
672 }
673 else
674 {
675 _outBuff.isLastInSequence = false;
676 }
677
678
679 #ifdef PBZIP_DEBUG
680 OFF_T startBlock = blockNum;
681 blockNum += _outBuff.bufSize;
682
683 fprintf( stderr, " end getNextStream/blockRange=[%" PRIuMAX ", %" PRIuMAX "), stream no=%d; seq=%d: [",
684 (uintmax_t) startBlock, (uintmax_t) blockNum, _outBuff.blockNumber, _outBuff.sequenceNumber );
685 printCurrentState();
686 fprintf( stderr, "\n" );
687 #endif
688
689 *res = _outBuff;
690 // clean-up pointers to returned data.
691 initOutBuff();
692
693 return res;
694 }
695
initOutBuff(char * buf,size_t bufSize,size_t bufCapacity)696 void BZ2StreamScanner::initOutBuff( char * buf, size_t bufSize, size_t bufCapacity )
697 {
698 _outBuff.buf = buf;
699 _outBuff.bufSize = bufSize;
700 _outBuffCapacity = bufCapacity;
701 _outBuff.inSize = 0;
702 }
703
704 } // namespace pbzip2
705
706