1 #include "util/read_compressed.hh"
2 
3 #include "util/file.hh"
4 #include "util/have.hh"
5 #include "util/scoped.hh"
6 
7 #include <algorithm>
8 #include <iostream>
9 
10 #include <cassert>
11 #include <climits>
12 #include <cstdlib>
13 #include <cstring>
14 
15 #ifdef HAVE_ZLIB
16 #include <zlib.h>
17 #endif
18 
19 #ifdef HAVE_BZLIB
20 #include <bzlib.h>
21 #endif
22 
23 #ifdef HAVE_XZLIB
24 #include <lzma.h>
25 #endif
26 
27 namespace util {
28 
CompressedException()29 CompressedException::CompressedException() throw() {}
~CompressedException()30 CompressedException::~CompressedException() throw() {}
31 
GZException()32 GZException::GZException() throw() {}
~GZException()33 GZException::~GZException() throw() {}
34 
BZException()35 BZException::BZException() throw() {}
~BZException()36 BZException::~BZException() throw() {}
37 
XZException()38 XZException::XZException() throw() {}
~XZException()39 XZException::~XZException() throw() {}
40 
ReplaceThis(ReadBase * with,ReadCompressed & thunk)41 void ReadBase::ReplaceThis(ReadBase *with, ReadCompressed &thunk) {
42   thunk.internal_.reset(with);
43 }
44 
Current(ReadCompressed & thunk)45 ReadBase *ReadBase::Current(ReadCompressed &thunk) { return thunk.internal_.get(); }
46 
ReadCount(ReadCompressed & thunk)47 uint64_t &ReadBase::ReadCount(ReadCompressed &thunk) {
48   return thunk.raw_amount_;
49 }
50 
51 namespace {
52 
53 ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed);
54 
55 // Completed file that other classes can thunk to.
56 class Complete : public ReadBase {
57   public:
Read(void *,std::size_t,ReadCompressed &)58     std::size_t Read(void *, std::size_t, ReadCompressed &) {
59       return 0;
60     }
61 };
62 
63 class Uncompressed : public ReadBase {
64   public:
Uncompressed(int fd)65     explicit Uncompressed(int fd) : fd_(fd) {}
66 
Read(void * to,std::size_t amount,ReadCompressed & thunk)67     std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
68       std::size_t got = PartialRead(fd_.get(), to, amount);
69       ReadCount(thunk) += got;
70       return got;
71     }
72 
73   private:
74     scoped_fd fd_;
75 };
76 
77 class UncompressedWithHeader : public ReadBase {
78   public:
UncompressedWithHeader(int fd,const void * already_data,std::size_t already_size)79     UncompressedWithHeader(int fd, const void *already_data, std::size_t already_size) : fd_(fd) {
80       assert(already_size);
81       buf_.reset(malloc(already_size));
82       if (!buf_.get()) throw std::bad_alloc();
83       memcpy(buf_.get(), already_data, already_size);
84       remain_ = static_cast<uint8_t*>(buf_.get());
85       end_ = remain_ + already_size;
86     }
87 
Read(void * to,std::size_t amount,ReadCompressed & thunk)88     std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
89       assert(buf_.get());
90       assert(remain_ != end_);
91       std::size_t sending = std::min<std::size_t>(amount, end_ - remain_);
92       memcpy(to, remain_, sending);
93       remain_ += sending;
94       if (remain_ == end_) {
95         ReplaceThis(new Uncompressed(fd_.release()), thunk);
96       }
97       return sending;
98     }
99 
100   private:
101     scoped_malloc buf_;
102     uint8_t *remain_;
103     uint8_t *end_;
104 
105     scoped_fd fd_;
106 };
107 
108 static const std::size_t kInputBuffer = 16384;
109 
110 template <class Compression> class StreamCompressed : public ReadBase {
111   public:
StreamCompressed(int fd,const void * already_data,std::size_t already_size)112     StreamCompressed(int fd, const void *already_data, std::size_t already_size)
113       : file_(fd),
114         in_buffer_(MallocOrThrow(kInputBuffer)),
115         back_(memcpy(in_buffer_.get(), already_data, already_size), already_size) {}
116 
Read(void * to,std::size_t amount,ReadCompressed & thunk)117     std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
118       if (amount == 0) return 0;
119       back_.SetOutput(to, amount);
120       do {
121         if (!back_.Stream().avail_in) ReadInput(thunk);
122         if (!back_.Process()) {
123           // reached end, at least for the compressed portion.
124           std::size_t ret = static_cast<const uint8_t *>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to);
125           ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk);
126           if (ret) return ret;
127           // We did not read anything this round, so clients might think EOF.  Transfer responsibility to the next reader.
128           return Current(thunk)->Read(to, amount, thunk);
129         }
130       } while (back_.Stream().next_out == to);
131       return static_cast<const uint8_t*>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to);
132     }
133 
134   private:
ReadInput(ReadCompressed & thunk)135     void ReadInput(ReadCompressed &thunk) {
136       assert(!back_.Stream().avail_in);
137       std::size_t got = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer);
138       back_.SetInput(in_buffer_.get(), got);
139       ReadCount(thunk) += got;
140     }
141 
142     scoped_fd file_;
143     scoped_malloc in_buffer_;
144 
145     Compression back_;
146 };
147 
148 #ifdef HAVE_ZLIB
149 class GZip {
150   public:
GZip(const void * base,std::size_t amount)151     GZip(const void *base, std::size_t amount) {
152       SetInput(base, amount);
153       stream_.zalloc = Z_NULL;
154       stream_.zfree = Z_NULL;
155       stream_.opaque = Z_NULL;
156       stream_.msg = NULL;
157       // 32 for zlib and gzip decoding with automatic header detection.
158       // 15 for maximum window size.
159       UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib.");
160     }
161 
~GZip()162     ~GZip() {
163       if (Z_OK != inflateEnd(&stream_)) {
164         std::cerr << "zlib could not close properly." << std::endl;
165         abort();
166       }
167     }
168 
SetOutput(void * to,std::size_t amount)169     void SetOutput(void *to, std::size_t amount) {
170       stream_.next_out = static_cast<Bytef*>(to);
171       stream_.avail_out = std::min<std::size_t>(std::numeric_limits<uInt>::max(), amount);
172     }
173 
SetInput(const void * base,std::size_t amount)174     void SetInput(const void *base, std::size_t amount) {
175       assert(amount < static_cast<std::size_t>(std::numeric_limits<uInt>::max()));
176       stream_.next_in = const_cast<Bytef*>(static_cast<const Bytef*>(base));
177       stream_.avail_in = amount;
178     }
179 
Stream() const180     const z_stream &Stream() const { return stream_; }
181 
Process()182     bool Process() {
183       int result = inflate(&stream_, 0);
184       switch (result) {
185         case Z_OK:
186           return true;
187         case Z_STREAM_END:
188           return false;
189         case Z_ERRNO:
190           UTIL_THROW(ErrnoException, "zlib error");
191         default:
192           UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result);
193       }
194     }
195 
196   private:
197     z_stream stream_;
198 };
199 #endif // HAVE_ZLIB
200 
201 #ifdef HAVE_BZLIB
202 class BZip {
203   public:
BZip(const void * base,std::size_t amount)204     BZip(const void *base, std::size_t amount) {
205       memset(&stream_, 0, sizeof(stream_));
206       SetInput(base, amount);
207       HandleError(BZ2_bzDecompressInit(&stream_, 0, 0));
208     }
209 
~BZip()210     ~BZip() {
211       try {
212         HandleError(BZ2_bzDecompressEnd(&stream_));
213       } catch (const std::exception &e) {
214         std::cerr << e.what() << std::endl;
215         abort();
216       }
217     }
218 
Process()219     bool Process() {
220       int ret = BZ2_bzDecompress(&stream_);
221       if (ret == BZ_STREAM_END) return false;
222       HandleError(ret);
223       return true;
224     }
225 
SetOutput(void * base,std::size_t amount)226     void SetOutput(void *base, std::size_t amount) {
227       stream_.next_out = static_cast<char*>(base);
228       stream_.avail_out = std::min<std::size_t>(std::numeric_limits<unsigned int>::max(), amount);
229     }
230 
SetInput(const void * base,std::size_t amount)231     void SetInput(const void *base, std::size_t amount) {
232       stream_.next_in = const_cast<char*>(static_cast<const char*>(base));
233       stream_.avail_in = amount;
234     }
235 
Stream() const236     const bz_stream &Stream() const { return stream_; }
237 
238   private:
HandleError(int value)239     void HandleError(int value) {
240       switch(value) {
241         case BZ_OK:
242           return;
243         case BZ_CONFIG_ERROR:
244           UTIL_THROW(BZException, "bzip2 seems to be miscompiled.");
245         case BZ_PARAM_ERROR:
246           UTIL_THROW(BZException, "bzip2 Parameter error");
247         case BZ_DATA_ERROR:
248           UTIL_THROW(BZException, "bzip2 detected a corrupt file");
249         case BZ_DATA_ERROR_MAGIC:
250           UTIL_THROW(BZException, "bzip2 detected bad magic bytes.  Perhaps this was not a bzip2 file after all?");
251         case BZ_MEM_ERROR:
252           throw std::bad_alloc();
253         default:
254           UTIL_THROW(BZException, "Unknown bzip2 error code " << value);
255       }
256     }
257 
258     bz_stream stream_;
259 };
260 #endif // HAVE_BZLIB
261 
262 #ifdef HAVE_XZLIB
263 class XZip {
264   public:
XZip(const void * base,std::size_t amount)265     XZip(const void *base, std::size_t amount)
266       : stream_(), action_(LZMA_RUN) {
267       memset(&stream_, 0, sizeof(stream_));
268       SetInput(base, amount);
269       HandleError(lzma_stream_decoder(&stream_, UINT64_MAX, 0));
270     }
271 
~XZip()272     ~XZip() {
273       lzma_end(&stream_);
274     }
275 
SetOutput(void * base,std::size_t amount)276     void SetOutput(void *base, std::size_t amount) {
277       stream_.next_out = static_cast<uint8_t*>(base);
278       stream_.avail_out = amount;
279     }
280 
SetInput(const void * base,std::size_t amount)281     void SetInput(const void *base, std::size_t amount) {
282       stream_.next_in = static_cast<const uint8_t*>(base);
283       stream_.avail_in = amount;
284       if (!amount) action_ = LZMA_FINISH;
285     }
286 
Stream() const287     const lzma_stream &Stream() const { return stream_; }
288 
Process()289     bool Process() {
290       lzma_ret status = lzma_code(&stream_, action_);
291       if (status == LZMA_STREAM_END) return false;
292       HandleError(status);
293       return true;
294     }
295 
296   private:
HandleError(lzma_ret value)297     void HandleError(lzma_ret value) {
298       switch (value) {
299         case LZMA_OK:
300           return;
301         case LZMA_MEM_ERROR:
302           throw std::bad_alloc();
303         case LZMA_FORMAT_ERROR:
304           UTIL_THROW(XZException, "xzlib says file format not recognized");
305         case LZMA_OPTIONS_ERROR:
306           UTIL_THROW(XZException, "xzlib says unsupported compression options");
307         case LZMA_DATA_ERROR:
308           UTIL_THROW(XZException, "xzlib says this file is corrupt");
309         case LZMA_BUF_ERROR:
310           UTIL_THROW(XZException, "xzlib says unexpected end of input");
311         default:
312           UTIL_THROW(XZException, "unrecognized xzlib error " << value);
313       }
314     }
315 
316     lzma_stream stream_;
317     lzma_action action_;
318 };
319 #endif // HAVE_XZLIB
320 
321 class IStreamReader : public ReadBase {
322   public:
IStreamReader(std::istream & stream)323     explicit IStreamReader(std::istream &stream) : stream_(stream) {}
324 
Read(void * to,std::size_t amount,ReadCompressed & thunk)325     std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
326       if (!stream_.read(static_cast<char*>(to), amount)) {
327         UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error");
328         amount = stream_.gcount();
329       }
330       ReadCount(thunk) += amount;
331       return amount;
332     }
333 
334   private:
335     std::istream &stream_;
336 };
337 
338 enum MagicResult {
339   UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP
340 };
341 
DetectMagic(const void * from_void,std::size_t length)342 MagicResult DetectMagic(const void *from_void, std::size_t length) {
343   const uint8_t *header = static_cast<const uint8_t*>(from_void);
344   if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) {
345     return UTIL_GZIP;
346   }
347   const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
348   if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) {
349     return UTIL_BZIP;
350   }
351   const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
352   if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) {
353     return UTIL_XZIP;
354   }
355   return UTIL_UNKNOWN;
356 }
357 
ReadFactory(int fd,uint64_t & raw_amount,const void * already_data,const std::size_t already_size,bool require_compressed)358 ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) {
359   scoped_fd hold(fd);
360   std::string header(reinterpret_cast<const char*>(already_data), already_size);
361   if (header.size() < ReadCompressed::kMagicSize) {
362     std::size_t original = header.size();
363     header.resize(ReadCompressed::kMagicSize);
364     std::size_t got = ReadOrEOF(fd, &header[original], ReadCompressed::kMagicSize - original);
365     raw_amount += got;
366     header.resize(original + got);
367   }
368   if (header.empty()) {
369     return new Complete();
370   }
371   switch (DetectMagic(&header[0], header.size())) {
372     case UTIL_GZIP:
373 #ifdef HAVE_ZLIB
374       return new StreamCompressed<GZip>(hold.release(), header.data(), header.size());
375 #else
376       UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in.");
377 #endif
378     case UTIL_BZIP:
379 #ifdef HAVE_BZLIB
380       return new StreamCompressed<BZip>(hold.release(), &header[0], header.size());
381 #else
382       UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in.");
383 #endif
384     case UTIL_XZIP:
385 #ifdef HAVE_XZLIB
386       return new StreamCompressed<XZip>(hold.release(), header.data(), header.size());
387 #else
388       UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in.");
389 #endif
390     default:
391       UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file.  This could be supported but usually indicates an error.");
392       return new UncompressedWithHeader(hold.release(), header.data(), header.size());
393   }
394 }
395 
396 } // namespace
397 
DetectCompressedMagic(const void * from_void)398 bool ReadCompressed::DetectCompressedMagic(const void *from_void) {
399   return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN;
400 }
401 
ReadCompressed(int fd)402 ReadCompressed::ReadCompressed(int fd) {
403   Reset(fd);
404 }
405 
ReadCompressed(std::istream & in)406 ReadCompressed::ReadCompressed(std::istream &in) {
407   Reset(in);
408 }
409 
ReadCompressed()410 ReadCompressed::ReadCompressed() {}
411 
Reset(int fd)412 void ReadCompressed::Reset(int fd) {
413   raw_amount_ = 0;
414   internal_.reset();
415   internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false));
416 }
417 
Reset(std::istream & in)418 void ReadCompressed::Reset(std::istream &in) {
419   internal_.reset();
420   internal_.reset(new IStreamReader(in));
421 }
422 
Read(void * to,std::size_t amount)423 std::size_t ReadCompressed::Read(void *to, std::size_t amount) {
424   return internal_->Read(to, amount, *this);
425 }
426 
ReadOrEOF(void * const to_in,std::size_t amount)427 std::size_t ReadCompressed::ReadOrEOF(void *const to_in, std::size_t amount) {
428   uint8_t *to = reinterpret_cast<uint8_t*>(to_in);
429   while (amount) {
430     std::size_t got = Read(to, amount);
431     if (!got) break;
432     to += got;
433     amount -= got;
434   }
435   return to - reinterpret_cast<uint8_t*>(to_in);
436 }
437 
438 } // namespace util
439