1 #include "util/read_compressed.hh"
2
3 #include "util/file.hh"
4 #include "util/have.hh"
5 #include "util/scoped.hh"
6
7 #include <algorithm>
8 #include <iostream>
9
10 #include <cassert>
11 #include <climits>
12 #include <cstdlib>
13 #include <cstring>
14
15 #ifdef HAVE_ZLIB
16 #include <zlib.h>
17 #endif
18
19 #ifdef HAVE_BZLIB
20 #include <bzlib.h>
21 #endif
22
23 #ifdef HAVE_XZLIB
24 #include <lzma.h>
25 #endif
26
27 namespace util {
28
CompressedException()29 CompressedException::CompressedException() throw() {}
~CompressedException()30 CompressedException::~CompressedException() throw() {}
31
GZException()32 GZException::GZException() throw() {}
~GZException()33 GZException::~GZException() throw() {}
34
BZException()35 BZException::BZException() throw() {}
~BZException()36 BZException::~BZException() throw() {}
37
XZException()38 XZException::XZException() throw() {}
~XZException()39 XZException::~XZException() throw() {}
40
ReplaceThis(ReadBase * with,ReadCompressed & thunk)41 void ReadBase::ReplaceThis(ReadBase *with, ReadCompressed &thunk) {
42 thunk.internal_.reset(with);
43 }
44
Current(ReadCompressed & thunk)45 ReadBase *ReadBase::Current(ReadCompressed &thunk) { return thunk.internal_.get(); }
46
ReadCount(ReadCompressed & thunk)47 uint64_t &ReadBase::ReadCount(ReadCompressed &thunk) {
48 return thunk.raw_amount_;
49 }
50
51 namespace {
52
53 ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed);
54
55 // Completed file that other classes can thunk to.
56 class Complete : public ReadBase {
57 public:
Read(void *,std::size_t,ReadCompressed &)58 std::size_t Read(void *, std::size_t, ReadCompressed &) {
59 return 0;
60 }
61 };
62
63 class Uncompressed : public ReadBase {
64 public:
Uncompressed(int fd)65 explicit Uncompressed(int fd) : fd_(fd) {}
66
Read(void * to,std::size_t amount,ReadCompressed & thunk)67 std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
68 std::size_t got = PartialRead(fd_.get(), to, amount);
69 ReadCount(thunk) += got;
70 return got;
71 }
72
73 private:
74 scoped_fd fd_;
75 };
76
77 class UncompressedWithHeader : public ReadBase {
78 public:
UncompressedWithHeader(int fd,const void * already_data,std::size_t already_size)79 UncompressedWithHeader(int fd, const void *already_data, std::size_t already_size) : fd_(fd) {
80 assert(already_size);
81 buf_.reset(malloc(already_size));
82 if (!buf_.get()) throw std::bad_alloc();
83 memcpy(buf_.get(), already_data, already_size);
84 remain_ = static_cast<uint8_t*>(buf_.get());
85 end_ = remain_ + already_size;
86 }
87
Read(void * to,std::size_t amount,ReadCompressed & thunk)88 std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
89 assert(buf_.get());
90 assert(remain_ != end_);
91 std::size_t sending = std::min<std::size_t>(amount, end_ - remain_);
92 memcpy(to, remain_, sending);
93 remain_ += sending;
94 if (remain_ == end_) {
95 ReplaceThis(new Uncompressed(fd_.release()), thunk);
96 }
97 return sending;
98 }
99
100 private:
101 scoped_malloc buf_;
102 uint8_t *remain_;
103 uint8_t *end_;
104
105 scoped_fd fd_;
106 };
107
108 static const std::size_t kInputBuffer = 16384;
109
110 template <class Compression> class StreamCompressed : public ReadBase {
111 public:
StreamCompressed(int fd,const void * already_data,std::size_t already_size)112 StreamCompressed(int fd, const void *already_data, std::size_t already_size)
113 : file_(fd),
114 in_buffer_(MallocOrThrow(kInputBuffer)),
115 back_(memcpy(in_buffer_.get(), already_data, already_size), already_size) {}
116
Read(void * to,std::size_t amount,ReadCompressed & thunk)117 std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
118 if (amount == 0) return 0;
119 back_.SetOutput(to, amount);
120 do {
121 if (!back_.Stream().avail_in) ReadInput(thunk);
122 if (!back_.Process()) {
123 // reached end, at least for the compressed portion.
124 std::size_t ret = static_cast<const uint8_t *>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to);
125 ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk);
126 if (ret) return ret;
127 // We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader.
128 return Current(thunk)->Read(to, amount, thunk);
129 }
130 } while (back_.Stream().next_out == to);
131 return static_cast<const uint8_t*>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to);
132 }
133
134 private:
ReadInput(ReadCompressed & thunk)135 void ReadInput(ReadCompressed &thunk) {
136 assert(!back_.Stream().avail_in);
137 std::size_t got = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer);
138 back_.SetInput(in_buffer_.get(), got);
139 ReadCount(thunk) += got;
140 }
141
142 scoped_fd file_;
143 scoped_malloc in_buffer_;
144
145 Compression back_;
146 };
147
148 #ifdef HAVE_ZLIB
149 class GZip {
150 public:
GZip(const void * base,std::size_t amount)151 GZip(const void *base, std::size_t amount) {
152 SetInput(base, amount);
153 stream_.zalloc = Z_NULL;
154 stream_.zfree = Z_NULL;
155 stream_.opaque = Z_NULL;
156 stream_.msg = NULL;
157 // 32 for zlib and gzip decoding with automatic header detection.
158 // 15 for maximum window size.
159 UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib.");
160 }
161
~GZip()162 ~GZip() {
163 if (Z_OK != inflateEnd(&stream_)) {
164 std::cerr << "zlib could not close properly." << std::endl;
165 abort();
166 }
167 }
168
SetOutput(void * to,std::size_t amount)169 void SetOutput(void *to, std::size_t amount) {
170 stream_.next_out = static_cast<Bytef*>(to);
171 stream_.avail_out = std::min<std::size_t>(std::numeric_limits<uInt>::max(), amount);
172 }
173
SetInput(const void * base,std::size_t amount)174 void SetInput(const void *base, std::size_t amount) {
175 assert(amount < static_cast<std::size_t>(std::numeric_limits<uInt>::max()));
176 stream_.next_in = const_cast<Bytef*>(static_cast<const Bytef*>(base));
177 stream_.avail_in = amount;
178 }
179
Stream() const180 const z_stream &Stream() const { return stream_; }
181
Process()182 bool Process() {
183 int result = inflate(&stream_, 0);
184 switch (result) {
185 case Z_OK:
186 return true;
187 case Z_STREAM_END:
188 return false;
189 case Z_ERRNO:
190 UTIL_THROW(ErrnoException, "zlib error");
191 default:
192 UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result);
193 }
194 }
195
196 private:
197 z_stream stream_;
198 };
199 #endif // HAVE_ZLIB
200
201 #ifdef HAVE_BZLIB
202 class BZip {
203 public:
BZip(const void * base,std::size_t amount)204 BZip(const void *base, std::size_t amount) {
205 memset(&stream_, 0, sizeof(stream_));
206 SetInput(base, amount);
207 HandleError(BZ2_bzDecompressInit(&stream_, 0, 0));
208 }
209
~BZip()210 ~BZip() {
211 try {
212 HandleError(BZ2_bzDecompressEnd(&stream_));
213 } catch (const std::exception &e) {
214 std::cerr << e.what() << std::endl;
215 abort();
216 }
217 }
218
Process()219 bool Process() {
220 int ret = BZ2_bzDecompress(&stream_);
221 if (ret == BZ_STREAM_END) return false;
222 HandleError(ret);
223 return true;
224 }
225
SetOutput(void * base,std::size_t amount)226 void SetOutput(void *base, std::size_t amount) {
227 stream_.next_out = static_cast<char*>(base);
228 stream_.avail_out = std::min<std::size_t>(std::numeric_limits<unsigned int>::max(), amount);
229 }
230
SetInput(const void * base,std::size_t amount)231 void SetInput(const void *base, std::size_t amount) {
232 stream_.next_in = const_cast<char*>(static_cast<const char*>(base));
233 stream_.avail_in = amount;
234 }
235
Stream() const236 const bz_stream &Stream() const { return stream_; }
237
238 private:
HandleError(int value)239 void HandleError(int value) {
240 switch(value) {
241 case BZ_OK:
242 return;
243 case BZ_CONFIG_ERROR:
244 UTIL_THROW(BZException, "bzip2 seems to be miscompiled.");
245 case BZ_PARAM_ERROR:
246 UTIL_THROW(BZException, "bzip2 Parameter error");
247 case BZ_DATA_ERROR:
248 UTIL_THROW(BZException, "bzip2 detected a corrupt file");
249 case BZ_DATA_ERROR_MAGIC:
250 UTIL_THROW(BZException, "bzip2 detected bad magic bytes. Perhaps this was not a bzip2 file after all?");
251 case BZ_MEM_ERROR:
252 throw std::bad_alloc();
253 default:
254 UTIL_THROW(BZException, "Unknown bzip2 error code " << value);
255 }
256 }
257
258 bz_stream stream_;
259 };
260 #endif // HAVE_BZLIB
261
262 #ifdef HAVE_XZLIB
263 class XZip {
264 public:
XZip(const void * base,std::size_t amount)265 XZip(const void *base, std::size_t amount)
266 : stream_(), action_(LZMA_RUN) {
267 memset(&stream_, 0, sizeof(stream_));
268 SetInput(base, amount);
269 HandleError(lzma_stream_decoder(&stream_, UINT64_MAX, 0));
270 }
271
~XZip()272 ~XZip() {
273 lzma_end(&stream_);
274 }
275
SetOutput(void * base,std::size_t amount)276 void SetOutput(void *base, std::size_t amount) {
277 stream_.next_out = static_cast<uint8_t*>(base);
278 stream_.avail_out = amount;
279 }
280
SetInput(const void * base,std::size_t amount)281 void SetInput(const void *base, std::size_t amount) {
282 stream_.next_in = static_cast<const uint8_t*>(base);
283 stream_.avail_in = amount;
284 if (!amount) action_ = LZMA_FINISH;
285 }
286
Stream() const287 const lzma_stream &Stream() const { return stream_; }
288
Process()289 bool Process() {
290 lzma_ret status = lzma_code(&stream_, action_);
291 if (status == LZMA_STREAM_END) return false;
292 HandleError(status);
293 return true;
294 }
295
296 private:
HandleError(lzma_ret value)297 void HandleError(lzma_ret value) {
298 switch (value) {
299 case LZMA_OK:
300 return;
301 case LZMA_MEM_ERROR:
302 throw std::bad_alloc();
303 case LZMA_FORMAT_ERROR:
304 UTIL_THROW(XZException, "xzlib says file format not recognized");
305 case LZMA_OPTIONS_ERROR:
306 UTIL_THROW(XZException, "xzlib says unsupported compression options");
307 case LZMA_DATA_ERROR:
308 UTIL_THROW(XZException, "xzlib says this file is corrupt");
309 case LZMA_BUF_ERROR:
310 UTIL_THROW(XZException, "xzlib says unexpected end of input");
311 default:
312 UTIL_THROW(XZException, "unrecognized xzlib error " << value);
313 }
314 }
315
316 lzma_stream stream_;
317 lzma_action action_;
318 };
319 #endif // HAVE_XZLIB
320
321 class IStreamReader : public ReadBase {
322 public:
IStreamReader(std::istream & stream)323 explicit IStreamReader(std::istream &stream) : stream_(stream) {}
324
Read(void * to,std::size_t amount,ReadCompressed & thunk)325 std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
326 if (!stream_.read(static_cast<char*>(to), amount)) {
327 UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error");
328 amount = stream_.gcount();
329 }
330 ReadCount(thunk) += amount;
331 return amount;
332 }
333
334 private:
335 std::istream &stream_;
336 };
337
338 enum MagicResult {
339 UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP
340 };
341
DetectMagic(const void * from_void,std::size_t length)342 MagicResult DetectMagic(const void *from_void, std::size_t length) {
343 const uint8_t *header = static_cast<const uint8_t*>(from_void);
344 if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) {
345 return UTIL_GZIP;
346 }
347 const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
348 if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) {
349 return UTIL_BZIP;
350 }
351 const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
352 if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) {
353 return UTIL_XZIP;
354 }
355 return UTIL_UNKNOWN;
356 }
357
ReadFactory(int fd,uint64_t & raw_amount,const void * already_data,const std::size_t already_size,bool require_compressed)358 ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) {
359 scoped_fd hold(fd);
360 std::string header(reinterpret_cast<const char*>(already_data), already_size);
361 if (header.size() < ReadCompressed::kMagicSize) {
362 std::size_t original = header.size();
363 header.resize(ReadCompressed::kMagicSize);
364 std::size_t got = ReadOrEOF(fd, &header[original], ReadCompressed::kMagicSize - original);
365 raw_amount += got;
366 header.resize(original + got);
367 }
368 if (header.empty()) {
369 return new Complete();
370 }
371 switch (DetectMagic(&header[0], header.size())) {
372 case UTIL_GZIP:
373 #ifdef HAVE_ZLIB
374 return new StreamCompressed<GZip>(hold.release(), header.data(), header.size());
375 #else
376 UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in.");
377 #endif
378 case UTIL_BZIP:
379 #ifdef HAVE_BZLIB
380 return new StreamCompressed<BZip>(hold.release(), &header[0], header.size());
381 #else
382 UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in.");
383 #endif
384 case UTIL_XZIP:
385 #ifdef HAVE_XZLIB
386 return new StreamCompressed<XZip>(hold.release(), header.data(), header.size());
387 #else
388 UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in.");
389 #endif
390 default:
391 UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error.");
392 return new UncompressedWithHeader(hold.release(), header.data(), header.size());
393 }
394 }
395
396 } // namespace
397
DetectCompressedMagic(const void * from_void)398 bool ReadCompressed::DetectCompressedMagic(const void *from_void) {
399 return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN;
400 }
401
ReadCompressed(int fd)402 ReadCompressed::ReadCompressed(int fd) {
403 Reset(fd);
404 }
405
ReadCompressed(std::istream & in)406 ReadCompressed::ReadCompressed(std::istream &in) {
407 Reset(in);
408 }
409
ReadCompressed()410 ReadCompressed::ReadCompressed() {}
411
Reset(int fd)412 void ReadCompressed::Reset(int fd) {
413 raw_amount_ = 0;
414 internal_.reset();
415 internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false));
416 }
417
Reset(std::istream & in)418 void ReadCompressed::Reset(std::istream &in) {
419 internal_.reset();
420 internal_.reset(new IStreamReader(in));
421 }
422
Read(void * to,std::size_t amount)423 std::size_t ReadCompressed::Read(void *to, std::size_t amount) {
424 return internal_->Read(to, amount, *this);
425 }
426
ReadOrEOF(void * const to_in,std::size_t amount)427 std::size_t ReadCompressed::ReadOrEOF(void *const to_in, std::size_t amount) {
428 uint8_t *to = reinterpret_cast<uint8_t*>(to_in);
429 while (amount) {
430 std::size_t got = Read(to, amount);
431 if (!got) break;
432 to += got;
433 amount -= got;
434 }
435 return to - reinterpret_cast<uint8_t*>(to_in);
436 }
437
438 } // namespace util
439