1 // -*- coding: utf-8 -*-
2 //
3 // zlibstream.hxx --- IOStreams classes for working with RFC 1950 and RFC 1952
4 //                    compression formats (respectively known as the zlib and
5 //                    gzip formats)
6 //
7 // Copyright (C) 2017  Florent Rougon
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Library General Public
11 // License as published by the Free Software Foundation; either
12 // version 2 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 // Library General Public License for more details.
18 //
19 // You should have received a copy of the GNU Library General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
22 // MA  02110-1301  USA.
23 
24 #ifndef SG_ZLIBSTREAM_HXX
25 #define SG_ZLIBSTREAM_HXX
26 
27 #include <ios>                  // std::streamsize
28 #include <istream>
29 #include <streambuf>
30 #include <memory>               // std::unique_ptr
31 #include <zlib.h>               // struct z_stream
32 
33 #include <simgear/misc/sg_path.hxx>
34 
35 // This file contains:
36 //
37 //  - two stream buffer classes (ZlibCompressorIStreambuf and
38 //    ZlibDecompressorIStreambuf), both based on the same abstract class:
39 //    ZlibAbstractIStreambuf;
40 //
41 //  - two std::istream subclasses (ZlibCompressorIStream and
42 //    ZlibDecompressorIStream), each creating and using the corresponding
43 //    stream buffer class from the previous item.
44 //
45 // All these allow one to work with RFC 1950 and RFC 1952 compression
46 // formats, respectively known as the zlib and gzip formats.
47 //
48 // These classes are *input* streaming classes, which means they can
49 // efficiently handle arbitrary amounts of data without using any disk
50 // space nor increasing amounts of memory, and allow “client code” to pull
51 // exactly as much data as it wants at any given time, resuming later
52 // when it is ready to handle the next chunk.
53 //
54 // So, for example, assuming you've created an instance of
55 // ZlibCompressorIStream (bound to some input stream of your choice, let's
56 // call it iStream), you could read 512 bytes of data from it, and you
57 // would get the first 512 bytes of *compressed* data corresponding to what
58 // iStream provided. Then you could resume at any time and ask for the next
59 // 512 bytes of compressed data (or any other amount), etc.
60 //
61 // Therefore, these classes are well suited, among others, to compress or
62 // decompress data streams while at the same time packing the result into
63 // discrete chunks or packets with size constraints (you can think of the
64 // process as making sausages :).
65 //
66 // The input being in each case an std::istream (for compressing as well as
67 // for decompressing), it can be tied to an arbitrary source: a file with
68 // sg_ifstream or std::ifstream, a memory buffer with std::istringstream or
69 // std::stringstream, a TCP socket with a custom std::streambuf subclass[1]
70 // to interface with the sockets API, etc.
71 //
72 //   [1] Possibly wrapped in an std::istream.
73 //
74 // The stream buffer classes upon which ZlibCompressorIStream and
75 // ZlibDecompressorIStream are built have an xsgetn() implementation that
76 // avoids useless copies of data by asking zlib to write directly to the
77 // destination buffer. This xsgetn() method is used when calling read() on
78 // the std::istream subclasses, or sgetn() if you are using the stream
79 // buffer classes directly (i.e., ZlibCompressorIStreambuf and
80 // ZlibDecompressorIStreambuf). Other std::istream methods may instead rely
81 // only on the internal buffer and the underflow() method, and therefore be
82 // less efficient for large amounts of data. You may want to take a look at
83 // zlibstream_test.cxx to see various ways of using these classes.
84 //
85 // In case you use std::istream& operator>>(std::istream&, std::string&) or
86 // its overload friends, beware that it splits fields at spaces, and by
87 // default ignores spaces at the beginning of a field (cf. std::skipws,
88 // std::noskipws and friends). As far as I understand it, most of these
89 // operators are mainly intended in the IOStreams library to be used to
90 // read an int here, a double there, a space-delimited string afterwards,
91 // etc. (the exception could be the overload writing to a stream buffer,
92 // however it doesn't seem to be very efficient on my system with GNU
93 // libstdc++ [it is not using xsgetn()], so beware also of this one if you
94 // are handling large amounts of data). For moderately complex or large
95 // input handling, I'd suggest to use std::istream methods such as read(),
96 // gcount() and getline() (std::getline() can be useful too). Or directly
97 // use the stream buffer classes, in particular with sgetn().
98 
99 
100 namespace simgear
101 {
102 
103 enum class ZLibCompressionFormat {
104   ZLIB = 0,
105   GZIP,
106   AUTODETECT
107 };
108 
109 enum class ZLibMemoryStrategy {
110   FAVOR_MEMORY_OVER_SPEED = 0,
111   FAVOR_SPEED_OVER_MEMORY
112 };
113 
114 // Abstract base class for both the compressor and decompressor stream buffers.
115 class ZlibAbstractIStreambuf: public std::streambuf
116 {
117 public:
118   /**
119    *  @brief Constructor for ZlibAbstractIStreambuf.
120    *  @param iStream     Input stream to read from.
121    *  @param path        Optional path to the file corresponding to iStream,
122    *                     if any. Only used for error messages.
123    *  @param inBuf       Pointer to the input buffer (data read from iStream is
124    *                     written there before being compressed or decompressed).
125    *                     If nullptr, the buffer is allocated on the heap in the
126    *                     constructor and deallocated in the destructor.
127    *  @param inBufSize   Size of the input buffer, in chars.
128    *  @param outBuf      Pointer to the output buffer. Data is read by zlib
129    *                     from the input buffer, compressed or decompressed, and
130    *                     the result is directly written to the output buffer.
131    *                     If nullptr, the buffer is allocated on the heap in the
132    *                     constructor and deallocated in the destructor.
133    *  @param outBufSize  Size of the output buffer, in chars.
134    *  @param putbackSize Size of the putback area inside the output buffer, in
135    *                     chars.
136    *
137    *  It is required that putbackSize < outBufSize. It is guaranteed that,
138    *  if at least putbackSize chars have been read without any putback (or
139    *  unget) operation intermixed, then at least putbackSize chars can be
140    *  put back in sequence. If you don't need this feature, use zero for the
141    *  putbackSize value (the default) for best performance.
142   */
143   explicit ZlibAbstractIStreambuf(std::istream& iStream,
144                                   const SGPath& path = SGPath(),
145                                   char* inBuf = nullptr,
146                                   std::size_t inBufSize = 262144,
147                                   char* outBuf = nullptr,
148                                   std::size_t outBufSize = 262144,
149                                   std::size_t putbackSize = 0);
150 
151   // Alternate constructor with sink semantics for the “source” std::istream.
152   // When used, the class takes ownership of the std::istream instance pointed
153   // to by the first constructor argument, and keeps it alive as long as the
154   // object this constructor is for is itself alive.
155   explicit ZlibAbstractIStreambuf(std::unique_ptr<std::istream> iStream_p,
156                                   const SGPath& path = SGPath(),
157                                   char* inBuf = nullptr,
158                                   std::size_t inBufSize = 262144,
159                                   char* outBuf = nullptr,
160                                   std::size_t outBufSize = 262144,
161                                   std::size_t putbackSize = 0);
162 
163   ZlibAbstractIStreambuf(const ZlibAbstractIStreambuf&) = delete;
164   ZlibAbstractIStreambuf& operator=(const ZlibAbstractIStreambuf&) = delete;
165   virtual ~ZlibAbstractIStreambuf();
166 
167 protected:
168   enum class OperationType {
169     COMPRESSION = 0,
170     DECOMPRESSION
171   };
172 
173   virtual OperationType operationType() const = 0;
174 
175   // Either compress or decompress a chunk of data (depending on the
176   // particular subclass implementation). The semantics are the same as for
177   // zlib's inflate() and deflate() functions applied to member _zstream,
178   // concerning 1) the return value and 2) where, and how much to read and
179   // write (which thus depends on _zstream.avail_in, _zstream.next_in,
180   // _zstream.avail_out and _zstream.next_out).
181   virtual int zlibProcessData() = 0;
182 
183   // The input stream, from which data is read before being processed by zlib
184   std::istream& _iStream;
185   // Pointer to the same, used when calling the constructor that takes an
186   // std::unique_ptr<std::istream> as its first argument; empty
187   // std::unique_ptr object otherwise.
188   std::unique_ptr<std::istream> _iStream_p;
189 
190   // Corresponding path, if any (default-constructed SGPath instance otherwise)
191   const SGPath _path;
192   // Structure used to communicate with zlib
193   z_stream _zstream;
194 
195 private:
196   // Callback whose role is to refill the output buffer when it's empty and
197   // the “client” tries to read more.
198   virtual int underflow() override;
199   // Optional override when subclassing std::streambuf. This is the most
200   // efficient way of reading several characters (as soon as we've emptied the
201   // output buffer, data is written by zlib directly to the destination
202   // buffer).
203   virtual std::streamsize xsgetn(char* dest, std::streamsize n) override;
204   // Utility method for xsgetn()
205   std::size_t xsgetn_preparePutbackArea(char* origGptr, char* dest,
206                                         char* writePtr);
207   // Make sure there is data to read in the input buffer, or signal EOF.
208   bool getInputData();
209   // Utility method for fillOutputBuffer()
210   std::size_t fOB_remainingSpace(unsigned char* nextOutPtr) const;
211   // Fill the output buffer (using zlib functions) as much as possible.
212   char* fillOutputBuffer();
213   // Utility method
214   [[ noreturn ]] void handleZ_BUF_ERROR() const;
215 
216   bool _allFinished = false;
217 
218   // The buffers
219   // ~~~~~~~~~~~
220   //
221   // The input buffer receives data obtained from _iStream, before it is
222   // processed by zlib. In underflow(), zlib reads from this buffer it and
223   // writes the resulting data(*) to the output buffer. Then we point the
224   // standard std::streambuf pointers (gptr() and friends) directly towards
225   // the data inside that output buffer. xsgetn() is even more optimized: it
226   // first empties the output buffer, then makes zlib write the remaining data
227   // directly to the destination area.
228   //
229   //   (*) Compressed or decompressed, depending on the particular
230   //       implementation of zlibProcessData() in each subclass.
231   char* _inBuf;
232   const std::size_t _inBufSize;
233   // _inBufEndPtr points right after the last data retrieved from _iStream and
234   // stored into _inBuf. When zlib has read all such data, _zstream.next_in is
235   // equal to _inBufEndPtr (after proper casting). Except in this particular
236   // situation, only _zstream.next_in <= _inBufEndPtr is guaranteed.
237   char* _inBufEndPtr;
238   // Layout of the _outBuf buffer:
239   //
240   // |_outBuf  <putback area>  |_outBuf + _putbackSize    |_outBuf + _outBufSize
241   //
242   // The first _putbackSize chars in _outBuf are reserved for the putback area
243   // (right-aligned at _outBuf + _putbackSize). The actual output buffer thus
244   // starts at _outBuf + _putbackSize. At any given time for callers of this
245   // class, the number of characters that can be put back is gptr() - eback().
246   // It may be lower than _putbackSize if we haven't read that many characters
247   // yet. It may also be larger if gptr() > _outBuf + _putbackSize, i.e.,
248   // when the buffer for pending data is non-empty.
249   //
250   // At any given time, callers should see:
251   //
252   //   _outBuf <= eback() <= _outBuf + _putbackSize <= gptr() <= egptr()
253   //                                                <= _outBuf + _outBufSize
254   //
255   // (hoping this won't get out of sync with the code!)
256   char *_outBuf;
257   const std::size_t _outBufSize;
258   // Space reserved for characters to be put back into the stream. Must be
259   // strictly smaller than _outBufSize (this is checked in the constructor).
260   // It is guaranteed that this number of chars can be put back, except of
261   // course if we haven't read that many characters from the input stream yet.
262   // If characters are buffered in _outBuf[2], then it may be that more
263   // characters than _putbackSize can be put back (it is essentially a matter
264   // for std::streambuf of decreasing the “next pointer for the input
265   // sequence”, i.e., the one returned by gptr()).
266   //
267   //   [2] In the [_outBuf + _putbackSize, _outBuf + _outBufSize) area.
268   const std::size_t _putbackSize;
269 
270   // Since the constructor optionally allocates memory for the input and
271   // output buffers, these members allow the destructor to know which buffers
272   // have to be deallocated, if any.
273   bool _inBufMustBeFreed = false;
274   bool _outBufMustBeFreed = false;
275 };
276 
277 
278 // Stream buffer class for compressing data. Input data is obtained from an
279 // std::istream instance; the corresponding compressed data can be read using
280 // the standard std::streambuf read interface (mainly: sbumpc(), sgetc(),
281 // snextc(), sgetn(), sputbackc(), sungetc()). Input, uncompressed data is
282 // “pulled” as needed for the amount of compressed data requested by the
283 // “client” using the methods I just listed.
284 class ZlibCompressorIStreambuf: public ZlibAbstractIStreambuf
285 {
286 public:
287   // Same parameters as for ZlibAbstractIStreambuf, except:
288   //
289   //   compressionLevel: in the [0,9] range. 0 means no compression at all.
290   //                     Levels 1 to 9 yield compressed data, with 1 giving
291   //                     the highest compression speed but worst compression
292   //                     ratio, and 9 the highest compression ratio but lowest
293   //                     compression speed.
294   //   format            either ZLibCompressionFormat::ZLIB or
295   //                     ZLibCompressionFormat::GZIP
296   //   memStrategy       either ZLibMemoryStrategy::FAVOR_MEMORY_OVER_SPEED or
297   //                     ZLibMemoryStrategy::FAVOR_SPEED_OVER_MEMORY
298   explicit ZlibCompressorIStreambuf(
299     std::istream& iStream,
300     const SGPath& path = SGPath(),
301     int compressionLevel = Z_DEFAULT_COMPRESSION,
302     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
303     ZLibMemoryStrategy memStrategy = ZLibMemoryStrategy::FAVOR_SPEED_OVER_MEMORY,
304     char* inBuf = nullptr,
305     std::size_t inBufSize = 262144,
306     char* outBuf = nullptr,
307     std::size_t outBufSize = 262144,
308     std::size_t putbackSize = 0);
309 
310   // Alternate constructor with sink semantics for the “source” std::istream.
311   explicit ZlibCompressorIStreambuf(
312     std::unique_ptr<std::istream> _iStream_p,
313     const SGPath& path = SGPath(),
314     int compressionLevel = Z_DEFAULT_COMPRESSION,
315     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
316     ZLibMemoryStrategy memStrategy = ZLibMemoryStrategy::FAVOR_SPEED_OVER_MEMORY,
317     char* inBuf = nullptr,
318     std::size_t inBufSize = 262144,
319     char* outBuf = nullptr,
320     std::size_t outBufSize = 262144,
321     std::size_t putbackSize = 0);
322 
323   ZlibCompressorIStreambuf(const ZlibCompressorIStreambuf&) = delete;
324   ZlibCompressorIStreambuf& operator=(const ZlibCompressorIStreambuf&) = delete;
325   virtual ~ZlibCompressorIStreambuf();
326 
327 protected:
328   virtual OperationType operationType() const override;
329   // Initialize the z_stream struct used by zlib
330   void zStreamInit(int compressionLevel, ZLibCompressionFormat format,
331                    ZLibMemoryStrategy memStrategy);
332   // Call zlib's deflate() function to compress data.
333   virtual int zlibProcessData() override;
334 };
335 
336 
337 // Stream buffer class for decompressing data. Input data is obtained from an
338 // std::istream instance; the corresponding decompressed data can be read
339 // using the standard std::streambuf read interface (mainly: sbumpc(),
340 // sgetc(), snextc(), sgetn(), sputbackc(), sungetc()). Input, compressed data
341 // is “pulled” as needed for the amount of uncompressed data requested by the
342 // “client” using the methods I just listed.
343 class ZlibDecompressorIStreambuf: public ZlibAbstractIStreambuf
344 {
345 public:
346   // Same parameters as for ZlibAbstractIStreambuf, except:
347   //
348   //   format            ZLibCompressionFormat::ZLIB,
349   //                     ZLibCompressionFormat::GZIP or
350   //                     ZLibCompressionFormat::AUTODETECT
351   explicit ZlibDecompressorIStreambuf(
352     std::istream& iStream,
353     const SGPath& path = SGPath(),
354     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
355     char* inBuf = nullptr,
356     std::size_t inBufSize = 262144,
357     char* outBuf = nullptr,
358     std::size_t outBufSize = 262144,
359     std::size_t putbackSize = 0); // default optimized for speed
360 
361   // Alternate constructor with sink semantics for the “source” std::istream.
362   explicit ZlibDecompressorIStreambuf(
363     std::unique_ptr<std::istream> _iStream_p,
364     const SGPath& path = SGPath(),
365     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
366     char* inBuf = nullptr,
367     std::size_t inBufSize = 262144,
368     char* outBuf = nullptr,
369     std::size_t outBufSize = 262144,
370     std::size_t putbackSize = 0); // default optimized for speed
371 
372   ZlibDecompressorIStreambuf(const ZlibDecompressorIStreambuf&) = delete;
373   ZlibDecompressorIStreambuf& operator=(const ZlibDecompressorIStreambuf&)
374                                                                       = delete;
375   virtual ~ZlibDecompressorIStreambuf();
376 
377 protected:
378   virtual OperationType operationType() const override;
379   void zStreamInit(ZLibCompressionFormat format);
380   virtual int zlibProcessData() override;
381 };
382 
383 // std::istream subclass for compressing data. Input data is obtained from an
384 // std::istream instance; the corresponding compressed data can be read using
385 // the standard std::istream interface (read(), readsome(), gcount(), get(),
386 // getline(), operator>>(), peek(), putback(), ignore(), unget()... plus
387 // operator overloads such as istream& operator>>(istream&, string&) as
388 // defined in <string>, and std::getline()). Input, uncompressed data is
389 // “pulled” as needed for the amount of compressed data requested by the
390 // “client”.
391 //
392 // To get data efficiently from an instance of this class, use its read()
393 // method (typically in conjunction with gcount(), inside a loop).
394 class ZlibCompressorIStream: public std::istream
395 {
396 public:
397   // Same parameters as for ZlibCompressorIStreambuf
398   explicit ZlibCompressorIStream(
399     std::istream& iStream,
400     const SGPath& path = SGPath(),
401     int compressionLevel = Z_DEFAULT_COMPRESSION,
402     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
403     ZLibMemoryStrategy memStrategy = ZLibMemoryStrategy::FAVOR_SPEED_OVER_MEMORY,
404     char* inBuf = nullptr,
405     std::size_t inBufSize = 262144,
406     char* outBuf = nullptr,
407     std::size_t outBufSize = 262144,
408     std::size_t putbackSize = 0); // default optimized for speed
409 
410   // Alternate constructor with sink semantics for the “source” std::istream.
411   explicit ZlibCompressorIStream(
412     std::unique_ptr<std::istream> _iStream_p,
413     const SGPath& path = SGPath(),
414     int compressionLevel = Z_DEFAULT_COMPRESSION,
415     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
416     ZLibMemoryStrategy memStrategy = ZLibMemoryStrategy::FAVOR_SPEED_OVER_MEMORY,
417     char* inBuf = nullptr,
418     std::size_t inBufSize = 262144,
419     char* outBuf = nullptr,
420     std::size_t outBufSize = 262144,
421     std::size_t putbackSize = 0); // default optimized for speed
422 
423   ZlibCompressorIStream(const ZlibCompressorIStream&) = delete;
424   ZlibCompressorIStream& operator=(const ZlibCompressorIStream&) = delete;
425   virtual ~ZlibCompressorIStream();
426 
427 private:
428   ZlibCompressorIStreambuf _streamBuf;
429 };
430 
431 // std::istream subclass for decompressing data. Input data is obtained from
432 // an std::istream instance; the corresponding decompressed data can be read
433 // using the standard std::istream interface (read(), readsome(), gcount(),
434 // get(), getline(), operator>>(), peek(), putback(), ignore(), unget()...
435 // plus operator overloads such as istream& operator>>(istream&, string&) as
436 // defined in <string>, and std::getline()). Input, compressed data is
437 // “pulled” as needed for the amount of uncompressed data requested by the
438 // “client”.
439 //
440 // To get data efficiently from an instance of this class, use its read()
441 // method (typically in conjunction with gcount(), inside a loop).
442 class ZlibDecompressorIStream: public std::istream
443 {
444 public:
445   // Same parameters as for ZlibDecompressorIStreambuf
446   explicit ZlibDecompressorIStream(
447     std::istream& iStream,
448     const SGPath& path = SGPath(),
449     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
450     char* inBuf = nullptr,
451     std::size_t inBufSize = 262144,
452     char* outBuf = nullptr,
453     std::size_t outBufSize = 262144,
454     std::size_t putbackSize = 0); // default optimized for speed
455 
456   // Alternate constructor with sink semantics for the “source” std::istream.
457   explicit ZlibDecompressorIStream(
458     std::unique_ptr<std::istream> _iStream_p,
459     const SGPath& path = SGPath(),
460     ZLibCompressionFormat format = ZLibCompressionFormat::ZLIB,
461     char* inBuf = nullptr,
462     std::size_t inBufSize = 262144,
463     char* outBuf = nullptr,
464     std::size_t outBufSize = 262144,
465     std::size_t putbackSize = 0); // default optimized for speed
466 
467   ZlibDecompressorIStream(const ZlibDecompressorIStream&) = delete;
468   ZlibDecompressorIStream& operator=(const ZlibDecompressorIStream&) = delete;
469   virtual ~ZlibDecompressorIStream();
470 
471 private:
472   ZlibDecompressorIStreambuf _streamBuf;
473 };
474 
475 } // of namespace simgear
476 
477 #endif  // of SG_ZLIBSTREAM_HXX
478