1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <string>
23 #include <vector>
24 
25 #include <folly/Optional.h>
26 #include <folly/Range.h>
27 #include <folly/io/IOBuf.h>
28 
29 /**
30  * Compression / decompression over IOBufs
31  */
32 
33 namespace folly {
34 namespace io {
35 
36 enum class CodecType {
37   /**
38    * This codec type is not defined; getCodec() will throw an exception
39    * if used. Useful if deriving your own classes from Codec without
40    * going through the getCodec() interface.
41    */
42   USER_DEFINED = 0,
43 
44   /**
45    * Use no compression.
46    * Levels supported: 0
47    */
48   NO_COMPRESSION = 1,
49 
50   /**
51    * Use LZ4 compression.
52    * Levels supported: 1 = fast, 2 = best; default = 1
53    */
54   LZ4 = 2,
55 
56   /**
57    * Use Snappy compression.
58    * Levels supported: 1
59    */
60   SNAPPY = 3,
61 
62   /**
63    * Use zlib compression.
64    * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
65    * Streaming compression is supported.
66    */
67   ZLIB = 4,
68 
69   /**
70    * Use LZ4 compression, prefixed with size (as Varint).
71    */
72   LZ4_VARINT_SIZE = 5,
73 
74   /**
75    * Use LZMA2 compression.
76    * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
77    * Streaming compression is supported.
78    */
79   LZMA2 = 6,
80   LZMA2_VARINT_SIZE = 7,
81 
82   /**
83    * Use ZSTD compression.
84    * Levels supported: 1 = fast, ..., 19 = best; default = 3
85    * Use ZSTD_FAST for the fastest zstd compression (negative levels).
86    * Streaming compression is supported.
87    */
88   ZSTD = 8,
89 
90   /**
91    * Use gzip compression.  This is the same compression algorithm as ZLIB but
92    * gzip-compressed files tend to be easier to work with from the command line.
93    * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
94    * Streaming compression is supported.
95    */
96   GZIP = 9,
97 
98   /**
99    * Use LZ4 frame compression.
100    * Levels supported: 0 = fast, 16 = best; default = 0
101    */
102   LZ4_FRAME = 10,
103 
104   /**
105    * Use bzip2 compression.
106    * Levels supported: 1 = fast, 9 = best; default = 9
107    * Streaming compression is supported BUT FlushOp::FLUSH does NOT ensure that
108    * the decompressor can read all the data up to that point, due to a bug in
109    * the bzip2 library.
110    */
111   BZIP2 = 11,
112 
113   /**
114    * Use ZSTD compression with a negative compression level (1=-1, 2=-2, ...).
115    * Higher compression levels mean faster.
116    * Level 1 is around the same speed as Snappy with better compression.
117    * Level 5 is around the same speed as LZ4 with slightly worse compression.
118    * Each level gains about 6-15% speed and loses 3-7% compression.
119    * Decompression speed improves for each level, and level 1 decompression
120    * speed is around 25% faster than ZSTD.
121    * This codec is fully compatible with ZSTD.
122    * Levels supported: 1 = best, ..., 5 = fast; default = 1
123    * Streaming compression is supported.
124    */
125   ZSTD_FAST = 12,
126 
127   NUM_CODEC_TYPES = 13,
128 };
129 
130 class Codec {
131  public:
~Codec()132   virtual ~Codec() {}
133 
134   static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1);
135   /**
136    * Return the maximum length of data that may be compressed with this codec.
137    * NO_COMPRESSION and ZLIB support arbitrary lengths;
138    * LZ4 supports up to 1.9GiB; SNAPPY supports up to 4GiB.
139    * May return UNLIMITED_UNCOMPRESSED_LENGTH if unlimited.
140    */
141   uint64_t maxUncompressedLength() const;
142 
143   /**
144    * Return the codec's type.
145    */
type()146   CodecType type() const { return type_; }
147 
148   /**
149    * Does this codec need the exact uncompressed length on decompression?
150    */
151   bool needsUncompressedLength() const;
152 
153   /**
154    * Compress data, returning an IOBuf (which may share storage with data).
155    * Throws std::invalid_argument if data is larger than
156    * maxUncompressedLength().
157    */
158   std::unique_ptr<IOBuf> compress(const folly::IOBuf* data);
159 
160   /**
161    * Compresses data. May involve additional copies compared to the overload
162    * that takes and returns IOBufs. Has the same error semantics as the IOBuf
163    * version.
164    */
165   std::string compress(StringPiece data);
166 
167   /**
168    * Uncompress data. Throws std::runtime_error on decompression error.
169    *
170    * Some codecs (LZ4) require the exact uncompressed length; this is indicated
171    * by needsUncompressedLength().
172    *
173    * For other codes (zlib), knowing the exact uncompressed length ahead of
174    * time might be faster.
175    *
176    * Regardless of the behavior of the underlying compressor, uncompressing
177    * an empty IOBuf chain will return an empty IOBuf chain.
178    */
179   std::unique_ptr<IOBuf> uncompress(
180       const IOBuf* data,
181       folly::Optional<uint64_t> uncompressedLength = folly::none);
182 
183   /**
184    * Uncompresses data. May involve additional copies compared to the overload
185    * that takes and returns IOBufs. Has the same error semantics as the IOBuf
186    * version.
187    */
188   std::string uncompress(
189       StringPiece data,
190       folly::Optional<uint64_t> uncompressedLength = folly::none);
191 
192   /**
193    * Returns a bound on the maximum compressed length when compressing data with
194    * the given uncompressed length.
195    */
196   uint64_t maxCompressedLength(uint64_t uncompressedLength) const;
197 
198   /**
199    * Extracts the uncompressed length from the compressed data if possible.
200    * If the codec doesn't store the uncompressed length, or the data is
201    * corrupted it returns the given uncompressedLength.
202    * If the uncompressed length is stored in the compressed data and
203    * uncompressedLength is not none and they do not match a std::runtime_error
204    * is thrown.
205    */
206   folly::Optional<uint64_t> getUncompressedLength(
207       const folly::IOBuf* data,
208       folly::Optional<uint64_t> uncompressedLength = folly::none) const;
209 
210   /**
211    * Helper wrapper around getUncompressedLength(IOBuf)
212    */
213   folly::Optional<uint64_t> getUncompressedLength(
214       folly::StringPiece data,
215       folly::Optional<uint64_t> uncompressedLength = folly::none) const;
216 
217  protected:
218   Codec(
219       CodecType type,
220       folly::Optional<int> level = folly::none,
221       folly::StringPiece name = {});
222 
223  public:
224   /**
225    * Returns a superset of the set of prefixes for which canUncompress() will
226    * return true. A superset is allowed for optimizations in canUncompress()
227    * based on other knowledge such as length. None of the prefixes may be empty.
228    * default: No prefixes.
229    */
230   virtual std::vector<std::string> validPrefixes() const;
231 
232   /**
233    * Returns true if the codec thinks it can uncompress the data.
234    * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy,
235    * it can always return false.
236    * default: Returns false.
237    */
238   virtual bool canUncompress(
239       const folly::IOBuf* data,
240       folly::Optional<uint64_t> uncompressedLength = folly::none) const;
241 
242   /**
243    * Helper wrapper around canUncompress(IOBuf)
244    */
245   bool canUncompress(
246       folly::StringPiece data,
247       folly::Optional<uint64_t> uncompressedLength = folly::none) const;
248 
249  private:
250   // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH)
251   virtual uint64_t doMaxUncompressedLength() const;
252   // default: doesn't need uncompressed length
253   virtual bool doNeedsUncompressedLength() const;
254   virtual std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) = 0;
255   virtual std::unique_ptr<IOBuf> doUncompress(
256       const folly::IOBuf* data,
257       folly::Optional<uint64_t> uncompressedLength) = 0;
258   // default: an implementation is provided by default to wrap the strings into
259   // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output
260   // from IOBuf to string. Implementers, at their discretion, can override
261   // these methods to avoid the copy.
262   virtual std::string doCompressString(StringPiece data);
263   virtual std::string doUncompressString(
264       StringPiece data, folly::Optional<uint64_t> uncompressedLength);
265 
266   virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0;
267   // default: returns the passed uncompressedLength.
268   virtual folly::Optional<uint64_t> doGetUncompressedLength(
269       const folly::IOBuf* data,
270       folly::Optional<uint64_t> uncompressedLength) const;
271 
272   CodecType type_;
273 };
274 
275 class StreamCodec : public Codec {
276  public:
~StreamCodec()277   ~StreamCodec() override {}
278 
279   /**
280    * Does the codec need the data length before compression streaming?
281    */
282   bool needsDataLength() const;
283 
284   /*****************************************************************************
285    * Streaming API
286    *****************************************************************************
287    * A low-level stateful streaming API.
288    * Streaming operations can be started in two ways:
289    *   1. From a clean Codec on which no non-const methods have been called.
290    *   2. A call to resetStream(), which will reset any codec to a clean state.
291    * After a streaming operation has begun, either compressStream() or
292    * uncompressStream() must be called until the streaming operation ends.
293    * compressStream() ends when it returns true with flushOp END.
294    * uncompressStream() ends when it returns true. At this point the codec
295    * may be reused by calling resetStream().
296    *
297    * compress() and uncompress() can be called at any time, but they interrupt
298    * any ongoing streaming operations (state is lost and resetStream() must be
299    * called before another streaming operation).
300    */
301 
302   /**
303    * Reset the state of the codec, and set the uncompressed length for the next
304    * streaming operation. If uncompressedLength is not none it must be exactly
305    * the uncompressed length. compressStream() must be passed exactly
306    * uncompressedLength input bytes before the stream is ended.
307    * uncompressStream() must be passed a compressed frame that uncompresses to
308    * uncompressedLength.
309    */
310   void resetStream(folly::Optional<uint64_t> uncompressedLength = folly::none);
311 
312   enum class FlushOp { NONE, FLUSH, END };
313 
314   /**
315    * Compresses some data from the input buffer and writes the compressed data
316    * into the output buffer. It may read input without producing any output,
317    * except when forced to flush.
318    *
319    * The input buffer is advanced to point to the range of data that hasn't yet
320    * been read. Compression will resume at this point for the next call to
321    * compressStream(). The output buffer is advanced one byte past the last byte
322    * written.
323    *
324    * The default flushOp is NONE, which allows compressStream() complete
325    * discretion in how much data to gather before writing any output.
326    *
327    * If flushOp is END, all pending and input data is flushed to the output
328    * buffer, and the frame is ended. compressStream() must be called with the
329    * same input and flushOp END until it returns true. At this point the caller
330    * must call resetStream() to use the codec again.
331    *
332    * If flushOp is FLUSH, all pending and input data is flushed to the output
333    * buffer, but the frame is not ended. compressStream() must be called with
334    * the same input and flushOp END until it returns true. At this point the
335    * caller can continue to compressStream() with any input data and flushOp.
336    * The uncompressor, if passed all the produced output data, will be able to
337    * uncompress all the input data passed to compressStream() so far. Excessive
338    * use of flushOp FLUSH will deteriorate compression ratio. This is useful for
339    * stateful streaming across a network. Most users don't need to use this
340    * flushOp.
341    *
342    * A std::logic_error is thrown on incorrect usage of the API.
343    * A std::runtime_error is thrown upon error conditions or if no forward
344    * progress could be made twice in a row.
345    */
346   bool compressStream(
347       folly::ByteRange& input,
348       folly::MutableByteRange& output,
349       FlushOp flushOp = StreamCodec::FlushOp::NONE);
350 
351   /**
352    * Uncompresses some data from the input buffer and writes the uncompressed
353    * data into the output buffer. It may read input without producing any
354    * output.
355    *
356    * The input buffer is advanced to point to the range of data that hasn't yet
357    * been read. Uncompression will resume at this point for the next call to
358    * uncompressStream(). The output buffer is advanced one byte past the last
359    * byte written.
360    *
361    * The default flushOp is NONE, which allows uncompressStream() complete
362    * discretion in how much output data to flush. The uncompressor may not make
363    * maximum forward progress, but will make some forward progress when
364    * possible.
365    *
366    * If flushOp is END, the caller guarantees that no more input will be
367    * presented to uncompressStream(). uncompressStream() must be called with the
368    * same input and flushOp END until it returns true. This is not mandatory,
369    * but if the input is all available in one buffer, and there is enough output
370    * space to write the entire frame, codecs can uncompress faster.
371    *
372    * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum
373    * amount of forward progress possible. When using this flushOp and
374    * uncompressStream() returns with `!output.empty()` the caller knows that all
375    * pending output has been flushed. This is useful for stateful streaming
376    * across a network, and it should be used in conjunction with
377    * compressStream() with flushOp FLUSH. Most users don't need to use this
378    * flushOp.
379    *
380    * A std::runtime_error is thrown upon error conditions or if no forward
381    * progress could be made upon two consecutive calls to the function (only the
382    * second call will throw an exception).
383    *
384    * Returns true at the end of a frame. At this point resetStream() must be
385    * called to reuse the codec.
386    */
387   bool uncompressStream(
388       folly::ByteRange& input,
389       folly::MutableByteRange& output,
390       FlushOp flushOp = StreamCodec::FlushOp::NONE);
391 
392  protected:
393   StreamCodec(
394       CodecType type,
395       folly::Optional<int> level = folly::none,
396       folly::StringPiece name = {})
Codec(type,std::move (level),name)397       : Codec(type, std::move(level), name) {}
398 
399   // Returns the uncompressed length last passed to resetStream() or none if it
400   // hasn't been called yet.
uncompressedLength()401   folly::Optional<uint64_t> uncompressedLength() const {
402     return uncompressedLength_;
403   }
404 
405  private:
406   // default: Implemented using the streaming API.
407   std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) override;
408   std::unique_ptr<IOBuf> doUncompress(
409       const folly::IOBuf* data,
410       folly::Optional<uint64_t> uncompressedLength) override;
411 
412   // default: Returns false
413   virtual bool doNeedsDataLength() const;
414   virtual void doResetStream() = 0;
415   virtual bool doCompressStream(
416       folly::ByteRange& input,
417       folly::MutableByteRange& output,
418       FlushOp flushOp) = 0;
419   virtual bool doUncompressStream(
420       folly::ByteRange& input,
421       folly::MutableByteRange& output,
422       FlushOp flushOp) = 0;
423 
424   enum class State {
425     RESET,
426     COMPRESS,
427     COMPRESS_FLUSH,
428     COMPRESS_END,
429     UNCOMPRESS,
430     END,
431   };
432   void assertStateIs(State expected) const;
433 
434   State state_{State::RESET};
435   ByteRange previousInput_{};
436   folly::Optional<uint64_t> uncompressedLength_{};
437   bool progressMade_{true};
438 };
439 
440 constexpr int COMPRESSION_LEVEL_FASTEST = -1;
441 constexpr int COMPRESSION_LEVEL_DEFAULT = -2;
442 constexpr int COMPRESSION_LEVEL_BEST = -3;
443 
444 /**
445  * Return a codec for the given type. Throws on error.  The level
446  * is a non-negative codec-dependent integer indicating the level of
447  * compression desired, or one of the following constants:
448  *
449  * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
450  *   worst compression)
451  * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
452  *   FASTEST and BEST)
453  * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
454  *   best compression)
455  *
456  * When decompressing, the compression level is ignored. All codecs will
457  * decompress all data compressed with the a codec of the same type, regardless
458  * of compression level.
459  */
460 std::unique_ptr<Codec> getCodec(
461     CodecType type, int level = COMPRESSION_LEVEL_DEFAULT);
462 
463 /**
464  * Return a codec for the given type. Throws on error.  The level
465  * is a non-negative codec-dependent integer indicating the level of
466  * compression desired, or one of the following constants:
467  *
468  * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
469  *   worst compression)
470  * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
471  *   FASTEST and BEST)
472  * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
473  *   best compression)
474  *
475  * When decompressing, the compression level is ignored. All codecs will
476  * decompress all data compressed with the a codec of the same type, regardless
477  * of compression level.
478  */
479 std::unique_ptr<StreamCodec> getStreamCodec(
480     CodecType type, int level = COMPRESSION_LEVEL_DEFAULT);
481 
482 /**
483  * Returns a codec that can uncompress any of the given codec types as well as
484  * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to
485  * customCodecs in order, so long as a codec with the same type() isn't already
486  * present in customCodecs or as the terminalCodec. When uncompress() is called,
487  * each codec's canUncompress() is called in the order that they are given.
488  * Appended default codecs are checked last.  uncompress() is called on the
489  * first codec whose canUncompress() returns true.
490  *
491  * In addition, an optional `terminalCodec` can be provided. This codec's
492  * uncompress() will be called either when no other codec canUncompress() the
493  * data or the chosen codec throws an exception on the data. The terminalCodec
494  * is intended for ambiguous headers, when canUncompress() is false for some
495  * data it can actually uncompress. The terminalCodec does not need to override
496  * validPrefixes() or canUncompress() and overriding these functions will have
497  * no effect on the returned codec's validPrefixes() or canUncompress()
498  * functions. The terminalCodec's needsUncompressedLength() and
499  * maxUncompressedLength() will affect the returned codec's respective
500  * functions. The terminalCodec must not be duplicated in customCodecs.
501  *
502  * An exception is thrown if no codec canUncompress() the data and either no
503  * terminal codec was provided or a terminal codec was provided and it throws on
504  * the data.
505  * An exception is thrown if the chosen codec's uncompress() throws on the data
506  * and either no terminal codec was provided or a terminal codec was provided
507  * and it also throws on the data.
508  * An exception is thrown if compress() is called on the returned codec.
509  *
510  * Requirements are checked in debug mode and are as follows:
511  * Let headers be the concatenation of every codec's validPrefixes().
512  *  1. Each codec must override validPrefixes() and canUncompress().
513  *  2. No codec's validPrefixes() may be empty.
514  *  3. No header in headers may be empty.
515  *  4. headers must not contain any duplicate elements.
516  *  5. No strict non-empty prefix of any header in headers may be in headers.
517  *  6. The terminalCodec's type must not be the same as any other codec's type
518  *     (with USER_DEFINED being the exception).
519  */
520 std::unique_ptr<Codec> getAutoUncompressionCodec(
521     std::vector<std::unique_ptr<Codec>> customCodecs = {},
522     std::unique_ptr<Codec> terminalCodec = {});
523 
524 /**
525  * Check if a specified codec is supported.
526  */
527 bool hasCodec(CodecType type);
528 
529 /**
530  * Check if a specified codec is supported and supports streaming.
531  */
532 bool hasStreamCodec(CodecType type);
533 
534 /**
535  * Added here so users of folly can figure out whether the header
536  * folly/compression/CompressionContextPoolSingletons.h is present, and
537  * therefore whether it can be included.
538  */
539 #define FOLLY_COMPRESSION_HAS_CONTEXT_POOL_SINGLETONS
540 } // namespace io
541 } // namespace folly
542