1 /* 2 * Copyright (c) Facebook, Inc. and its affiliates. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #pragma once 18 19 #include <cstdint> 20 #include <limits> 21 #include <memory> 22 #include <string> 23 #include <vector> 24 25 #include <folly/Optional.h> 26 #include <folly/Range.h> 27 #include <folly/io/IOBuf.h> 28 29 /** 30 * Compression / decompression over IOBufs 31 */ 32 33 namespace folly { 34 namespace io { 35 36 enum class CodecType { 37 /** 38 * This codec type is not defined; getCodec() will throw an exception 39 * if used. Useful if deriving your own classes from Codec without 40 * going through the getCodec() interface. 41 */ 42 USER_DEFINED = 0, 43 44 /** 45 * Use no compression. 46 * Levels supported: 0 47 */ 48 NO_COMPRESSION = 1, 49 50 /** 51 * Use LZ4 compression. 52 * Levels supported: 1 = fast, 2 = best; default = 1 53 */ 54 LZ4 = 2, 55 56 /** 57 * Use Snappy compression. 58 * Levels supported: 1 59 */ 60 SNAPPY = 3, 61 62 /** 63 * Use zlib compression. 64 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6 65 * Streaming compression is supported. 66 */ 67 ZLIB = 4, 68 69 /** 70 * Use LZ4 compression, prefixed with size (as Varint). 71 */ 72 LZ4_VARINT_SIZE = 5, 73 74 /** 75 * Use LZMA2 compression. 76 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6 77 * Streaming compression is supported. 78 */ 79 LZMA2 = 6, 80 LZMA2_VARINT_SIZE = 7, 81 82 /** 83 * Use ZSTD compression. 84 * Levels supported: 1 = fast, ..., 19 = best; default = 3 85 * Use ZSTD_FAST for the fastest zstd compression (negative levels). 86 * Streaming compression is supported. 87 */ 88 ZSTD = 8, 89 90 /** 91 * Use gzip compression. This is the same compression algorithm as ZLIB but 92 * gzip-compressed files tend to be easier to work with from the command line. 93 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6 94 * Streaming compression is supported. 95 */ 96 GZIP = 9, 97 98 /** 99 * Use LZ4 frame compression. 100 * Levels supported: 0 = fast, 16 = best; default = 0 101 */ 102 LZ4_FRAME = 10, 103 104 /** 105 * Use bzip2 compression. 106 * Levels supported: 1 = fast, 9 = best; default = 9 107 * Streaming compression is supported BUT FlushOp::FLUSH does NOT ensure that 108 * the decompressor can read all the data up to that point, due to a bug in 109 * the bzip2 library. 110 */ 111 BZIP2 = 11, 112 113 /** 114 * Use ZSTD compression with a negative compression level (1=-1, 2=-2, ...). 115 * Higher compression levels mean faster. 116 * Level 1 is around the same speed as Snappy with better compression. 117 * Level 5 is around the same speed as LZ4 with slightly worse compression. 118 * Each level gains about 6-15% speed and loses 3-7% compression. 119 * Decompression speed improves for each level, and level 1 decompression 120 * speed is around 25% faster than ZSTD. 121 * This codec is fully compatible with ZSTD. 122 * Levels supported: 1 = best, ..., 5 = fast; default = 1 123 * Streaming compression is supported. 124 */ 125 ZSTD_FAST = 12, 126 127 NUM_CODEC_TYPES = 13, 128 }; 129 130 class Codec { 131 public: ~Codec()132 virtual ~Codec() {} 133 134 static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1); 135 /** 136 * Return the maximum length of data that may be compressed with this codec. 137 * NO_COMPRESSION and ZLIB support arbitrary lengths; 138 * LZ4 supports up to 1.9GiB; SNAPPY supports up to 4GiB. 139 * May return UNLIMITED_UNCOMPRESSED_LENGTH if unlimited. 140 */ 141 uint64_t maxUncompressedLength() const; 142 143 /** 144 * Return the codec's type. 145 */ type()146 CodecType type() const { return type_; } 147 148 /** 149 * Does this codec need the exact uncompressed length on decompression? 150 */ 151 bool needsUncompressedLength() const; 152 153 /** 154 * Compress data, returning an IOBuf (which may share storage with data). 155 * Throws std::invalid_argument if data is larger than 156 * maxUncompressedLength(). 157 */ 158 std::unique_ptr<IOBuf> compress(const folly::IOBuf* data); 159 160 /** 161 * Compresses data. May involve additional copies compared to the overload 162 * that takes and returns IOBufs. Has the same error semantics as the IOBuf 163 * version. 164 */ 165 std::string compress(StringPiece data); 166 167 /** 168 * Uncompress data. Throws std::runtime_error on decompression error. 169 * 170 * Some codecs (LZ4) require the exact uncompressed length; this is indicated 171 * by needsUncompressedLength(). 172 * 173 * For other codes (zlib), knowing the exact uncompressed length ahead of 174 * time might be faster. 175 * 176 * Regardless of the behavior of the underlying compressor, uncompressing 177 * an empty IOBuf chain will return an empty IOBuf chain. 178 */ 179 std::unique_ptr<IOBuf> uncompress( 180 const IOBuf* data, 181 folly::Optional<uint64_t> uncompressedLength = folly::none); 182 183 /** 184 * Uncompresses data. May involve additional copies compared to the overload 185 * that takes and returns IOBufs. Has the same error semantics as the IOBuf 186 * version. 187 */ 188 std::string uncompress( 189 StringPiece data, 190 folly::Optional<uint64_t> uncompressedLength = folly::none); 191 192 /** 193 * Returns a bound on the maximum compressed length when compressing data with 194 * the given uncompressed length. 195 */ 196 uint64_t maxCompressedLength(uint64_t uncompressedLength) const; 197 198 /** 199 * Extracts the uncompressed length from the compressed data if possible. 200 * If the codec doesn't store the uncompressed length, or the data is 201 * corrupted it returns the given uncompressedLength. 202 * If the uncompressed length is stored in the compressed data and 203 * uncompressedLength is not none and they do not match a std::runtime_error 204 * is thrown. 205 */ 206 folly::Optional<uint64_t> getUncompressedLength( 207 const folly::IOBuf* data, 208 folly::Optional<uint64_t> uncompressedLength = folly::none) const; 209 210 /** 211 * Helper wrapper around getUncompressedLength(IOBuf) 212 */ 213 folly::Optional<uint64_t> getUncompressedLength( 214 folly::StringPiece data, 215 folly::Optional<uint64_t> uncompressedLength = folly::none) const; 216 217 protected: 218 Codec( 219 CodecType type, 220 folly::Optional<int> level = folly::none, 221 folly::StringPiece name = {}); 222 223 public: 224 /** 225 * Returns a superset of the set of prefixes for which canUncompress() will 226 * return true. A superset is allowed for optimizations in canUncompress() 227 * based on other knowledge such as length. None of the prefixes may be empty. 228 * default: No prefixes. 229 */ 230 virtual std::vector<std::string> validPrefixes() const; 231 232 /** 233 * Returns true if the codec thinks it can uncompress the data. 234 * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy, 235 * it can always return false. 236 * default: Returns false. 237 */ 238 virtual bool canUncompress( 239 const folly::IOBuf* data, 240 folly::Optional<uint64_t> uncompressedLength = folly::none) const; 241 242 /** 243 * Helper wrapper around canUncompress(IOBuf) 244 */ 245 bool canUncompress( 246 folly::StringPiece data, 247 folly::Optional<uint64_t> uncompressedLength = folly::none) const; 248 249 private: 250 // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH) 251 virtual uint64_t doMaxUncompressedLength() const; 252 // default: doesn't need uncompressed length 253 virtual bool doNeedsUncompressedLength() const; 254 virtual std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) = 0; 255 virtual std::unique_ptr<IOBuf> doUncompress( 256 const folly::IOBuf* data, 257 folly::Optional<uint64_t> uncompressedLength) = 0; 258 // default: an implementation is provided by default to wrap the strings into 259 // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output 260 // from IOBuf to string. Implementers, at their discretion, can override 261 // these methods to avoid the copy. 262 virtual std::string doCompressString(StringPiece data); 263 virtual std::string doUncompressString( 264 StringPiece data, folly::Optional<uint64_t> uncompressedLength); 265 266 virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0; 267 // default: returns the passed uncompressedLength. 268 virtual folly::Optional<uint64_t> doGetUncompressedLength( 269 const folly::IOBuf* data, 270 folly::Optional<uint64_t> uncompressedLength) const; 271 272 CodecType type_; 273 }; 274 275 class StreamCodec : public Codec { 276 public: ~StreamCodec()277 ~StreamCodec() override {} 278 279 /** 280 * Does the codec need the data length before compression streaming? 281 */ 282 bool needsDataLength() const; 283 284 /***************************************************************************** 285 * Streaming API 286 ***************************************************************************** 287 * A low-level stateful streaming API. 288 * Streaming operations can be started in two ways: 289 * 1. From a clean Codec on which no non-const methods have been called. 290 * 2. A call to resetStream(), which will reset any codec to a clean state. 291 * After a streaming operation has begun, either compressStream() or 292 * uncompressStream() must be called until the streaming operation ends. 293 * compressStream() ends when it returns true with flushOp END. 294 * uncompressStream() ends when it returns true. At this point the codec 295 * may be reused by calling resetStream(). 296 * 297 * compress() and uncompress() can be called at any time, but they interrupt 298 * any ongoing streaming operations (state is lost and resetStream() must be 299 * called before another streaming operation). 300 */ 301 302 /** 303 * Reset the state of the codec, and set the uncompressed length for the next 304 * streaming operation. If uncompressedLength is not none it must be exactly 305 * the uncompressed length. compressStream() must be passed exactly 306 * uncompressedLength input bytes before the stream is ended. 307 * uncompressStream() must be passed a compressed frame that uncompresses to 308 * uncompressedLength. 309 */ 310 void resetStream(folly::Optional<uint64_t> uncompressedLength = folly::none); 311 312 enum class FlushOp { NONE, FLUSH, END }; 313 314 /** 315 * Compresses some data from the input buffer and writes the compressed data 316 * into the output buffer. It may read input without producing any output, 317 * except when forced to flush. 318 * 319 * The input buffer is advanced to point to the range of data that hasn't yet 320 * been read. Compression will resume at this point for the next call to 321 * compressStream(). The output buffer is advanced one byte past the last byte 322 * written. 323 * 324 * The default flushOp is NONE, which allows compressStream() complete 325 * discretion in how much data to gather before writing any output. 326 * 327 * If flushOp is END, all pending and input data is flushed to the output 328 * buffer, and the frame is ended. compressStream() must be called with the 329 * same input and flushOp END until it returns true. At this point the caller 330 * must call resetStream() to use the codec again. 331 * 332 * If flushOp is FLUSH, all pending and input data is flushed to the output 333 * buffer, but the frame is not ended. compressStream() must be called with 334 * the same input and flushOp END until it returns true. At this point the 335 * caller can continue to compressStream() with any input data and flushOp. 336 * The uncompressor, if passed all the produced output data, will be able to 337 * uncompress all the input data passed to compressStream() so far. Excessive 338 * use of flushOp FLUSH will deteriorate compression ratio. This is useful for 339 * stateful streaming across a network. Most users don't need to use this 340 * flushOp. 341 * 342 * A std::logic_error is thrown on incorrect usage of the API. 343 * A std::runtime_error is thrown upon error conditions or if no forward 344 * progress could be made twice in a row. 345 */ 346 bool compressStream( 347 folly::ByteRange& input, 348 folly::MutableByteRange& output, 349 FlushOp flushOp = StreamCodec::FlushOp::NONE); 350 351 /** 352 * Uncompresses some data from the input buffer and writes the uncompressed 353 * data into the output buffer. It may read input without producing any 354 * output. 355 * 356 * The input buffer is advanced to point to the range of data that hasn't yet 357 * been read. Uncompression will resume at this point for the next call to 358 * uncompressStream(). The output buffer is advanced one byte past the last 359 * byte written. 360 * 361 * The default flushOp is NONE, which allows uncompressStream() complete 362 * discretion in how much output data to flush. The uncompressor may not make 363 * maximum forward progress, but will make some forward progress when 364 * possible. 365 * 366 * If flushOp is END, the caller guarantees that no more input will be 367 * presented to uncompressStream(). uncompressStream() must be called with the 368 * same input and flushOp END until it returns true. This is not mandatory, 369 * but if the input is all available in one buffer, and there is enough output 370 * space to write the entire frame, codecs can uncompress faster. 371 * 372 * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum 373 * amount of forward progress possible. When using this flushOp and 374 * uncompressStream() returns with `!output.empty()` the caller knows that all 375 * pending output has been flushed. This is useful for stateful streaming 376 * across a network, and it should be used in conjunction with 377 * compressStream() with flushOp FLUSH. Most users don't need to use this 378 * flushOp. 379 * 380 * A std::runtime_error is thrown upon error conditions or if no forward 381 * progress could be made upon two consecutive calls to the function (only the 382 * second call will throw an exception). 383 * 384 * Returns true at the end of a frame. At this point resetStream() must be 385 * called to reuse the codec. 386 */ 387 bool uncompressStream( 388 folly::ByteRange& input, 389 folly::MutableByteRange& output, 390 FlushOp flushOp = StreamCodec::FlushOp::NONE); 391 392 protected: 393 StreamCodec( 394 CodecType type, 395 folly::Optional<int> level = folly::none, 396 folly::StringPiece name = {}) Codec(type,std::move (level),name)397 : Codec(type, std::move(level), name) {} 398 399 // Returns the uncompressed length last passed to resetStream() or none if it 400 // hasn't been called yet. uncompressedLength()401 folly::Optional<uint64_t> uncompressedLength() const { 402 return uncompressedLength_; 403 } 404 405 private: 406 // default: Implemented using the streaming API. 407 std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) override; 408 std::unique_ptr<IOBuf> doUncompress( 409 const folly::IOBuf* data, 410 folly::Optional<uint64_t> uncompressedLength) override; 411 412 // default: Returns false 413 virtual bool doNeedsDataLength() const; 414 virtual void doResetStream() = 0; 415 virtual bool doCompressStream( 416 folly::ByteRange& input, 417 folly::MutableByteRange& output, 418 FlushOp flushOp) = 0; 419 virtual bool doUncompressStream( 420 folly::ByteRange& input, 421 folly::MutableByteRange& output, 422 FlushOp flushOp) = 0; 423 424 enum class State { 425 RESET, 426 COMPRESS, 427 COMPRESS_FLUSH, 428 COMPRESS_END, 429 UNCOMPRESS, 430 END, 431 }; 432 void assertStateIs(State expected) const; 433 434 State state_{State::RESET}; 435 ByteRange previousInput_{}; 436 folly::Optional<uint64_t> uncompressedLength_{}; 437 bool progressMade_{true}; 438 }; 439 440 constexpr int COMPRESSION_LEVEL_FASTEST = -1; 441 constexpr int COMPRESSION_LEVEL_DEFAULT = -2; 442 constexpr int COMPRESSION_LEVEL_BEST = -3; 443 444 /** 445 * Return a codec for the given type. Throws on error. The level 446 * is a non-negative codec-dependent integer indicating the level of 447 * compression desired, or one of the following constants: 448 * 449 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory, 450 * worst compression) 451 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between 452 * FASTEST and BEST) 453 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory, 454 * best compression) 455 * 456 * When decompressing, the compression level is ignored. All codecs will 457 * decompress all data compressed with the a codec of the same type, regardless 458 * of compression level. 459 */ 460 std::unique_ptr<Codec> getCodec( 461 CodecType type, int level = COMPRESSION_LEVEL_DEFAULT); 462 463 /** 464 * Return a codec for the given type. Throws on error. The level 465 * is a non-negative codec-dependent integer indicating the level of 466 * compression desired, or one of the following constants: 467 * 468 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory, 469 * worst compression) 470 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between 471 * FASTEST and BEST) 472 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory, 473 * best compression) 474 * 475 * When decompressing, the compression level is ignored. All codecs will 476 * decompress all data compressed with the a codec of the same type, regardless 477 * of compression level. 478 */ 479 std::unique_ptr<StreamCodec> getStreamCodec( 480 CodecType type, int level = COMPRESSION_LEVEL_DEFAULT); 481 482 /** 483 * Returns a codec that can uncompress any of the given codec types as well as 484 * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to 485 * customCodecs in order, so long as a codec with the same type() isn't already 486 * present in customCodecs or as the terminalCodec. When uncompress() is called, 487 * each codec's canUncompress() is called in the order that they are given. 488 * Appended default codecs are checked last. uncompress() is called on the 489 * first codec whose canUncompress() returns true. 490 * 491 * In addition, an optional `terminalCodec` can be provided. This codec's 492 * uncompress() will be called either when no other codec canUncompress() the 493 * data or the chosen codec throws an exception on the data. The terminalCodec 494 * is intended for ambiguous headers, when canUncompress() is false for some 495 * data it can actually uncompress. The terminalCodec does not need to override 496 * validPrefixes() or canUncompress() and overriding these functions will have 497 * no effect on the returned codec's validPrefixes() or canUncompress() 498 * functions. The terminalCodec's needsUncompressedLength() and 499 * maxUncompressedLength() will affect the returned codec's respective 500 * functions. The terminalCodec must not be duplicated in customCodecs. 501 * 502 * An exception is thrown if no codec canUncompress() the data and either no 503 * terminal codec was provided or a terminal codec was provided and it throws on 504 * the data. 505 * An exception is thrown if the chosen codec's uncompress() throws on the data 506 * and either no terminal codec was provided or a terminal codec was provided 507 * and it also throws on the data. 508 * An exception is thrown if compress() is called on the returned codec. 509 * 510 * Requirements are checked in debug mode and are as follows: 511 * Let headers be the concatenation of every codec's validPrefixes(). 512 * 1. Each codec must override validPrefixes() and canUncompress(). 513 * 2. No codec's validPrefixes() may be empty. 514 * 3. No header in headers may be empty. 515 * 4. headers must not contain any duplicate elements. 516 * 5. No strict non-empty prefix of any header in headers may be in headers. 517 * 6. The terminalCodec's type must not be the same as any other codec's type 518 * (with USER_DEFINED being the exception). 519 */ 520 std::unique_ptr<Codec> getAutoUncompressionCodec( 521 std::vector<std::unique_ptr<Codec>> customCodecs = {}, 522 std::unique_ptr<Codec> terminalCodec = {}); 523 524 /** 525 * Check if a specified codec is supported. 526 */ 527 bool hasCodec(CodecType type); 528 529 /** 530 * Check if a specified codec is supported and supports streaming. 531 */ 532 bool hasStreamCodec(CodecType type); 533 534 /** 535 * Added here so users of folly can figure out whether the header 536 * folly/compression/CompressionContextPoolSingletons.h is present, and 537 * therefore whether it can be included. 538 */ 539 #define FOLLY_COMPRESSION_HAS_CONTEXT_POOL_SINGLETONS 540 } // namespace io 541 } // namespace folly 542