1 // Copyright 2005 Google Inc. All Rights Reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 #include "snappy-internal.h"
30 #include "snappy-sinksource.h"
31 #include "snappy.h"
32
33 #if !defined(SNAPPY_HAVE_SSSE3)
34 // __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
35 // support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
36 // defines __AVX__ when AVX support is available.
37 #if defined(__SSSE3__) || defined(__AVX__)
38 #define SNAPPY_HAVE_SSSE3 1
39 #else
40 #define SNAPPY_HAVE_SSSE3 0
41 #endif
42 #endif // !defined(SNAPPY_HAVE_SSSE3)
43
44 #if !defined(SNAPPY_HAVE_BMI2)
45 // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
46 // specifically, but it does define __AVX2__ when AVX2 support is available.
47 // Fortunately, AVX2 was introduced in Haswell, just like BMI2.
48 //
49 // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
50 // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
51 // case issuing BMI2 instructions results in a compiler error.
52 #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
53 #define SNAPPY_HAVE_BMI2 1
54 #else
55 #define SNAPPY_HAVE_BMI2 0
56 #endif
57 #endif // !defined(SNAPPY_HAVE_BMI2)
58
59 #if SNAPPY_HAVE_SSSE3
60 // Please do not replace with <x86intrin.h>. or with headers that assume more
61 // advanced SSE versions without checking with all the OWNERS.
62 #include <tmmintrin.h>
63 #endif
64
65 #if SNAPPY_HAVE_BMI2
66 // Please do not replace with <x86intrin.h>. or with headers that assume more
67 // advanced SSE versions without checking with all the OWNERS.
68 #include <immintrin.h>
69 #endif
70
71 #include <algorithm>
72 #include <array>
73 #include <cstddef>
74 #include <cstdint>
75 #include <cstdio>
76 #include <cstring>
77 #include <string>
78 #include <utility>
79 #include <vector>
80
81 namespace snappy {
82
83 namespace {
84
85 // The amount of slop bytes writers are using for unconditional copies.
86 constexpr int kSlopBytes = 64;
87
88 using internal::char_table;
89 using internal::COPY_1_BYTE_OFFSET;
90 using internal::COPY_2_BYTE_OFFSET;
91 using internal::COPY_4_BYTE_OFFSET;
92 using internal::kMaximumTagLength;
93 using internal::LITERAL;
94
95 // We translate the information encoded in a tag through a lookup table to a
96 // format that requires fewer instructions to decode. Effectively we store
97 // the length minus the tag part of the offset. The lowest significant byte
98 // thus stores the length. While total length - offset is given by
99 // entry - ExtractOffset(type). The nice thing is that the subtraction
100 // immediately sets the flags for the necessary check that offset >= length.
101 // This folds the cmp with sub. We engineer the long literals and copy-4 to
102 // always fail this check, so their presence doesn't affect the fast path.
103 // To prevent literals from triggering the guard against offset < length (offset
104 // does not apply to literals) the table is giving them a spurious offset of
105 // 256.
MakeEntry(int16_t len,int16_t offset)106 inline constexpr int16_t MakeEntry(int16_t len, int16_t offset) {
107 return len - (offset << 8);
108 }
109
LengthMinusOffset(int data,int type)110 inline constexpr int16_t LengthMinusOffset(int data, int type) {
111 return type == 3 ? 0xFF // copy-4 (or type == 3)
112 : type == 2 ? MakeEntry(data + 1, 0) // copy-2
113 : type == 1 ? MakeEntry((data & 7) + 4, data >> 3) // copy-1
114 : data < 60 ? MakeEntry(data + 1, 1) // note spurious offset.
115 : 0xFF; // long literal
116 }
117
LengthMinusOffset(uint8_t tag)118 inline constexpr int16_t LengthMinusOffset(uint8_t tag) {
119 return LengthMinusOffset(tag >> 2, tag & 3);
120 }
121
122 template <size_t... Ints>
123 struct index_sequence {};
124
125 template <std::size_t N, size_t... Is>
126 struct make_index_sequence : make_index_sequence<N - 1, N - 1, Is...> {};
127
128 template <size_t... Is>
129 struct make_index_sequence<0, Is...> : index_sequence<Is...> {};
130
131 template <size_t... seq>
MakeTable(index_sequence<seq...>)132 constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) {
133 return std::array<int16_t, 256>{LengthMinusOffset(seq)...};
134 }
135
136 // We maximally co-locate the two tables so that only one register needs to be
137 // reserved for the table address.
138 struct {
139 alignas(64) const std::array<int16_t, 256> length_minus_offset;
140 uint32_t extract_masks[4]; // Used for extracting offset based on tag type.
141 } table = {MakeTable(make_index_sequence<256>{}), {0, 0xFF, 0xFFFF, 0}};
142
143 // Any hash function will produce a valid compressed bitstream, but a good
144 // hash function reduces the number of collisions and thus yields better
145 // compression for compressible input, and more speed for incompressible
146 // input. Of course, it doesn't hurt if the hash function is reasonably fast
147 // either, as it gets called a lot.
HashBytes(uint32_t bytes,uint32_t mask)148 inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) {
149 constexpr uint32_t kMagic = 0x1e35a7bd;
150 return ((kMagic * bytes) >> (32 - kMaxHashTableBits)) & mask;
151 }
152
153 } // namespace
154
MaxCompressedLength(size_t source_bytes)155 size_t MaxCompressedLength(size_t source_bytes) {
156 // Compressed data can be defined as:
157 // compressed := item* literal*
158 // item := literal* copy
159 //
160 // The trailing literal sequence has a space blowup of at most 62/60
161 // since a literal of length 60 needs one tag byte + one extra byte
162 // for length information.
163 //
164 // Item blowup is trickier to measure. Suppose the "copy" op copies
165 // 4 bytes of data. Because of a special check in the encoding code,
166 // we produce a 4-byte copy only if the offset is < 65536. Therefore
167 // the copy op takes 3 bytes to encode, and this type of item leads
168 // to at most the 62/60 blowup for representing literals.
169 //
170 // Suppose the "copy" op copies 5 bytes of data. If the offset is big
171 // enough, it will take 5 bytes to encode the copy op. Therefore the
172 // worst case here is a one-byte literal followed by a five-byte copy.
173 // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
174 //
175 // This last factor dominates the blowup, so the final estimate is:
176 return 32 + source_bytes + source_bytes / 6;
177 }
178
179 namespace {
180
UnalignedCopy64(const void * src,void * dst)181 void UnalignedCopy64(const void* src, void* dst) {
182 char tmp[8];
183 std::memcpy(tmp, src, 8);
184 std::memcpy(dst, tmp, 8);
185 }
186
UnalignedCopy128(const void * src,void * dst)187 void UnalignedCopy128(const void* src, void* dst) {
188 // std::memcpy() gets vectorized when the appropriate compiler options are
189 // used. For example, x86 compilers targeting SSE2+ will optimize to an SSE2
190 // load and store.
191 char tmp[16];
192 std::memcpy(tmp, src, 16);
193 std::memcpy(dst, tmp, 16);
194 }
195
196 template <bool use_16bytes_chunk>
ConditionalUnalignedCopy128(const char * src,char * dst)197 inline void ConditionalUnalignedCopy128(const char* src, char* dst) {
198 if (use_16bytes_chunk) {
199 UnalignedCopy128(src, dst);
200 } else {
201 UnalignedCopy64(src, dst);
202 UnalignedCopy64(src + 8, dst + 8);
203 }
204 }
205
206 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
207 // for handling COPY operations where the input and output regions may overlap.
208 // For example, suppose:
209 // src == "ab"
210 // op == src + 2
211 // op_limit == op + 20
212 // After IncrementalCopySlow(src, op, op_limit), the result will have eleven
213 // copies of "ab"
214 // ababababababababababab
215 // Note that this does not match the semantics of either std::memcpy() or
216 // std::memmove().
IncrementalCopySlow(const char * src,char * op,char * const op_limit)217 inline char* IncrementalCopySlow(const char* src, char* op,
218 char* const op_limit) {
219 // TODO: Remove pragma when LLVM is aware this
220 // function is only called in cold regions and when cold regions don't get
221 // vectorized or unrolled.
222 #ifdef __clang__
223 #pragma clang loop unroll(disable)
224 #endif
225 while (op < op_limit) {
226 *op++ = *src++;
227 }
228 return op_limit;
229 }
230
231 #if SNAPPY_HAVE_SSSE3
232
233 // Computes the bytes for shuffle control mask (please read comments on
234 // 'pattern_generation_masks' as well) for the given index_offset and
235 // pattern_size. For example, when the 'offset' is 6, it will generate a
236 // repeating pattern of size 6. So, the first 16 byte indexes will correspond to
237 // the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the
238 // next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3,
239 // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
240 // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
241 // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
242 template <size_t... indexes>
MakePatternMaskBytes(int index_offset,int pattern_size,index_sequence<indexes...>)243 inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
244 int index_offset, int pattern_size, index_sequence<indexes...>) {
245 return {static_cast<char>((index_offset + indexes) % pattern_size)...};
246 }
247
248 // Computes the shuffle control mask bytes array for given pattern-sizes and
249 // returns an array.
250 template <size_t... pattern_sizes_minus_one>
251 inline constexpr std::array<std::array<char, sizeof(__m128i)>,
252 sizeof...(pattern_sizes_minus_one)>
MakePatternMaskBytesTable(int index_offset,index_sequence<pattern_sizes_minus_one...>)253 MakePatternMaskBytesTable(int index_offset,
254 index_sequence<pattern_sizes_minus_one...>) {
255 return {MakePatternMaskBytes(
256 index_offset, pattern_sizes_minus_one + 1,
257 make_index_sequence</*indexes=*/sizeof(__m128i)>())...};
258 }
259
260 // This is an array of shuffle control masks that can be used as the source
261 // operand for PSHUFB to permute the contents of the destination XMM register
262 // into a repeating byte pattern.
263 alignas(16) constexpr std::array<std::array<char, sizeof(__m128i)>,
264 16> pattern_generation_masks =
265 MakePatternMaskBytesTable(
266 /*index_offset=*/0,
267 /*pattern_sizes_minus_one=*/make_index_sequence<16>());
268
269 // Similar to 'pattern_generation_masks', this table is used to "rotate" the
270 // pattern so that we can copy the *next 16 bytes* consistent with the pattern.
271 // Basically, pattern_reshuffle_masks is a continuation of
272 // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
273 // pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
274 alignas(16) constexpr std::array<std::array<char, sizeof(__m128i)>,
275 16> pattern_reshuffle_masks =
276 MakePatternMaskBytesTable(
277 /*index_offset=*/16,
278 /*pattern_sizes_minus_one=*/make_index_sequence<16>());
279
280 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
LoadPattern(const char * src,const size_t pattern_size)281 static inline __m128i LoadPattern(const char* src, const size_t pattern_size) {
282 __m128i generation_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
283 pattern_generation_masks[pattern_size - 1].data()));
284 // Uninitialized bytes are masked out by the shuffle mask.
285 // TODO: remove annotation and macro defs once MSan is fixed.
286 SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size);
287 return _mm_shuffle_epi8(
288 _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)), generation_mask);
289 }
290
291 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
292 static inline std::pair<__m128i /* pattern */, __m128i /* reshuffle_mask */>
LoadPatternAndReshuffleMask(const char * src,const size_t pattern_size)293 LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
294 __m128i pattern = LoadPattern(src, pattern_size);
295
296 // This mask will generate the next 16 bytes in-place. Doing so enables us to
297 // write data by at most 4 _mm_storeu_si128.
298 //
299 // For example, suppose pattern is: abcdefabcdefabcd
300 // Shuffling with this mask will generate: efabcdefabcdefab
301 // Shuffling again will generate: cdefabcdefabcdef
302 __m128i reshuffle_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
303 pattern_reshuffle_masks[pattern_size - 1].data()));
304 return {pattern, reshuffle_mask};
305 }
306
307 #endif // SNAPPY_HAVE_SSSE3
308
309 // Fallback for when we need to copy while extending the pattern, for example
310 // copying 10 bytes from 3 positions back abc -> abcabcabcabca.
311 //
312 // REQUIRES: [dst - offset, dst + 64) is a valid address range.
313 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
Copy64BytesWithPatternExtension(char * dst,size_t offset)314 static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
315 #if SNAPPY_HAVE_SSSE3
316 if (SNAPPY_PREDICT_TRUE(offset <= 16)) {
317 switch (offset) {
318 case 0:
319 return false;
320 case 1: {
321 std::memset(dst, dst[-1], 64);
322 return true;
323 }
324 case 2:
325 case 4:
326 case 8:
327 case 16: {
328 __m128i pattern = LoadPattern(dst - offset, offset);
329 for (int i = 0; i < 4; i++) {
330 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
331 }
332 return true;
333 }
334 default: {
335 auto pattern_and_reshuffle_mask =
336 LoadPatternAndReshuffleMask(dst - offset, offset);
337 __m128i pattern = pattern_and_reshuffle_mask.first;
338 __m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
339 for (int i = 0; i < 4; i++) {
340 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
341 pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
342 }
343 return true;
344 }
345 }
346 }
347 #else
348 if (SNAPPY_PREDICT_TRUE(offset < 16)) {
349 if (SNAPPY_PREDICT_FALSE(offset == 0)) return false;
350 // Extend the pattern to the first 16 bytes.
351 for (int i = 0; i < 16; i++) dst[i] = dst[i - offset];
352 // Find a multiple of pattern >= 16.
353 static std::array<uint8_t, 16> pattern_sizes = []() {
354 std::array<uint8_t, 16> res;
355 for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i;
356 return res;
357 }();
358 offset = pattern_sizes[offset];
359 for (int i = 1; i < 4; i++) {
360 std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
361 }
362 return true;
363 }
364 #endif // SNAPPY_HAVE_SSSE3
365
366 // Very rare.
367 for (int i = 0; i < 4; i++) {
368 std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
369 }
370 return true;
371 }
372
373 // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than
374 // IncrementalCopySlow. buf_limit is the address past the end of the writable
375 // region of the buffer.
IncrementalCopy(const char * src,char * op,char * const op_limit,char * const buf_limit)376 inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
377 char* const buf_limit) {
378 #if SNAPPY_HAVE_SSSE3
379 constexpr int big_pattern_size_lower_bound = 16;
380 #else
381 constexpr int big_pattern_size_lower_bound = 8;
382 #endif
383
384 // Terminology:
385 //
386 // slop = buf_limit - op
387 // pat = op - src
388 // len = op_limit - op
389 assert(src < op);
390 assert(op < op_limit);
391 assert(op_limit <= buf_limit);
392 // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.
393 assert(op_limit - op <= 64);
394 // NOTE: In practice the compressor always emits len >= 4, so it is ok to
395 // assume that to optimize this function, but this is not guaranteed by the
396 // compression format, so we have to also handle len < 4 in case the input
397 // does not satisfy these conditions.
398
399 size_t pattern_size = op - src;
400 // The cases are split into different branches to allow the branch predictor,
401 // FDO, and static prediction hints to work better. For each input we list the
402 // ratio of invocations that match each condition.
403 //
404 // input slop < 16 pat < 8 len > 16
405 // ------------------------------------------
406 // html|html4|cp 0% 1.01% 27.73%
407 // urls 0% 0.88% 14.79%
408 // jpg 0% 64.29% 7.14%
409 // pdf 0% 2.56% 58.06%
410 // txt[1-4] 0% 0.23% 0.97%
411 // pb 0% 0.96% 13.88%
412 // bin 0.01% 22.27% 41.17%
413 //
414 // It is very rare that we don't have enough slop for doing block copies. It
415 // is also rare that we need to expand a pattern. Small patterns are common
416 // for incompressible formats and for those we are plenty fast already.
417 // Lengths are normally not greater than 16 but they vary depending on the
418 // input. In general if we always predict len <= 16 it would be an ok
419 // prediction.
420 //
421 // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE)
422 // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a
423 // time.
424
425 // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
426 // bytes.
427 if (pattern_size < big_pattern_size_lower_bound) {
428 #if SNAPPY_HAVE_SSSE3
429 // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
430 // to permute the register's contents in-place into a repeating sequence of
431 // the first "pattern_size" bytes.
432 // For example, suppose:
433 // src == "abc"
434 // op == op + 3
435 // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
436 // followed by one byte of slop: abcabcabcabcabca.
437 //
438 // The non-SSE fallback implementation suffers from store-forwarding stalls
439 // because its loads and stores partly overlap. By expanding the pattern
440 // in-place, we avoid the penalty.
441
442 // Typically, the op_limit is the gating factor so try to simplify the loop
443 // based on that.
444 if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
445 auto pattern_and_reshuffle_mask =
446 LoadPatternAndReshuffleMask(src, pattern_size);
447 __m128i pattern = pattern_and_reshuffle_mask.first;
448 __m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
449
450 // There is at least one, and at most four 16-byte blocks. Writing four
451 // conditionals instead of a loop allows FDO to layout the code with
452 // respect to the actual probabilities of each length.
453 // TODO: Replace with loop with trip count hint.
454 _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
455
456 if (op + 16 < op_limit) {
457 pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
458 _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern);
459 }
460 if (op + 32 < op_limit) {
461 pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
462 _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern);
463 }
464 if (op + 48 < op_limit) {
465 pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
466 _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern);
467 }
468 return op_limit;
469 }
470 char* const op_end = buf_limit - 15;
471 if (SNAPPY_PREDICT_TRUE(op < op_end)) {
472 auto pattern_and_reshuffle_mask =
473 LoadPatternAndReshuffleMask(src, pattern_size);
474 __m128i pattern = pattern_and_reshuffle_mask.first;
475 __m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
476
477 // This code path is relatively cold however so we save code size
478 // by avoiding unrolling and vectorizing.
479 //
480 // TODO: Remove pragma when when cold regions don't get
481 // vectorized or unrolled.
482 #ifdef __clang__
483 #pragma clang loop unroll(disable)
484 #endif
485 do {
486 _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
487 pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
488 op += 16;
489 } while (SNAPPY_PREDICT_TRUE(op < op_end));
490 }
491 return IncrementalCopySlow(op - pattern_size, op, op_limit);
492 #else // !SNAPPY_HAVE_SSSE3
493 // If plenty of buffer space remains, expand the pattern to at least 8
494 // bytes. The way the following loop is written, we need 8 bytes of buffer
495 // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
496 // bytes if pattern_size is 2. Precisely encoding that is probably not
497 // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
498 // (because 11 are required in the worst case).
499 if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
500 while (pattern_size < 8) {
501 UnalignedCopy64(src, op);
502 op += pattern_size;
503 pattern_size *= 2;
504 }
505 if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
506 } else {
507 return IncrementalCopySlow(src, op, op_limit);
508 }
509 #endif // SNAPPY_HAVE_SSSE3
510 }
511 assert(pattern_size >= big_pattern_size_lower_bound);
512 constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
513
514 // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can
515 // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op.
516 // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes
517 // guarantees that op - src >= 8.
518 //
519 // Typically, the op_limit is the gating factor so try to simplify the loop
520 // based on that.
521 if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
522 // There is at least one, and at most four 16-byte blocks. Writing four
523 // conditionals instead of a loop allows FDO to layout the code with respect
524 // to the actual probabilities of each length.
525 // TODO: Replace with loop with trip count hint.
526 ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
527 if (op + 16 < op_limit) {
528 ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16);
529 }
530 if (op + 32 < op_limit) {
531 ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32);
532 }
533 if (op + 48 < op_limit) {
534 ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48);
535 }
536 return op_limit;
537 }
538
539 // Fall back to doing as much as we can with the available slop in the
540 // buffer. This code path is relatively cold however so we save code size by
541 // avoiding unrolling and vectorizing.
542 //
543 // TODO: Remove pragma when when cold regions don't get vectorized
544 // or unrolled.
545 #ifdef __clang__
546 #pragma clang loop unroll(disable)
547 #endif
548 for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
549 ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
550 }
551 if (op >= op_limit) return op_limit;
552
553 // We only take this branch if we didn't have enough slop and we can do a
554 // single 8 byte copy.
555 if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
556 UnalignedCopy64(src, op);
557 src += 8;
558 op += 8;
559 }
560 return IncrementalCopySlow(src, op, op_limit);
561 }
562
563 } // namespace
564
565 template <bool allow_fast_path>
EmitLiteral(char * op,const char * literal,int len)566 static inline char* EmitLiteral(char* op, const char* literal, int len) {
567 // The vast majority of copies are below 16 bytes, for which a
568 // call to std::memcpy() is overkill. This fast path can sometimes
569 // copy up to 15 bytes too much, but that is okay in the
570 // main loop, since we have a bit to go on for both sides:
571 //
572 // - The input will always have kInputMarginBytes = 15 extra
573 // available bytes, as long as we're in the main loop, and
574 // if not, allow_fast_path = false.
575 // - The output will always have 32 spare bytes (see
576 // MaxCompressedLength).
577 assert(len > 0); // Zero-length literals are disallowed
578 int n = len - 1;
579 if (allow_fast_path && len <= 16) {
580 // Fits in tag byte
581 *op++ = LITERAL | (n << 2);
582
583 UnalignedCopy128(literal, op);
584 return op + len;
585 }
586
587 if (n < 60) {
588 // Fits in tag byte
589 *op++ = LITERAL | (n << 2);
590 } else {
591 int count = (Bits::Log2Floor(n) >> 3) + 1;
592 assert(count >= 1);
593 assert(count <= 4);
594 *op++ = LITERAL | ((59 + count) << 2);
595 // Encode in upcoming bytes.
596 // Write 4 bytes, though we may care about only 1 of them. The output buffer
597 // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
598 // here and there is a std::memcpy() of size 'len' below.
599 LittleEndian::Store32(op, n);
600 op += count;
601 }
602 std::memcpy(op, literal, len);
603 return op + len;
604 }
605
606 template <bool len_less_than_12>
EmitCopyAtMost64(char * op,size_t offset,size_t len)607 static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
608 assert(len <= 64);
609 assert(len >= 4);
610 assert(offset < 65536);
611 assert(len_less_than_12 == (len < 12));
612
613 if (len_less_than_12) {
614 uint32_t u = (uint32_t)((len << 2) + (offset << 8));
615 uint32_t copy1 = COPY_1_BYTE_OFFSET - (4 << 2) + ((offset >> 3) & 0xe0);
616 uint32_t copy2 = COPY_2_BYTE_OFFSET - (1 << 2);
617 // It turns out that offset < 2048 is a difficult to predict branch.
618 // `perf record` shows this is the highest percentage of branch misses in
619 // benchmarks. This code produces branch free code, the data dependency
620 // chain that bottlenecks the throughput is so long that a few extra
621 // instructions are completely free (IPC << 6 because of data deps).
622 u += offset < 2048 ? copy1 : copy2;
623 LittleEndian::Store32(op, u);
624 op += offset < 2048 ? 2 : 3;
625 } else {
626 // Write 4 bytes, though we only care about 3 of them. The output buffer
627 // is required to have some slack, so the extra byte won't overrun it.
628 uint32_t u = COPY_2_BYTE_OFFSET + (uint32_t)(((len - 1) << 2) + (offset << 8));
629 LittleEndian::Store32(op, u);
630 op += 3;
631 }
632 return op;
633 }
634
635 template <bool len_less_than_12>
EmitCopy(char * op,size_t offset,size_t len)636 static inline char* EmitCopy(char* op, size_t offset, size_t len) {
637 assert(len_less_than_12 == (len < 12));
638 if (len_less_than_12) {
639 return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
640 } else {
641 // A special case for len <= 64 might help, but so far measurements suggest
642 // it's in the noise.
643
644 // Emit 64 byte copies but make sure to keep at least four bytes reserved.
645 while (SNAPPY_PREDICT_FALSE(len >= 68)) {
646 op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
647 len -= 64;
648 }
649
650 // One or two copies will now finish the job.
651 if (len > 64) {
652 op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
653 len -= 60;
654 }
655
656 // Emit remainder.
657 if (len < 12) {
658 op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
659 } else {
660 op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
661 }
662 return op;
663 }
664 }
665
GetUncompressedLength(const char * start,size_t n,size_t * result)666 bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
667 uint32_t v = 0;
668 const char* limit = start + n;
669 if (Varint::Parse32WithLimit(start, limit, &v) != NULL) {
670 *result = v;
671 return true;
672 } else {
673 return false;
674 }
675 }
676
677 namespace {
CalculateTableSize(uint32_t input_size)678 uint32_t CalculateTableSize(uint32_t input_size) {
679 static_assert(
680 kMaxHashTableSize >= kMinHashTableSize,
681 "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
682 if (input_size > kMaxHashTableSize) {
683 return kMaxHashTableSize;
684 }
685 if (input_size < kMinHashTableSize) {
686 return kMinHashTableSize;
687 }
688 // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
689 // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
690 return 2u << Bits::Log2Floor(input_size - 1);
691 }
692 } // namespace
693
694 namespace internal {
WorkingMemory(size_t input_size)695 WorkingMemory::WorkingMemory(size_t input_size) {
696 const size_t max_fragment_size = std::min(input_size, kBlockSize);
697 const size_t table_size = CalculateTableSize((uint32_t)max_fragment_size);
698 size_ = table_size * sizeof(*table_) + max_fragment_size +
699 MaxCompressedLength(max_fragment_size);
700 mem_ = std::allocator<char>().allocate(size_);
701 table_ = reinterpret_cast<uint16_t*>(mem_);
702 input_ = mem_ + table_size * sizeof(*table_);
703 output_ = input_ + max_fragment_size;
704 }
705
~WorkingMemory()706 WorkingMemory::~WorkingMemory() {
707 std::allocator<char>().deallocate(mem_, size_);
708 }
709
GetHashTable(size_t fragment_size,int * table_size) const710 uint16_t* WorkingMemory::GetHashTable(size_t fragment_size,
711 int* table_size) const {
712 const size_t htsize = CalculateTableSize((uint32_t)fragment_size);
713 memset(table_, 0, htsize * sizeof(*table_));
714 *table_size = (int)htsize;
715 return table_;
716 }
717 } // end namespace internal
718
719 // Flat array compression that does not emit the "uncompressed length"
720 // prefix. Compresses "input" string to the "*op" buffer.
721 //
722 // REQUIRES: "input" is at most "kBlockSize" bytes long.
723 // REQUIRES: "op" points to an array of memory that is at least
724 // "MaxCompressedLength(input.size())" in size.
725 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
726 // REQUIRES: "table_size" is a power of two
727 //
728 // Returns an "end" pointer into "op" buffer.
729 // "end - op" is the compressed size of "input".
730 namespace internal {
CompressFragment(const char * input,size_t input_size,char * op,uint16_t * table,const int table_size)731 char* CompressFragment(const char* input, size_t input_size, char* op,
732 uint16_t* table, const int table_size) {
733 // "ip" is the input pointer, and "op" is the output pointer.
734 const char* ip = input;
735 assert(input_size <= kBlockSize);
736 assert((table_size & (table_size - 1)) == 0); // table must be power of two
737 const uint32_t mask = table_size - 1;
738 const char* ip_end = input + input_size;
739 const char* base_ip = ip;
740
741 const size_t kInputMarginBytes = 15;
742 if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
743 const char* ip_limit = input + input_size - kInputMarginBytes;
744
745 for (uint32_t preload = LittleEndian::Load32(ip + 1);;) {
746 // Bytes in [next_emit, ip) will be emitted as literal bytes. Or
747 // [next_emit, ip_end) after the main loop.
748 const char* next_emit = ip++;
749 uint64_t data = LittleEndian::Load64(ip);
750 // The body of this loop calls EmitLiteral once and then EmitCopy one or
751 // more times. (The exception is that when we're close to exhausting
752 // the input we goto emit_remainder.)
753 //
754 // In the first iteration of this loop we're just starting, so
755 // there's nothing to copy, so calling EmitLiteral once is
756 // necessary. And we only start a new iteration when the
757 // current iteration has determined that a call to EmitLiteral will
758 // precede the next call to EmitCopy (if any).
759 //
760 // Step 1: Scan forward in the input looking for a 4-byte-long match.
761 // If we get close to exhausting the input then goto emit_remainder.
762 //
763 // Heuristic match skipping: If 32 bytes are scanned with no matches
764 // found, start looking only at every other byte. If 32 more bytes are
765 // scanned (or skipped), look at every third byte, etc.. When a match is
766 // found, immediately go back to looking at every byte. This is a small
767 // loss (~5% performance, ~0.1% density) for compressible data due to more
768 // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
769 // win since the compressor quickly "realizes" the data is incompressible
770 // and doesn't bother looking for matches everywhere.
771 //
772 // The "skip" variable keeps track of how many bytes there are since the
773 // last match; dividing it by 32 (ie. right-shifting by five) gives the
774 // number of bytes to move ahead for each iteration.
775 uint32_t skip = 32;
776
777 const char* candidate;
778 if (ip_limit - ip >= 16) {
779 auto delta = ip - base_ip;
780 for (int j = 0; j < 4; ++j) {
781 for (int k = 0; k < 4; ++k) {
782 int i = 4 * j + k;
783 // These for-loops are meant to be unrolled. So we can freely
784 // special case the first iteration to use the value already
785 // loaded in preload.
786 uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data);
787 assert(dword == LittleEndian::Load32(ip + i));
788 uint32_t hash = HashBytes(dword, mask);
789 candidate = base_ip + table[hash];
790 assert(candidate >= base_ip);
791 assert(candidate < ip + i);
792 table[hash] = (uint16_t)(delta + i);
793 if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) {
794 *op = LITERAL | (i << 2);
795 UnalignedCopy128(next_emit, op + 1);
796 ip += i;
797 op = op + i + 2;
798 goto emit_match;
799 }
800 data >>= 8;
801 }
802 data = LittleEndian::Load64(ip + 4 * j + 4);
803 }
804 ip += 16;
805 skip += 16;
806 }
807 while (true) {
808 assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));
809 uint32_t hash = HashBytes((uint32_t)data, mask);
810 uint32_t bytes_between_hash_lookups = skip >> 5;
811 skip += bytes_between_hash_lookups;
812 const char* next_ip = ip + bytes_between_hash_lookups;
813 if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
814 ip = next_emit;
815 goto emit_remainder;
816 }
817 candidate = base_ip + table[hash];
818 assert(candidate >= base_ip);
819 assert(candidate < ip);
820
821 table[hash] = (uint16_t)(ip - base_ip);
822 if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==
823 LittleEndian::Load32(candidate))) {
824 break;
825 }
826 data = LittleEndian::Load32(next_ip);
827 ip = next_ip;
828 }
829
830 // Step 2: A 4-byte match has been found. We'll later see if more
831 // than 4 bytes match. But, prior to the match, input
832 // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
833 assert(next_emit + 16 <= ip_end);
834 op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, (int)(ip - next_emit));
835
836 // Step 3: Call EmitCopy, and then see if another EmitCopy could
837 // be our next move. Repeat until we find no match for the
838 // input immediately after what was consumed by the last EmitCopy call.
839 //
840 // If we exit this loop normally then we need to call EmitLiteral next,
841 // though we don't yet know how big the literal will be. We handle that
842 // by proceeding to the next iteration of the main loop. We also can exit
843 // this loop via goto if we get close to exhausting the input.
844 emit_match:
845 do {
846 // We have a 4-byte match at ip, and no need to emit any
847 // "literal bytes" prior to ip.
848 const char* base = ip;
849 std::pair<size_t, bool> p =
850 FindMatchLength(candidate + 4, ip + 4, ip_end, &data);
851 size_t matched = 4 + p.first;
852 ip += matched;
853 size_t offset = base - candidate;
854 assert(0 == memcmp(base, candidate, matched));
855 if (p.second) {
856 op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
857 } else {
858 op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
859 }
860 if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
861 goto emit_remainder;
862 }
863 // Expect 5 bytes to match
864 assert((data & 0xFFFFFFFFFF) ==
865 (LittleEndian::Load64(ip) & 0xFFFFFFFFFF));
866 // We are now looking for a 4-byte match again. We read
867 // table[Hash(ip, shift)] for that. To improve compression,
868 // we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)].
869 table[HashBytes(LittleEndian::Load32(ip - 1), mask)] = (uint16_t)(ip - base_ip - 1);
870 uint32_t hash = HashBytes((uint32_t)data, mask);
871 candidate = base_ip + table[hash];
872 table[hash] = (uint16_t)(ip - base_ip);
873 // Measurements on the benchmarks have shown the following probabilities
874 // for the loop to exit (ie. avg. number of iterations is reciprocal).
875 // BM_Flat/6 txt1 p = 0.3-0.4
876 // BM_Flat/7 txt2 p = 0.35
877 // BM_Flat/8 txt3 p = 0.3-0.4
878 // BM_Flat/9 txt3 p = 0.34-0.4
879 // BM_Flat/10 pb p = 0.4
880 // BM_Flat/11 gaviota p = 0.1
881 // BM_Flat/12 cp p = 0.5
882 // BM_Flat/13 c p = 0.3
883 } while (static_cast<uint32_t>(data) == LittleEndian::Load32(candidate));
884 // Because the least significant 5 bytes matched, we can utilize data
885 // for the next iteration.
886 preload = (uint32_t)(data >> 8);
887 }
888 }
889
890 emit_remainder:
891 // Emit the remaining bytes as a literal
892 if (ip < ip_end) {
893 op = EmitLiteral</*allow_fast_path=*/false>(op, ip, (int)(ip_end - ip));
894 }
895
896 return op;
897 }
898 } // end namespace internal
899
900 // Called back at avery compression call to trace parameters and sizes.
Report(const char * algorithm,size_t compressed_size,size_t uncompressed_size)901 static inline void Report(const char *algorithm, size_t compressed_size,
902 size_t uncompressed_size) {
903 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
904 (void)algorithm;
905 (void)compressed_size;
906 (void)uncompressed_size;
907 }
908
909 // Signature of output types needed by decompression code.
910 // The decompression code is templatized on a type that obeys this
911 // signature so that we do not pay virtual function call overhead in
912 // the middle of a tight decompression loop.
913 //
914 // class DecompressionWriter {
915 // public:
916 // // Called before decompression
917 // void SetExpectedLength(size_t length);
918 //
919 // // For performance a writer may choose to donate the cursor variable to the
920 // // decompression function. The decompression will inject it in all its
921 // // function calls to the writer. Keeping the important output cursor as a
922 // // function local stack variable allows the compiler to keep it in
923 // // register, which greatly aids performance by avoiding loads and stores of
924 // // this variable in the fast path loop iterations.
925 // T GetOutputPtr() const;
926 //
927 // // At end of decompression the loop donates the ownership of the cursor
928 // // variable back to the writer by calling this function.
929 // void SetOutputPtr(T op);
930 //
931 // // Called after decompression
932 // bool CheckLength() const;
933 //
934 // // Called repeatedly during decompression
935 // // Each function get a pointer to the op (output pointer), that the writer
936 // // can use and update. Note it's important that these functions get fully
937 // // inlined so that no actual address of the local variable needs to be
938 // // taken.
939 // bool Append(const char* ip, size_t length, T* op);
940 // bool AppendFromSelf(uint32_t offset, size_t length, T* op);
941 //
942 // // The rules for how TryFastAppend differs from Append are somewhat
943 // // convoluted:
944 // //
945 // // - TryFastAppend is allowed to decline (return false) at any
946 // // time, for any reason -- just "return false" would be
947 // // a perfectly legal implementation of TryFastAppend.
948 // // The intention is for TryFastAppend to allow a fast path
949 // // in the common case of a small append.
950 // // - TryFastAppend is allowed to read up to <available> bytes
951 // // from the input buffer, whereas Append is allowed to read
952 // // <length>. However, if it returns true, it must leave
953 // // at least five (kMaximumTagLength) bytes in the input buffer
954 // // afterwards, so that there is always enough space to read the
955 // // next tag without checking for a refill.
956 // // - TryFastAppend must always return decline (return false)
957 // // if <length> is 61 or more, as in this case the literal length is not
958 // // decoded fully. In practice, this should not be a big problem,
959 // // as it is unlikely that one would implement a fast path accepting
960 // // this much data.
961 // //
962 // bool TryFastAppend(const char* ip, size_t available, size_t length, T* op);
963 // };
964
ExtractLowBytes(uint32_t v,int n)965 static inline uint32_t ExtractLowBytes(uint32_t v, int n) {
966 assert(n >= 0);
967 assert(n <= 4);
968 #if SNAPPY_HAVE_BMI2
969 return _bzhi_u32(v, 8 * n);
970 #else
971 // This needs to be wider than uint32_t otherwise `mask << 32` will be
972 // undefined.
973 uint64_t mask = 0xffffffff;
974 return v & ~(mask << (8 * n));
975 #endif
976 }
977
LeftShiftOverflows(uint8_t value,uint32_t shift)978 static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) {
979 assert(shift < 32);
980 static const uint8_t masks[] = {
981 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
982 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
983 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
984 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
985 return (value & masks[shift]) != 0;
986 }
987
Copy64BytesWithPatternExtension(ptrdiff_t dst,size_t offset)988 inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
989 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
990 (void)dst;
991 return offset != 0;
992 }
993
MemCopy(char * dst,const uint8_t * src,size_t size)994 void MemCopy(char* dst, const uint8_t* src, size_t size) {
995 std::memcpy(dst, src, size);
996 }
997
MemCopy(ptrdiff_t dst,const uint8_t * src,size_t size)998 void MemCopy(ptrdiff_t dst, const uint8_t* src, size_t size) {
999 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1000 (void)dst;
1001 (void)src;
1002 (void)size;
1003 }
1004
MemMove(char * dst,const void * src,size_t size)1005 void MemMove(char* dst, const void* src, size_t size) {
1006 std::memmove(dst, src, size);
1007 }
1008
MemMove(ptrdiff_t dst,const void * src,size_t size)1009 void MemMove(ptrdiff_t dst, const void* src, size_t size) {
1010 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1011 (void)dst;
1012 (void)src;
1013 (void)size;
1014 }
1015
1016 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
AdvanceToNextTag(const uint8_t ** ip_p,size_t * tag)1017 size_t AdvanceToNextTag(const uint8_t** ip_p, size_t* tag) {
1018 const uint8_t*& ip = *ip_p;
1019 // This section is crucial for the throughput of the decompression loop.
1020 // The latency of an iteration is fundamentally constrained by the
1021 // following data chain on ip.
1022 // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
1023 // ip2 = ip + 2 + (c >> 2)
1024 // This amounts to 8 cycles.
1025 // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
1026 size_t literal_len = *tag >> 2;
1027 size_t tag_type = *tag;
1028 bool is_literal;
1029 #if defined(__GNUC__) && defined(__x86_64__) && defined(__GCC_ASM_FLAG_OUTPUTS__)
1030 // TODO clang misses the fact that the (c & 3) already correctly
1031 // sets the zero flag.
1032 asm("and $3, %k[tag_type]\n\t"
1033 : [tag_type] "+r"(tag_type), "=@ccz"(is_literal));
1034 #else
1035 tag_type &= 3;
1036 is_literal = (tag_type == 0);
1037 #endif
1038 // TODO
1039 // This is code is subtle. Loading the values first and then cmov has less
1040 // latency then cmov ip and then load. However clang would move the loads
1041 // in an optimization phase, volatile prevents this transformation.
1042 // Note that we have enough slop bytes (64) that the loads are always valid.
1043 size_t tag_literal =
1044 static_cast<const volatile uint8_t*>(ip)[1 + literal_len];
1045 size_t tag_copy = static_cast<const volatile uint8_t*>(ip)[tag_type];
1046 *tag = is_literal ? tag_literal : tag_copy;
1047 const uint8_t* ip_copy = ip + 1 + tag_type;
1048 const uint8_t* ip_literal = ip + 2 + literal_len;
1049 ip = is_literal ? ip_literal : ip_copy;
1050 #if defined(__GNUC__) && defined(__x86_64__)
1051 // TODO Clang is "optimizing" zero-extension (a totally free
1052 // operation) this means that after the cmov of tag, it emits another movzb
1053 // tag, byte(tag). It really matters as it's on the core chain. This dummy
1054 // asm, persuades clang to do the zero-extension at the load (it's automatic)
1055 // removing the expensive movzb.
1056 asm("" ::"r"(tag_copy));
1057 #endif
1058 return tag_type;
1059 }
1060
1061 // Extract the offset for copy-1 and copy-2 returns 0 for literals or copy-4.
ExtractOffset(uint32_t val,size_t tag_type)1062 inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
1063 return val & table.extract_masks[tag_type];
1064 };
1065
1066 // Core decompression loop, when there is enough data available.
1067 // Decompresses the input buffer [ip, ip_limit) into the output buffer
1068 // [op, op_limit_min_slop). Returning when either we are too close to the end
1069 // of the input buffer, or we exceed op_limit_min_slop or when a exceptional
1070 // tag is encountered (literal of length > 60) or a copy-4.
1071 // Returns {ip, op} at the points it stopped decoding.
1072 // TODO This function probably does not need to be inlined, as it
1073 // should decode large chunks at a time. This allows runtime dispatch to
1074 // implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy).
1075 template <typename T>
DecompressBranchless(const uint8_t * ip,const uint8_t * ip_limit,ptrdiff_t op,T op_base,ptrdiff_t op_limit_min_slop)1076 std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
1077 const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base,
1078 ptrdiff_t op_limit_min_slop) {
1079 // We unroll the inner loop twice so we need twice the spare room.
1080 op_limit_min_slop -= kSlopBytes;
1081 if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) {
1082 const uint8_t* const ip_limit_min_slop = ip_limit - 2 * kSlopBytes - 1;
1083 ip++;
1084 // ip points just past the tag and we are touching at maximum kSlopBytes
1085 // in an iteration.
1086 size_t tag = ip[-1];
1087 do {
1088 // The throughput is limited by instructions, unrolling the inner loop
1089 // twice reduces the amount of instructions checking limits and also
1090 // leads to reduced mov's.
1091 for (int i = 0; i < 2; i++) {
1092 const uint8_t* old_ip = ip;
1093 assert(tag == ip[-1]);
1094 // For literals tag_type = 0, hence we will always obtain 0 from
1095 // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
1096 ptrdiff_t len_min_offset = table.length_minus_offset[tag];
1097 size_t tag_type = AdvanceToNextTag(&ip, &tag);
1098 uint32_t next = LittleEndian::Load32(old_ip);
1099 size_t len = len_min_offset & 0xFF;
1100 len_min_offset -= ExtractOffset(next, tag_type);
1101 if (SNAPPY_PREDICT_FALSE(len_min_offset > 0)) {
1102 if (SNAPPY_PREDICT_FALSE(len & 0x80)) {
1103 // Exceptional case (long literal or copy 4).
1104 // Actually doing the copy here is negatively impacting the main
1105 // loop due to compiler incorrectly allocating a register for
1106 // this fallback. Hence we just break.
1107 break_loop:
1108 ip = old_ip;
1109 goto exit;
1110 }
1111 // Only copy-1 or copy-2 tags can get here.
1112 assert(tag_type == 1 || tag_type == 2);
1113 std::ptrdiff_t delta = op + len_min_offset - len;
1114 // Guard against copies before the buffer start.
1115 if (SNAPPY_PREDICT_FALSE(delta < 0 ||
1116 !Copy64BytesWithPatternExtension(
1117 op_base + op, len - len_min_offset))) {
1118 goto break_loop;
1119 }
1120 op += len;
1121 continue;
1122 }
1123 std::ptrdiff_t delta = op + len_min_offset - len;
1124 if (SNAPPY_PREDICT_FALSE(delta < 0)) {
1125 #if defined(__GNUC__) && defined(__x86_64__)
1126 // TODO
1127 // When validating, both code path reduced to `op += len`. Ie. this
1128 // becomes effectively
1129 //
1130 // if (delta < 0) if (tag_type != 0) goto break_loop;
1131 // op += len;
1132 //
1133 // The compiler interchanges the predictable and almost always false
1134 // first if-statement with the completely unpredictable second
1135 // if-statement, putting an unpredictable branch on every iteration.
1136 // This empty asm is worth almost 2x, which I think qualifies for an
1137 // award for the most load-bearing empty statement.
1138 asm("");
1139 #endif
1140
1141 // Due to the spurious offset in literals have this will trigger
1142 // at the start of a block when op is still smaller than 256.
1143 if (tag_type != 0) goto break_loop;
1144 MemCopy(op_base + op, old_ip, 64);
1145 op += len;
1146 continue;
1147 }
1148
1149 // For copies we need to copy from op_base + delta, for literals
1150 // we need to copy from ip instead of from the stream.
1151 const void* from =
1152 tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;
1153 MemMove(op_base + op, from, 64);
1154 op += len;
1155 }
1156 } while (ip < ip_limit_min_slop && op < op_limit_min_slop);
1157 exit:
1158 ip--;
1159 assert(ip <= ip_limit);
1160 }
1161 return {ip, op};
1162 }
1163
1164 // Helper class for decompression
1165 class SnappyDecompressor {
1166 private:
1167 Source* reader_; // Underlying source of bytes to decompress
1168 const char* ip_; // Points to next buffered byte
1169 const char* ip_limit_; // Points just past buffered bytes
1170 // If ip < ip_limit_min_maxtaglen_ it's safe to read kMaxTagLength from
1171 // buffer.
1172 const char* ip_limit_min_maxtaglen_;
1173 uint32_t peeked_; // Bytes peeked from reader (need to skip)
1174 bool eof_; // Hit end of input without an error?
1175 char scratch_[kMaximumTagLength]; // See RefillTag().
1176
1177 // Ensure that all of the tag metadata for the next tag is available
1178 // in [ip_..ip_limit_-1]. Also ensures that [ip,ip+4] is readable even
1179 // if (ip_limit_ - ip_ < 5).
1180 //
1181 // Returns true on success, false on error or end of input.
1182 bool RefillTag();
1183
ResetLimit(const char * ip)1184 void ResetLimit(const char* ip) {
1185 ip_limit_min_maxtaglen_ =
1186 ip_limit_ - std::min<ptrdiff_t>(ip_limit_ - ip, kMaximumTagLength - 1);
1187 }
1188
1189 public:
SnappyDecompressor(Source * reader)1190 explicit SnappyDecompressor(Source* reader)
1191 : reader_(reader), ip_(NULL), ip_limit_(NULL), peeked_(0), eof_(false) {}
1192
~SnappyDecompressor()1193 ~SnappyDecompressor() {
1194 // Advance past any bytes we peeked at from the reader
1195 reader_->Skip(peeked_);
1196 }
1197
1198 // Returns true iff we have hit the end of the input without an error.
eof() const1199 bool eof() const { return eof_; }
1200
1201 // Read the uncompressed length stored at the start of the compressed data.
1202 // On success, stores the length in *result and returns true.
1203 // On failure, returns false.
ReadUncompressedLength(uint32_t * result)1204 bool ReadUncompressedLength(uint32_t* result) {
1205 assert(ip_ == NULL); // Must not have read anything yet
1206 // Length is encoded in 1..5 bytes
1207 *result = 0;
1208 uint32_t shift = 0;
1209 while (true) {
1210 if (shift >= 32) return false;
1211 size_t n;
1212 const char* ip = reader_->Peek(&n);
1213 if (n == 0) return false;
1214 const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
1215 reader_->Skip(1);
1216 uint32_t val = c & 0x7f;
1217 if (LeftShiftOverflows(static_cast<uint8_t>(val), shift)) return false;
1218 *result |= val << shift;
1219 if (c < 128) {
1220 break;
1221 }
1222 shift += 7;
1223 }
1224 return true;
1225 }
1226
1227 // Process the next item found in the input.
1228 // Returns true if successful, false on error or end of input.
1229 template <class Writer>
1230 #if defined(__GNUC__) && defined(__x86_64__)
1231 __attribute__((aligned(32)))
1232 #endif
1233 void
DecompressAllTags(Writer * writer)1234 DecompressAllTags(Writer* writer) {
1235 const char* ip = ip_;
1236 ResetLimit(ip);
1237 auto op = writer->GetOutputPtr();
1238 // We could have put this refill fragment only at the beginning of the loop.
1239 // However, duplicating it at the end of each branch gives the compiler more
1240 // scope to optimize the <ip_limit_ - ip> expression based on the local
1241 // context, which overall increases speed.
1242 #define MAYBE_REFILL() \
1243 if (SNAPPY_PREDICT_FALSE(ip >= ip_limit_min_maxtaglen_)) { \
1244 ip_ = ip; \
1245 if (SNAPPY_PREDICT_FALSE(!RefillTag())) goto exit; \
1246 ip = ip_; \
1247 ResetLimit(ip); \
1248 } \
1249 preload = static_cast<uint8_t>(*ip)
1250
1251 // At the start of the for loop below the least significant byte of preload
1252 // contains the tag.
1253 uint32_t preload;
1254 MAYBE_REFILL();
1255 for (;;) {
1256 {
1257 ptrdiff_t op_limit_min_slop;
1258 auto op_base = writer->GetBase(&op_limit_min_slop);
1259 if (op_base) {
1260 auto res =
1261 DecompressBranchless(reinterpret_cast<const uint8_t*>(ip),
1262 reinterpret_cast<const uint8_t*>(ip_limit_),
1263 op - op_base, op_base, op_limit_min_slop);
1264 ip = reinterpret_cast<const char*>(res.first);
1265 op = op_base + res.second;
1266 MAYBE_REFILL();
1267 }
1268 }
1269 const uint8_t c = static_cast<uint8_t>(preload);
1270 ip++;
1271
1272 // Ratio of iterations that have LITERAL vs non-LITERAL for different
1273 // inputs.
1274 //
1275 // input LITERAL NON_LITERAL
1276 // -----------------------------------
1277 // html|html4|cp 23% 77%
1278 // urls 36% 64%
1279 // jpg 47% 53%
1280 // pdf 19% 81%
1281 // txt[1-4] 25% 75%
1282 // pb 24% 76%
1283 // bin 24% 76%
1284 if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
1285 size_t literal_length = (c >> 2) + 1u;
1286 if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length, &op)) {
1287 assert(literal_length < 61);
1288 ip += literal_length;
1289 // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
1290 // will not return true unless there's already at least five spare
1291 // bytes in addition to the literal.
1292 preload = static_cast<uint8_t>(*ip);
1293 continue;
1294 }
1295 if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
1296 // Long literal.
1297 const size_t literal_length_length = literal_length - 60;
1298 literal_length =
1299 ExtractLowBytes(LittleEndian::Load32(ip), (uint32_t)literal_length_length) +
1300 1;
1301 ip += literal_length_length;
1302 }
1303
1304 size_t avail = ip_limit_ - ip;
1305 while (avail < literal_length) {
1306 if (!writer->Append(ip, avail, &op)) goto exit;
1307 literal_length -= avail;
1308 reader_->Skip(peeked_);
1309 size_t n;
1310 ip = reader_->Peek(&n);
1311 avail = n;
1312 peeked_ = (uint32_t)avail;
1313 if (avail == 0) goto exit;
1314 ip_limit_ = ip + avail;
1315 ResetLimit(ip);
1316 }
1317 if (!writer->Append(ip, literal_length, &op)) goto exit;
1318 ip += literal_length;
1319 MAYBE_REFILL();
1320 } else {
1321 if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) {
1322 const size_t copy_offset = LittleEndian::Load32(ip);
1323 const size_t length = (c >> 2) + 1;
1324 ip += 4;
1325
1326 if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
1327 } else {
1328 const ptrdiff_t entry = table.length_minus_offset[c];
1329 preload = LittleEndian::Load32(ip);
1330 const uint32_t trailer = ExtractLowBytes(preload, c & 3);
1331 const uint32_t length = entry & 0xff;
1332 assert(length > 0);
1333
1334 // copy_offset/256 is encoded in bits 8..10. By just fetching
1335 // those bits, we get copy_offset (since the bit-field starts at
1336 // bit 8).
1337 const uint32_t copy_offset = (uint32_t)(trailer - entry + length);
1338 if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
1339
1340 ip += (c & 3);
1341 // By using the result of the previous load we reduce the critical
1342 // dependency chain of ip to 4 cycles.
1343 preload >>= (c & 3) * 8;
1344 if (ip < ip_limit_min_maxtaglen_) continue;
1345 }
1346 MAYBE_REFILL();
1347 }
1348 }
1349 #undef MAYBE_REFILL
1350 exit:
1351 writer->SetOutputPtr(op);
1352 }
1353 };
1354
CalculateNeeded(uint8_t tag)1355 constexpr uint32_t CalculateNeeded(uint8_t tag) {
1356 return ((tag & 3) == 0 && tag >= (60 * 4))
1357 ? (tag >> 2) - 58
1358 : (0x05030201 >> ((tag * 8) & 31)) & 0xFF;
1359 }
1360
1361 #if __cplusplus >= 201402L
VerifyCalculateNeeded()1362 constexpr bool VerifyCalculateNeeded() {
1363 for (int i = 0; i < 1; i++) {
1364 if (CalculateNeeded(i) != (char_table[i] >> 11) + 1) return false;
1365 }
1366 return true;
1367 }
1368
1369 // Make sure CalculateNeeded is correct by verifying it against the established
1370 // table encoding the number of added bytes needed.
1371 static_assert(VerifyCalculateNeeded(), "");
1372 #endif // c++14
1373
RefillTag()1374 bool SnappyDecompressor::RefillTag() {
1375 const char* ip = ip_;
1376 if (ip == ip_limit_) {
1377 // Fetch a new fragment from the reader
1378 reader_->Skip(peeked_); // All peeked bytes are used up
1379 size_t n;
1380 ip = reader_->Peek(&n);
1381 peeked_ = (uint32_t)n;
1382 eof_ = (n == 0);
1383 if (eof_) return false;
1384 ip_limit_ = ip + n;
1385 }
1386
1387 // Read the tag character
1388 assert(ip < ip_limit_);
1389 const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
1390 // At this point make sure that the data for the next tag is consecutive.
1391 // For copy 1 this means the next 2 bytes (tag and 1 byte offset)
1392 // For copy 2 the next 3 bytes (tag and 2 byte offset)
1393 // For copy 4 the next 5 bytes (tag and 4 byte offset)
1394 // For all small literals we only need 1 byte buf for literals 60...63 the
1395 // length is encoded in 1...4 extra bytes.
1396 const uint32_t needed = CalculateNeeded(c);
1397 assert(needed <= sizeof(scratch_));
1398
1399 // Read more bytes from reader if needed
1400 uint32_t nbuf = (uint32_t)(ip_limit_ - ip);
1401 if (nbuf < needed) {
1402 // Stitch together bytes from ip and reader to form the word
1403 // contents. We store the needed bytes in "scratch_". They
1404 // will be consumed immediately by the caller since we do not
1405 // read more than we need.
1406 std::memmove(scratch_, ip, nbuf);
1407 reader_->Skip(peeked_); // All peeked bytes are used up
1408 peeked_ = 0;
1409 while (nbuf < needed) {
1410 size_t length;
1411 const char* src = reader_->Peek(&length);
1412 if (length == 0) return false;
1413 uint32_t to_add = std::min<uint32_t>(needed - nbuf, (uint32_t)length);
1414 std::memcpy(scratch_ + nbuf, src, to_add);
1415 nbuf += to_add;
1416 reader_->Skip(to_add);
1417 }
1418 assert(nbuf == needed);
1419 ip_ = scratch_;
1420 ip_limit_ = scratch_ + needed;
1421 } else if (nbuf < kMaximumTagLength) {
1422 // Have enough bytes, but move into scratch_ so that we do not
1423 // read past end of input
1424 std::memmove(scratch_, ip, nbuf);
1425 reader_->Skip(peeked_); // All peeked bytes are used up
1426 peeked_ = 0;
1427 ip_ = scratch_;
1428 ip_limit_ = scratch_ + nbuf;
1429 } else {
1430 // Pass pointer to buffer returned by reader_.
1431 ip_ = ip;
1432 }
1433 return true;
1434 }
1435
1436 template <typename Writer>
InternalUncompress(Source * r,Writer * writer)1437 static bool InternalUncompress(Source* r, Writer* writer) {
1438 // Read the uncompressed length from the front of the compressed input
1439 SnappyDecompressor decompressor(r);
1440 uint32_t uncompressed_len = 0;
1441 if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
1442
1443 return InternalUncompressAllTags(&decompressor, writer, (uint32_t)r->Available(),
1444 uncompressed_len);
1445 }
1446
1447 template <typename Writer>
InternalUncompressAllTags(SnappyDecompressor * decompressor,Writer * writer,uint32_t compressed_len,uint32_t uncompressed_len)1448 static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
1449 Writer* writer, uint32_t compressed_len,
1450 uint32_t uncompressed_len) {
1451 Report("snappy_uncompress", compressed_len, uncompressed_len);
1452
1453 writer->SetExpectedLength(uncompressed_len);
1454
1455 // Process the entire input
1456 decompressor->DecompressAllTags(writer);
1457 writer->Flush();
1458 return (decompressor->eof() && writer->CheckLength());
1459 }
1460
GetUncompressedLength(Source * source,uint32_t * result)1461 bool GetUncompressedLength(Source* source, uint32_t* result) {
1462 SnappyDecompressor decompressor(source);
1463 return decompressor.ReadUncompressedLength(result);
1464 }
1465
Compress(Source * reader,Sink * writer)1466 size_t Compress(Source* reader, Sink* writer) {
1467 size_t written = 0;
1468 size_t N = reader->Available();
1469 const size_t uncompressed_size = N;
1470 char ulength[Varint::kMax32];
1471 char* p = Varint::Encode32(ulength, (uint32_t)N);
1472 writer->Append(ulength, p - ulength);
1473 written += (p - ulength);
1474
1475 internal::WorkingMemory wmem(N);
1476
1477 while (N > 0) {
1478 // Get next block to compress (without copying if possible)
1479 size_t fragment_size;
1480 const char* fragment = reader->Peek(&fragment_size);
1481 assert(fragment_size != 0); // premature end of input
1482 const size_t num_to_read = std::min(N, kBlockSize);
1483 size_t bytes_read = fragment_size;
1484
1485 size_t pending_advance = 0;
1486 if (bytes_read >= num_to_read) {
1487 // Buffer returned by reader is large enough
1488 pending_advance = num_to_read;
1489 fragment_size = num_to_read;
1490 } else {
1491 char* scratch = wmem.GetScratchInput();
1492 std::memcpy(scratch, fragment, bytes_read);
1493 reader->Skip(bytes_read);
1494
1495 while (bytes_read < num_to_read) {
1496 fragment = reader->Peek(&fragment_size);
1497 size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
1498 std::memcpy(scratch + bytes_read, fragment, n);
1499 bytes_read += n;
1500 reader->Skip(n);
1501 }
1502 assert(bytes_read == num_to_read);
1503 fragment = scratch;
1504 fragment_size = num_to_read;
1505 }
1506 assert(fragment_size == num_to_read);
1507
1508 // Get encoding table for compression
1509 int table_size;
1510 uint16_t* table = wmem.GetHashTable(num_to_read, &table_size);
1511
1512 // Compress input_fragment and append to dest
1513 const int max_output = (int)MaxCompressedLength(num_to_read);
1514
1515 // Need a scratch buffer for the output, in case the byte sink doesn't
1516 // have room for us directly.
1517
1518 // Since we encode kBlockSize regions followed by a region
1519 // which is <= kBlockSize in length, a previously allocated
1520 // scratch_output[] region is big enough for this iteration.
1521 char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1522 char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
1523 table_size);
1524 writer->Append(dest, end - dest);
1525 written += (end - dest);
1526
1527 N -= num_to_read;
1528 reader->Skip(pending_advance);
1529 }
1530
1531 Report("snappy_compress", written, uncompressed_size);
1532
1533 return written;
1534 }
1535
1536 // -----------------------------------------------------------------------
1537 // IOVec interfaces
1538 // -----------------------------------------------------------------------
1539
1540 // A type that writes to an iovec.
1541 // Note that this is not a "ByteSink", but a type that matches the
1542 // Writer template argument to SnappyDecompressor::DecompressAllTags().
1543 class SnappyIOVecWriter {
1544 private:
1545 // output_iov_end_ is set to iov + count and used to determine when
1546 // the end of the iovs is reached.
1547 const struct iovec* output_iov_end_;
1548
1549 #if !defined(NDEBUG)
1550 const struct iovec* output_iov_;
1551 #endif // !defined(NDEBUG)
1552
1553 // Current iov that is being written into.
1554 const struct iovec* curr_iov_;
1555
1556 // Pointer to current iov's write location.
1557 char* curr_iov_output_;
1558
1559 // Remaining bytes to write into curr_iov_output.
1560 size_t curr_iov_remaining_;
1561
1562 // Total bytes decompressed into output_iov_ so far.
1563 size_t total_written_;
1564
1565 // Maximum number of bytes that will be decompressed into output_iov_.
1566 size_t output_limit_;
1567
GetIOVecPointer(const struct iovec * iov,size_t offset)1568 static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
1569 return reinterpret_cast<char*>(iov->iov_base) + offset;
1570 }
1571
1572 public:
1573 // Does not take ownership of iov. iov must be valid during the
1574 // entire lifetime of the SnappyIOVecWriter.
SnappyIOVecWriter(const struct iovec * iov,size_t iov_count)1575 inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
1576 : output_iov_end_(iov + iov_count),
1577 #if !defined(NDEBUG)
1578 output_iov_(iov),
1579 #endif // !defined(NDEBUG)
1580 curr_iov_(iov),
1581 curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
1582 : nullptr),
1583 curr_iov_remaining_(iov_count ? iov->iov_len : 0),
1584 total_written_(0),
1585 output_limit_(-1) {
1586 }
1587
SetExpectedLength(size_t len)1588 inline void SetExpectedLength(size_t len) { output_limit_ = len; }
1589
CheckLength() const1590 inline bool CheckLength() const { return total_written_ == output_limit_; }
1591
Append(const char * ip,size_t len,char **)1592 inline bool Append(const char* ip, size_t len, char**) {
1593 if (total_written_ + len > output_limit_) {
1594 return false;
1595 }
1596
1597 return AppendNoCheck(ip, len);
1598 }
1599
GetOutputPtr()1600 char* GetOutputPtr() { return nullptr; }
GetBase(ptrdiff_t *)1601 char* GetBase(ptrdiff_t*) { return nullptr; }
SetOutputPtr(char * op)1602 void SetOutputPtr(char* op) {
1603 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1604 (void)op;
1605 }
1606
AppendNoCheck(const char * ip,size_t len)1607 inline bool AppendNoCheck(const char* ip, size_t len) {
1608 while (len > 0) {
1609 if (curr_iov_remaining_ == 0) {
1610 // This iovec is full. Go to the next one.
1611 if (curr_iov_ + 1 >= output_iov_end_) {
1612 return false;
1613 }
1614 ++curr_iov_;
1615 curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1616 curr_iov_remaining_ = curr_iov_->iov_len;
1617 }
1618
1619 const size_t to_write = std::min(len, curr_iov_remaining_);
1620 std::memcpy(curr_iov_output_, ip, to_write);
1621 curr_iov_output_ += to_write;
1622 curr_iov_remaining_ -= to_write;
1623 total_written_ += to_write;
1624 ip += to_write;
1625 len -= to_write;
1626 }
1627
1628 return true;
1629 }
1630
TryFastAppend(const char * ip,size_t available,size_t len,char **)1631 inline bool TryFastAppend(const char* ip, size_t available, size_t len,
1632 char**) {
1633 const size_t space_left = output_limit_ - total_written_;
1634 if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
1635 curr_iov_remaining_ >= 16) {
1636 // Fast path, used for the majority (about 95%) of invocations.
1637 UnalignedCopy128(ip, curr_iov_output_);
1638 curr_iov_output_ += len;
1639 curr_iov_remaining_ -= len;
1640 total_written_ += len;
1641 return true;
1642 }
1643
1644 return false;
1645 }
1646
AppendFromSelf(size_t offset,size_t len,char **)1647 inline bool AppendFromSelf(size_t offset, size_t len, char**) {
1648 // See SnappyArrayWriter::AppendFromSelf for an explanation of
1649 // the "offset - 1u" trick.
1650 if (offset - 1u >= total_written_) {
1651 return false;
1652 }
1653 const size_t space_left = output_limit_ - total_written_;
1654 if (len > space_left) {
1655 return false;
1656 }
1657
1658 // Locate the iovec from which we need to start the copy.
1659 const iovec* from_iov = curr_iov_;
1660 size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
1661 while (offset > 0) {
1662 if (from_iov_offset >= offset) {
1663 from_iov_offset -= offset;
1664 break;
1665 }
1666
1667 offset -= from_iov_offset;
1668 --from_iov;
1669 #if !defined(NDEBUG)
1670 assert(from_iov >= output_iov_);
1671 #endif // !defined(NDEBUG)
1672 from_iov_offset = from_iov->iov_len;
1673 }
1674
1675 // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1676 // the current iovec.
1677 while (len > 0) {
1678 assert(from_iov <= curr_iov_);
1679 if (from_iov != curr_iov_) {
1680 const size_t to_copy =
1681 std::min(from_iov->iov_len - from_iov_offset, len);
1682 AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
1683 len -= to_copy;
1684 if (len > 0) {
1685 ++from_iov;
1686 from_iov_offset = 0;
1687 }
1688 } else {
1689 size_t to_copy = curr_iov_remaining_;
1690 if (to_copy == 0) {
1691 // This iovec is full. Go to the next one.
1692 if (curr_iov_ + 1 >= output_iov_end_) {
1693 return false;
1694 }
1695 ++curr_iov_;
1696 curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1697 curr_iov_remaining_ = curr_iov_->iov_len;
1698 continue;
1699 }
1700 if (to_copy > len) {
1701 to_copy = len;
1702 }
1703 assert(to_copy > 0);
1704
1705 IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
1706 curr_iov_output_, curr_iov_output_ + to_copy,
1707 curr_iov_output_ + curr_iov_remaining_);
1708 curr_iov_output_ += to_copy;
1709 curr_iov_remaining_ -= to_copy;
1710 from_iov_offset += to_copy;
1711 total_written_ += to_copy;
1712 len -= to_copy;
1713 }
1714 }
1715
1716 return true;
1717 }
1718
Flush()1719 inline void Flush() {}
1720 };
1721
RawUncompressToIOVec(const char * compressed,size_t compressed_length,const struct iovec * iov,size_t iov_cnt)1722 bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
1723 const struct iovec* iov, size_t iov_cnt) {
1724 ByteArraySource reader(compressed, compressed_length);
1725 return RawUncompressToIOVec(&reader, iov, iov_cnt);
1726 }
1727
RawUncompressToIOVec(Source * compressed,const struct iovec * iov,size_t iov_cnt)1728 bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
1729 size_t iov_cnt) {
1730 SnappyIOVecWriter output(iov, iov_cnt);
1731 return InternalUncompress(compressed, &output);
1732 }
1733
1734 // -----------------------------------------------------------------------
1735 // Flat array interfaces
1736 // -----------------------------------------------------------------------
1737
1738 // A type that writes to a flat array.
1739 // Note that this is not a "ByteSink", but a type that matches the
1740 // Writer template argument to SnappyDecompressor::DecompressAllTags().
1741 class SnappyArrayWriter {
1742 private:
1743 char* base_;
1744 char* op_;
1745 char* op_limit_;
1746 // If op < op_limit_min_slop_ then it's safe to unconditionally write
1747 // kSlopBytes starting at op.
1748 char* op_limit_min_slop_;
1749
1750 public:
SnappyArrayWriter(char * dst)1751 inline explicit SnappyArrayWriter(char* dst)
1752 : base_(dst),
1753 op_(dst),
1754 op_limit_(dst),
1755 op_limit_min_slop_(dst) {} // Safe default see invariant.
1756
SetExpectedLength(size_t len)1757 inline void SetExpectedLength(size_t len) {
1758 op_limit_ = op_ + len;
1759 // Prevent pointer from being past the buffer.
1760 op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, len);
1761 }
1762
CheckLength() const1763 inline bool CheckLength() const { return op_ == op_limit_; }
1764
GetOutputPtr()1765 char* GetOutputPtr() { return op_; }
GetBase(ptrdiff_t * op_limit_min_slop)1766 char* GetBase(ptrdiff_t* op_limit_min_slop) {
1767 *op_limit_min_slop = op_limit_min_slop_ - base_;
1768 return base_;
1769 }
SetOutputPtr(char * op)1770 void SetOutputPtr(char* op) { op_ = op; }
1771
Append(const char * ip,size_t len,char ** op_p)1772 inline bool Append(const char* ip, size_t len, char** op_p) {
1773 char* op = *op_p;
1774 const size_t space_left = op_limit_ - op;
1775 if (space_left < len) return false;
1776 std::memcpy(op, ip, len);
1777 *op_p = op + len;
1778 return true;
1779 }
1780
TryFastAppend(const char * ip,size_t available,size_t len,char ** op_p)1781 inline bool TryFastAppend(const char* ip, size_t available, size_t len,
1782 char** op_p) {
1783 char* op = *op_p;
1784 const size_t space_left = op_limit_ - op;
1785 if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
1786 // Fast path, used for the majority (about 95%) of invocations.
1787 UnalignedCopy128(ip, op);
1788 *op_p = op + len;
1789 return true;
1790 } else {
1791 return false;
1792 }
1793 }
1794
1795 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
AppendFromSelf(size_t offset,size_t len,char ** op_p)1796 inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
1797 assert(len > 0);
1798 char* const op = *op_p;
1799 assert(op >= base_);
1800 char* const op_end = op + len;
1801
1802 // Check if we try to append from before the start of the buffer.
1803 if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - base_) < offset))
1804 return false;
1805
1806 if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||
1807 op >= op_limit_min_slop_ || offset < len)) {
1808 if (op_end > op_limit_ || offset == 0) return false;
1809 *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
1810 return true;
1811 }
1812 std::memmove(op, op - offset, kSlopBytes);
1813 *op_p = op_end;
1814 return true;
1815 }
Produced() const1816 inline size_t Produced() const {
1817 assert(op_ >= base_);
1818 return op_ - base_;
1819 }
Flush()1820 inline void Flush() {}
1821 };
1822
RawUncompress(const char * compressed,size_t compressed_length,char * uncompressed)1823 bool RawUncompress(const char* compressed, size_t compressed_length,
1824 char* uncompressed) {
1825 ByteArraySource reader(compressed, compressed_length);
1826 return RawUncompress(&reader, uncompressed);
1827 }
1828
RawUncompress(Source * compressed,char * uncompressed)1829 bool RawUncompress(Source* compressed, char* uncompressed) {
1830 SnappyArrayWriter output(uncompressed);
1831 return InternalUncompress(compressed, &output);
1832 }
1833
Uncompress(const char * compressed,size_t compressed_length,std::string * uncompressed)1834 bool Uncompress(const char* compressed, size_t compressed_length,
1835 std::string* uncompressed) {
1836 size_t ulength;
1837 if (!GetUncompressedLength(compressed, compressed_length, &ulength)) {
1838 return false;
1839 }
1840 // On 32-bit builds: max_size() < kuint32max. Check for that instead
1841 // of crashing (e.g., consider externally specified compressed data).
1842 if (ulength > uncompressed->max_size()) {
1843 return false;
1844 }
1845 STLStringResizeUninitialized(uncompressed, ulength);
1846 return RawUncompress(compressed, compressed_length,
1847 string_as_array(uncompressed));
1848 }
1849
1850 // A Writer that drops everything on the floor and just does validation
1851 class SnappyDecompressionValidator {
1852 private:
1853 size_t expected_;
1854 size_t produced_;
1855
1856 public:
SnappyDecompressionValidator()1857 inline SnappyDecompressionValidator() : expected_(0), produced_(0) {}
SetExpectedLength(size_t len)1858 inline void SetExpectedLength(size_t len) { expected_ = len; }
GetOutputPtr()1859 size_t GetOutputPtr() { return produced_; }
GetBase(ptrdiff_t * op_limit_min_slop)1860 size_t GetBase(ptrdiff_t* op_limit_min_slop) {
1861 *op_limit_min_slop = std::numeric_limits<ptrdiff_t>::max() - kSlopBytes + 1;
1862 return 1;
1863 }
SetOutputPtr(size_t op)1864 void SetOutputPtr(size_t op) { produced_ = op; }
CheckLength() const1865 inline bool CheckLength() const { return expected_ == produced_; }
Append(const char * ip,size_t len,size_t * produced)1866 inline bool Append(const char* ip, size_t len, size_t* produced) {
1867 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1868 (void)ip;
1869
1870 *produced += len;
1871 return *produced <= expected_;
1872 }
TryFastAppend(const char * ip,size_t available,size_t length,size_t * produced)1873 inline bool TryFastAppend(const char* ip, size_t available, size_t length,
1874 size_t* produced) {
1875 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1876 (void)ip;
1877 (void)available;
1878 (void)length;
1879 (void)produced;
1880
1881 return false;
1882 }
AppendFromSelf(size_t offset,size_t len,size_t * produced)1883 inline bool AppendFromSelf(size_t offset, size_t len, size_t* produced) {
1884 // See SnappyArrayWriter::AppendFromSelf for an explanation of
1885 // the "offset - 1u" trick.
1886 if (*produced <= offset - 1u) return false;
1887 *produced += len;
1888 return *produced <= expected_;
1889 }
Flush()1890 inline void Flush() {}
1891 };
1892
IsValidCompressedBuffer(const char * compressed,size_t compressed_length)1893 bool IsValidCompressedBuffer(const char* compressed, size_t compressed_length) {
1894 ByteArraySource reader(compressed, compressed_length);
1895 SnappyDecompressionValidator writer;
1896 return InternalUncompress(&reader, &writer);
1897 }
1898
IsValidCompressed(Source * compressed)1899 bool IsValidCompressed(Source* compressed) {
1900 SnappyDecompressionValidator writer;
1901 return InternalUncompress(compressed, &writer);
1902 }
1903
RawCompress(const char * input,size_t input_length,char * compressed,size_t * compressed_length)1904 void RawCompress(const char* input, size_t input_length, char* compressed,
1905 size_t* compressed_length) {
1906 ByteArraySource reader(input, input_length);
1907 UncheckedByteArraySink writer(compressed);
1908 Compress(&reader, &writer);
1909
1910 // Compute how many bytes were added
1911 *compressed_length = (writer.CurrentDestination() - compressed);
1912 }
1913
Compress(const char * input,size_t input_length,std::string * compressed)1914 size_t Compress(const char* input, size_t input_length,
1915 std::string* compressed) {
1916 // Pre-grow the buffer to the max length of the compressed output
1917 STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1918
1919 size_t compressed_length;
1920 RawCompress(input, input_length, string_as_array(compressed),
1921 &compressed_length);
1922 compressed->resize(compressed_length);
1923 return compressed_length;
1924 }
1925
1926 // -----------------------------------------------------------------------
1927 // Sink interface
1928 // -----------------------------------------------------------------------
1929
1930 // A type that decompresses into a Sink. The template parameter
1931 // Allocator must export one method "char* Allocate(int size);", which
1932 // allocates a buffer of "size" and appends that to the destination.
1933 template <typename Allocator>
1934 class SnappyScatteredWriter {
1935 Allocator allocator_;
1936
1937 // We need random access into the data generated so far. Therefore
1938 // we keep track of all of the generated data as an array of blocks.
1939 // All of the blocks except the last have length kBlockSize.
1940 std::vector<char*> blocks_;
1941 size_t expected_;
1942
1943 // Total size of all fully generated blocks so far
1944 size_t full_size_;
1945
1946 // Pointer into current output block
1947 char* op_base_; // Base of output block
1948 char* op_ptr_; // Pointer to next unfilled byte in block
1949 char* op_limit_; // Pointer just past block
1950 // If op < op_limit_min_slop_ then it's safe to unconditionally write
1951 // kSlopBytes starting at op.
1952 char* op_limit_min_slop_;
1953
Size() const1954 inline size_t Size() const { return full_size_ + (op_ptr_ - op_base_); }
1955
1956 bool SlowAppend(const char* ip, size_t len);
1957 bool SlowAppendFromSelf(size_t offset, size_t len);
1958
1959 public:
SnappyScatteredWriter(const Allocator & allocator)1960 inline explicit SnappyScatteredWriter(const Allocator& allocator)
1961 : allocator_(allocator),
1962 full_size_(0),
1963 op_base_(NULL),
1964 op_ptr_(NULL),
1965 op_limit_(NULL),
1966 op_limit_min_slop_(NULL) {}
GetOutputPtr()1967 char* GetOutputPtr() { return op_ptr_; }
GetBase(ptrdiff_t * op_limit_min_slop)1968 char* GetBase(ptrdiff_t* op_limit_min_slop) {
1969 *op_limit_min_slop = op_limit_min_slop_ - op_base_;
1970 return op_base_;
1971 }
SetOutputPtr(char * op)1972 void SetOutputPtr(char* op) { op_ptr_ = op; }
1973
SetExpectedLength(size_t len)1974 inline void SetExpectedLength(size_t len) {
1975 assert(blocks_.empty());
1976 expected_ = len;
1977 }
1978
CheckLength() const1979 inline bool CheckLength() const { return Size() == expected_; }
1980
1981 // Return the number of bytes actually uncompressed so far
Produced() const1982 inline size_t Produced() const { return Size(); }
1983
Append(const char * ip,size_t len,char ** op_p)1984 inline bool Append(const char* ip, size_t len, char** op_p) {
1985 char* op = *op_p;
1986 size_t avail = op_limit_ - op;
1987 if (len <= avail) {
1988 // Fast path
1989 std::memcpy(op, ip, len);
1990 *op_p = op + len;
1991 return true;
1992 } else {
1993 op_ptr_ = op;
1994 bool res = SlowAppend(ip, len);
1995 *op_p = op_ptr_;
1996 return res;
1997 }
1998 }
1999
TryFastAppend(const char * ip,size_t available,size_t length,char ** op_p)2000 inline bool TryFastAppend(const char* ip, size_t available, size_t length,
2001 char** op_p) {
2002 char* op = *op_p;
2003 const int space_left = (int)(op_limit_ - op);
2004 if (length <= 16 && available >= 16 + kMaximumTagLength &&
2005 space_left >= 16) {
2006 // Fast path, used for the majority (about 95%) of invocations.
2007 UnalignedCopy128(ip, op);
2008 *op_p = op + length;
2009 return true;
2010 } else {
2011 return false;
2012 }
2013 }
2014
AppendFromSelf(size_t offset,size_t len,char ** op_p)2015 inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
2016 char* op = *op_p;
2017 assert(op >= op_base_);
2018 // Check if we try to append from before the start of the buffer.
2019 if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||
2020 static_cast<size_t>(op - op_base_) < offset ||
2021 op >= op_limit_min_slop_ || offset < len)) {
2022 if (offset == 0) return false;
2023 if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - op_base_) < offset ||
2024 op + len > op_limit_)) {
2025 op_ptr_ = op;
2026 bool res = SlowAppendFromSelf(offset, len);
2027 *op_p = op_ptr_;
2028 return res;
2029 }
2030 *op_p = IncrementalCopy(op - offset, op, op + len, op_limit_);
2031 return true;
2032 }
2033 // Fast path
2034 char* const op_end = op + len;
2035 std::memmove(op, op - offset, kSlopBytes);
2036 *op_p = op_end;
2037 return true;
2038 }
2039
2040 // Called at the end of the decompress. We ask the allocator
2041 // write all blocks to the sink.
Flush()2042 inline void Flush() { allocator_.Flush(Produced()); }
2043 };
2044
2045 template <typename Allocator>
SlowAppend(const char * ip,size_t len)2046 bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
2047 size_t avail = op_limit_ - op_ptr_;
2048 while (len > avail) {
2049 // Completely fill this block
2050 std::memcpy(op_ptr_, ip, avail);
2051 op_ptr_ += avail;
2052 assert(op_limit_ - op_ptr_ == 0);
2053 full_size_ += (op_ptr_ - op_base_);
2054 len -= avail;
2055 ip += avail;
2056
2057 // Bounds check
2058 if (full_size_ + len > expected_) return false;
2059
2060 // Make new block
2061 size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
2062 op_base_ = allocator_.Allocate((int)bsize);
2063 op_ptr_ = op_base_;
2064 op_limit_ = op_base_ + bsize;
2065 op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, bsize);
2066
2067 blocks_.push_back(op_base_);
2068 avail = bsize;
2069 }
2070
2071 std::memcpy(op_ptr_, ip, len);
2072 op_ptr_ += len;
2073 return true;
2074 }
2075
2076 template <typename Allocator>
SlowAppendFromSelf(size_t offset,size_t len)2077 bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
2078 size_t len) {
2079 // Overflow check
2080 // See SnappyArrayWriter::AppendFromSelf for an explanation of
2081 // the "offset - 1u" trick.
2082 const size_t cur = Size();
2083 if (offset - 1u >= cur) return false;
2084 if (expected_ - cur < len) return false;
2085
2086 // Currently we shouldn't ever hit this path because Compress() chops the
2087 // input into blocks and does not create cross-block copies. However, it is
2088 // nice if we do not rely on that, since we can get better compression if we
2089 // allow cross-block copies and thus might want to change the compressor in
2090 // the future.
2091 // TODO Replace this with a properly optimized path. This is not
2092 // triggered right now. But this is so super slow, that it would regress
2093 // performance unacceptably if triggered.
2094 size_t src = cur - offset;
2095 char* op = op_ptr_;
2096 while (len-- > 0) {
2097 char c = blocks_[src >> kBlockLog][src & (kBlockSize - 1)];
2098 if (!Append(&c, 1, &op)) {
2099 op_ptr_ = op;
2100 return false;
2101 }
2102 src++;
2103 }
2104 op_ptr_ = op;
2105 return true;
2106 }
2107
2108 class SnappySinkAllocator {
2109 public:
SnappySinkAllocator(Sink * dest)2110 explicit SnappySinkAllocator(Sink* dest) : dest_(dest) {}
~SnappySinkAllocator()2111 ~SnappySinkAllocator() {}
2112
Allocate(int size)2113 char* Allocate(int size) {
2114 Datablock block(new char[size], size);
2115 blocks_.push_back(block);
2116 return block.data;
2117 }
2118
2119 // We flush only at the end, because the writer wants
2120 // random access to the blocks and once we hand the
2121 // block over to the sink, we can't access it anymore.
2122 // Also we don't write more than has been actually written
2123 // to the blocks.
Flush(size_t size)2124 void Flush(size_t size) {
2125 size_t size_written = 0;
2126 for (Datablock& block : blocks_) {
2127 size_t block_size = std::min<size_t>(block.size, size - size_written);
2128 dest_->AppendAndTakeOwnership(block.data, block_size,
2129 &SnappySinkAllocator::Deleter, NULL);
2130 size_written += block_size;
2131 }
2132 blocks_.clear();
2133 }
2134
2135 private:
2136 struct Datablock {
2137 char* data;
2138 size_t size;
Datablocksnappy::SnappySinkAllocator::Datablock2139 Datablock(char* p, size_t s) : data(p), size(s) {}
2140 };
2141
Deleter(void * arg,const char * bytes,size_t size)2142 static void Deleter(void* arg, const char* bytes, size_t size) {
2143 // TODO: Switch to [[maybe_unused]] when we can assume C++17.
2144 (void)arg;
2145 (void)size;
2146
2147 delete[] bytes;
2148 }
2149
2150 Sink* dest_;
2151 std::vector<Datablock> blocks_;
2152
2153 // Note: copying this object is allowed
2154 };
2155
UncompressAsMuchAsPossible(Source * compressed,Sink * uncompressed)2156 size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
2157 SnappySinkAllocator allocator(uncompressed);
2158 SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
2159 InternalUncompress(compressed, &writer);
2160 return writer.Produced();
2161 }
2162
Uncompress(Source * compressed,Sink * uncompressed)2163 bool Uncompress(Source* compressed, Sink* uncompressed) {
2164 // Read the uncompressed length from the front of the compressed input
2165 SnappyDecompressor decompressor(compressed);
2166 uint32_t uncompressed_len = 0;
2167 if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
2168 return false;
2169 }
2170
2171 char c;
2172 size_t allocated_size;
2173 char* buf = uncompressed->GetAppendBufferVariable(1, uncompressed_len, &c, 1,
2174 &allocated_size);
2175
2176 const size_t compressed_len = compressed->Available();
2177 // If we can get a flat buffer, then use it, otherwise do block by block
2178 // uncompression
2179 if (allocated_size >= uncompressed_len) {
2180 SnappyArrayWriter writer(buf);
2181 bool result = InternalUncompressAllTags(&decompressor, &writer,
2182 (uint32_t)compressed_len, uncompressed_len);
2183 uncompressed->Append(buf, writer.Produced());
2184 return result;
2185 } else {
2186 SnappySinkAllocator allocator(uncompressed);
2187 SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
2188 return InternalUncompressAllTags(&decompressor, &writer, (uint32_t)compressed_len,
2189 uncompressed_len);
2190 }
2191 }
2192
2193 } // namespace snappy
2194