1 // Copyright 2005 Google Inc. All Rights Reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 #include "snappy.h"
30 #include "snappy-internal.h"
31 #include "snappy-sinksource.h"
32
33 #ifndef SNAPPY_HAVE_SSE2
34 #if defined(__SSE2__) || defined(_M_X64) || \
35 (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
36 #define SNAPPY_HAVE_SSE2 1
37 #else
38 #define SNAPPY_HAVE_SSE2 0
39 #endif
40 #endif
41
42 #if SNAPPY_HAVE_SSE2
43 #include <emmintrin.h>
44 #endif
45 #include <stdio.h>
46
47 #include <algorithm>
48 #include <string>
49 #include <vector>
50
51
52 namespace snappy {
53
54 using internal::COPY_1_BYTE_OFFSET;
55 using internal::COPY_2_BYTE_OFFSET;
56 using internal::LITERAL;
57 using internal::char_table;
58 using internal::kMaximumTagLength;
59
60 // Any hash function will produce a valid compressed bitstream, but a good
61 // hash function reduces the number of collisions and thus yields better
62 // compression for compressible input, and more speed for incompressible
63 // input. Of course, it doesn't hurt if the hash function is reasonably fast
64 // either, as it gets called a lot.
HashBytes(uint32 bytes,int shift)65 static inline uint32 HashBytes(uint32 bytes, int shift) {
66 uint32 kMul = 0x1e35a7bd;
67 return (bytes * kMul) >> shift;
68 }
Hash(const char * p,int shift)69 static inline uint32 Hash(const char* p, int shift) {
70 return HashBytes(UNALIGNED_LOAD32(p), shift);
71 }
72
MaxCompressedLength(size_t source_len)73 size_t MaxCompressedLength(size_t source_len) {
74 // Compressed data can be defined as:
75 // compressed := item* literal*
76 // item := literal* copy
77 //
78 // The trailing literal sequence has a space blowup of at most 62/60
79 // since a literal of length 60 needs one tag byte + one extra byte
80 // for length information.
81 //
82 // Item blowup is trickier to measure. Suppose the "copy" op copies
83 // 4 bytes of data. Because of a special check in the encoding code,
84 // we produce a 4-byte copy only if the offset is < 65536. Therefore
85 // the copy op takes 3 bytes to encode, and this type of item leads
86 // to at most the 62/60 blowup for representing literals.
87 //
88 // Suppose the "copy" op copies 5 bytes of data. If the offset is big
89 // enough, it will take 5 bytes to encode the copy op. Therefore the
90 // worst case here is a one-byte literal followed by a five-byte copy.
91 // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
92 //
93 // This last factor dominates the blowup, so the final estimate is:
94 return 32 + source_len + source_len/6;
95 }
96
97 namespace {
98
UnalignedCopy64(const void * src,void * dst)99 void UnalignedCopy64(const void* src, void* dst) {
100 char tmp[8];
101 memcpy(tmp, src, 8);
102 memcpy(dst, tmp, 8);
103 }
104
UnalignedCopy128(const void * src,void * dst)105 void UnalignedCopy128(const void* src, void* dst) {
106 // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
107 // SSE2 moves for memcpy(dst, src, 16).
108 #if SNAPPY_HAVE_SSE2
109 __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
110 _mm_storeu_si128(static_cast<__m128i*>(dst), x);
111 #else
112 char tmp[16];
113 memcpy(tmp, src, 16);
114 memcpy(dst, tmp, 16);
115 #endif
116 }
117
118 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
119 // for handling COPY operations where the input and output regions may overlap.
120 // For example, suppose:
121 // src == "ab"
122 // op == src + 2
123 // op_limit == op + 20
124 // After IncrementalCopySlow(src, op, op_limit), the result will have eleven
125 // copies of "ab"
126 // ababababababababababab
127 // Note that this does not match the semantics of either memcpy() or memmove().
IncrementalCopySlow(const char * src,char * op,char * const op_limit)128 inline char* IncrementalCopySlow(const char* src, char* op,
129 char* const op_limit) {
130 while (op < op_limit) {
131 *op++ = *src++;
132 }
133 return op_limit;
134 }
135
136 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
137 // IncrementalCopySlow. buf_limit is the address past the end of the writable
138 // region of the buffer.
IncrementalCopy(const char * src,char * op,char * const op_limit,char * const buf_limit)139 inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
140 char* const buf_limit) {
141 // Terminology:
142 //
143 // slop = buf_limit - op
144 // pat = op - src
145 // len = limit - op
146 assert(src < op);
147 assert(op_limit <= buf_limit);
148 // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
149 // to optimize this function but we have to also handle these cases in case
150 // the input does not satisfy these conditions.
151
152 size_t pattern_size = op - src;
153 // The cases are split into different branches to allow the branch predictor,
154 // FDO, and static prediction hints to work better. For each input we list the
155 // ratio of invocations that match each condition.
156 //
157 // input slop < 16 pat < 8 len > 16
158 // ------------------------------------------
159 // html|html4|cp 0% 1.01% 27.73%
160 // urls 0% 0.88% 14.79%
161 // jpg 0% 64.29% 7.14%
162 // pdf 0% 2.56% 58.06%
163 // txt[1-4] 0% 0.23% 0.97%
164 // pb 0% 0.96% 13.88%
165 // bin 0.01% 22.27% 41.17%
166 //
167 // It is very rare that we don't have enough slop for doing block copies. It
168 // is also rare that we need to expand a pattern. Small patterns are common
169 // for incompressible formats and for those we are plenty fast already.
170 // Lengths are normally not greater than 16 but they vary depending on the
171 // input. In general if we always predict len <= 16 it would be an ok
172 // prediction.
173 //
174 // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
175 // copying 2x 8 bytes at a time.
176
177 // Handle the uncommon case where pattern is less than 8 bytes.
178 if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
179 // Expand pattern to at least 8 bytes. The worse case scenario in terms of
180 // buffer usage is when the pattern is size 3. ^ is the original position
181 // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
182 //
183 // abc
184 // abcabcxxxxx
185 // abcabcabcabcxxxxx
186 // ^
187 // The last x is 14 bytes after ^.
188 if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 14)) {
189 while (pattern_size < 8) {
190 UnalignedCopy64(src, op);
191 op += pattern_size;
192 pattern_size *= 2;
193 }
194 if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
195 } else {
196 return IncrementalCopySlow(src, op, op_limit);
197 }
198 }
199 assert(pattern_size >= 8);
200
201 // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
202 // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
203 // because expanding the pattern to at least 8 bytes guarantees that
204 // op - src >= 8.
205 while (op <= buf_limit - 16) {
206 UnalignedCopy64(src, op);
207 UnalignedCopy64(src + 8, op + 8);
208 src += 16;
209 op += 16;
210 if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
211 }
212 // We only take this branch if we didn't have enough slop and we can do a
213 // single 8 byte copy.
214 if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
215 UnalignedCopy64(src, op);
216 src += 8;
217 op += 8;
218 }
219 return IncrementalCopySlow(src, op, op_limit);
220 }
221
222 } // namespace
223
EmitLiteral(char * op,const char * literal,int len,bool allow_fast_path)224 static inline char* EmitLiteral(char* op,
225 const char* literal,
226 int len,
227 bool allow_fast_path) {
228 // The vast majority of copies are below 16 bytes, for which a
229 // call to memcpy is overkill. This fast path can sometimes
230 // copy up to 15 bytes too much, but that is okay in the
231 // main loop, since we have a bit to go on for both sides:
232 //
233 // - The input will always have kInputMarginBytes = 15 extra
234 // available bytes, as long as we're in the main loop, and
235 // if not, allow_fast_path = false.
236 // - The output will always have 32 spare bytes (see
237 // MaxCompressedLength).
238 assert(len > 0); // Zero-length literals are disallowed
239 int n = len - 1;
240 if (allow_fast_path && len <= 16) {
241 // Fits in tag byte
242 *op++ = LITERAL | (n << 2);
243
244 UnalignedCopy128(literal, op);
245 return op + len;
246 }
247
248 if (n < 60) {
249 // Fits in tag byte
250 *op++ = LITERAL | (n << 2);
251 } else {
252 // Encode in upcoming bytes
253 char* base = op;
254 int count = 0;
255 op++;
256 while (n > 0) {
257 *op++ = n & 0xff;
258 n >>= 8;
259 count++;
260 }
261 assert(count >= 1);
262 assert(count <= 4);
263 *base = LITERAL | ((59+count) << 2);
264 }
265 memcpy(op, literal, len);
266 return op + len;
267 }
268
EmitCopyAtMost64(char * op,size_t offset,size_t len,bool len_less_than_12)269 static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
270 bool len_less_than_12) {
271 assert(len <= 64);
272 assert(len >= 4);
273 assert(offset < 65536);
274 assert(len_less_than_12 == (len < 12));
275
276 if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
277 // offset fits in 11 bits. The 3 highest go in the top of the first byte,
278 // and the rest go in the second byte.
279 *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
280 *op++ = offset & 0xff;
281 } else {
282 // Write 4 bytes, though we only care about 3 of them. The output buffer
283 // is required to have some slack, so the extra byte won't overrun it.
284 uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
285 LittleEndian::Store32(op, u);
286 op += 3;
287 }
288 return op;
289 }
290
EmitCopy(char * op,size_t offset,size_t len,bool len_less_than_12)291 static inline char* EmitCopy(char* op, size_t offset, size_t len,
292 bool len_less_than_12) {
293 assert(len_less_than_12 == (len < 12));
294 if (len_less_than_12) {
295 return EmitCopyAtMost64(op, offset, len, true);
296 } else {
297 // A special case for len <= 64 might help, but so far measurements suggest
298 // it's in the noise.
299
300 // Emit 64 byte copies but make sure to keep at least four bytes reserved.
301 while (SNAPPY_PREDICT_FALSE(len >= 68)) {
302 op = EmitCopyAtMost64(op, offset, 64, false);
303 len -= 64;
304 }
305
306 // One or two copies will now finish the job.
307 if (len > 64) {
308 op = EmitCopyAtMost64(op, offset, 60, false);
309 len -= 60;
310 }
311
312 // Emit remainder.
313 op = EmitCopyAtMost64(op, offset, len, len < 12);
314 return op;
315 }
316 }
317
GetUncompressedLength(const char * start,size_t n,size_t * result)318 bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
319 uint32 v = 0;
320 const char* limit = start + n;
321 if (Varint::Parse32WithLimit(start, limit, &v) != NULL) {
322 *result = v;
323 return true;
324 } else {
325 return false;
326 }
327 }
328
329 namespace internal {
GetHashTable(size_t input_size,int * table_size)330 uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
331 // Use smaller hash table when input.size() is smaller, since we
332 // fill the table, incurring O(hash table size) overhead for
333 // compression, and if the input is short, we won't need that
334 // many hash table entries anyway.
335 assert(kMaxHashTableSize >= 256);
336 size_t htsize = 256;
337 while (htsize < kMaxHashTableSize && htsize < input_size) {
338 htsize <<= 1;
339 }
340
341 uint16* table;
342 if (htsize <= ARRAYSIZE(small_table_)) {
343 table = small_table_;
344 } else {
345 if (large_table_ == NULL) {
346 large_table_ = new uint16[kMaxHashTableSize];
347 }
348 table = large_table_;
349 }
350
351 *table_size = htsize;
352 memset(table, 0, htsize * sizeof(*table));
353 return table;
354 }
355 } // end namespace internal
356
357 // For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
358 // equal UNALIGNED_LOAD32(p + offset). Motivation: On x86-64 hardware we have
359 // empirically found that overlapping loads such as
360 // UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
361 // are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
362 //
363 // We have different versions for 64- and 32-bit; ideally we would avoid the
364 // two functions and just inline the UNALIGNED_LOAD64 call into
365 // GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
366 // enough to avoid loading the value multiple times then. For 64-bit, the load
367 // is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
368 // done at GetUint32AtOffset() time.
369
370 #ifdef ARCH_K8
371
372 typedef uint64 EightBytesReference;
373
GetEightBytesAt(const char * ptr)374 static inline EightBytesReference GetEightBytesAt(const char* ptr) {
375 return UNALIGNED_LOAD64(ptr);
376 }
377
GetUint32AtOffset(uint64 v,int offset)378 static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
379 assert(offset >= 0);
380 assert(offset <= 4);
381 return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
382 }
383
384 #else
385
386 typedef const char* EightBytesReference;
387
GetEightBytesAt(const char * ptr)388 static inline EightBytesReference GetEightBytesAt(const char* ptr) {
389 return ptr;
390 }
391
GetUint32AtOffset(const char * v,int offset)392 static inline uint32 GetUint32AtOffset(const char* v, int offset) {
393 assert(offset >= 0);
394 assert(offset <= 4);
395 return UNALIGNED_LOAD32(v + offset);
396 }
397
398 #endif
399
400 // Flat array compression that does not emit the "uncompressed length"
401 // prefix. Compresses "input" string to the "*op" buffer.
402 //
403 // REQUIRES: "input" is at most "kBlockSize" bytes long.
404 // REQUIRES: "op" points to an array of memory that is at least
405 // "MaxCompressedLength(input.size())" in size.
406 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
407 // REQUIRES: "table_size" is a power of two
408 //
409 // Returns an "end" pointer into "op" buffer.
410 // "end - op" is the compressed size of "input".
411 namespace internal {
CompressFragment(const char * input,size_t input_size,char * op,uint16 * table,const int table_size)412 char* CompressFragment(const char* input,
413 size_t input_size,
414 char* op,
415 uint16* table,
416 const int table_size) {
417 // "ip" is the input pointer, and "op" is the output pointer.
418 const char* ip = input;
419 assert(input_size <= kBlockSize);
420 assert((table_size & (table_size - 1)) == 0); // table must be power of two
421 const int shift = 32 - Bits::Log2Floor(table_size);
422 assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
423 const char* ip_end = input + input_size;
424 const char* base_ip = ip;
425 // Bytes in [next_emit, ip) will be emitted as literal bytes. Or
426 // [next_emit, ip_end) after the main loop.
427 const char* next_emit = ip;
428
429 const size_t kInputMarginBytes = 15;
430 if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
431 const char* ip_limit = input + input_size - kInputMarginBytes;
432
433 for (uint32 next_hash = Hash(++ip, shift); ; ) {
434 assert(next_emit < ip);
435 // The body of this loop calls EmitLiteral once and then EmitCopy one or
436 // more times. (The exception is that when we're close to exhausting
437 // the input we goto emit_remainder.)
438 //
439 // In the first iteration of this loop we're just starting, so
440 // there's nothing to copy, so calling EmitLiteral once is
441 // necessary. And we only start a new iteration when the
442 // current iteration has determined that a call to EmitLiteral will
443 // precede the next call to EmitCopy (if any).
444 //
445 // Step 1: Scan forward in the input looking for a 4-byte-long match.
446 // If we get close to exhausting the input then goto emit_remainder.
447 //
448 // Heuristic match skipping: If 32 bytes are scanned with no matches
449 // found, start looking only at every other byte. If 32 more bytes are
450 // scanned (or skipped), look at every third byte, etc.. When a match is
451 // found, immediately go back to looking at every byte. This is a small
452 // loss (~5% performance, ~0.1% density) for compressible data due to more
453 // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
454 // win since the compressor quickly "realizes" the data is incompressible
455 // and doesn't bother looking for matches everywhere.
456 //
457 // The "skip" variable keeps track of how many bytes there are since the
458 // last match; dividing it by 32 (ie. right-shifting by five) gives the
459 // number of bytes to move ahead for each iteration.
460 uint32 skip = 32;
461
462 const char* next_ip = ip;
463 const char* candidate;
464 do {
465 ip = next_ip;
466 uint32 hash = next_hash;
467 assert(hash == Hash(ip, shift));
468 uint32 bytes_between_hash_lookups = skip >> 5;
469 skip += bytes_between_hash_lookups;
470 next_ip = ip + bytes_between_hash_lookups;
471 if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
472 goto emit_remainder;
473 }
474 next_hash = Hash(next_ip, shift);
475 candidate = base_ip + table[hash];
476 assert(candidate >= base_ip);
477 assert(candidate < ip);
478
479 table[hash] = ip - base_ip;
480 } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
481 UNALIGNED_LOAD32(candidate)));
482
483 // Step 2: A 4-byte match has been found. We'll later see if more
484 // than 4 bytes match. But, prior to the match, input
485 // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
486 assert(next_emit + 16 <= ip_end);
487 op = EmitLiteral(op, next_emit, ip - next_emit, true);
488
489 // Step 3: Call EmitCopy, and then see if another EmitCopy could
490 // be our next move. Repeat until we find no match for the
491 // input immediately after what was consumed by the last EmitCopy call.
492 //
493 // If we exit this loop normally then we need to call EmitLiteral next,
494 // though we don't yet know how big the literal will be. We handle that
495 // by proceeding to the next iteration of the main loop. We also can exit
496 // this loop via goto if we get close to exhausting the input.
497 EightBytesReference input_bytes;
498 uint32 candidate_bytes = 0;
499
500 do {
501 // We have a 4-byte match at ip, and no need to emit any
502 // "literal bytes" prior to ip.
503 const char* base = ip;
504 std::pair<size_t, bool> p =
505 FindMatchLength(candidate + 4, ip + 4, ip_end);
506 size_t matched = 4 + p.first;
507 ip += matched;
508 size_t offset = base - candidate;
509 assert(0 == memcmp(base, candidate, matched));
510 op = EmitCopy(op, offset, matched, p.second);
511 next_emit = ip;
512 if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
513 goto emit_remainder;
514 }
515 // We are now looking for a 4-byte match again. We read
516 // table[Hash(ip, shift)] for that. To improve compression,
517 // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
518 input_bytes = GetEightBytesAt(ip - 1);
519 uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
520 table[prev_hash] = ip - base_ip - 1;
521 uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
522 candidate = base_ip + table[cur_hash];
523 candidate_bytes = UNALIGNED_LOAD32(candidate);
524 table[cur_hash] = ip - base_ip;
525 } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
526
527 next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
528 ++ip;
529 }
530 }
531
532 emit_remainder:
533 // Emit the remaining bytes as a literal
534 if (next_emit < ip_end) {
535 op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
536 }
537
538 return op;
539 }
540 } // end namespace internal
541
542 // Called back at avery compression call to trace parameters and sizes.
Report(const char * algorithm,size_t compressed_size,size_t uncompressed_size)543 static inline void Report(const char *algorithm, size_t compressed_size,
544 size_t uncompressed_size) {}
545
546 // Signature of output types needed by decompression code.
547 // The decompression code is templatized on a type that obeys this
548 // signature so that we do not pay virtual function call overhead in
549 // the middle of a tight decompression loop.
550 //
551 // class DecompressionWriter {
552 // public:
553 // // Called before decompression
554 // void SetExpectedLength(size_t length);
555 //
556 // // Called after decompression
557 // bool CheckLength() const;
558 //
559 // // Called repeatedly during decompression
560 // bool Append(const char* ip, size_t length);
561 // bool AppendFromSelf(uint32 offset, size_t length);
562 //
563 // // The rules for how TryFastAppend differs from Append are somewhat
564 // // convoluted:
565 // //
566 // // - TryFastAppend is allowed to decline (return false) at any
567 // // time, for any reason -- just "return false" would be
568 // // a perfectly legal implementation of TryFastAppend.
569 // // The intention is for TryFastAppend to allow a fast path
570 // // in the common case of a small append.
571 // // - TryFastAppend is allowed to read up to <available> bytes
572 // // from the input buffer, whereas Append is allowed to read
573 // // <length>. However, if it returns true, it must leave
574 // // at least five (kMaximumTagLength) bytes in the input buffer
575 // // afterwards, so that there is always enough space to read the
576 // // next tag without checking for a refill.
577 // // - TryFastAppend must always return decline (return false)
578 // // if <length> is 61 or more, as in this case the literal length is not
579 // // decoded fully. In practice, this should not be a big problem,
580 // // as it is unlikely that one would implement a fast path accepting
581 // // this much data.
582 // //
583 // bool TryFastAppend(const char* ip, size_t available, size_t length);
584 // };
585
586 namespace internal {
587
588 // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
589 static const uint32 wordmask[] = {
590 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
591 };
592
593 } // end namespace internal
594
595 // Helper class for decompression
596 class SnappyDecompressor {
597 private:
598 Source* reader_; // Underlying source of bytes to decompress
599 const char* ip_; // Points to next buffered byte
600 const char* ip_limit_; // Points just past buffered bytes
601 uint32 peeked_; // Bytes peeked from reader (need to skip)
602 bool eof_; // Hit end of input without an error?
603 char scratch_[kMaximumTagLength]; // See RefillTag().
604
605 // Ensure that all of the tag metadata for the next tag is available
606 // in [ip_..ip_limit_-1]. Also ensures that [ip,ip+4] is readable even
607 // if (ip_limit_ - ip_ < 5).
608 //
609 // Returns true on success, false on error or end of input.
610 bool RefillTag();
611
612 public:
SnappyDecompressor(Source * reader)613 explicit SnappyDecompressor(Source* reader)
614 : reader_(reader),
615 ip_(NULL),
616 ip_limit_(NULL),
617 peeked_(0),
618 eof_(false) {
619 }
620
~SnappyDecompressor()621 ~SnappyDecompressor() {
622 // Advance past any bytes we peeked at from the reader
623 reader_->Skip(peeked_);
624 }
625
626 // Returns true iff we have hit the end of the input without an error.
eof() const627 bool eof() const {
628 return eof_;
629 }
630
631 // Read the uncompressed length stored at the start of the compressed data.
632 // On succcess, stores the length in *result and returns true.
633 // On failure, returns false.
ReadUncompressedLength(uint32 * result)634 bool ReadUncompressedLength(uint32* result) {
635 assert(ip_ == NULL); // Must not have read anything yet
636 // Length is encoded in 1..5 bytes
637 *result = 0;
638 uint32 shift = 0;
639 while (true) {
640 if (shift >= 32) return false;
641 size_t n;
642 const char* ip = reader_->Peek(&n);
643 if (n == 0) return false;
644 const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
645 reader_->Skip(1);
646 uint32 val = c & 0x7f;
647 if (((val << shift) >> shift) != val) return false;
648 *result |= val << shift;
649 if (c < 128) {
650 break;
651 }
652 shift += 7;
653 }
654 return true;
655 }
656
657 // Process the next item found in the input.
658 // Returns true if successful, false on error or end of input.
659 template <class Writer>
DecompressAllTags(Writer * writer)660 void DecompressAllTags(Writer* writer) {
661 const char* ip = ip_;
662 // For position-independent executables, accessing global arrays can be
663 // slow. Move wordmask array onto the stack to mitigate this.
664 uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
665 // Do not use memcpy to copy internal::wordmask to
666 // wordmask. LLVM converts stack arrays to global arrays if it detects
667 // const stack arrays and this hurts the performance of position
668 // independent code. This change is temporary and can be reverted when
669 // https://reviews.llvm.org/D30759 is approved.
670 wordmask[0] = internal::wordmask[0];
671 wordmask[1] = internal::wordmask[1];
672 wordmask[2] = internal::wordmask[2];
673 wordmask[3] = internal::wordmask[3];
674 wordmask[4] = internal::wordmask[4];
675
676 // We could have put this refill fragment only at the beginning of the loop.
677 // However, duplicating it at the end of each branch gives the compiler more
678 // scope to optimize the <ip_limit_ - ip> expression based on the local
679 // context, which overall increases speed.
680 #define MAYBE_REFILL() \
681 if (ip_limit_ - ip < kMaximumTagLength) { \
682 ip_ = ip; \
683 if (!RefillTag()) return; \
684 ip = ip_; \
685 }
686
687 MAYBE_REFILL();
688 // Add loop alignment directive. Without this directive, we observed
689 // significant performance degradation on several intel architectures
690 // in snappy benchmark built with LLVM. The degradation was caused by
691 // increased branch miss prediction.
692 #if defined(__clang__) && defined(__x86_64__)
693 asm volatile (".p2align 5");
694 #endif
695 for ( ;; ) {
696 const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
697
698 // Ratio of iterations that have LITERAL vs non-LITERAL for different
699 // inputs.
700 //
701 // input LITERAL NON_LITERAL
702 // -----------------------------------
703 // html|html4|cp 23% 77%
704 // urls 36% 64%
705 // jpg 47% 53%
706 // pdf 19% 81%
707 // txt[1-4] 25% 75%
708 // pb 24% 76%
709 // bin 24% 76%
710 if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
711 size_t literal_length = (c >> 2) + 1u;
712 if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
713 assert(literal_length < 61);
714 ip += literal_length;
715 // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
716 // will not return true unless there's already at least five spare
717 // bytes in addition to the literal.
718 continue;
719 }
720 if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
721 // Long literal.
722 const size_t literal_length_length = literal_length - 60;
723 literal_length =
724 (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
725 ip += literal_length_length;
726 }
727
728 size_t avail = ip_limit_ - ip;
729 while (avail < literal_length) {
730 if (!writer->Append(ip, avail)) return;
731 literal_length -= avail;
732 reader_->Skip(peeked_);
733 size_t n;
734 ip = reader_->Peek(&n);
735 avail = n;
736 peeked_ = avail;
737 if (avail == 0) return; // Premature end of input
738 ip_limit_ = ip + avail;
739 }
740 if (!writer->Append(ip, literal_length)) {
741 return;
742 }
743 ip += literal_length;
744 MAYBE_REFILL();
745 } else {
746 const size_t entry = char_table[c];
747 const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
748 const size_t length = entry & 0xff;
749 ip += entry >> 11;
750
751 // copy_offset/256 is encoded in bits 8..10. By just fetching
752 // those bits, we get copy_offset (since the bit-field starts at
753 // bit 8).
754 const size_t copy_offset = entry & 0x700;
755 if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
756 return;
757 }
758 MAYBE_REFILL();
759 }
760 }
761
762 #undef MAYBE_REFILL
763 }
764 };
765
RefillTag()766 bool SnappyDecompressor::RefillTag() {
767 const char* ip = ip_;
768 if (ip == ip_limit_) {
769 // Fetch a new fragment from the reader
770 reader_->Skip(peeked_); // All peeked bytes are used up
771 size_t n;
772 ip = reader_->Peek(&n);
773 peeked_ = n;
774 eof_ = (n == 0);
775 if (eof_) return false;
776 ip_limit_ = ip + n;
777 }
778
779 // Read the tag character
780 assert(ip < ip_limit_);
781 const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
782 const uint32 entry = char_table[c];
783 const uint32 needed = (entry >> 11) + 1; // +1 byte for 'c'
784 assert(needed <= sizeof(scratch_));
785
786 // Read more bytes from reader if needed
787 uint32 nbuf = ip_limit_ - ip;
788 if (nbuf < needed) {
789 // Stitch together bytes from ip and reader to form the word
790 // contents. We store the needed bytes in "scratch_". They
791 // will be consumed immediately by the caller since we do not
792 // read more than we need.
793 memmove(scratch_, ip, nbuf);
794 reader_->Skip(peeked_); // All peeked bytes are used up
795 peeked_ = 0;
796 while (nbuf < needed) {
797 size_t length;
798 const char* src = reader_->Peek(&length);
799 if (length == 0) return false;
800 uint32 to_add = std::min<uint32>(needed - nbuf, length);
801 memcpy(scratch_ + nbuf, src, to_add);
802 nbuf += to_add;
803 reader_->Skip(to_add);
804 }
805 assert(nbuf == needed);
806 ip_ = scratch_;
807 ip_limit_ = scratch_ + needed;
808 } else if (nbuf < kMaximumTagLength) {
809 // Have enough bytes, but move into scratch_ so that we do not
810 // read past end of input
811 memmove(scratch_, ip, nbuf);
812 reader_->Skip(peeked_); // All peeked bytes are used up
813 peeked_ = 0;
814 ip_ = scratch_;
815 ip_limit_ = scratch_ + nbuf;
816 } else {
817 // Pass pointer to buffer returned by reader_.
818 ip_ = ip;
819 }
820 return true;
821 }
822
823 template <typename Writer>
InternalUncompress(Source * r,Writer * writer)824 static bool InternalUncompress(Source* r, Writer* writer) {
825 // Read the uncompressed length from the front of the compressed input
826 SnappyDecompressor decompressor(r);
827 uint32 uncompressed_len = 0;
828 if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
829
830 return InternalUncompressAllTags(&decompressor, writer, r->Available(),
831 uncompressed_len);
832 }
833
834 template <typename Writer>
InternalUncompressAllTags(SnappyDecompressor * decompressor,Writer * writer,uint32 compressed_len,uint32 uncompressed_len)835 static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
836 Writer* writer,
837 uint32 compressed_len,
838 uint32 uncompressed_len) {
839 Report("snappy_uncompress", compressed_len, uncompressed_len);
840
841 writer->SetExpectedLength(uncompressed_len);
842
843 // Process the entire input
844 decompressor->DecompressAllTags(writer);
845 writer->Flush();
846 return (decompressor->eof() && writer->CheckLength());
847 }
848
GetUncompressedLength(Source * source,uint32 * result)849 bool GetUncompressedLength(Source* source, uint32* result) {
850 SnappyDecompressor decompressor(source);
851 return decompressor.ReadUncompressedLength(result);
852 }
853
Compress(Source * reader,Sink * writer)854 size_t Compress(Source* reader, Sink* writer) {
855 size_t written = 0;
856 size_t N = reader->Available();
857 const size_t uncompressed_size = N;
858 char ulength[Varint::kMax32];
859 char* p = Varint::Encode32(ulength, N);
860 writer->Append(ulength, p-ulength);
861 written += (p - ulength);
862
863 internal::WorkingMemory wmem;
864 char* scratch = NULL;
865 char* scratch_output = NULL;
866
867 while (N > 0) {
868 // Get next block to compress (without copying if possible)
869 size_t fragment_size;
870 const char* fragment = reader->Peek(&fragment_size);
871 assert(fragment_size != 0); // premature end of input
872 const size_t num_to_read = std::min(N, kBlockSize);
873 size_t bytes_read = fragment_size;
874
875 size_t pending_advance = 0;
876 if (bytes_read >= num_to_read) {
877 // Buffer returned by reader is large enough
878 pending_advance = num_to_read;
879 fragment_size = num_to_read;
880 } else {
881 // Read into scratch buffer
882 if (scratch == NULL) {
883 // If this is the last iteration, we want to allocate N bytes
884 // of space, otherwise the max possible kBlockSize space.
885 // num_to_read contains exactly the correct value
886 scratch = new char[num_to_read];
887 }
888 memcpy(scratch, fragment, bytes_read);
889 reader->Skip(bytes_read);
890
891 while (bytes_read < num_to_read) {
892 fragment = reader->Peek(&fragment_size);
893 size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
894 memcpy(scratch + bytes_read, fragment, n);
895 bytes_read += n;
896 reader->Skip(n);
897 }
898 assert(bytes_read == num_to_read);
899 fragment = scratch;
900 fragment_size = num_to_read;
901 }
902 assert(fragment_size == num_to_read);
903
904 // Get encoding table for compression
905 int table_size;
906 uint16* table = wmem.GetHashTable(num_to_read, &table_size);
907
908 // Compress input_fragment and append to dest
909 const int max_output = MaxCompressedLength(num_to_read);
910
911 // Need a scratch buffer for the output, in case the byte sink doesn't
912 // have room for us directly.
913 if (scratch_output == NULL) {
914 scratch_output = new char[max_output];
915 } else {
916 // Since we encode kBlockSize regions followed by a region
917 // which is <= kBlockSize in length, a previously allocated
918 // scratch_output[] region is big enough for this iteration.
919 }
920 char* dest = writer->GetAppendBuffer(max_output, scratch_output);
921 char* end = internal::CompressFragment(fragment, fragment_size,
922 dest, table, table_size);
923 writer->Append(dest, end - dest);
924 written += (end - dest);
925
926 N -= num_to_read;
927 reader->Skip(pending_advance);
928 }
929
930 Report("snappy_compress", written, uncompressed_size);
931
932 delete[] scratch;
933 delete[] scratch_output;
934
935 return written;
936 }
937
938 // -----------------------------------------------------------------------
939 // IOVec interfaces
940 // -----------------------------------------------------------------------
941
942 // A type that writes to an iovec.
943 // Note that this is not a "ByteSink", but a type that matches the
944 // Writer template argument to SnappyDecompressor::DecompressAllTags().
945 class SnappyIOVecWriter {
946 private:
947 const struct iovec* output_iov_;
948 const size_t output_iov_count_;
949
950 // We are currently writing into output_iov_[curr_iov_index_].
951 size_t curr_iov_index_;
952
953 // Bytes written to output_iov_[curr_iov_index_] so far.
954 size_t curr_iov_written_;
955
956 // Total bytes decompressed into output_iov_ so far.
957 size_t total_written_;
958
959 // Maximum number of bytes that will be decompressed into output_iov_.
960 size_t output_limit_;
961
GetIOVecPointer(size_t index,size_t offset)962 inline char* GetIOVecPointer(size_t index, size_t offset) {
963 return reinterpret_cast<char*>(output_iov_[index].iov_base) +
964 offset;
965 }
966
967 public:
968 // Does not take ownership of iov. iov must be valid during the
969 // entire lifetime of the SnappyIOVecWriter.
SnappyIOVecWriter(const struct iovec * iov,size_t iov_count)970 inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
971 : output_iov_(iov),
972 output_iov_count_(iov_count),
973 curr_iov_index_(0),
974 curr_iov_written_(0),
975 total_written_(0),
976 output_limit_(-1) {
977 }
978
SetExpectedLength(size_t len)979 inline void SetExpectedLength(size_t len) {
980 output_limit_ = len;
981 }
982
CheckLength() const983 inline bool CheckLength() const {
984 return total_written_ == output_limit_;
985 }
986
Append(const char * ip,size_t len)987 inline bool Append(const char* ip, size_t len) {
988 if (total_written_ + len > output_limit_) {
989 return false;
990 }
991
992 while (len > 0) {
993 assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
994 if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
995 // This iovec is full. Go to the next one.
996 if (curr_iov_index_ + 1 >= output_iov_count_) {
997 return false;
998 }
999 curr_iov_written_ = 0;
1000 ++curr_iov_index_;
1001 }
1002
1003 const size_t to_write = std::min(
1004 len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
1005 memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1006 ip,
1007 to_write);
1008 curr_iov_written_ += to_write;
1009 total_written_ += to_write;
1010 ip += to_write;
1011 len -= to_write;
1012 }
1013
1014 return true;
1015 }
1016
TryFastAppend(const char * ip,size_t available,size_t len)1017 inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1018 const size_t space_left = output_limit_ - total_written_;
1019 if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
1020 output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
1021 // Fast path, used for the majority (about 95%) of invocations.
1022 char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
1023 UnalignedCopy128(ip, ptr);
1024 curr_iov_written_ += len;
1025 total_written_ += len;
1026 return true;
1027 }
1028
1029 return false;
1030 }
1031
AppendFromSelf(size_t offset,size_t len)1032 inline bool AppendFromSelf(size_t offset, size_t len) {
1033 if (offset > total_written_ || offset == 0) {
1034 return false;
1035 }
1036 const size_t space_left = output_limit_ - total_written_;
1037 if (len > space_left) {
1038 return false;
1039 }
1040
1041 // Locate the iovec from which we need to start the copy.
1042 size_t from_iov_index = curr_iov_index_;
1043 size_t from_iov_offset = curr_iov_written_;
1044 while (offset > 0) {
1045 if (from_iov_offset >= offset) {
1046 from_iov_offset -= offset;
1047 break;
1048 }
1049
1050 offset -= from_iov_offset;
1051 assert(from_iov_index > 0);
1052 --from_iov_index;
1053 from_iov_offset = output_iov_[from_iov_index].iov_len;
1054 }
1055
1056 // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1057 // the current iovec.
1058 while (len > 0) {
1059 assert(from_iov_index <= curr_iov_index_);
1060 if (from_iov_index != curr_iov_index_) {
1061 const size_t to_copy = std::min(
1062 output_iov_[from_iov_index].iov_len - from_iov_offset,
1063 len);
1064 Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
1065 len -= to_copy;
1066 if (len > 0) {
1067 ++from_iov_index;
1068 from_iov_offset = 0;
1069 }
1070 } else {
1071 assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1072 size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
1073 curr_iov_written_,
1074 len);
1075 if (to_copy == 0) {
1076 // This iovec is full. Go to the next one.
1077 if (curr_iov_index_ + 1 >= output_iov_count_) {
1078 return false;
1079 }
1080 ++curr_iov_index_;
1081 curr_iov_written_ = 0;
1082 continue;
1083 }
1084 if (to_copy > len) {
1085 to_copy = len;
1086 }
1087 IncrementalCopySlow(
1088 GetIOVecPointer(from_iov_index, from_iov_offset),
1089 GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1090 GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
1091 curr_iov_written_ += to_copy;
1092 from_iov_offset += to_copy;
1093 total_written_ += to_copy;
1094 len -= to_copy;
1095 }
1096 }
1097
1098 return true;
1099 }
1100
Flush()1101 inline void Flush() {}
1102 };
1103
RawUncompressToIOVec(const char * compressed,size_t compressed_length,const struct iovec * iov,size_t iov_cnt)1104 bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
1105 const struct iovec* iov, size_t iov_cnt) {
1106 ByteArraySource reader(compressed, compressed_length);
1107 return RawUncompressToIOVec(&reader, iov, iov_cnt);
1108 }
1109
RawUncompressToIOVec(Source * compressed,const struct iovec * iov,size_t iov_cnt)1110 bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
1111 size_t iov_cnt) {
1112 SnappyIOVecWriter output(iov, iov_cnt);
1113 return InternalUncompress(compressed, &output);
1114 }
1115
1116 // -----------------------------------------------------------------------
1117 // Flat array interfaces
1118 // -----------------------------------------------------------------------
1119
1120 // A type that writes to a flat array.
1121 // Note that this is not a "ByteSink", but a type that matches the
1122 // Writer template argument to SnappyDecompressor::DecompressAllTags().
1123 class SnappyArrayWriter {
1124 private:
1125 char* base_;
1126 char* op_;
1127 char* op_limit_;
1128
1129 public:
SnappyArrayWriter(char * dst)1130 inline explicit SnappyArrayWriter(char* dst)
1131 : base_(dst),
1132 op_(dst),
1133 op_limit_(dst) {
1134 }
1135
SetExpectedLength(size_t len)1136 inline void SetExpectedLength(size_t len) {
1137 op_limit_ = op_ + len;
1138 }
1139
CheckLength() const1140 inline bool CheckLength() const {
1141 return op_ == op_limit_;
1142 }
1143
Append(const char * ip,size_t len)1144 inline bool Append(const char* ip, size_t len) {
1145 char* op = op_;
1146 const size_t space_left = op_limit_ - op;
1147 if (space_left < len) {
1148 return false;
1149 }
1150 memcpy(op, ip, len);
1151 op_ = op + len;
1152 return true;
1153 }
1154
TryFastAppend(const char * ip,size_t available,size_t len)1155 inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1156 char* op = op_;
1157 const size_t space_left = op_limit_ - op;
1158 if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
1159 // Fast path, used for the majority (about 95%) of invocations.
1160 UnalignedCopy128(ip, op);
1161 op_ = op + len;
1162 return true;
1163 } else {
1164 return false;
1165 }
1166 }
1167
AppendFromSelf(size_t offset,size_t len)1168 inline bool AppendFromSelf(size_t offset, size_t len) {
1169 char* const op_end = op_ + len;
1170
1171 // Check if we try to append from before the start of the buffer.
1172 // Normally this would just be a check for "produced < offset",
1173 // but "produced <= offset - 1u" is equivalent for every case
1174 // except the one where offset==0, where the right side will wrap around
1175 // to a very big number. This is convenient, as offset==0 is another
1176 // invalid case that we also want to catch, so that we do not go
1177 // into an infinite loop.
1178 if (Produced() <= offset - 1u || op_end > op_limit_) return false;
1179 op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
1180
1181 return true;
1182 }
Produced() const1183 inline size_t Produced() const {
1184 assert(op_ >= base_);
1185 return op_ - base_;
1186 }
Flush()1187 inline void Flush() {}
1188 };
1189
RawUncompress(const char * compressed,size_t n,char * uncompressed)1190 bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
1191 ByteArraySource reader(compressed, n);
1192 return RawUncompress(&reader, uncompressed);
1193 }
1194
RawUncompress(Source * compressed,char * uncompressed)1195 bool RawUncompress(Source* compressed, char* uncompressed) {
1196 SnappyArrayWriter output(uncompressed);
1197 return InternalUncompress(compressed, &output);
1198 }
1199
Uncompress(const char * compressed,size_t n,string * uncompressed)1200 bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1201 size_t ulength;
1202 if (!GetUncompressedLength(compressed, n, &ulength)) {
1203 return false;
1204 }
1205 // On 32-bit builds: max_size() < kuint32max. Check for that instead
1206 // of crashing (e.g., consider externally specified compressed data).
1207 if (ulength > uncompressed->max_size()) {
1208 return false;
1209 }
1210 STLStringResizeUninitialized(uncompressed, ulength);
1211 return RawUncompress(compressed, n, string_as_array(uncompressed));
1212 }
1213
1214 // A Writer that drops everything on the floor and just does validation
1215 class SnappyDecompressionValidator {
1216 private:
1217 size_t expected_;
1218 size_t produced_;
1219
1220 public:
SnappyDecompressionValidator()1221 inline SnappyDecompressionValidator() : expected_(0), produced_(0) { }
SetExpectedLength(size_t len)1222 inline void SetExpectedLength(size_t len) {
1223 expected_ = len;
1224 }
CheckLength() const1225 inline bool CheckLength() const {
1226 return expected_ == produced_;
1227 }
Append(const char * ip,size_t len)1228 inline bool Append(const char* ip, size_t len) {
1229 produced_ += len;
1230 return produced_ <= expected_;
1231 }
TryFastAppend(const char * ip,size_t available,size_t length)1232 inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
1233 return false;
1234 }
AppendFromSelf(size_t offset,size_t len)1235 inline bool AppendFromSelf(size_t offset, size_t len) {
1236 // See SnappyArrayWriter::AppendFromSelf for an explanation of
1237 // the "offset - 1u" trick.
1238 if (produced_ <= offset - 1u) return false;
1239 produced_ += len;
1240 return produced_ <= expected_;
1241 }
Flush()1242 inline void Flush() {}
1243 };
1244
IsValidCompressedBuffer(const char * compressed,size_t n)1245 bool IsValidCompressedBuffer(const char* compressed, size_t n) {
1246 ByteArraySource reader(compressed, n);
1247 SnappyDecompressionValidator writer;
1248 return InternalUncompress(&reader, &writer);
1249 }
1250
IsValidCompressed(Source * compressed)1251 bool IsValidCompressed(Source* compressed) {
1252 SnappyDecompressionValidator writer;
1253 return InternalUncompress(compressed, &writer);
1254 }
1255
RawCompress(const char * input,size_t input_length,char * compressed,size_t * compressed_length)1256 void RawCompress(const char* input,
1257 size_t input_length,
1258 char* compressed,
1259 size_t* compressed_length) {
1260 ByteArraySource reader(input, input_length);
1261 UncheckedByteArraySink writer(compressed);
1262 Compress(&reader, &writer);
1263
1264 // Compute how many bytes were added
1265 *compressed_length = (writer.CurrentDestination() - compressed);
1266 }
1267
Compress(const char * input,size_t input_length,string * compressed)1268 size_t Compress(const char* input, size_t input_length, string* compressed) {
1269 // Pre-grow the buffer to the max length of the compressed output
1270 STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1271
1272 size_t compressed_length;
1273 RawCompress(input, input_length, string_as_array(compressed),
1274 &compressed_length);
1275 compressed->resize(compressed_length);
1276 return compressed_length;
1277 }
1278
1279 // -----------------------------------------------------------------------
1280 // Sink interface
1281 // -----------------------------------------------------------------------
1282
1283 // A type that decompresses into a Sink. The template parameter
1284 // Allocator must export one method "char* Allocate(int size);", which
1285 // allocates a buffer of "size" and appends that to the destination.
1286 template <typename Allocator>
1287 class SnappyScatteredWriter {
1288 Allocator allocator_;
1289
1290 // We need random access into the data generated so far. Therefore
1291 // we keep track of all of the generated data as an array of blocks.
1292 // All of the blocks except the last have length kBlockSize.
1293 std::vector<char*> blocks_;
1294 size_t expected_;
1295
1296 // Total size of all fully generated blocks so far
1297 size_t full_size_;
1298
1299 // Pointer into current output block
1300 char* op_base_; // Base of output block
1301 char* op_ptr_; // Pointer to next unfilled byte in block
1302 char* op_limit_; // Pointer just past block
1303
Size() const1304 inline size_t Size() const {
1305 return full_size_ + (op_ptr_ - op_base_);
1306 }
1307
1308 bool SlowAppend(const char* ip, size_t len);
1309 bool SlowAppendFromSelf(size_t offset, size_t len);
1310
1311 public:
SnappyScatteredWriter(const Allocator & allocator)1312 inline explicit SnappyScatteredWriter(const Allocator& allocator)
1313 : allocator_(allocator),
1314 full_size_(0),
1315 op_base_(NULL),
1316 op_ptr_(NULL),
1317 op_limit_(NULL) {
1318 }
1319
SetExpectedLength(size_t len)1320 inline void SetExpectedLength(size_t len) {
1321 assert(blocks_.empty());
1322 expected_ = len;
1323 }
1324
CheckLength() const1325 inline bool CheckLength() const {
1326 return Size() == expected_;
1327 }
1328
1329 // Return the number of bytes actually uncompressed so far
Produced() const1330 inline size_t Produced() const {
1331 return Size();
1332 }
1333
Append(const char * ip,size_t len)1334 inline bool Append(const char* ip, size_t len) {
1335 size_t avail = op_limit_ - op_ptr_;
1336 if (len <= avail) {
1337 // Fast path
1338 memcpy(op_ptr_, ip, len);
1339 op_ptr_ += len;
1340 return true;
1341 } else {
1342 return SlowAppend(ip, len);
1343 }
1344 }
1345
TryFastAppend(const char * ip,size_t available,size_t length)1346 inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
1347 char* op = op_ptr_;
1348 const int space_left = op_limit_ - op;
1349 if (length <= 16 && available >= 16 + kMaximumTagLength &&
1350 space_left >= 16) {
1351 // Fast path, used for the majority (about 95%) of invocations.
1352 UnalignedCopy128(ip, op);
1353 op_ptr_ = op + length;
1354 return true;
1355 } else {
1356 return false;
1357 }
1358 }
1359
AppendFromSelf(size_t offset,size_t len)1360 inline bool AppendFromSelf(size_t offset, size_t len) {
1361 char* const op_end = op_ptr_ + len;
1362 // See SnappyArrayWriter::AppendFromSelf for an explanation of
1363 // the "offset - 1u" trick.
1364 if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
1365 op_end <= op_limit_)) {
1366 // Fast path: src and dst in current block.
1367 op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
1368 return true;
1369 }
1370 return SlowAppendFromSelf(offset, len);
1371 }
1372
1373 // Called at the end of the decompress. We ask the allocator
1374 // write all blocks to the sink.
Flush()1375 inline void Flush() { allocator_.Flush(Produced()); }
1376 };
1377
1378 template<typename Allocator>
SlowAppend(const char * ip,size_t len)1379 bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
1380 size_t avail = op_limit_ - op_ptr_;
1381 while (len > avail) {
1382 // Completely fill this block
1383 memcpy(op_ptr_, ip, avail);
1384 op_ptr_ += avail;
1385 assert(op_limit_ - op_ptr_ == 0);
1386 full_size_ += (op_ptr_ - op_base_);
1387 len -= avail;
1388 ip += avail;
1389
1390 // Bounds check
1391 if (full_size_ + len > expected_) {
1392 return false;
1393 }
1394
1395 // Make new block
1396 size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
1397 op_base_ = allocator_.Allocate(bsize);
1398 op_ptr_ = op_base_;
1399 op_limit_ = op_base_ + bsize;
1400 blocks_.push_back(op_base_);
1401 avail = bsize;
1402 }
1403
1404 memcpy(op_ptr_, ip, len);
1405 op_ptr_ += len;
1406 return true;
1407 }
1408
1409 template<typename Allocator>
SlowAppendFromSelf(size_t offset,size_t len)1410 bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
1411 size_t len) {
1412 // Overflow check
1413 // See SnappyArrayWriter::AppendFromSelf for an explanation of
1414 // the "offset - 1u" trick.
1415 const size_t cur = Size();
1416 if (offset - 1u >= cur) return false;
1417 if (expected_ - cur < len) return false;
1418
1419 // Currently we shouldn't ever hit this path because Compress() chops the
1420 // input into blocks and does not create cross-block copies. However, it is
1421 // nice if we do not rely on that, since we can get better compression if we
1422 // allow cross-block copies and thus might want to change the compressor in
1423 // the future.
1424 size_t src = cur - offset;
1425 while (len-- > 0) {
1426 char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
1427 Append(&c, 1);
1428 src++;
1429 }
1430 return true;
1431 }
1432
1433 class SnappySinkAllocator {
1434 public:
SnappySinkAllocator(Sink * dest)1435 explicit SnappySinkAllocator(Sink* dest): dest_(dest) {}
~SnappySinkAllocator()1436 ~SnappySinkAllocator() {}
1437
Allocate(int size)1438 char* Allocate(int size) {
1439 Datablock block(new char[size], size);
1440 blocks_.push_back(block);
1441 return block.data;
1442 }
1443
1444 // We flush only at the end, because the writer wants
1445 // random access to the blocks and once we hand the
1446 // block over to the sink, we can't access it anymore.
1447 // Also we don't write more than has been actually written
1448 // to the blocks.
Flush(size_t size)1449 void Flush(size_t size) {
1450 size_t size_written = 0;
1451 size_t block_size;
1452 for (int i = 0; i < blocks_.size(); ++i) {
1453 block_size = std::min<size_t>(blocks_[i].size, size - size_written);
1454 dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
1455 &SnappySinkAllocator::Deleter, NULL);
1456 size_written += block_size;
1457 }
1458 blocks_.clear();
1459 }
1460
1461 private:
1462 struct Datablock {
1463 char* data;
1464 size_t size;
Datablocksnappy::SnappySinkAllocator::Datablock1465 Datablock(char* p, size_t s) : data(p), size(s) {}
1466 };
1467
Deleter(void * arg,const char * bytes,size_t size)1468 static void Deleter(void* arg, const char* bytes, size_t size) {
1469 delete[] bytes;
1470 }
1471
1472 Sink* dest_;
1473 std::vector<Datablock> blocks_;
1474
1475 // Note: copying this object is allowed
1476 };
1477
UncompressAsMuchAsPossible(Source * compressed,Sink * uncompressed)1478 size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
1479 SnappySinkAllocator allocator(uncompressed);
1480 SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1481 InternalUncompress(compressed, &writer);
1482 return writer.Produced();
1483 }
1484
Uncompress(Source * compressed,Sink * uncompressed)1485 bool Uncompress(Source* compressed, Sink* uncompressed) {
1486 // Read the uncompressed length from the front of the compressed input
1487 SnappyDecompressor decompressor(compressed);
1488 uint32 uncompressed_len = 0;
1489 if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
1490 return false;
1491 }
1492
1493 char c;
1494 size_t allocated_size;
1495 char* buf = uncompressed->GetAppendBufferVariable(
1496 1, uncompressed_len, &c, 1, &allocated_size);
1497
1498 const size_t compressed_len = compressed->Available();
1499 // If we can get a flat buffer, then use it, otherwise do block by block
1500 // uncompression
1501 if (allocated_size >= uncompressed_len) {
1502 SnappyArrayWriter writer(buf);
1503 bool result = InternalUncompressAllTags(&decompressor, &writer,
1504 compressed_len, uncompressed_len);
1505 uncompressed->Append(buf, writer.Produced());
1506 return result;
1507 } else {
1508 SnappySinkAllocator allocator(uncompressed);
1509 SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1510 return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
1511 uncompressed_len);
1512 }
1513 }
1514
1515 } // end namespace snappy
1516