1 // Copyright 2005 Google Inc. All Rights Reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 //     * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 //     * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 //     * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 
29 #include "snappy.h"
30 #include "snappy-internal.h"
31 #include "snappy-sinksource.h"
32 
33 #ifndef SNAPPY_HAVE_SSE2
34 #if defined(__SSE2__) || defined(_M_X64) || \
35     (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
36 #define SNAPPY_HAVE_SSE2 1
37 #else
38 #define SNAPPY_HAVE_SSE2 0
39 #endif
40 #endif
41 
42 #if SNAPPY_HAVE_SSE2
43 #include <emmintrin.h>
44 #endif
45 #include <stdio.h>
46 
47 #include <algorithm>
48 #include <string>
49 #include <vector>
50 
51 
52 namespace snappy {
53 
54 using internal::COPY_1_BYTE_OFFSET;
55 using internal::COPY_2_BYTE_OFFSET;
56 using internal::LITERAL;
57 using internal::char_table;
58 using internal::kMaximumTagLength;
59 
60 // Any hash function will produce a valid compressed bitstream, but a good
61 // hash function reduces the number of collisions and thus yields better
62 // compression for compressible input, and more speed for incompressible
63 // input. Of course, it doesn't hurt if the hash function is reasonably fast
64 // either, as it gets called a lot.
HashBytes(uint32 bytes,int shift)65 static inline uint32 HashBytes(uint32 bytes, int shift) {
66   uint32 kMul = 0x1e35a7bd;
67   return (bytes * kMul) >> shift;
68 }
Hash(const char * p,int shift)69 static inline uint32 Hash(const char* p, int shift) {
70   return HashBytes(UNALIGNED_LOAD32(p), shift);
71 }
72 
MaxCompressedLength(size_t source_len)73 size_t MaxCompressedLength(size_t source_len) {
74   // Compressed data can be defined as:
75   //    compressed := item* literal*
76   //    item       := literal* copy
77   //
78   // The trailing literal sequence has a space blowup of at most 62/60
79   // since a literal of length 60 needs one tag byte + one extra byte
80   // for length information.
81   //
82   // Item blowup is trickier to measure.  Suppose the "copy" op copies
83   // 4 bytes of data.  Because of a special check in the encoding code,
84   // we produce a 4-byte copy only if the offset is < 65536.  Therefore
85   // the copy op takes 3 bytes to encode, and this type of item leads
86   // to at most the 62/60 blowup for representing literals.
87   //
88   // Suppose the "copy" op copies 5 bytes of data.  If the offset is big
89   // enough, it will take 5 bytes to encode the copy op.  Therefore the
90   // worst case here is a one-byte literal followed by a five-byte copy.
91   // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
92   //
93   // This last factor dominates the blowup, so the final estimate is:
94   return 32 + source_len + source_len/6;
95 }
96 
97 namespace {
98 
UnalignedCopy64(const void * src,void * dst)99 void UnalignedCopy64(const void* src, void* dst) {
100   char tmp[8];
101   memcpy(tmp, src, 8);
102   memcpy(dst, tmp, 8);
103 }
104 
UnalignedCopy128(const void * src,void * dst)105 void UnalignedCopy128(const void* src, void* dst) {
106   // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
107   // SSE2 moves for memcpy(dst, src, 16).
108 #if SNAPPY_HAVE_SSE2
109   __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
110   _mm_storeu_si128(static_cast<__m128i*>(dst), x);
111 #else
112   char tmp[16];
113   memcpy(tmp, src, 16);
114   memcpy(dst, tmp, 16);
115 #endif
116 }
117 
118 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
119 // for handling COPY operations where the input and output regions may overlap.
120 // For example, suppose:
121 //    src       == "ab"
122 //    op        == src + 2
123 //    op_limit  == op + 20
124 // After IncrementalCopySlow(src, op, op_limit), the result will have eleven
125 // copies of "ab"
126 //    ababababababababababab
127 // Note that this does not match the semantics of either memcpy() or memmove().
IncrementalCopySlow(const char * src,char * op,char * const op_limit)128 inline char* IncrementalCopySlow(const char* src, char* op,
129                                  char* const op_limit) {
130   while (op < op_limit) {
131     *op++ = *src++;
132   }
133   return op_limit;
134 }
135 
136 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
137 // IncrementalCopySlow. buf_limit is the address past the end of the writable
138 // region of the buffer.
IncrementalCopy(const char * src,char * op,char * const op_limit,char * const buf_limit)139 inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
140                              char* const buf_limit) {
141   // Terminology:
142   //
143   // slop = buf_limit - op
144   // pat  = op - src
145   // len  = limit - op
146   assert(src < op);
147   assert(op_limit <= buf_limit);
148   // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
149   // to optimize this function but we have to also handle these cases in case
150   // the input does not satisfy these conditions.
151 
152   size_t pattern_size = op - src;
153   // The cases are split into different branches to allow the branch predictor,
154   // FDO, and static prediction hints to work better. For each input we list the
155   // ratio of invocations that match each condition.
156   //
157   // input        slop < 16   pat < 8  len > 16
158   // ------------------------------------------
159   // html|html4|cp   0%         1.01%    27.73%
160   // urls            0%         0.88%    14.79%
161   // jpg             0%        64.29%     7.14%
162   // pdf             0%         2.56%    58.06%
163   // txt[1-4]        0%         0.23%     0.97%
164   // pb              0%         0.96%    13.88%
165   // bin             0.01%     22.27%    41.17%
166   //
167   // It is very rare that we don't have enough slop for doing block copies. It
168   // is also rare that we need to expand a pattern. Small patterns are common
169   // for incompressible formats and for those we are plenty fast already.
170   // Lengths are normally not greater than 16 but they vary depending on the
171   // input. In general if we always predict len <= 16 it would be an ok
172   // prediction.
173   //
174   // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
175   // copying 2x 8 bytes at a time.
176 
177   // Handle the uncommon case where pattern is less than 8 bytes.
178   if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
179     // Expand pattern to at least 8 bytes. The worse case scenario in terms of
180     // buffer usage is when the pattern is size 3. ^ is the original position
181     // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
182     //
183     // abc
184     // abcabcxxxxx
185     // abcabcabcabcxxxxx
186     //    ^
187     // The last x is 14 bytes after ^.
188     if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 14)) {
189       while (pattern_size < 8) {
190         UnalignedCopy64(src, op);
191         op += pattern_size;
192         pattern_size *= 2;
193       }
194       if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
195     } else {
196       return IncrementalCopySlow(src, op, op_limit);
197     }
198   }
199   assert(pattern_size >= 8);
200 
201   // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
202   // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
203   // because expanding the pattern to at least 8 bytes guarantees that
204   // op - src >= 8.
205   while (op <= buf_limit - 16) {
206     UnalignedCopy64(src, op);
207     UnalignedCopy64(src + 8, op + 8);
208     src += 16;
209     op += 16;
210     if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
211   }
212   // We only take this branch if we didn't have enough slop and we can do a
213   // single 8 byte copy.
214   if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
215     UnalignedCopy64(src, op);
216     src += 8;
217     op += 8;
218   }
219   return IncrementalCopySlow(src, op, op_limit);
220 }
221 
222 }  // namespace
223 
EmitLiteral(char * op,const char * literal,int len,bool allow_fast_path)224 static inline char* EmitLiteral(char* op,
225                                 const char* literal,
226                                 int len,
227                                 bool allow_fast_path) {
228   // The vast majority of copies are below 16 bytes, for which a
229   // call to memcpy is overkill. This fast path can sometimes
230   // copy up to 15 bytes too much, but that is okay in the
231   // main loop, since we have a bit to go on for both sides:
232   //
233   //   - The input will always have kInputMarginBytes = 15 extra
234   //     available bytes, as long as we're in the main loop, and
235   //     if not, allow_fast_path = false.
236   //   - The output will always have 32 spare bytes (see
237   //     MaxCompressedLength).
238   assert(len > 0);      // Zero-length literals are disallowed
239   int n = len - 1;
240   if (allow_fast_path && len <= 16) {
241     // Fits in tag byte
242     *op++ = LITERAL | (n << 2);
243 
244     UnalignedCopy128(literal, op);
245     return op + len;
246   }
247 
248   if (n < 60) {
249     // Fits in tag byte
250     *op++ = LITERAL | (n << 2);
251   } else {
252     // Encode in upcoming bytes
253     char* base = op;
254     int count = 0;
255     op++;
256     while (n > 0) {
257       *op++ = n & 0xff;
258       n >>= 8;
259       count++;
260     }
261     assert(count >= 1);
262     assert(count <= 4);
263     *base = LITERAL | ((59+count) << 2);
264   }
265   memcpy(op, literal, len);
266   return op + len;
267 }
268 
EmitCopyAtMost64(char * op,size_t offset,size_t len,bool len_less_than_12)269 static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
270                                      bool len_less_than_12) {
271   assert(len <= 64);
272   assert(len >= 4);
273   assert(offset < 65536);
274   assert(len_less_than_12 == (len < 12));
275 
276   if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
277     // offset fits in 11 bits.  The 3 highest go in the top of the first byte,
278     // and the rest go in the second byte.
279     *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
280     *op++ = offset & 0xff;
281   } else {
282     // Write 4 bytes, though we only care about 3 of them.  The output buffer
283     // is required to have some slack, so the extra byte won't overrun it.
284     uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
285     LittleEndian::Store32(op, u);
286     op += 3;
287   }
288   return op;
289 }
290 
EmitCopy(char * op,size_t offset,size_t len,bool len_less_than_12)291 static inline char* EmitCopy(char* op, size_t offset, size_t len,
292                              bool len_less_than_12) {
293   assert(len_less_than_12 == (len < 12));
294   if (len_less_than_12) {
295     return EmitCopyAtMost64(op, offset, len, true);
296   } else {
297     // A special case for len <= 64 might help, but so far measurements suggest
298     // it's in the noise.
299 
300     // Emit 64 byte copies but make sure to keep at least four bytes reserved.
301     while (SNAPPY_PREDICT_FALSE(len >= 68)) {
302       op = EmitCopyAtMost64(op, offset, 64, false);
303       len -= 64;
304     }
305 
306     // One or two copies will now finish the job.
307     if (len > 64) {
308       op = EmitCopyAtMost64(op, offset, 60, false);
309       len -= 60;
310     }
311 
312     // Emit remainder.
313     op = EmitCopyAtMost64(op, offset, len, len < 12);
314     return op;
315   }
316 }
317 
GetUncompressedLength(const char * start,size_t n,size_t * result)318 bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
319   uint32 v = 0;
320   const char* limit = start + n;
321   if (Varint::Parse32WithLimit(start, limit, &v) != NULL) {
322     *result = v;
323     return true;
324   } else {
325     return false;
326   }
327 }
328 
329 namespace internal {
GetHashTable(size_t input_size,int * table_size)330 uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
331   // Use smaller hash table when input.size() is smaller, since we
332   // fill the table, incurring O(hash table size) overhead for
333   // compression, and if the input is short, we won't need that
334   // many hash table entries anyway.
335   assert(kMaxHashTableSize >= 256);
336   size_t htsize = 256;
337   while (htsize < kMaxHashTableSize && htsize < input_size) {
338     htsize <<= 1;
339   }
340 
341   uint16* table;
342   if (htsize <= ARRAYSIZE(small_table_)) {
343     table = small_table_;
344   } else {
345     if (large_table_ == NULL) {
346       large_table_ = new uint16[kMaxHashTableSize];
347     }
348     table = large_table_;
349   }
350 
351   *table_size = htsize;
352   memset(table, 0, htsize * sizeof(*table));
353   return table;
354 }
355 }  // end namespace internal
356 
357 // For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
358 // equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
359 // empirically found that overlapping loads such as
360 //  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
361 // are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
362 //
363 // We have different versions for 64- and 32-bit; ideally we would avoid the
364 // two functions and just inline the UNALIGNED_LOAD64 call into
365 // GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
366 // enough to avoid loading the value multiple times then. For 64-bit, the load
367 // is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
368 // done at GetUint32AtOffset() time.
369 
370 #ifdef ARCH_K8
371 
372 typedef uint64 EightBytesReference;
373 
GetEightBytesAt(const char * ptr)374 static inline EightBytesReference GetEightBytesAt(const char* ptr) {
375   return UNALIGNED_LOAD64(ptr);
376 }
377 
GetUint32AtOffset(uint64 v,int offset)378 static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
379   assert(offset >= 0);
380   assert(offset <= 4);
381   return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
382 }
383 
384 #else
385 
386 typedef const char* EightBytesReference;
387 
GetEightBytesAt(const char * ptr)388 static inline EightBytesReference GetEightBytesAt(const char* ptr) {
389   return ptr;
390 }
391 
GetUint32AtOffset(const char * v,int offset)392 static inline uint32 GetUint32AtOffset(const char* v, int offset) {
393   assert(offset >= 0);
394   assert(offset <= 4);
395   return UNALIGNED_LOAD32(v + offset);
396 }
397 
398 #endif
399 
400 // Flat array compression that does not emit the "uncompressed length"
401 // prefix. Compresses "input" string to the "*op" buffer.
402 //
403 // REQUIRES: "input" is at most "kBlockSize" bytes long.
404 // REQUIRES: "op" points to an array of memory that is at least
405 // "MaxCompressedLength(input.size())" in size.
406 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
407 // REQUIRES: "table_size" is a power of two
408 //
409 // Returns an "end" pointer into "op" buffer.
410 // "end - op" is the compressed size of "input".
411 namespace internal {
CompressFragment(const char * input,size_t input_size,char * op,uint16 * table,const int table_size)412 char* CompressFragment(const char* input,
413                        size_t input_size,
414                        char* op,
415                        uint16* table,
416                        const int table_size) {
417   // "ip" is the input pointer, and "op" is the output pointer.
418   const char* ip = input;
419   assert(input_size <= kBlockSize);
420   assert((table_size & (table_size - 1)) == 0); // table must be power of two
421   const int shift = 32 - Bits::Log2Floor(table_size);
422   assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
423   const char* ip_end = input + input_size;
424   const char* base_ip = ip;
425   // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
426   // [next_emit, ip_end) after the main loop.
427   const char* next_emit = ip;
428 
429   const size_t kInputMarginBytes = 15;
430   if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
431     const char* ip_limit = input + input_size - kInputMarginBytes;
432 
433     for (uint32 next_hash = Hash(++ip, shift); ; ) {
434       assert(next_emit < ip);
435       // The body of this loop calls EmitLiteral once and then EmitCopy one or
436       // more times.  (The exception is that when we're close to exhausting
437       // the input we goto emit_remainder.)
438       //
439       // In the first iteration of this loop we're just starting, so
440       // there's nothing to copy, so calling EmitLiteral once is
441       // necessary.  And we only start a new iteration when the
442       // current iteration has determined that a call to EmitLiteral will
443       // precede the next call to EmitCopy (if any).
444       //
445       // Step 1: Scan forward in the input looking for a 4-byte-long match.
446       // If we get close to exhausting the input then goto emit_remainder.
447       //
448       // Heuristic match skipping: If 32 bytes are scanned with no matches
449       // found, start looking only at every other byte. If 32 more bytes are
450       // scanned (or skipped), look at every third byte, etc.. When a match is
451       // found, immediately go back to looking at every byte. This is a small
452       // loss (~5% performance, ~0.1% density) for compressible data due to more
453       // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
454       // win since the compressor quickly "realizes" the data is incompressible
455       // and doesn't bother looking for matches everywhere.
456       //
457       // The "skip" variable keeps track of how many bytes there are since the
458       // last match; dividing it by 32 (ie. right-shifting by five) gives the
459       // number of bytes to move ahead for each iteration.
460       uint32 skip = 32;
461 
462       const char* next_ip = ip;
463       const char* candidate;
464       do {
465         ip = next_ip;
466         uint32 hash = next_hash;
467         assert(hash == Hash(ip, shift));
468         uint32 bytes_between_hash_lookups = skip >> 5;
469         skip += bytes_between_hash_lookups;
470         next_ip = ip + bytes_between_hash_lookups;
471         if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
472           goto emit_remainder;
473         }
474         next_hash = Hash(next_ip, shift);
475         candidate = base_ip + table[hash];
476         assert(candidate >= base_ip);
477         assert(candidate < ip);
478 
479         table[hash] = ip - base_ip;
480       } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
481                                  UNALIGNED_LOAD32(candidate)));
482 
483       // Step 2: A 4-byte match has been found.  We'll later see if more
484       // than 4 bytes match.  But, prior to the match, input
485       // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
486       assert(next_emit + 16 <= ip_end);
487       op = EmitLiteral(op, next_emit, ip - next_emit, true);
488 
489       // Step 3: Call EmitCopy, and then see if another EmitCopy could
490       // be our next move.  Repeat until we find no match for the
491       // input immediately after what was consumed by the last EmitCopy call.
492       //
493       // If we exit this loop normally then we need to call EmitLiteral next,
494       // though we don't yet know how big the literal will be.  We handle that
495       // by proceeding to the next iteration of the main loop.  We also can exit
496       // this loop via goto if we get close to exhausting the input.
497       EightBytesReference input_bytes;
498       uint32 candidate_bytes = 0;
499 
500       do {
501         // We have a 4-byte match at ip, and no need to emit any
502         // "literal bytes" prior to ip.
503         const char* base = ip;
504         std::pair<size_t, bool> p =
505             FindMatchLength(candidate + 4, ip + 4, ip_end);
506         size_t matched = 4 + p.first;
507         ip += matched;
508         size_t offset = base - candidate;
509         assert(0 == memcmp(base, candidate, matched));
510         op = EmitCopy(op, offset, matched, p.second);
511         next_emit = ip;
512         if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
513           goto emit_remainder;
514         }
515         // We are now looking for a 4-byte match again.  We read
516         // table[Hash(ip, shift)] for that.  To improve compression,
517         // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
518         input_bytes = GetEightBytesAt(ip - 1);
519         uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
520         table[prev_hash] = ip - base_ip - 1;
521         uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
522         candidate = base_ip + table[cur_hash];
523         candidate_bytes = UNALIGNED_LOAD32(candidate);
524         table[cur_hash] = ip - base_ip;
525       } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
526 
527       next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
528       ++ip;
529     }
530   }
531 
532  emit_remainder:
533   // Emit the remaining bytes as a literal
534   if (next_emit < ip_end) {
535     op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
536   }
537 
538   return op;
539 }
540 }  // end namespace internal
541 
542 // Called back at avery compression call to trace parameters and sizes.
Report(const char * algorithm,size_t compressed_size,size_t uncompressed_size)543 static inline void Report(const char *algorithm, size_t compressed_size,
544                           size_t uncompressed_size) {}
545 
546 // Signature of output types needed by decompression code.
547 // The decompression code is templatized on a type that obeys this
548 // signature so that we do not pay virtual function call overhead in
549 // the middle of a tight decompression loop.
550 //
551 // class DecompressionWriter {
552 //  public:
553 //   // Called before decompression
554 //   void SetExpectedLength(size_t length);
555 //
556 //   // Called after decompression
557 //   bool CheckLength() const;
558 //
559 //   // Called repeatedly during decompression
560 //   bool Append(const char* ip, size_t length);
561 //   bool AppendFromSelf(uint32 offset, size_t length);
562 //
563 //   // The rules for how TryFastAppend differs from Append are somewhat
564 //   // convoluted:
565 //   //
566 //   //  - TryFastAppend is allowed to decline (return false) at any
567 //   //    time, for any reason -- just "return false" would be
568 //   //    a perfectly legal implementation of TryFastAppend.
569 //   //    The intention is for TryFastAppend to allow a fast path
570 //   //    in the common case of a small append.
571 //   //  - TryFastAppend is allowed to read up to <available> bytes
572 //   //    from the input buffer, whereas Append is allowed to read
573 //   //    <length>. However, if it returns true, it must leave
574 //   //    at least five (kMaximumTagLength) bytes in the input buffer
575 //   //    afterwards, so that there is always enough space to read the
576 //   //    next tag without checking for a refill.
577 //   //  - TryFastAppend must always return decline (return false)
578 //   //    if <length> is 61 or more, as in this case the literal length is not
579 //   //    decoded fully. In practice, this should not be a big problem,
580 //   //    as it is unlikely that one would implement a fast path accepting
581 //   //    this much data.
582 //   //
583 //   bool TryFastAppend(const char* ip, size_t available, size_t length);
584 // };
585 
586 namespace internal {
587 
588 // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
589 static const uint32 wordmask[] = {
590   0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
591 };
592 
593 }  // end namespace internal
594 
595 // Helper class for decompression
596 class SnappyDecompressor {
597  private:
598   Source*       reader_;         // Underlying source of bytes to decompress
599   const char*   ip_;             // Points to next buffered byte
600   const char*   ip_limit_;       // Points just past buffered bytes
601   uint32        peeked_;         // Bytes peeked from reader (need to skip)
602   bool          eof_;            // Hit end of input without an error?
603   char          scratch_[kMaximumTagLength];  // See RefillTag().
604 
605   // Ensure that all of the tag metadata for the next tag is available
606   // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
607   // if (ip_limit_ - ip_ < 5).
608   //
609   // Returns true on success, false on error or end of input.
610   bool RefillTag();
611 
612  public:
SnappyDecompressor(Source * reader)613   explicit SnappyDecompressor(Source* reader)
614       : reader_(reader),
615         ip_(NULL),
616         ip_limit_(NULL),
617         peeked_(0),
618         eof_(false) {
619   }
620 
~SnappyDecompressor()621   ~SnappyDecompressor() {
622     // Advance past any bytes we peeked at from the reader
623     reader_->Skip(peeked_);
624   }
625 
626   // Returns true iff we have hit the end of the input without an error.
eof() const627   bool eof() const {
628     return eof_;
629   }
630 
631   // Read the uncompressed length stored at the start of the compressed data.
632   // On succcess, stores the length in *result and returns true.
633   // On failure, returns false.
ReadUncompressedLength(uint32 * result)634   bool ReadUncompressedLength(uint32* result) {
635     assert(ip_ == NULL);       // Must not have read anything yet
636     // Length is encoded in 1..5 bytes
637     *result = 0;
638     uint32 shift = 0;
639     while (true) {
640       if (shift >= 32) return false;
641       size_t n;
642       const char* ip = reader_->Peek(&n);
643       if (n == 0) return false;
644       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
645       reader_->Skip(1);
646       uint32 val = c & 0x7f;
647       if (((val << shift) >> shift) != val) return false;
648       *result |= val << shift;
649       if (c < 128) {
650         break;
651       }
652       shift += 7;
653     }
654     return true;
655   }
656 
657   // Process the next item found in the input.
658   // Returns true if successful, false on error or end of input.
659   template <class Writer>
DecompressAllTags(Writer * writer)660   void DecompressAllTags(Writer* writer) {
661     const char* ip = ip_;
662     // For position-independent executables, accessing global arrays can be
663     // slow.  Move wordmask array onto the stack to mitigate this.
664     uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
665     // Do not use memcpy to copy internal::wordmask to
666     // wordmask.  LLVM converts stack arrays to global arrays if it detects
667     // const stack arrays and this hurts the performance of position
668     // independent code. This change is temporary and can be reverted when
669     // https://reviews.llvm.org/D30759 is approved.
670     wordmask[0] = internal::wordmask[0];
671     wordmask[1] = internal::wordmask[1];
672     wordmask[2] = internal::wordmask[2];
673     wordmask[3] = internal::wordmask[3];
674     wordmask[4] = internal::wordmask[4];
675 
676     // We could have put this refill fragment only at the beginning of the loop.
677     // However, duplicating it at the end of each branch gives the compiler more
678     // scope to optimize the <ip_limit_ - ip> expression based on the local
679     // context, which overall increases speed.
680     #define MAYBE_REFILL() \
681         if (ip_limit_ - ip < kMaximumTagLength) { \
682           ip_ = ip; \
683           if (!RefillTag()) return; \
684           ip = ip_; \
685         }
686 
687     MAYBE_REFILL();
688     // Add loop alignment directive. Without this directive, we observed
689     // significant performance degradation on several intel architectures
690     // in snappy benchmark built with LLVM. The degradation was caused by
691     // increased branch miss prediction.
692 #if defined(__clang__) && defined(__x86_64__)
693     asm volatile (".p2align 5");
694 #endif
695     for ( ;; ) {
696       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
697 
698       // Ratio of iterations that have LITERAL vs non-LITERAL for different
699       // inputs.
700       //
701       // input          LITERAL  NON_LITERAL
702       // -----------------------------------
703       // html|html4|cp   23%        77%
704       // urls            36%        64%
705       // jpg             47%        53%
706       // pdf             19%        81%
707       // txt[1-4]        25%        75%
708       // pb              24%        76%
709       // bin             24%        76%
710       if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
711         size_t literal_length = (c >> 2) + 1u;
712         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
713           assert(literal_length < 61);
714           ip += literal_length;
715           // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
716           // will not return true unless there's already at least five spare
717           // bytes in addition to the literal.
718           continue;
719         }
720         if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
721           // Long literal.
722           const size_t literal_length_length = literal_length - 60;
723           literal_length =
724               (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
725           ip += literal_length_length;
726         }
727 
728         size_t avail = ip_limit_ - ip;
729         while (avail < literal_length) {
730           if (!writer->Append(ip, avail)) return;
731           literal_length -= avail;
732           reader_->Skip(peeked_);
733           size_t n;
734           ip = reader_->Peek(&n);
735           avail = n;
736           peeked_ = avail;
737           if (avail == 0) return;  // Premature end of input
738           ip_limit_ = ip + avail;
739         }
740         if (!writer->Append(ip, literal_length)) {
741           return;
742         }
743         ip += literal_length;
744         MAYBE_REFILL();
745       } else {
746         const size_t entry = char_table[c];
747         const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
748         const size_t length = entry & 0xff;
749         ip += entry >> 11;
750 
751         // copy_offset/256 is encoded in bits 8..10.  By just fetching
752         // those bits, we get copy_offset (since the bit-field starts at
753         // bit 8).
754         const size_t copy_offset = entry & 0x700;
755         if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
756           return;
757         }
758         MAYBE_REFILL();
759       }
760     }
761 
762 #undef MAYBE_REFILL
763   }
764 };
765 
RefillTag()766 bool SnappyDecompressor::RefillTag() {
767   const char* ip = ip_;
768   if (ip == ip_limit_) {
769     // Fetch a new fragment from the reader
770     reader_->Skip(peeked_);   // All peeked bytes are used up
771     size_t n;
772     ip = reader_->Peek(&n);
773     peeked_ = n;
774     eof_ = (n == 0);
775     if (eof_) return false;
776     ip_limit_ = ip + n;
777   }
778 
779   // Read the tag character
780   assert(ip < ip_limit_);
781   const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
782   const uint32 entry = char_table[c];
783   const uint32 needed = (entry >> 11) + 1;  // +1 byte for 'c'
784   assert(needed <= sizeof(scratch_));
785 
786   // Read more bytes from reader if needed
787   uint32 nbuf = ip_limit_ - ip;
788   if (nbuf < needed) {
789     // Stitch together bytes from ip and reader to form the word
790     // contents.  We store the needed bytes in "scratch_".  They
791     // will be consumed immediately by the caller since we do not
792     // read more than we need.
793     memmove(scratch_, ip, nbuf);
794     reader_->Skip(peeked_);  // All peeked bytes are used up
795     peeked_ = 0;
796     while (nbuf < needed) {
797       size_t length;
798       const char* src = reader_->Peek(&length);
799       if (length == 0) return false;
800       uint32 to_add = std::min<uint32>(needed - nbuf, length);
801       memcpy(scratch_ + nbuf, src, to_add);
802       nbuf += to_add;
803       reader_->Skip(to_add);
804     }
805     assert(nbuf == needed);
806     ip_ = scratch_;
807     ip_limit_ = scratch_ + needed;
808   } else if (nbuf < kMaximumTagLength) {
809     // Have enough bytes, but move into scratch_ so that we do not
810     // read past end of input
811     memmove(scratch_, ip, nbuf);
812     reader_->Skip(peeked_);  // All peeked bytes are used up
813     peeked_ = 0;
814     ip_ = scratch_;
815     ip_limit_ = scratch_ + nbuf;
816   } else {
817     // Pass pointer to buffer returned by reader_.
818     ip_ = ip;
819   }
820   return true;
821 }
822 
823 template <typename Writer>
InternalUncompress(Source * r,Writer * writer)824 static bool InternalUncompress(Source* r, Writer* writer) {
825   // Read the uncompressed length from the front of the compressed input
826   SnappyDecompressor decompressor(r);
827   uint32 uncompressed_len = 0;
828   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
829 
830   return InternalUncompressAllTags(&decompressor, writer, r->Available(),
831                                    uncompressed_len);
832 }
833 
834 template <typename Writer>
InternalUncompressAllTags(SnappyDecompressor * decompressor,Writer * writer,uint32 compressed_len,uint32 uncompressed_len)835 static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
836                                       Writer* writer,
837                                       uint32 compressed_len,
838                                       uint32 uncompressed_len) {
839   Report("snappy_uncompress", compressed_len, uncompressed_len);
840 
841   writer->SetExpectedLength(uncompressed_len);
842 
843   // Process the entire input
844   decompressor->DecompressAllTags(writer);
845   writer->Flush();
846   return (decompressor->eof() && writer->CheckLength());
847 }
848 
GetUncompressedLength(Source * source,uint32 * result)849 bool GetUncompressedLength(Source* source, uint32* result) {
850   SnappyDecompressor decompressor(source);
851   return decompressor.ReadUncompressedLength(result);
852 }
853 
Compress(Source * reader,Sink * writer)854 size_t Compress(Source* reader, Sink* writer) {
855   size_t written = 0;
856   size_t N = reader->Available();
857   const size_t uncompressed_size = N;
858   char ulength[Varint::kMax32];
859   char* p = Varint::Encode32(ulength, N);
860   writer->Append(ulength, p-ulength);
861   written += (p - ulength);
862 
863   internal::WorkingMemory wmem;
864   char* scratch = NULL;
865   char* scratch_output = NULL;
866 
867   while (N > 0) {
868     // Get next block to compress (without copying if possible)
869     size_t fragment_size;
870     const char* fragment = reader->Peek(&fragment_size);
871     assert(fragment_size != 0);  // premature end of input
872     const size_t num_to_read = std::min(N, kBlockSize);
873     size_t bytes_read = fragment_size;
874 
875     size_t pending_advance = 0;
876     if (bytes_read >= num_to_read) {
877       // Buffer returned by reader is large enough
878       pending_advance = num_to_read;
879       fragment_size = num_to_read;
880     } else {
881       // Read into scratch buffer
882       if (scratch == NULL) {
883         // If this is the last iteration, we want to allocate N bytes
884         // of space, otherwise the max possible kBlockSize space.
885         // num_to_read contains exactly the correct value
886         scratch = new char[num_to_read];
887       }
888       memcpy(scratch, fragment, bytes_read);
889       reader->Skip(bytes_read);
890 
891       while (bytes_read < num_to_read) {
892         fragment = reader->Peek(&fragment_size);
893         size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
894         memcpy(scratch + bytes_read, fragment, n);
895         bytes_read += n;
896         reader->Skip(n);
897       }
898       assert(bytes_read == num_to_read);
899       fragment = scratch;
900       fragment_size = num_to_read;
901     }
902     assert(fragment_size == num_to_read);
903 
904     // Get encoding table for compression
905     int table_size;
906     uint16* table = wmem.GetHashTable(num_to_read, &table_size);
907 
908     // Compress input_fragment and append to dest
909     const int max_output = MaxCompressedLength(num_to_read);
910 
911     // Need a scratch buffer for the output, in case the byte sink doesn't
912     // have room for us directly.
913     if (scratch_output == NULL) {
914       scratch_output = new char[max_output];
915     } else {
916       // Since we encode kBlockSize regions followed by a region
917       // which is <= kBlockSize in length, a previously allocated
918       // scratch_output[] region is big enough for this iteration.
919     }
920     char* dest = writer->GetAppendBuffer(max_output, scratch_output);
921     char* end = internal::CompressFragment(fragment, fragment_size,
922                                            dest, table, table_size);
923     writer->Append(dest, end - dest);
924     written += (end - dest);
925 
926     N -= num_to_read;
927     reader->Skip(pending_advance);
928   }
929 
930   Report("snappy_compress", written, uncompressed_size);
931 
932   delete[] scratch;
933   delete[] scratch_output;
934 
935   return written;
936 }
937 
938 // -----------------------------------------------------------------------
939 // IOVec interfaces
940 // -----------------------------------------------------------------------
941 
942 // A type that writes to an iovec.
943 // Note that this is not a "ByteSink", but a type that matches the
944 // Writer template argument to SnappyDecompressor::DecompressAllTags().
945 class SnappyIOVecWriter {
946  private:
947   const struct iovec* output_iov_;
948   const size_t output_iov_count_;
949 
950   // We are currently writing into output_iov_[curr_iov_index_].
951   size_t curr_iov_index_;
952 
953   // Bytes written to output_iov_[curr_iov_index_] so far.
954   size_t curr_iov_written_;
955 
956   // Total bytes decompressed into output_iov_ so far.
957   size_t total_written_;
958 
959   // Maximum number of bytes that will be decompressed into output_iov_.
960   size_t output_limit_;
961 
GetIOVecPointer(size_t index,size_t offset)962   inline char* GetIOVecPointer(size_t index, size_t offset) {
963     return reinterpret_cast<char*>(output_iov_[index].iov_base) +
964         offset;
965   }
966 
967  public:
968   // Does not take ownership of iov. iov must be valid during the
969   // entire lifetime of the SnappyIOVecWriter.
SnappyIOVecWriter(const struct iovec * iov,size_t iov_count)970   inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
971       : output_iov_(iov),
972         output_iov_count_(iov_count),
973         curr_iov_index_(0),
974         curr_iov_written_(0),
975         total_written_(0),
976         output_limit_(-1) {
977   }
978 
SetExpectedLength(size_t len)979   inline void SetExpectedLength(size_t len) {
980     output_limit_ = len;
981   }
982 
CheckLength() const983   inline bool CheckLength() const {
984     return total_written_ == output_limit_;
985   }
986 
Append(const char * ip,size_t len)987   inline bool Append(const char* ip, size_t len) {
988     if (total_written_ + len > output_limit_) {
989       return false;
990     }
991 
992     while (len > 0) {
993       assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
994       if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
995         // This iovec is full. Go to the next one.
996         if (curr_iov_index_ + 1 >= output_iov_count_) {
997           return false;
998         }
999         curr_iov_written_ = 0;
1000         ++curr_iov_index_;
1001       }
1002 
1003       const size_t to_write = std::min(
1004           len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
1005       memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1006              ip,
1007              to_write);
1008       curr_iov_written_ += to_write;
1009       total_written_ += to_write;
1010       ip += to_write;
1011       len -= to_write;
1012     }
1013 
1014     return true;
1015   }
1016 
TryFastAppend(const char * ip,size_t available,size_t len)1017   inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1018     const size_t space_left = output_limit_ - total_written_;
1019     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
1020         output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
1021       // Fast path, used for the majority (about 95%) of invocations.
1022       char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
1023       UnalignedCopy128(ip, ptr);
1024       curr_iov_written_ += len;
1025       total_written_ += len;
1026       return true;
1027     }
1028 
1029     return false;
1030   }
1031 
AppendFromSelf(size_t offset,size_t len)1032   inline bool AppendFromSelf(size_t offset, size_t len) {
1033     if (offset > total_written_ || offset == 0) {
1034       return false;
1035     }
1036     const size_t space_left = output_limit_ - total_written_;
1037     if (len > space_left) {
1038       return false;
1039     }
1040 
1041     // Locate the iovec from which we need to start the copy.
1042     size_t from_iov_index = curr_iov_index_;
1043     size_t from_iov_offset = curr_iov_written_;
1044     while (offset > 0) {
1045       if (from_iov_offset >= offset) {
1046         from_iov_offset -= offset;
1047         break;
1048       }
1049 
1050       offset -= from_iov_offset;
1051       assert(from_iov_index > 0);
1052       --from_iov_index;
1053       from_iov_offset = output_iov_[from_iov_index].iov_len;
1054     }
1055 
1056     // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1057     // the current iovec.
1058     while (len > 0) {
1059       assert(from_iov_index <= curr_iov_index_);
1060       if (from_iov_index != curr_iov_index_) {
1061         const size_t to_copy = std::min(
1062             output_iov_[from_iov_index].iov_len - from_iov_offset,
1063             len);
1064         Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
1065         len -= to_copy;
1066         if (len > 0) {
1067           ++from_iov_index;
1068           from_iov_offset = 0;
1069         }
1070       } else {
1071         assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1072         size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
1073                                       curr_iov_written_,
1074                                   len);
1075         if (to_copy == 0) {
1076           // This iovec is full. Go to the next one.
1077           if (curr_iov_index_ + 1 >= output_iov_count_) {
1078             return false;
1079           }
1080           ++curr_iov_index_;
1081           curr_iov_written_ = 0;
1082           continue;
1083         }
1084         if (to_copy > len) {
1085           to_copy = len;
1086         }
1087         IncrementalCopySlow(
1088             GetIOVecPointer(from_iov_index, from_iov_offset),
1089             GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1090             GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
1091         curr_iov_written_ += to_copy;
1092         from_iov_offset += to_copy;
1093         total_written_ += to_copy;
1094         len -= to_copy;
1095       }
1096     }
1097 
1098     return true;
1099   }
1100 
Flush()1101   inline void Flush() {}
1102 };
1103 
RawUncompressToIOVec(const char * compressed,size_t compressed_length,const struct iovec * iov,size_t iov_cnt)1104 bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
1105                           const struct iovec* iov, size_t iov_cnt) {
1106   ByteArraySource reader(compressed, compressed_length);
1107   return RawUncompressToIOVec(&reader, iov, iov_cnt);
1108 }
1109 
RawUncompressToIOVec(Source * compressed,const struct iovec * iov,size_t iov_cnt)1110 bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
1111                           size_t iov_cnt) {
1112   SnappyIOVecWriter output(iov, iov_cnt);
1113   return InternalUncompress(compressed, &output);
1114 }
1115 
1116 // -----------------------------------------------------------------------
1117 // Flat array interfaces
1118 // -----------------------------------------------------------------------
1119 
1120 // A type that writes to a flat array.
1121 // Note that this is not a "ByteSink", but a type that matches the
1122 // Writer template argument to SnappyDecompressor::DecompressAllTags().
1123 class SnappyArrayWriter {
1124  private:
1125   char* base_;
1126   char* op_;
1127   char* op_limit_;
1128 
1129  public:
SnappyArrayWriter(char * dst)1130   inline explicit SnappyArrayWriter(char* dst)
1131       : base_(dst),
1132         op_(dst),
1133         op_limit_(dst) {
1134   }
1135 
SetExpectedLength(size_t len)1136   inline void SetExpectedLength(size_t len) {
1137     op_limit_ = op_ + len;
1138   }
1139 
CheckLength() const1140   inline bool CheckLength() const {
1141     return op_ == op_limit_;
1142   }
1143 
Append(const char * ip,size_t len)1144   inline bool Append(const char* ip, size_t len) {
1145     char* op = op_;
1146     const size_t space_left = op_limit_ - op;
1147     if (space_left < len) {
1148       return false;
1149     }
1150     memcpy(op, ip, len);
1151     op_ = op + len;
1152     return true;
1153   }
1154 
TryFastAppend(const char * ip,size_t available,size_t len)1155   inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1156     char* op = op_;
1157     const size_t space_left = op_limit_ - op;
1158     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
1159       // Fast path, used for the majority (about 95%) of invocations.
1160       UnalignedCopy128(ip, op);
1161       op_ = op + len;
1162       return true;
1163     } else {
1164       return false;
1165     }
1166   }
1167 
AppendFromSelf(size_t offset,size_t len)1168   inline bool AppendFromSelf(size_t offset, size_t len) {
1169     char* const op_end = op_ + len;
1170 
1171     // Check if we try to append from before the start of the buffer.
1172     // Normally this would just be a check for "produced < offset",
1173     // but "produced <= offset - 1u" is equivalent for every case
1174     // except the one where offset==0, where the right side will wrap around
1175     // to a very big number. This is convenient, as offset==0 is another
1176     // invalid case that we also want to catch, so that we do not go
1177     // into an infinite loop.
1178     if (Produced() <= offset - 1u || op_end > op_limit_) return false;
1179     op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
1180 
1181     return true;
1182   }
Produced() const1183   inline size_t Produced() const {
1184     assert(op_ >= base_);
1185     return op_ - base_;
1186   }
Flush()1187   inline void Flush() {}
1188 };
1189 
RawUncompress(const char * compressed,size_t n,char * uncompressed)1190 bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
1191   ByteArraySource reader(compressed, n);
1192   return RawUncompress(&reader, uncompressed);
1193 }
1194 
RawUncompress(Source * compressed,char * uncompressed)1195 bool RawUncompress(Source* compressed, char* uncompressed) {
1196   SnappyArrayWriter output(uncompressed);
1197   return InternalUncompress(compressed, &output);
1198 }
1199 
Uncompress(const char * compressed,size_t n,string * uncompressed)1200 bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1201   size_t ulength;
1202   if (!GetUncompressedLength(compressed, n, &ulength)) {
1203     return false;
1204   }
1205   // On 32-bit builds: max_size() < kuint32max.  Check for that instead
1206   // of crashing (e.g., consider externally specified compressed data).
1207   if (ulength > uncompressed->max_size()) {
1208     return false;
1209   }
1210   STLStringResizeUninitialized(uncompressed, ulength);
1211   return RawUncompress(compressed, n, string_as_array(uncompressed));
1212 }
1213 
1214 // A Writer that drops everything on the floor and just does validation
1215 class SnappyDecompressionValidator {
1216  private:
1217   size_t expected_;
1218   size_t produced_;
1219 
1220  public:
SnappyDecompressionValidator()1221   inline SnappyDecompressionValidator() : expected_(0), produced_(0) { }
SetExpectedLength(size_t len)1222   inline void SetExpectedLength(size_t len) {
1223     expected_ = len;
1224   }
CheckLength() const1225   inline bool CheckLength() const {
1226     return expected_ == produced_;
1227   }
Append(const char * ip,size_t len)1228   inline bool Append(const char* ip, size_t len) {
1229     produced_ += len;
1230     return produced_ <= expected_;
1231   }
TryFastAppend(const char * ip,size_t available,size_t length)1232   inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
1233     return false;
1234   }
AppendFromSelf(size_t offset,size_t len)1235   inline bool AppendFromSelf(size_t offset, size_t len) {
1236     // See SnappyArrayWriter::AppendFromSelf for an explanation of
1237     // the "offset - 1u" trick.
1238     if (produced_ <= offset - 1u) return false;
1239     produced_ += len;
1240     return produced_ <= expected_;
1241   }
Flush()1242   inline void Flush() {}
1243 };
1244 
IsValidCompressedBuffer(const char * compressed,size_t n)1245 bool IsValidCompressedBuffer(const char* compressed, size_t n) {
1246   ByteArraySource reader(compressed, n);
1247   SnappyDecompressionValidator writer;
1248   return InternalUncompress(&reader, &writer);
1249 }
1250 
IsValidCompressed(Source * compressed)1251 bool IsValidCompressed(Source* compressed) {
1252   SnappyDecompressionValidator writer;
1253   return InternalUncompress(compressed, &writer);
1254 }
1255 
RawCompress(const char * input,size_t input_length,char * compressed,size_t * compressed_length)1256 void RawCompress(const char* input,
1257                  size_t input_length,
1258                  char* compressed,
1259                  size_t* compressed_length) {
1260   ByteArraySource reader(input, input_length);
1261   UncheckedByteArraySink writer(compressed);
1262   Compress(&reader, &writer);
1263 
1264   // Compute how many bytes were added
1265   *compressed_length = (writer.CurrentDestination() - compressed);
1266 }
1267 
Compress(const char * input,size_t input_length,string * compressed)1268 size_t Compress(const char* input, size_t input_length, string* compressed) {
1269   // Pre-grow the buffer to the max length of the compressed output
1270   STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1271 
1272   size_t compressed_length;
1273   RawCompress(input, input_length, string_as_array(compressed),
1274               &compressed_length);
1275   compressed->resize(compressed_length);
1276   return compressed_length;
1277 }
1278 
1279 // -----------------------------------------------------------------------
1280 // Sink interface
1281 // -----------------------------------------------------------------------
1282 
1283 // A type that decompresses into a Sink. The template parameter
1284 // Allocator must export one method "char* Allocate(int size);", which
1285 // allocates a buffer of "size" and appends that to the destination.
1286 template <typename Allocator>
1287 class SnappyScatteredWriter {
1288   Allocator allocator_;
1289 
1290   // We need random access into the data generated so far.  Therefore
1291   // we keep track of all of the generated data as an array of blocks.
1292   // All of the blocks except the last have length kBlockSize.
1293   std::vector<char*> blocks_;
1294   size_t expected_;
1295 
1296   // Total size of all fully generated blocks so far
1297   size_t full_size_;
1298 
1299   // Pointer into current output block
1300   char* op_base_;       // Base of output block
1301   char* op_ptr_;        // Pointer to next unfilled byte in block
1302   char* op_limit_;      // Pointer just past block
1303 
Size() const1304   inline size_t Size() const {
1305     return full_size_ + (op_ptr_ - op_base_);
1306   }
1307 
1308   bool SlowAppend(const char* ip, size_t len);
1309   bool SlowAppendFromSelf(size_t offset, size_t len);
1310 
1311  public:
SnappyScatteredWriter(const Allocator & allocator)1312   inline explicit SnappyScatteredWriter(const Allocator& allocator)
1313       : allocator_(allocator),
1314         full_size_(0),
1315         op_base_(NULL),
1316         op_ptr_(NULL),
1317         op_limit_(NULL) {
1318   }
1319 
SetExpectedLength(size_t len)1320   inline void SetExpectedLength(size_t len) {
1321     assert(blocks_.empty());
1322     expected_ = len;
1323   }
1324 
CheckLength() const1325   inline bool CheckLength() const {
1326     return Size() == expected_;
1327   }
1328 
1329   // Return the number of bytes actually uncompressed so far
Produced() const1330   inline size_t Produced() const {
1331     return Size();
1332   }
1333 
Append(const char * ip,size_t len)1334   inline bool Append(const char* ip, size_t len) {
1335     size_t avail = op_limit_ - op_ptr_;
1336     if (len <= avail) {
1337       // Fast path
1338       memcpy(op_ptr_, ip, len);
1339       op_ptr_ += len;
1340       return true;
1341     } else {
1342       return SlowAppend(ip, len);
1343     }
1344   }
1345 
TryFastAppend(const char * ip,size_t available,size_t length)1346   inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
1347     char* op = op_ptr_;
1348     const int space_left = op_limit_ - op;
1349     if (length <= 16 && available >= 16 + kMaximumTagLength &&
1350         space_left >= 16) {
1351       // Fast path, used for the majority (about 95%) of invocations.
1352       UnalignedCopy128(ip, op);
1353       op_ptr_ = op + length;
1354       return true;
1355     } else {
1356       return false;
1357     }
1358   }
1359 
AppendFromSelf(size_t offset,size_t len)1360   inline bool AppendFromSelf(size_t offset, size_t len) {
1361     char* const op_end = op_ptr_ + len;
1362     // See SnappyArrayWriter::AppendFromSelf for an explanation of
1363     // the "offset - 1u" trick.
1364     if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
1365                           op_end <= op_limit_)) {
1366       // Fast path: src and dst in current block.
1367       op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
1368       return true;
1369     }
1370     return SlowAppendFromSelf(offset, len);
1371   }
1372 
1373   // Called at the end of the decompress. We ask the allocator
1374   // write all blocks to the sink.
Flush()1375   inline void Flush() { allocator_.Flush(Produced()); }
1376 };
1377 
1378 template<typename Allocator>
SlowAppend(const char * ip,size_t len)1379 bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
1380   size_t avail = op_limit_ - op_ptr_;
1381   while (len > avail) {
1382     // Completely fill this block
1383     memcpy(op_ptr_, ip, avail);
1384     op_ptr_ += avail;
1385     assert(op_limit_ - op_ptr_ == 0);
1386     full_size_ += (op_ptr_ - op_base_);
1387     len -= avail;
1388     ip += avail;
1389 
1390     // Bounds check
1391     if (full_size_ + len > expected_) {
1392       return false;
1393     }
1394 
1395     // Make new block
1396     size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
1397     op_base_ = allocator_.Allocate(bsize);
1398     op_ptr_ = op_base_;
1399     op_limit_ = op_base_ + bsize;
1400     blocks_.push_back(op_base_);
1401     avail = bsize;
1402   }
1403 
1404   memcpy(op_ptr_, ip, len);
1405   op_ptr_ += len;
1406   return true;
1407 }
1408 
1409 template<typename Allocator>
SlowAppendFromSelf(size_t offset,size_t len)1410 bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
1411                                                          size_t len) {
1412   // Overflow check
1413   // See SnappyArrayWriter::AppendFromSelf for an explanation of
1414   // the "offset - 1u" trick.
1415   const size_t cur = Size();
1416   if (offset - 1u >= cur) return false;
1417   if (expected_ - cur < len) return false;
1418 
1419   // Currently we shouldn't ever hit this path because Compress() chops the
1420   // input into blocks and does not create cross-block copies. However, it is
1421   // nice if we do not rely on that, since we can get better compression if we
1422   // allow cross-block copies and thus might want to change the compressor in
1423   // the future.
1424   size_t src = cur - offset;
1425   while (len-- > 0) {
1426     char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
1427     Append(&c, 1);
1428     src++;
1429   }
1430   return true;
1431 }
1432 
1433 class SnappySinkAllocator {
1434  public:
SnappySinkAllocator(Sink * dest)1435   explicit SnappySinkAllocator(Sink* dest): dest_(dest) {}
~SnappySinkAllocator()1436   ~SnappySinkAllocator() {}
1437 
Allocate(int size)1438   char* Allocate(int size) {
1439     Datablock block(new char[size], size);
1440     blocks_.push_back(block);
1441     return block.data;
1442   }
1443 
1444   // We flush only at the end, because the writer wants
1445   // random access to the blocks and once we hand the
1446   // block over to the sink, we can't access it anymore.
1447   // Also we don't write more than has been actually written
1448   // to the blocks.
Flush(size_t size)1449   void Flush(size_t size) {
1450     size_t size_written = 0;
1451     size_t block_size;
1452     for (int i = 0; i < blocks_.size(); ++i) {
1453       block_size = std::min<size_t>(blocks_[i].size, size - size_written);
1454       dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
1455                                     &SnappySinkAllocator::Deleter, NULL);
1456       size_written += block_size;
1457     }
1458     blocks_.clear();
1459   }
1460 
1461  private:
1462   struct Datablock {
1463     char* data;
1464     size_t size;
Datablocksnappy::SnappySinkAllocator::Datablock1465     Datablock(char* p, size_t s) : data(p), size(s) {}
1466   };
1467 
Deleter(void * arg,const char * bytes,size_t size)1468   static void Deleter(void* arg, const char* bytes, size_t size) {
1469     delete[] bytes;
1470   }
1471 
1472   Sink* dest_;
1473   std::vector<Datablock> blocks_;
1474 
1475   // Note: copying this object is allowed
1476 };
1477 
UncompressAsMuchAsPossible(Source * compressed,Sink * uncompressed)1478 size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
1479   SnappySinkAllocator allocator(uncompressed);
1480   SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1481   InternalUncompress(compressed, &writer);
1482   return writer.Produced();
1483 }
1484 
Uncompress(Source * compressed,Sink * uncompressed)1485 bool Uncompress(Source* compressed, Sink* uncompressed) {
1486   // Read the uncompressed length from the front of the compressed input
1487   SnappyDecompressor decompressor(compressed);
1488   uint32 uncompressed_len = 0;
1489   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
1490     return false;
1491   }
1492 
1493   char c;
1494   size_t allocated_size;
1495   char* buf = uncompressed->GetAppendBufferVariable(
1496       1, uncompressed_len, &c, 1, &allocated_size);
1497 
1498   const size_t compressed_len = compressed->Available();
1499   // If we can get a flat buffer, then use it, otherwise do block by block
1500   // uncompression
1501   if (allocated_size >= uncompressed_len) {
1502     SnappyArrayWriter writer(buf);
1503     bool result = InternalUncompressAllTags(&decompressor, &writer,
1504                                             compressed_len, uncompressed_len);
1505     uncompressed->Append(buf, writer.Produced());
1506     return result;
1507   } else {
1508     SnappySinkAllocator allocator(uncompressed);
1509     SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1510     return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
1511                                      uncompressed_len);
1512   }
1513 }
1514 
1515 } // end namespace snappy
1516