1 // Copyright 2008 Google Inc. All Rights Reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 //     * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 //     * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 //     * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 //
29 // Internals shared between the Snappy implementation and its unittest.
30 
31 #ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
32 #define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
33 
34 #include "snappy-stubs-internal.h"
35 
36 namespace snappy {
37 namespace internal {
38 
39 // Working memory performs a single allocation to hold all scratch space
40 // required for compression.
41 class WorkingMemory {
42  public:
43   explicit WorkingMemory(size_t input_size);
44   ~WorkingMemory();
45 
46   // Allocates and clears a hash table using memory in "*this",
47   // stores the number of buckets in "*table_size" and returns a pointer to
48   // the base of the hash table.
49   uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
GetScratchInput()50   char* GetScratchInput() const { return input_; }
GetScratchOutput()51   char* GetScratchOutput() const { return output_; }
52 
53  private:
54   char* mem_;        // the allocated memory, never nullptr
55   size_t size_;      // the size of the allocated memory, never 0
56   uint16_t* table_;  // the pointer to the hashtable
57   char* input_;      // the pointer to the input scratch buffer
58   char* output_;     // the pointer to the output scratch buffer
59 
60   // No copying
61   WorkingMemory(const WorkingMemory&);
62   void operator=(const WorkingMemory&);
63 };
64 
65 // Flat array compression that does not emit the "uncompressed length"
66 // prefix. Compresses "input" string to the "*op" buffer.
67 //
68 // REQUIRES: "input_length <= kBlockSize"
69 // REQUIRES: "op" points to an array of memory that is at least
70 // "MaxCompressedLength(input_length)" in size.
71 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
72 // REQUIRES: "table_size" is a power of two
73 //
74 // Returns an "end" pointer into "op" buffer.
75 // "end - op" is the compressed size of "input".
76 char* CompressFragment(const char* input,
77                        size_t input_length,
78                        char* op,
79                        uint16_t* table,
80                        const int table_size);
81 
82 // Find the largest n such that
83 //
84 //   s1[0,n-1] == s2[0,n-1]
85 //   and n <= (s2_limit - s2).
86 //
87 // Return make_pair(n, n < 8).
88 // Does not read *s2_limit or beyond.
89 // Does not read *(s1 + (s2_limit - s2)) or beyond.
90 // Requires that s2_limit >= s2.
91 //
92 // In addition populate *data with the next 5 bytes from the end of the match.
93 // This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
94 // that on some arch's this can be done faster in this routine than subsequent
95 // loading from s2 + n.
96 //
97 // Separate implementation for 64-bit, little-endian cpus.
98 #if !defined(SNAPPY_IS_BIG_ENDIAN) && \
99     (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
FindMatchLength(const char * s1,const char * s2,const char * s2_limit,uint64_t * data)100 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
101                                                       const char* s2,
102                                                       const char* s2_limit,
103                                                       uint64_t* data) {
104   assert(s2_limit >= s2);
105   size_t matched = 0;
106 
107   // This block isn't necessary for correctness; we could just start looping
108   // immediately.  As an optimization though, it is useful.  It creates some not
109   // uncommon code paths that determine, without extra effort, whether the match
110   // length is less than 8.  In short, we are hoping to avoid a conditional
111   // branch, and perhaps get better code layout from the C++ compiler.
112   if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
113     uint64_t a1 = UNALIGNED_LOAD64(s1);
114     uint64_t a2 = UNALIGNED_LOAD64(s2);
115     if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
116       // This code is critical for performance. The reason is that it determines
117       // how much to advance `ip` (s2). This obviously depends on both the loads
118       // from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
119       // depends on the advanced `ip` calculated here through a load, hash and
120       // new candidate hash lookup (a lot of cycles). This makes s1 (ie.
121       // `candidate`) the variable that limits throughput. This is the reason we
122       // go through hoops to have this function update `data` for the next iter.
123       // The straightforward code would use *data, given by
124       //
125       // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
126       //
127       // as input for the hash table lookup to find next candidate. However
128       // this forces the load on the data dependency chain of s1, because
129       // matched_bytes directly depends on s1. However matched_bytes is 0..7, so
130       // we can also calculate *data by
131       //
132       // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
133       //                    matched_bytes);
134       //
135       // The loads do not depend on s1 anymore and are thus off the bottleneck.
136       // The straightforward implementation on x86_64 would be to use
137       //
138       // shrd rax, rdx, cl  (cl being matched_bytes * 8)
139       //
140       // unfortunately shrd with a variable shift has a 4 cycle latency. So this
141       // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
142       // shift instruction but can only shift 64 bits. If we focus on just
143       // obtaining the least significant 4 bytes, we can obtain this by
144       //
145       // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
146       //     UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
147       //
148       // Writen like above this is not a big win, the conditional move would be
149       // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
150       // However matched_bytes < 4 is equal to
151       // static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
152       // move (2 cycles) can execute in parallel with FindLSBSetNonZero64
153       // (tzcnt), which takes 3 cycles.
154       uint64_t xorval = a1 ^ a2;
155       int shift = Bits::FindLSBSetNonZero64(xorval);
156       size_t matched_bytes = shift >> 3;
157 #ifndef __x86_64__
158       *data = UNALIGNED_LOAD64(s2 + matched_bytes);
159 #else
160       // Ideally this would just be
161       //
162       // a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
163       //
164       // However clang correctly infers that the above statement participates on
165       // a critical data dependency chain and thus, unfortunately, refuses to
166       // use a conditional move (it's tuned to cut data dependencies). In this
167       // case there is a longer parallel chain anyway AND this will be fairly
168       // unpredictable.
169       uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
170       asm("testl %k2, %k2\n\t"
171           "cmovzq %1, %0\n\t"
172           : "+r"(a2)
173           : "r"(a3), "r"(xorval));
174       *data = a2 >> (shift & (3 * 8));
175 #endif
176       return std::pair<size_t, bool>(matched_bytes, true);
177     } else {
178       matched = 8;
179       s2 += 8;
180     }
181   }
182 
183   // Find out how long the match is. We loop over the data 64 bits at a
184   // time until we find a 64-bit block that doesn't match; then we find
185   // the first non-matching bit and use that to calculate the total
186   // length of the match.
187   while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
188     uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
189     uint64_t a2 = UNALIGNED_LOAD64(s2);
190     if (a1 == a2) {
191       s2 += 8;
192       matched += 8;
193     } else {
194       uint64_t xorval = a1 ^ a2;
195       int shift = Bits::FindLSBSetNonZero64(xorval);
196       size_t matched_bytes = shift >> 3;
197 #ifndef __x86_64__
198       *data = UNALIGNED_LOAD64(s2 + matched_bytes);
199 #else
200       uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
201       asm("testl %k2, %k2\n\t"
202           "cmovzq %1, %0\n\t"
203           : "+r"(a2)
204           : "r"(a3), "r"(xorval));
205       *data = a2 >> (shift & (3 * 8));
206 #endif
207       matched += matched_bytes;
208       assert(matched >= 8);
209       return std::pair<size_t, bool>(matched, false);
210     }
211   }
212   while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) {
213     if (s1[matched] == *s2) {
214       ++s2;
215       ++matched;
216     } else {
217       if (s2 <= s2_limit - 8) {
218         *data = UNALIGNED_LOAD64(s2);
219       }
220       return std::pair<size_t, bool>(matched, matched < 8);
221     }
222   }
223   return std::pair<size_t, bool>(matched, matched < 8);
224 }
225 #else
FindMatchLength(const char * s1,const char * s2,const char * s2_limit,uint64_t * data)226 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
227                                                       const char* s2,
228                                                       const char* s2_limit,
229                                                       uint64_t* data) {
230   // Implementation based on the x86-64 version, above.
231   assert(s2_limit >= s2);
232   int matched = 0;
233 
234   while (s2 <= s2_limit - 4 &&
235          UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
236     s2 += 4;
237     matched += 4;
238   }
239   if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
240     uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
241     int matching_bits = Bits::FindLSBSetNonZero(x);
242     matched += matching_bits >> 3;
243     s2 += matching_bits >> 3;
244   } else {
245     while ((s2 < s2_limit) && (s1[matched] == *s2)) {
246       ++s2;
247       ++matched;
248     }
249   }
250   if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2);
251   return std::pair<size_t, bool>(matched, matched < 8);
252 }
253 #endif
254 
255 // Lookup tables for decompression code.  Give --snappy_dump_decompression_table
256 // to the unit test to recompute char_table.
257 
258 enum {
259   LITERAL = 0,
260   COPY_1_BYTE_OFFSET = 1,  // 3 bit length + 3 bits of offset in opcode
261   COPY_2_BYTE_OFFSET = 2,
262   COPY_4_BYTE_OFFSET = 3
263 };
264 static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual offset.
265 
266 // Data stored per entry in lookup table:
267 //      Range   Bits-used       Description
268 //      ------------------------------------
269 //      1..64   0..7            Literal/copy length encoded in opcode byte
270 //      0..7    8..10           Copy offset encoded in opcode byte / 256
271 //      0..4    11..13          Extra bytes after opcode
272 //
273 // We use eight bits for the length even though 7 would have sufficed
274 // because of efficiency reasons:
275 //      (1) Extracting a byte is faster than a bit-field
276 //      (2) It properly aligns copy offset so we do not need a <<8
277 static const uint16_t char_table[256] = {
278   0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
279   0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
280   0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
281   0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
282   0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
283   0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
284   0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
285   0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
286   0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
287   0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
288   0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
289   0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
290   0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
291   0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
292   0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
293   0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
294   0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
295   0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
296   0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
297   0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
298   0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
299   0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
300   0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
301   0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
302   0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
303   0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
304   0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
305   0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
306   0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
307   0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
308   0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
309   0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
310 };
311 
312 }  // end namespace internal
313 }  // end namespace snappy
314 
315 #endif  // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
316