1 /*
2 Copyright 2011, Google Inc.
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 
9   * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11   * Redistributions in binary form must reproduce the above
12 copyright notice, this list of conditions and the following disclaimer
13 in the documentation and/or other materials provided with the
14 distribution.
15   * Neither the name of Google Inc. nor the names of its
16 contributors may be used to endorse or promote products derived from
17 this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 File modified for the Linux Kernel by
32 Zeev Tarantov <zeev.tarantov@gmail.com>
33 
34 File modified for Sereal by
35 Steffen Mueller <smueller@cpan.org>
36 */
37 
38 #include "csnappy_internal.h"
39 #ifdef __KERNEL__
40 #include <linux/kernel.h>
41 #include <linux/module.h>
42 #endif
43 #include "csnappy.h"
44 
45 
46 static INLINE char*
encode_varint32(char * sptr,uint32_t v)47 encode_varint32(char *sptr, uint32_t v)
48 {
49 	uint8_t* ptr = (uint8_t *)sptr;
50 	static const int B = 128;
51 	if (v < (1<<7)) {
52 		*(ptr++) = v;
53 	} else if (v < (1<<14)) {
54 		*(ptr++) = v | B;
55 		*(ptr++) = v>>7;
56 	} else if (v < (1<<21)) {
57 		*(ptr++) = v | B;
58 		*(ptr++) = (v>>7) | B;
59 		*(ptr++) = v>>14;
60 	} else if (v < (1<<28)) {
61 		*(ptr++) = v | B;
62 		*(ptr++) = (v>>7) | B;
63 		*(ptr++) = (v>>14) | B;
64 		*(ptr++) = v>>21;
65 	} else {
66 		*(ptr++) = v | B;
67 		*(ptr++) = (v>>7) | B;
68 		*(ptr++) = (v>>14) | B;
69 		*(ptr++) = (v>>21) | B;
70 		*(ptr++) = v>>28;
71 	}
72 	return (char *)ptr;
73 }
74 
75 /*
76  * *** DO NOT CHANGE THE VALUE OF kBlockSize ***
77 
78  * New Compression code chops up the input into blocks of at most
79  * the following size.  This ensures that back-references in the
80  * output never cross kBlockSize block boundaries.  This can be
81  * helpful in implementing blocked decompression.  However the
82  * decompression code should not rely on this guarantee since older
83  * compression code may not obey it.
84  */
85 #define kBlockLog 15
86 #define kBlockSize (1 << kBlockLog)
87 
88 
89 #if defined(__arm__) && !defined(ARCH_ARM_HAVE_UNALIGNED)
90 
emit_literal(uint8_t * op,const uint8_t * src,const uint8_t * end)91 static uint8_t* emit_literal(
92 	uint8_t *op,
93 	const uint8_t *src,
94 	const uint8_t *end)
95 {
96 	uint32_t length = end - src;
97 	uint32_t n = length - 1;
98 	if (!length)
99 		return op;
100 	if (n < 60) {
101 		/* Fits in tag byte */
102 		*op++ = LITERAL | (n << 2);
103 	} else {
104 		/* Encode in upcoming bytes */
105 		uint8_t *base = op;
106 		op++;
107 		do {
108 			*op++ = n & 0xff;
109 			n >>= 8;
110 		} while (n > 0);
111 		*base = LITERAL | ((59 + (op - base - 1)) << 2);
112 	}
113 	memcpy(op, src, length);
114 	return op + length;
115 }
116 
emit_copy(uint8_t * op,uint32_t offset,uint32_t len)117 static uint8_t* emit_copy(
118 	uint8_t *op,
119 	uint32_t offset,
120 	uint32_t len)
121 {
122 	DCHECK_GT(offset, 0);
123 
124 	/* Emit 64 byte copies but make sure to keep at least four bytes
125 	 * reserved */
126 	while (unlikely(len >= 68)) {
127 		*op++ = COPY_2_BYTE_OFFSET | ((64 - 1) << 2);
128 		*op++ = offset & 255;
129 		*op++ = offset >> 8;
130 		len -= 64;
131 	}
132 
133 	/* Emit an extra 60 byte copy if have too much data to fit in one
134 	 * copy */
135 	if (unlikely(len > 64)) {
136 		*op++ = COPY_2_BYTE_OFFSET | ((60 - 1) << 2);
137 		*op++ = offset & 255;
138 		*op++ = offset >> 8;
139 		len -= 60;
140 	}
141 
142 	/* Emit remainder */
143 	DCHECK_GE(len, 4);
144 	if ((len < 12) && (offset < 2048)) {
145 		int len_minus_4 = len - 4;
146 		*op++ = COPY_1_BYTE_OFFSET   |
147 			((len_minus_4) << 2) |
148 			((offset >> 8) << 5);
149 		*op++ = offset & 0xff;
150 	} else {
151 		*op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2);
152 		*op++ = offset & 255;
153 		*op++ = offset >> 8;
154 	}
155 	return op;
156 }
157 
find_match_length(const uint8_t * s1,const uint8_t * s2,const uint8_t * s2_end)158 static uint32_t find_match_length(
159 	const uint8_t *s1,
160 	const uint8_t *s2,
161 	const uint8_t *s2_end)
162 {
163 	const uint8_t * const s2_start = s2;
164 	while (s2 < s2_end && *s1++ == *s2++) /*nothing*/;
165 	return s2 - s2_start - 1;
166 }
167 
hash(uint32_t v)168 static uint32_t hash(uint32_t v)
169 {
170 	return v * UINT32_C(0x1e35a7bd);
171 }
172 
173 char*
csnappy_compress_fragment(const char * input,const uint32_t input_size,char * dst,void * working_memory,const int workmem_bytes_power_of_two)174 csnappy_compress_fragment(
175 	const char *input,
176 	const uint32_t input_size,
177 	char *dst,
178 	void *working_memory,
179 	const int workmem_bytes_power_of_two)
180 {
181 	const uint8_t * const src_start = (const uint8_t *)input;
182 	const uint8_t * const src_end_minus4 = src_start + input_size - 4;
183 	const uint8_t *src = src_start, *done_upto = src_start, *match;
184 	uint8_t *op = (uint8_t *)dst;
185 	uint16_t *wm = (uint16_t *)working_memory;
186 	int shift = 33 - workmem_bytes_power_of_two;
187 	uint32_t curr_val, curr_hash, match_val, offset, length;
188 	if (unlikely(input_size < 4))
189 		goto the_end;
190 	memset(wm, 0, 1 << workmem_bytes_power_of_two);
191 	for (;;) {
192 		curr_val = (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
193 		do {
194 			src++;
195 			if (unlikely(src >= src_end_minus4))
196 				goto the_end;
197 			curr_val = (curr_val >> 8) | (src[3] << 24);
198 			DCHECK_EQ(curr_val, get_unaligned_le32(src));
199 			curr_hash = hash(curr_val) >> shift;
200 			match = src_start + wm[curr_hash];
201 			DCHECK_LT(match, src);
202 			wm[curr_hash] = src - src_start;
203 			match_val = get_unaligned_le32(match);
204 		} while (likely(curr_val != match_val));
205 		offset = src - match;
206 		length = 4 + find_match_length(
207 			match + 4, src + 4, src_end_minus4 + 4);
208 		DCHECK_EQ(memcmp(src, match, length), 0);
209 		op = emit_literal(op, done_upto, src);
210 		op = emit_copy(op, offset, length);
211 		done_upto = src + length;
212 		src = done_upto - 1;
213 	}
214 the_end:
215 	op = emit_literal(op, done_upto, src_end_minus4 + 4);
216 	return (char *)op;
217 }
218 
219 #else /* !simple */
220 
221 /*
222  * Any hash function will produce a valid compressed bitstream, but a good
223  * hash function reduces the number of collisions and thus yields better
224  * compression for compressible input, and more speed for incompressible
225  * input. Of course, it doesn't hurt if the hash function is reasonably fast
226  * either, as it gets called a lot.
227  */
HashBytes(uint32_t bytes,int shift)228 static INLINE uint32_t HashBytes(uint32_t bytes, int shift)
229 {
230 	uint32_t kMul = 0x1e35a7bd;
231 	return (bytes * kMul) >> shift;
232 }
Hash(const char * p,int shift)233 static INLINE uint32_t Hash(const char *p, int shift)
234 {
235 	return HashBytes(UNALIGNED_LOAD32(p), shift);
236 }
237 
238 
239 /*
240  * Return the largest n such that
241  *
242  *   s1[0,n-1] == s2[0,n-1]
243  *   and n <= (s2_limit - s2).
244  *
245  * Does not read *s2_limit or beyond.
246  * Does not read *(s1 + (s2_limit - s2)) or beyond.
247  * Requires that s2_limit >= s2.
248  *
249  * Separate implementation for x86_64, for speed.  Uses the fact that
250  * x86_64 is little endian.
251  */
252 #if defined(__x86_64__) || defined(__aarch64__)
253 static INLINE int
FindMatchLength(const char * s1,const char * s2,const char * s2_limit)254 FindMatchLength(const char *s1, const char *s2, const char *s2_limit)
255 {
256 	uint64_t x;
257 	int matched, matching_bits;
258 	DCHECK_GE(s2_limit, s2);
259 	matched = 0;
260 	/*
261 	 * Find out how long the match is. We loop over the data 64 bits at a
262 	 * time until we find a 64-bit block that doesn't match; then we find
263 	 * the first non-matching bit and use that to calculate the total
264 	 * length of the match.
265 	 */
266 	while (likely(s2 <= s2_limit - 8)) {
267 		if (unlikely(UNALIGNED_LOAD64(s1 + matched) ==
268 				UNALIGNED_LOAD64(s2))) {
269 			s2 += 8;
270 			matched += 8;
271 		} else {
272 			/*
273 			 * On current (mid-2008) Opteron models there is a 3%
274 			 * more efficient code sequence to find the first
275 			 * non-matching byte. However, what follows is ~10%
276 			 * better on Intel Core 2 and newer, and we expect AMD's
277 			 * bsf instruction to improve.
278 			 */
279 			x = UNALIGNED_LOAD64(s1 + matched) ^
280 				UNALIGNED_LOAD64(s2);
281 			matching_bits = FindLSBSetNonZero64(x);
282 			matched += matching_bits >> 3;
283 			return matched;
284 		}
285 	}
286 	while (likely(s2 < s2_limit)) {
287 		if (likely(s1[matched] == *s2)) {
288 			++s2;
289 			++matched;
290 		} else {
291 			return matched;
292 		}
293 	}
294 	return matched;
295 }
296 #else /* !defined(__x86_64__) && !defined(__aarch64__) */
297 static INLINE int
FindMatchLength(const char * s1,const char * s2,const char * s2_limit)298 FindMatchLength(const char *s1, const char *s2, const char *s2_limit)
299 {
300 	/* Implementation based on the x86-64 version, above. */
301 	int matched = 0;
302 	DCHECK_GE(s2_limit, s2);
303 
304 	while (s2 <= s2_limit - 4 &&
305 		UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
306 		s2 += 4;
307 		matched += 4;
308 	}
309 #if __BYTE_ORDER == __LITTLE_ENDIAN
310 	if (s2 <= s2_limit - 4) {
311 		uint32_t x = UNALIGNED_LOAD32(s1 + matched) ^
312 				UNALIGNED_LOAD32(s2);
313 		int matching_bits = FindLSBSetNonZero(x);
314 		matched += matching_bits >> 3;
315 	} else {
316 		while ((s2 < s2_limit) && (s1[matched] == *s2)) {
317 			++s2;
318 			++matched;
319 		}
320 	}
321 #else
322 	while ((s2 < s2_limit) && (s1[matched] == *s2)) {
323 		++s2;
324 		++matched;
325 	}
326 #endif
327 	return matched;
328 }
329 #endif /* !defined(__x86_64__) && !defined(__aarch64__) */
330 
331 
332 static INLINE char*
EmitLiteral(char * op,const char * literal,int len,int allow_fast_path)333 EmitLiteral(char *op, const char *literal, int len, int allow_fast_path)
334 {
335 	int n = len - 1; /* Zero-length literals are disallowed */
336 	if (n < 60) {
337 		/* Fits in tag byte */
338 		*op++ = LITERAL | (n << 2);
339 		/*
340 		The vast majority of copies are below 16 bytes, for which a
341 		call to memcpy is overkill. This fast path can sometimes
342 		copy up to 15 bytes too much, but that is okay in the
343 		main loop, since we have a bit to go on for both sides:
344 		- The input will always have kInputMarginBytes = 15 extra
345 		available bytes, as long as we're in the main loop, and
346 		if not, allow_fast_path = false.
347 		- The output will always have 32 spare bytes (see
348 		snappy_max_compressed_length).
349 		*/
350 		if (allow_fast_path && len <= 16) {
351 			UnalignedCopy64(literal, op);
352 			UnalignedCopy64(literal + 8, op + 8);
353 			return op + len;
354 		}
355 	} else {
356 		/* Encode in upcoming bytes */
357 		char *base = op;
358 		int count = 0;
359 		op++;
360 		while (n > 0) {
361 			*op++ = n & 0xff;
362 			n >>= 8;
363 			count++;
364 		}
365 		DCHECK_GE(count, 1);
366 		DCHECK_LE(count, 4);
367 		*base = LITERAL | ((59+count) << 2);
368 	}
369 	memcpy(op, literal, len);
370 	return op + len;
371 }
372 
373 static INLINE char*
EmitCopyLessThan64(char * op,int offset,int len)374 EmitCopyLessThan64(char *op, int offset, int len)
375 {
376 	DCHECK_LE(len, 64);
377 	DCHECK_GE(len, 4);
378 	DCHECK_LT(offset, 65536);
379 
380 	if ((len < 12) && (offset < 2048)) {
381 		int len_minus_4 = len - 4;
382 		DCHECK_LT(len_minus_4, 8); /* Must fit in 3 bits */
383 		*op++ = COPY_1_BYTE_OFFSET   +
384 			((len_minus_4) << 2) +
385 			((offset >> 8) << 5);
386 		*op++ = offset & 0xff;
387 	} else {
388 		*op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
389 		put_unaligned_le16(offset, op);
390 		op += 2;
391 	}
392 	return op;
393 }
394 
395 static INLINE char*
EmitCopy(char * op,int offset,int len)396 EmitCopy(char *op, int offset, int len)
397 {
398 	/* Emit 64 byte copies but make sure to keep at least four bytes
399 	 * reserved */
400 	while (len >= 68) {
401 		op = EmitCopyLessThan64(op, offset, 64);
402 		len -= 64;
403 	}
404 
405 	/* Emit an extra 60 byte copy if have too much data to fit in one
406 	 * copy */
407 	if (len > 64) {
408 		op = EmitCopyLessThan64(op, offset, 60);
409 		len -= 60;
410 	}
411 
412 	/* Emit remainder */
413 	op = EmitCopyLessThan64(op, offset, len);
414 	return op;
415 }
416 
417 
418 /*
419 For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
420 equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
421 empirically found that overlapping loads such as
422  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
423 are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
424 
425 We have different versions for 64- and 32-bit; ideally we would avoid the
426 two functions and just INLINE the UNALIGNED_LOAD64 call into
427 GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
428 enough to avoid loading the value multiple times then. For 64-bit, the load
429 is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
430 done at GetUint32AtOffset() time.
431 */
432 
433 #if defined(__x86_64__) || (__SIZEOF_SIZE_T__ == 8)
434 
435 typedef uint64_t EightBytesReference;
436 
GetEightBytesAt(const char * ptr)437 static INLINE EightBytesReference GetEightBytesAt(const char* ptr) {
438 	return UNALIGNED_LOAD64(ptr);
439 }
440 
GetUint32AtOffset(uint64_t v,int offset)441 static INLINE uint32_t GetUint32AtOffset(uint64_t v, int offset) {
442 	DCHECK_GE(offset, 0);
443 	DCHECK_LE(offset, 4);
444 #if __BYTE_ORDER == __LITTLE_ENDIAN
445 	return v >> (8 * offset);
446 #else
447 	return v >> (32 - 8 * offset);
448 #endif
449 }
450 
451 #else /* !ARCH_K8 */
452 
453 typedef const char* EightBytesReference;
454 
GetEightBytesAt(const char * ptr)455 static INLINE EightBytesReference GetEightBytesAt(const char* ptr) {
456 	return ptr;
457 }
458 
GetUint32AtOffset(const char * v,int offset)459 static INLINE uint32_t GetUint32AtOffset(const char* v, int offset) {
460 	DCHECK_GE(offset, 0);
461 	DCHECK_LE(offset, 4);
462 	return UNALIGNED_LOAD32(v + offset);
463 }
464 
465 #endif /* !ARCH_K8 */
466 
467 
468 #define kInputMarginBytes 15
469 char*
csnappy_compress_fragment(const char * input,const uint32_t input_size,char * op,void * working_memory,const int workmem_bytes_power_of_two)470 csnappy_compress_fragment(
471 	const char *input,
472 	const uint32_t input_size,
473 	char *op,
474 	void *working_memory,
475 	const int workmem_bytes_power_of_two)
476 {
477 	const char *ip, *ip_end, *base_ip, *next_emit, *ip_limit, *next_ip,
478 			*candidate, *base;
479 	uint16_t *table = (uint16_t *)working_memory;
480 	EightBytesReference input_bytes;
481 	uint32_t hash, next_hash, prev_hash, cur_hash, skip, candidate_bytes;
482 	int shift, matched;
483 
484 	DCHECK_GE(workmem_bytes_power_of_two, 9);
485 	DCHECK_LE(workmem_bytes_power_of_two, 15);
486 	/* Table of 2^X bytes, need (X-1) bits to address table of uint16_t.
487 	 * How many bits of 32bit hash function result are discarded? */
488 	shift = 33 - workmem_bytes_power_of_two;
489 	/* "ip" is the input pointer, and "op" is the output pointer. */
490 	ip = input;
491 	DCHECK_LE(input_size, kBlockSize);
492 	ip_end = input + input_size;
493 	base_ip = ip;
494 	/* Bytes in [next_emit, ip) will be emitted as literal bytes. Or
495 	   [next_emit, ip_end) after the main loop. */
496 	next_emit = ip;
497 
498 	if (unlikely(input_size < kInputMarginBytes))
499 		goto emit_remainder;
500 
501 	memset(working_memory, 0, 1 << workmem_bytes_power_of_two);
502 
503 	ip_limit = input + input_size - kInputMarginBytes;
504 	next_hash = Hash(++ip, shift);
505 
506 main_loop:
507 	DCHECK_LT(next_emit, ip);
508 	/*
509 	* The body of this loop calls EmitLiteral once and then EmitCopy one or
510 	* more times. (The exception is that when we're close to exhausting
511 	* the input we goto emit_remainder.)
512 	*
513 	* In the first iteration of this loop we're just starting, so
514 	* there's nothing to copy, so calling EmitLiteral once is
515 	* necessary. And we only start a new iteration when the
516 	* current iteration has determined that a call to EmitLiteral will
517 	* precede the next call to EmitCopy (if any).
518 	*
519 	* Step 1: Scan forward in the input looking for a 4-byte-long match.
520 	* If we get close to exhausting the input then goto emit_remainder.
521 	*
522 	* Heuristic match skipping: If 32 bytes are scanned with no matches
523 	* found, start looking only at every other byte. If 32 more bytes are
524 	* scanned, look at every third byte, etc.. When a match is found,
525 	* immediately go back to looking at every byte. This is a small loss
526 	* (~5% performance, ~0.1% density) for compressible data due to more
527 	* bookkeeping, but for non-compressible data (such as JPEG) it's a huge
528 	* win since the compressor quickly "realizes" the data is incompressible
529 	* and doesn't bother looking for matches everywhere.
530 	*
531 	* The "skip" variable keeps track of how many bytes there are since the
532 	* last match; dividing it by 32 (ie. right-shifting by five) gives the
533 	* number of bytes to move ahead for each iteration.
534 	*/
535 	skip = 32;
536 
537 	next_ip = ip;
538 	do {
539 		ip = next_ip;
540 		hash = next_hash;
541 		DCHECK_EQ(hash, Hash(ip, shift));
542 		next_ip = ip + (skip++ >> 5);
543 		if (unlikely(next_ip > ip_limit))
544 			goto emit_remainder;
545 		next_hash = Hash(next_ip, shift);
546 		candidate = base_ip + table[hash];
547 		DCHECK_GE(candidate, base_ip);
548 		DCHECK_LT(candidate, ip);
549 
550 		table[hash] = ip - base_ip;
551 	} while (likely(UNALIGNED_LOAD32(ip) !=
552 			UNALIGNED_LOAD32(candidate)));
553 
554 	/*
555 	* Step 2: A 4-byte match has been found. We'll later see if more
556 	* than 4 bytes match. But, prior to the match, input
557 	* bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
558 	*/
559 	DCHECK_LE(next_emit + 16, ip_end);
560 	op = EmitLiteral(op, next_emit, ip - next_emit, 1);
561 
562 	/*
563 	* Step 3: Call EmitCopy, and then see if another EmitCopy could
564 	* be our next move. Repeat until we find no match for the
565 	* input immediately after what was consumed by the last EmitCopy call.
566 	*
567 	* If we exit this loop normally then we need to call EmitLiteral next,
568 	* though we don't yet know how big the literal will be. We handle that
569 	* by proceeding to the next iteration of the main loop. We also can exit
570 	* this loop via goto if we get close to exhausting the input.
571 	*/
572 	candidate_bytes = 0;
573 
574 	do {
575 		/* We have a 4-byte match at ip, and no need to emit any
576 		 "literal bytes" prior to ip. */
577 		base = ip;
578 		matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
579 		ip += matched;
580 		DCHECK_EQ(0, memcmp(base, candidate, matched));
581 		op = EmitCopy(op, base - candidate, matched);
582 		/* We could immediately start working at ip now, but to improve
583 		 compression we first update table[Hash(ip - 1, ...)]. */
584 		next_emit = ip;
585 		if (unlikely(ip >= ip_limit))
586 			goto emit_remainder;
587 		input_bytes = GetEightBytesAt(ip - 1);
588 		prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
589 		table[prev_hash] = ip - base_ip - 1;
590 		cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
591 		candidate = base_ip + table[cur_hash];
592 		candidate_bytes = UNALIGNED_LOAD32(candidate);
593 		table[cur_hash] = ip - base_ip;
594 	} while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
595 
596 	next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
597 	++ip;
598 	goto main_loop;
599 
600 emit_remainder:
601 	/* Emit the remaining bytes as a literal */
602 	if (next_emit < ip_end)
603 		op = EmitLiteral(op, next_emit, ip_end - next_emit, 0);
604 
605 	return op;
606 }
607 #endif /* !simple */
608 #if defined(__KERNEL__) && !defined(STATIC)
609 EXPORT_SYMBOL(csnappy_compress_fragment);
610 #endif
611 
612 uint32_t __attribute__((const))
csnappy_max_compressed_length(uint32_t source_len)613 csnappy_max_compressed_length(uint32_t source_len)
614 {
615 	return 32 + source_len + source_len/6;
616 }
617 #if defined(__KERNEL__) && !defined(STATIC)
618 EXPORT_SYMBOL(csnappy_max_compressed_length);
619 #endif
620 
621 void
csnappy_compress(const char * input,uint32_t input_length,char * compressed,uint32_t * compressed_length,void * working_memory,const int workmem_bytes_power_of_two)622 csnappy_compress(
623 	const char *input,
624 	uint32_t input_length,
625 	char *compressed,
626 	uint32_t *compressed_length,
627 	void *working_memory,
628 	const int workmem_bytes_power_of_two)
629 {
630 	int workmem_size;
631 	int num_to_read;
632 	uint32_t written = 0;
633 	char *p = encode_varint32(compressed, input_length);
634 	written += (p - compressed);
635 	compressed = p;
636 	while (input_length > 0) {
637 		num_to_read = min(input_length, (uint32_t)kBlockSize);
638 		workmem_size = workmem_bytes_power_of_two;
639 		if (unlikely(num_to_read < kBlockSize)) {
640 			for (workmem_size = 9;
641 			     workmem_size < workmem_bytes_power_of_two;
642 			     ++workmem_size) {
643 				if ((1 << (workmem_size-1)) >= num_to_read)
644 					break;
645 			}
646 		}
647 		p = csnappy_compress_fragment(
648 				input, num_to_read, compressed,
649 				working_memory, workmem_size);
650 		written += (p - compressed);
651 		compressed = p;
652 		input_length -= num_to_read;
653 		input += num_to_read;
654 	}
655 	*compressed_length = written;
656 }
657 #if defined(__KERNEL__) && !defined(STATIC)
658 EXPORT_SYMBOL(csnappy_compress);
659 
660 MODULE_LICENSE("BSD");
661 MODULE_DESCRIPTION("Snappy Compressor");
662 #endif
663