1 /*
2  * adler32.c - Adler-32 checksum algorithm
3  *
4  * Originally public domain; changes after 2016-09-07 are copyrighted.
5  *
6  * Copyright 2016 Eric Biggers
7  *
8  * Permission is hereby granted, free of charge, to any person
9  * obtaining a copy of this software and associated documentation
10  * files (the "Software"), to deal in the Software without
11  * restriction, including without limitation the rights to use,
12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following
15  * conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27  * OTHER DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include "x86_cpu_features.h"
31 
32 #include "libdeflate.h"
33 
34 /* The Adler-32 divisor, or "base", value. */
35 #define DIVISOR 65521
36 
37 /*
38  * MAX_BYTES_PER_CHUNK is the most bytes that can be processed without the
39  * possibility of s2 overflowing when it is represented as an unsigned 32-bit
40  * integer.  This value was computed using the following Python script:
41  *
42  *	divisor = 65521
43  *	count = 0
44  *	s1 = divisor - 1
45  *	s2 = divisor - 1
46  *	while True:
47  *		s1 += 0xFF
48  *		s2 += s1
49  *		if s2 > 0xFFFFFFFF:
50  *			break
51  *		count += 1
52  *	print(count)
53  *
54  * Note that to get the correct worst-case value, we must assume that every byte
55  * has value 0xFF and that s1 and s2 started with the highest possible values
56  * modulo the divisor.
57  */
58 #define MAX_BYTES_PER_CHUNK	5552
59 
60 /* Select the implementations to compile in. */
61 
62 #define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
63 
64 /* Include the SSE2 implementation? */
65 #define NEED_SSE2_IMPL 0
66 #ifdef __SSE2__
67 #  include <emmintrin.h>
68 #  undef NEED_SSE2_IMPL
69 #  define NEED_SSE2_IMPL 1
70 #  undef NEED_GENERIC_IMPL
71 #  define NEED_GENERIC_IMPL 0 /* generic impl not needed */
72 #endif
73 
74 /* Include the AVX2 implementation? */
75 #define NEED_AVX2_IMPL 0
76 #if defined(__AVX2__) || \
77 	(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET && \
78 	 COMPILER_SUPPORTS_TARGET_INTRINSICS)
79 #  include <immintrin.h>
80 #  undef NEED_AVX2_IMPL
81 #  define NEED_AVX2_IMPL 1
82 #  ifdef __AVX2__ /* compiling for AVX2, i.e. can we assume it's there? */
83 #    undef NEED_GENERIC_IMPL
84 #    define NEED_GENERIC_IMPL 0 /* generic impl not needed */
85 #    undef NEED_SSE2_IMPL
86 #    define NEED_SSE2_IMPL 0 /* SSE2 impl not needed */
87 #  endif /* otherwise, we can build an AVX2 version, but we won't know whether
88 	    we can use it until runtime */
89 #endif
90 
91 /* Include the NEON implementation? */
92 #define NEED_NEON_IMPL 0
93 #ifdef __ARM_NEON
94 #  include <arm_neon.h>
95 #  undef NEED_NEON_IMPL
96 #  define NEED_NEON_IMPL 1
97 #  undef NEED_GENERIC_IMPL
98 #  define NEED_GENERIC_IMPL 0 /* generic impl not needed */
99 #endif
100 
101 #define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_SSE2_IMPL + NEED_AVX2_IMPL + \
102 		   NEED_NEON_IMPL)
103 
104 /* Define the generic implementation if needed. */
105 #if NEED_GENERIC_IMPL
adler32_generic(u32 adler,const void * buffer,size_t size)106 static u32 adler32_generic(u32 adler, const void *buffer, size_t size)
107 {
108 	u32 s1 = adler & 0xFFFF;
109 	u32 s2 = adler >> 16;
110 	const u8 *p = buffer;
111 	const u8 * const end = p + size;
112 
113 	while (p != end) {
114 		size_t chunk_size = MIN(end - p, MAX_BYTES_PER_CHUNK);
115 		const u8 *chunk_end = p + chunk_size;
116 		size_t num_unrolled_iterations = chunk_size / 4;
117 
118 		while (num_unrolled_iterations--) {
119 			s1 += *p++;
120 			s2 += s1;
121 			s1 += *p++;
122 			s2 += s1;
123 			s1 += *p++;
124 			s2 += s1;
125 			s1 += *p++;
126 			s2 += s1;
127 		}
128 		while (p != chunk_end) {
129 			s1 += *p++;
130 			s2 += s1;
131 		}
132 		s1 %= DIVISOR;
133 		s2 %= DIVISOR;
134 	}
135 
136 	return (s2 << 16) | s1;
137 }
138 #define DEFAULT_IMPL adler32_generic
139 #endif /* NEED_GENERIC_IMPL */
140 
141 #define TARGET_SSE2 100
142 #define TARGET_AVX2 200
143 #define TARGET_NEON 300
144 
145 /* Define the SSE2 implementation if needed. */
146 #if NEED_SSE2_IMPL
147 #  define FUNCNAME		adler32_sse2
148 #  define TARGET		TARGET_SSE2
149 #  define ALIGNMENT_REQUIRED	16
150 #  define BYTES_PER_ITERATION	32
151 #  define ATTRIBUTES
152 #  define DEFAULT_IMPL		adler32_sse2
153 #  include "adler32_impl.h"
154 #endif
155 
156 /* Define the AVX2 implementation if needed. */
157 #if NEED_AVX2_IMPL
158 #  define FUNCNAME		adler32_avx2
159 #  define TARGET		TARGET_AVX2
160 #  define ALIGNMENT_REQUIRED	32
161 #  define BYTES_PER_ITERATION	32
162 #  ifdef __AVX2__
163 #    define ATTRIBUTES
164 #    define DEFAULT_IMPL	adler32_avx2
165 #  else
166 #    define ATTRIBUTES		__attribute__((target("avx2")))
167 #  endif
168 #  include "adler32_impl.h"
169 #endif
170 
171 /* Define the NEON implementation if needed. */
172 #if NEED_NEON_IMPL
173 #  define FUNCNAME		adler32_neon
174 #  define TARGET		TARGET_NEON
175 #  define ALIGNMENT_REQUIRED	16
176 #  define BYTES_PER_ITERATION	32
177 #  define ATTRIBUTES
178 #  define DEFAULT_IMPL		adler32_neon
179 #  include "adler32_impl.h"
180 #endif
181 
182 typedef u32 (*adler32_func_t)(u32, const void *, size_t);
183 
184 /*
185  * If multiple implementations are available, then dispatch among them based on
186  * CPU features at runtime.  Otherwise just call the single one directly.
187  */
188 #if NUM_IMPLS == 1
189 #  define adler32_impl DEFAULT_IMPL
190 #else
191 static u32 dispatch(u32, const void *, size_t);
192 
193 static adler32_func_t adler32_impl = dispatch;
194 
dispatch(u32 adler,const void * buffer,size_t size)195 static u32 dispatch(u32 adler, const void *buffer, size_t size)
196 {
197 	adler32_func_t f = DEFAULT_IMPL;
198 #if NEED_AVX2_IMPL && !defined(__AVX2__)
199 	if (x86_have_cpu_features(X86_CPU_FEATURE_AVX2))
200 		f = adler32_avx2;
201 #endif
202 	adler32_impl = f;
203 	return adler32_impl(adler, buffer, size);
204 }
205 #endif /* NUM_IMPLS != 1 */
206 
207 LIBDEFLATEAPI u32
libdeflate_adler32(u32 adler,const void * buffer,size_t size)208 libdeflate_adler32(u32 adler, const void *buffer, size_t size)
209 {
210 	if (buffer == NULL) /* return initial value */
211 		return 1;
212 	return adler32_impl(adler, buffer, size);
213 }
214