1 /*
2 * adler32.c - Adler-32 checksum algorithm
3 *
4 * Originally public domain; changes after 2016-09-07 are copyrighted.
5 *
6 * Copyright 2016 Eric Biggers
7 *
8 * Permission is hereby granted, free of charge, to any person
9 * obtaining a copy of this software and associated documentation
10 * files (the "Software"), to deal in the Software without
11 * restriction, including without limitation the rights to use,
12 * copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following
15 * conditions:
16 *
17 * The above copyright notice and this permission notice shall be
18 * included in all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 * OTHER DEALINGS IN THE SOFTWARE.
28 */
29
30 #include "x86_cpu_features.h"
31
32 #include "libdeflate.h"
33
34 /* The Adler-32 divisor, or "base", value. */
35 #define DIVISOR 65521
36
37 /*
38 * MAX_BYTES_PER_CHUNK is the most bytes that can be processed without the
39 * possibility of s2 overflowing when it is represented as an unsigned 32-bit
40 * integer. This value was computed using the following Python script:
41 *
42 * divisor = 65521
43 * count = 0
44 * s1 = divisor - 1
45 * s2 = divisor - 1
46 * while True:
47 * s1 += 0xFF
48 * s2 += s1
49 * if s2 > 0xFFFFFFFF:
50 * break
51 * count += 1
52 * print(count)
53 *
54 * Note that to get the correct worst-case value, we must assume that every byte
55 * has value 0xFF and that s1 and s2 started with the highest possible values
56 * modulo the divisor.
57 */
58 #define MAX_BYTES_PER_CHUNK 5552
59
60 /* Select the implementations to compile in. */
61
62 #define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
63
64 /* Include the SSE2 implementation? */
65 #define NEED_SSE2_IMPL 0
66 #ifdef __SSE2__
67 # include <emmintrin.h>
68 # undef NEED_SSE2_IMPL
69 # define NEED_SSE2_IMPL 1
70 # undef NEED_GENERIC_IMPL
71 # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
72 #endif
73
74 /* Include the AVX2 implementation? */
75 #define NEED_AVX2_IMPL 0
76 #if defined(__AVX2__) || \
77 (X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET && \
78 COMPILER_SUPPORTS_TARGET_INTRINSICS)
79 # include <immintrin.h>
80 # undef NEED_AVX2_IMPL
81 # define NEED_AVX2_IMPL 1
82 # ifdef __AVX2__ /* compiling for AVX2, i.e. can we assume it's there? */
83 # undef NEED_GENERIC_IMPL
84 # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
85 # undef NEED_SSE2_IMPL
86 # define NEED_SSE2_IMPL 0 /* SSE2 impl not needed */
87 # endif /* otherwise, we can build an AVX2 version, but we won't know whether
88 we can use it until runtime */
89 #endif
90
91 /* Include the NEON implementation? */
92 #define NEED_NEON_IMPL 0
93 #ifdef __ARM_NEON
94 # include <arm_neon.h>
95 # undef NEED_NEON_IMPL
96 # define NEED_NEON_IMPL 1
97 # undef NEED_GENERIC_IMPL
98 # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
99 #endif
100
101 #define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_SSE2_IMPL + NEED_AVX2_IMPL + \
102 NEED_NEON_IMPL)
103
104 /* Define the generic implementation if needed. */
105 #if NEED_GENERIC_IMPL
adler32_generic(u32 adler,const void * buffer,size_t size)106 static u32 adler32_generic(u32 adler, const void *buffer, size_t size)
107 {
108 u32 s1 = adler & 0xFFFF;
109 u32 s2 = adler >> 16;
110 const u8 *p = buffer;
111 const u8 * const end = p + size;
112
113 while (p != end) {
114 size_t chunk_size = MIN(end - p, MAX_BYTES_PER_CHUNK);
115 const u8 *chunk_end = p + chunk_size;
116 size_t num_unrolled_iterations = chunk_size / 4;
117
118 while (num_unrolled_iterations--) {
119 s1 += *p++;
120 s2 += s1;
121 s1 += *p++;
122 s2 += s1;
123 s1 += *p++;
124 s2 += s1;
125 s1 += *p++;
126 s2 += s1;
127 }
128 while (p != chunk_end) {
129 s1 += *p++;
130 s2 += s1;
131 }
132 s1 %= DIVISOR;
133 s2 %= DIVISOR;
134 }
135
136 return (s2 << 16) | s1;
137 }
138 #define DEFAULT_IMPL adler32_generic
139 #endif /* NEED_GENERIC_IMPL */
140
141 #define TARGET_SSE2 100
142 #define TARGET_AVX2 200
143 #define TARGET_NEON 300
144
145 /* Define the SSE2 implementation if needed. */
146 #if NEED_SSE2_IMPL
147 # define FUNCNAME adler32_sse2
148 # define TARGET TARGET_SSE2
149 # define ALIGNMENT_REQUIRED 16
150 # define BYTES_PER_ITERATION 32
151 # define ATTRIBUTES
152 # define DEFAULT_IMPL adler32_sse2
153 # include "adler32_impl.h"
154 #endif
155
156 /* Define the AVX2 implementation if needed. */
157 #if NEED_AVX2_IMPL
158 # define FUNCNAME adler32_avx2
159 # define TARGET TARGET_AVX2
160 # define ALIGNMENT_REQUIRED 32
161 # define BYTES_PER_ITERATION 32
162 # ifdef __AVX2__
163 # define ATTRIBUTES
164 # define DEFAULT_IMPL adler32_avx2
165 # else
166 # define ATTRIBUTES __attribute__((target("avx2")))
167 # endif
168 # include "adler32_impl.h"
169 #endif
170
171 /* Define the NEON implementation if needed. */
172 #if NEED_NEON_IMPL
173 # define FUNCNAME adler32_neon
174 # define TARGET TARGET_NEON
175 # define ALIGNMENT_REQUIRED 16
176 # define BYTES_PER_ITERATION 32
177 # define ATTRIBUTES
178 # define DEFAULT_IMPL adler32_neon
179 # include "adler32_impl.h"
180 #endif
181
182 typedef u32 (*adler32_func_t)(u32, const void *, size_t);
183
184 /*
185 * If multiple implementations are available, then dispatch among them based on
186 * CPU features at runtime. Otherwise just call the single one directly.
187 */
188 #if NUM_IMPLS == 1
189 # define adler32_impl DEFAULT_IMPL
190 #else
191 static u32 dispatch(u32, const void *, size_t);
192
193 static adler32_func_t adler32_impl = dispatch;
194
dispatch(u32 adler,const void * buffer,size_t size)195 static u32 dispatch(u32 adler, const void *buffer, size_t size)
196 {
197 adler32_func_t f = DEFAULT_IMPL;
198 #if NEED_AVX2_IMPL && !defined(__AVX2__)
199 if (x86_have_cpu_features(X86_CPU_FEATURE_AVX2))
200 f = adler32_avx2;
201 #endif
202 adler32_impl = f;
203 return adler32_impl(adler, buffer, size);
204 }
205 #endif /* NUM_IMPLS != 1 */
206
207 LIBDEFLATEAPI u32
libdeflate_adler32(u32 adler,const void * buffer,size_t size)208 libdeflate_adler32(u32 adler, const void *buffer, size_t size)
209 {
210 if (buffer == NULL) /* return initial value */
211 return 1;
212 return adler32_impl(adler, buffer, size);
213 }
214