1 /* functable.c -- Choose relevant optimized functions at runtime
2 * Copyright (C) 2017 Hans Kristian Rosbach
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 */
5
6 #include "zbuild.h"
7 #include "zendian.h"
8 #include "deflate.h"
9 #include "deflate_p.h"
10
11 #include "functable.h"
12
13 #ifdef X86_FEATURES
14 # include "fallback_builtins.h"
15 #endif
16
17 /* insert_string */
18 extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
19 #ifdef X86_SSE42_CRC_HASH
20 extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
21 #elif defined(ARM_ACLE_CRC_HASH)
22 extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
23 #endif
24
25 /* quick_insert_string */
26 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
27 #ifdef X86_SSE42_CRC_HASH
28 extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
29 #elif defined(ARM_ACLE_CRC_HASH)
30 extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
31 #endif
32
33 /* slide_hash */
34 #ifdef X86_SSE2
35 void slide_hash_sse2(deflate_state *s);
36 #elif defined(ARM_NEON_SLIDEHASH)
37 void slide_hash_neon(deflate_state *s);
38 #elif defined(POWER8_VSX_SLIDEHASH)
39 void slide_hash_power8(deflate_state *s);
40 #endif
41 #ifdef X86_AVX2
42 void slide_hash_avx2(deflate_state *s);
43 #endif
44
45 /* adler32 */
46 extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
47 #ifdef ARM_NEON_ADLER32
48 extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
49 #endif
50 #ifdef X86_SSSE3_ADLER32
51 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
52 #endif
53 #ifdef X86_AVX2_ADLER32
54 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
55 #endif
56 #ifdef POWER8_VSX_ADLER32
57 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
58 #endif
59
60 /* memory chunking */
61 extern uint32_t chunksize_c(void);
62 extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
63 extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
64 extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
65 extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
66 extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
67 #ifdef X86_SSE2_CHUNKSET
68 extern uint32_t chunksize_sse2(void);
69 extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
70 extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
71 extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
72 extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
73 extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
74 #endif
75 #ifdef X86_AVX_CHUNKSET
76 extern uint32_t chunksize_avx(void);
77 extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
78 extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
79 extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
80 extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
81 extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
82 #endif
83 #ifdef ARM_NEON_CHUNKSET
84 extern uint32_t chunksize_neon(void);
85 extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
86 extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
87 extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
88 extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
89 extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
90 #endif
91
92 /* CRC32 */
93 Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
94
95 #ifdef ARM_ACLE_CRC_HASH
96 extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
97 #endif
98
99 #if BYTE_ORDER == LITTLE_ENDIAN
100 extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
101 #elif BYTE_ORDER == BIG_ENDIAN
102 extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
103 #endif
104
105 /* compare258 */
106 extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1);
107 #ifdef UNALIGNED_OK
108 extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1);
109 extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1);
110 #ifdef UNALIGNED64_OK
111 extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1);
112 #endif
113 #ifdef X86_SSE42_CMP_STR
114 extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
115 #endif
116 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
117 extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
118 #endif
119 #endif
120
121 /* longest_match */
122 extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
123 #ifdef UNALIGNED_OK
124 extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
125 extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
126 #ifdef UNALIGNED64_OK
127 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
128 #endif
129 #ifdef X86_SSE42_CMP_STR
130 extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
131 #endif
132 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
133 extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
134 #endif
135 #endif
136
137 Z_INTERNAL Z_TLS struct functable_s functable;
138
cpu_check_features(void)139 Z_INTERNAL void cpu_check_features(void)
140 {
141 static int features_checked = 0;
142 if (features_checked)
143 return;
144 #if defined(X86_FEATURES)
145 x86_check_features();
146 #elif defined(ARM_FEATURES)
147 arm_check_features();
148 #elif defined(POWER_FEATURES)
149 power_check_features();
150 #endif
151 features_checked = 1;
152 }
153
154 /* stub functions */
insert_string_stub(deflate_state * const s,const uint32_t str,uint32_t count)155 Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) {
156 // Initialize default
157
158 functable.insert_string = &insert_string_c;
159 cpu_check_features();
160
161 #ifdef X86_SSE42_CRC_HASH
162 if (x86_cpu_has_sse42)
163 functable.insert_string = &insert_string_sse4;
164 #elif defined(ARM_ACLE_CRC_HASH)
165 if (arm_cpu_has_crc32)
166 functable.insert_string = &insert_string_acle;
167 #endif
168
169 functable.insert_string(s, str, count);
170 }
171
quick_insert_string_stub(deflate_state * const s,const uint32_t str)172 Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
173 functable.quick_insert_string = &quick_insert_string_c;
174
175 #ifdef X86_SSE42_CRC_HASH
176 if (x86_cpu_has_sse42)
177 functable.quick_insert_string = &quick_insert_string_sse4;
178 #elif defined(ARM_ACLE_CRC_HASH)
179 if (arm_cpu_has_crc32)
180 functable.quick_insert_string = &quick_insert_string_acle;
181 #endif
182
183 return functable.quick_insert_string(s, str);
184 }
185
slide_hash_stub(deflate_state * s)186 Z_INTERNAL void slide_hash_stub(deflate_state *s) {
187
188 functable.slide_hash = &slide_hash_c;
189 cpu_check_features();
190
191 #ifdef X86_SSE2
192 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
193 if (x86_cpu_has_sse2)
194 # endif
195 functable.slide_hash = &slide_hash_sse2;
196 #elif defined(ARM_NEON_SLIDEHASH)
197 # ifndef ARM_NOCHECK_NEON
198 if (arm_cpu_has_neon)
199 # endif
200 functable.slide_hash = &slide_hash_neon;
201 #endif
202 #ifdef X86_AVX2
203 if (x86_cpu_has_avx2)
204 functable.slide_hash = &slide_hash_avx2;
205 #endif
206 #ifdef POWER8_VSX_SLIDEHASH
207 if (power_cpu_has_arch_2_07)
208 functable.slide_hash = &slide_hash_power8;
209 #endif
210
211 functable.slide_hash(s);
212 }
213
adler32_stub(uint32_t adler,const unsigned char * buf,size_t len)214 Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
215 // Initialize default
216 functable.adler32 = &adler32_c;
217 cpu_check_features();
218
219 #ifdef ARM_NEON_ADLER32
220 # ifndef ARM_NOCHECK_NEON
221 if (arm_cpu_has_neon)
222 # endif
223 functable.adler32 = &adler32_neon;
224 #endif
225 #ifdef X86_SSSE3_ADLER32
226 if (x86_cpu_has_ssse3)
227 functable.adler32 = &adler32_ssse3;
228 #endif
229 #ifdef X86_AVX2_ADLER32
230 if (x86_cpu_has_avx2)
231 functable.adler32 = &adler32_avx2;
232 #endif
233 #ifdef POWER8_VSX_ADLER32
234 if (power_cpu_has_arch_2_07)
235 functable.adler32 = &adler32_power8;
236 #endif
237
238 return functable.adler32(adler, buf, len);
239 }
240
chunksize_stub(void)241 Z_INTERNAL uint32_t chunksize_stub(void) {
242 // Initialize default
243 functable.chunksize = &chunksize_c;
244
245 #ifdef X86_SSE2_CHUNKSET
246 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
247 if (x86_cpu_has_sse2)
248 # endif
249 functable.chunksize = &chunksize_sse2;
250 #endif
251 #ifdef X86_AVX_CHUNKSET
252 if (x86_cpu_has_avx2)
253 functable.chunksize = &chunksize_avx;
254 #endif
255 #ifdef ARM_NEON_CHUNKSET
256 if (arm_cpu_has_neon)
257 functable.chunksize = &chunksize_neon;
258 #endif
259
260 return functable.chunksize();
261 }
262
chunkcopy_stub(uint8_t * out,uint8_t const * from,unsigned len)263 Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
264 // Initialize default
265 functable.chunkcopy = &chunkcopy_c;
266
267 #ifdef X86_SSE2_CHUNKSET
268 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
269 if (x86_cpu_has_sse2)
270 # endif
271 functable.chunkcopy = &chunkcopy_sse2;
272 #endif
273 #ifdef X86_AVX_CHUNKSET
274 if (x86_cpu_has_avx2)
275 functable.chunkcopy = &chunkcopy_avx;
276 #endif
277 #ifdef ARM_NEON_CHUNKSET
278 if (arm_cpu_has_neon)
279 functable.chunkcopy = &chunkcopy_neon;
280 #endif
281
282 return functable.chunkcopy(out, from, len);
283 }
284
chunkcopy_safe_stub(uint8_t * out,uint8_t const * from,unsigned len,uint8_t * safe)285 Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
286 // Initialize default
287 functable.chunkcopy_safe = &chunkcopy_safe_c;
288
289 #ifdef X86_SSE2_CHUNKSET
290 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
291 if (x86_cpu_has_sse2)
292 # endif
293 functable.chunkcopy_safe = &chunkcopy_safe_sse2;
294 #endif
295 #ifdef X86_AVX_CHUNKSET
296 if (x86_cpu_has_avx2)
297 functable.chunkcopy_safe = &chunkcopy_safe_avx;
298 #endif
299 #ifdef ARM_NEON_CHUNKSET
300 if (arm_cpu_has_neon)
301 functable.chunkcopy_safe = &chunkcopy_safe_neon;
302 #endif
303
304 return functable.chunkcopy_safe(out, from, len, safe);
305 }
306
chunkunroll_stub(uint8_t * out,unsigned * dist,unsigned * len)307 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
308 // Initialize default
309 functable.chunkunroll = &chunkunroll_c;
310
311 #ifdef X86_SSE2_CHUNKSET
312 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
313 if (x86_cpu_has_sse2)
314 # endif
315 functable.chunkunroll = &chunkunroll_sse2;
316 #endif
317 #ifdef X86_AVX_CHUNKSET
318 if (x86_cpu_has_avx2)
319 functable.chunkunroll = &chunkunroll_avx;
320 #endif
321 #ifdef ARM_NEON_CHUNKSET
322 if (arm_cpu_has_neon)
323 functable.chunkunroll = &chunkunroll_neon;
324 #endif
325
326 return functable.chunkunroll(out, dist, len);
327 }
328
chunkmemset_stub(uint8_t * out,unsigned dist,unsigned len)329 Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
330 // Initialize default
331 functable.chunkmemset = &chunkmemset_c;
332
333 #ifdef X86_SSE2_CHUNKSET
334 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
335 if (x86_cpu_has_sse2)
336 # endif
337 functable.chunkmemset = &chunkmemset_sse2;
338 #endif
339 #ifdef X86_AVX_CHUNKSET
340 if (x86_cpu_has_avx2)
341 functable.chunkmemset = &chunkmemset_avx;
342 #endif
343 #ifdef ARM_NEON_CHUNKSET
344 if (arm_cpu_has_neon)
345 functable.chunkmemset = &chunkmemset_neon;
346 #endif
347
348 return functable.chunkmemset(out, dist, len);
349 }
350
chunkmemset_safe_stub(uint8_t * out,unsigned dist,unsigned len,unsigned left)351 Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
352 // Initialize default
353 functable.chunkmemset_safe = &chunkmemset_safe_c;
354
355 #ifdef X86_SSE2_CHUNKSET
356 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
357 if (x86_cpu_has_sse2)
358 # endif
359 functable.chunkmemset_safe = &chunkmemset_safe_sse2;
360 #endif
361 #ifdef X86_AVX_CHUNKSET
362 if (x86_cpu_has_avx2)
363 functable.chunkmemset_safe = &chunkmemset_safe_avx;
364 #endif
365 #ifdef ARM_NEON_CHUNKSET
366 if (arm_cpu_has_neon)
367 functable.chunkmemset_safe = &chunkmemset_safe_neon;
368 #endif
369
370 return functable.chunkmemset_safe(out, dist, len, left);
371 }
372
crc32_stub(uint32_t crc,const unsigned char * buf,uint64_t len)373 Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
374 int32_t use_byfour = sizeof(void *) == sizeof(ptrdiff_t);
375
376 Assert(sizeof(uint64_t) >= sizeof(size_t),
377 "crc32_z takes size_t but internally we have a uint64_t len");
378 /* return a function pointer for optimized arches here after a capability test */
379
380 cpu_check_features();
381
382 if (use_byfour) {
383 #if BYTE_ORDER == LITTLE_ENDIAN
384 functable.crc32 = crc32_little;
385 # if defined(ARM_ACLE_CRC_HASH)
386 if (arm_cpu_has_crc32)
387 functable.crc32 = crc32_acle;
388 # endif
389 #elif BYTE_ORDER == BIG_ENDIAN
390 functable.crc32 = crc32_big;
391 #else
392 # error No endian defined
393 #endif
394 } else {
395 functable.crc32 = crc32_generic;
396 }
397
398 return functable.crc32(crc, buf, len);
399 }
400
compare258_stub(const unsigned char * src0,const unsigned char * src1)401 Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) {
402
403 functable.compare258 = &compare258_c;
404
405 #ifdef UNALIGNED_OK
406 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
407 functable.compare258 = &compare258_unaligned_64;
408 # elif defined(HAVE_BUILTIN_CTZ)
409 functable.compare258 = &compare258_unaligned_32;
410 # else
411 functable.compare258 = &compare258_unaligned_16;
412 # endif
413 # ifdef X86_SSE42_CMP_STR
414 if (x86_cpu_has_sse42)
415 functable.compare258 = &compare258_unaligned_sse4;
416 # endif
417 # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
418 if (x86_cpu_has_avx2)
419 functable.compare258 = &compare258_unaligned_avx2;
420 # endif
421 #endif
422
423 return functable.compare258(src0, src1);
424 }
425
longest_match_stub(deflate_state * const s,Pos cur_match)426 Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
427
428 functable.longest_match = &longest_match_c;
429
430 #ifdef UNALIGNED_OK
431 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
432 functable.longest_match = &longest_match_unaligned_64;
433 # elif defined(HAVE_BUILTIN_CTZ)
434 functable.longest_match = &longest_match_unaligned_32;
435 # else
436 functable.longest_match = &longest_match_unaligned_16;
437 # endif
438 # ifdef X86_SSE42_CMP_STR
439 if (x86_cpu_has_sse42)
440 functable.longest_match = &longest_match_unaligned_sse4;
441 # endif
442 # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
443 if (x86_cpu_has_avx2)
444 functable.longest_match = &longest_match_unaligned_avx2;
445 # endif
446 #endif
447
448 return functable.longest_match(s, cur_match);
449 }
450
451 /* functable init */
452 Z_INTERNAL Z_TLS struct functable_s functable = {
453 insert_string_stub,
454 quick_insert_string_stub,
455 adler32_stub,
456 crc32_stub,
457 slide_hash_stub,
458 compare258_stub,
459 longest_match_stub,
460 chunksize_stub,
461 chunkcopy_stub,
462 chunkcopy_safe_stub,
463 chunkunroll_stub,
464 chunkmemset_stub,
465 chunkmemset_safe_stub
466 };
467