1 /* functable.c -- Choose relevant optimized functions at runtime
2  * Copyright (C) 2017 Hans Kristian Rosbach
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  */
5 
6 #include "zbuild.h"
7 #include "zendian.h"
8 #include "deflate.h"
9 #include "deflate_p.h"
10 
11 #include "functable.h"
12 
13 #ifdef X86_FEATURES
14 #  include "fallback_builtins.h"
15 #endif
16 
17 /* insert_string */
18 extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
19 #ifdef X86_SSE42_CRC_HASH
20 extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
21 #elif defined(ARM_ACLE_CRC_HASH)
22 extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
23 #endif
24 
25 /* quick_insert_string */
26 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
27 #ifdef X86_SSE42_CRC_HASH
28 extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
29 #elif defined(ARM_ACLE_CRC_HASH)
30 extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
31 #endif
32 
33 /* slide_hash */
34 #ifdef X86_SSE2
35 void slide_hash_sse2(deflate_state *s);
36 #elif defined(ARM_NEON_SLIDEHASH)
37 void slide_hash_neon(deflate_state *s);
38 #elif defined(POWER8_VSX_SLIDEHASH)
39 void slide_hash_power8(deflate_state *s);
40 #endif
41 #ifdef X86_AVX2
42 void slide_hash_avx2(deflate_state *s);
43 #endif
44 
45 /* adler32 */
46 extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
47 #ifdef ARM_NEON_ADLER32
48 extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
49 #endif
50 #ifdef X86_SSSE3_ADLER32
51 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
52 #endif
53 #ifdef X86_AVX2_ADLER32
54 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
55 #endif
56 #ifdef POWER8_VSX_ADLER32
57 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
58 #endif
59 
60 /* memory chunking */
61 extern uint32_t chunksize_c(void);
62 extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
63 extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
64 extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
65 extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
66 extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
67 #ifdef X86_SSE2_CHUNKSET
68 extern uint32_t chunksize_sse2(void);
69 extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
70 extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
71 extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
72 extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
73 extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
74 #endif
75 #ifdef X86_AVX_CHUNKSET
76 extern uint32_t chunksize_avx(void);
77 extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
78 extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
79 extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
80 extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
81 extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
82 #endif
83 #ifdef ARM_NEON_CHUNKSET
84 extern uint32_t chunksize_neon(void);
85 extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
86 extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
87 extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
88 extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
89 extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
90 #endif
91 
92 /* CRC32 */
93 Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
94 
95 #ifdef ARM_ACLE_CRC_HASH
96 extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
97 #endif
98 
99 #if BYTE_ORDER == LITTLE_ENDIAN
100 extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
101 #elif BYTE_ORDER == BIG_ENDIAN
102 extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
103 #endif
104 
105 /* compare258 */
106 extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1);
107 #ifdef UNALIGNED_OK
108 extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1);
109 extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1);
110 #ifdef UNALIGNED64_OK
111 extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1);
112 #endif
113 #ifdef X86_SSE42_CMP_STR
114 extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
115 #endif
116 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
117 extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
118 #endif
119 #endif
120 
121 /* longest_match */
122 extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
123 #ifdef UNALIGNED_OK
124 extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
125 extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
126 #ifdef UNALIGNED64_OK
127 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
128 #endif
129 #ifdef X86_SSE42_CMP_STR
130 extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
131 #endif
132 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
133 extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
134 #endif
135 #endif
136 
137 Z_INTERNAL Z_TLS struct functable_s functable;
138 
cpu_check_features(void)139 Z_INTERNAL void cpu_check_features(void)
140 {
141     static int features_checked = 0;
142     if (features_checked)
143         return;
144 #if defined(X86_FEATURES)
145     x86_check_features();
146 #elif defined(ARM_FEATURES)
147     arm_check_features();
148 #elif defined(POWER_FEATURES)
149     power_check_features();
150 #endif
151     features_checked = 1;
152 }
153 
154 /* stub functions */
insert_string_stub(deflate_state * const s,const uint32_t str,uint32_t count)155 Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) {
156     // Initialize default
157 
158     functable.insert_string = &insert_string_c;
159     cpu_check_features();
160 
161 #ifdef X86_SSE42_CRC_HASH
162     if (x86_cpu_has_sse42)
163         functable.insert_string = &insert_string_sse4;
164 #elif defined(ARM_ACLE_CRC_HASH)
165     if (arm_cpu_has_crc32)
166         functable.insert_string = &insert_string_acle;
167 #endif
168 
169     functable.insert_string(s, str, count);
170 }
171 
quick_insert_string_stub(deflate_state * const s,const uint32_t str)172 Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
173     functable.quick_insert_string = &quick_insert_string_c;
174 
175 #ifdef X86_SSE42_CRC_HASH
176     if (x86_cpu_has_sse42)
177         functable.quick_insert_string = &quick_insert_string_sse4;
178 #elif defined(ARM_ACLE_CRC_HASH)
179     if (arm_cpu_has_crc32)
180         functable.quick_insert_string = &quick_insert_string_acle;
181 #endif
182 
183     return functable.quick_insert_string(s, str);
184 }
185 
slide_hash_stub(deflate_state * s)186 Z_INTERNAL void slide_hash_stub(deflate_state *s) {
187 
188     functable.slide_hash = &slide_hash_c;
189     cpu_check_features();
190 
191 #ifdef X86_SSE2
192 #  if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
193     if (x86_cpu_has_sse2)
194 #  endif
195         functable.slide_hash = &slide_hash_sse2;
196 #elif defined(ARM_NEON_SLIDEHASH)
197 #  ifndef ARM_NOCHECK_NEON
198     if (arm_cpu_has_neon)
199 #  endif
200         functable.slide_hash = &slide_hash_neon;
201 #endif
202 #ifdef X86_AVX2
203     if (x86_cpu_has_avx2)
204         functable.slide_hash = &slide_hash_avx2;
205 #endif
206 #ifdef POWER8_VSX_SLIDEHASH
207     if (power_cpu_has_arch_2_07)
208         functable.slide_hash = &slide_hash_power8;
209 #endif
210 
211     functable.slide_hash(s);
212 }
213 
adler32_stub(uint32_t adler,const unsigned char * buf,size_t len)214 Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
215     // Initialize default
216     functable.adler32 = &adler32_c;
217     cpu_check_features();
218 
219 #ifdef ARM_NEON_ADLER32
220 #  ifndef ARM_NOCHECK_NEON
221     if (arm_cpu_has_neon)
222 #  endif
223         functable.adler32 = &adler32_neon;
224 #endif
225 #ifdef X86_SSSE3_ADLER32
226     if (x86_cpu_has_ssse3)
227         functable.adler32 = &adler32_ssse3;
228 #endif
229 #ifdef X86_AVX2_ADLER32
230     if (x86_cpu_has_avx2)
231         functable.adler32 = &adler32_avx2;
232 #endif
233 #ifdef POWER8_VSX_ADLER32
234     if (power_cpu_has_arch_2_07)
235         functable.adler32 = &adler32_power8;
236 #endif
237 
238     return functable.adler32(adler, buf, len);
239 }
240 
chunksize_stub(void)241 Z_INTERNAL uint32_t chunksize_stub(void) {
242     // Initialize default
243     functable.chunksize = &chunksize_c;
244 
245 #ifdef X86_SSE2_CHUNKSET
246 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
247     if (x86_cpu_has_sse2)
248 # endif
249         functable.chunksize = &chunksize_sse2;
250 #endif
251 #ifdef X86_AVX_CHUNKSET
252     if (x86_cpu_has_avx2)
253         functable.chunksize = &chunksize_avx;
254 #endif
255 #ifdef ARM_NEON_CHUNKSET
256     if (arm_cpu_has_neon)
257         functable.chunksize = &chunksize_neon;
258 #endif
259 
260     return functable.chunksize();
261 }
262 
chunkcopy_stub(uint8_t * out,uint8_t const * from,unsigned len)263 Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
264     // Initialize default
265     functable.chunkcopy = &chunkcopy_c;
266 
267 #ifdef X86_SSE2_CHUNKSET
268 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
269     if (x86_cpu_has_sse2)
270 # endif
271         functable.chunkcopy = &chunkcopy_sse2;
272 #endif
273 #ifdef X86_AVX_CHUNKSET
274     if (x86_cpu_has_avx2)
275         functable.chunkcopy = &chunkcopy_avx;
276 #endif
277 #ifdef ARM_NEON_CHUNKSET
278     if (arm_cpu_has_neon)
279         functable.chunkcopy = &chunkcopy_neon;
280 #endif
281 
282     return functable.chunkcopy(out, from, len);
283 }
284 
chunkcopy_safe_stub(uint8_t * out,uint8_t const * from,unsigned len,uint8_t * safe)285 Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
286     // Initialize default
287     functable.chunkcopy_safe = &chunkcopy_safe_c;
288 
289 #ifdef X86_SSE2_CHUNKSET
290 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
291     if (x86_cpu_has_sse2)
292 # endif
293         functable.chunkcopy_safe = &chunkcopy_safe_sse2;
294 #endif
295 #ifdef X86_AVX_CHUNKSET
296     if (x86_cpu_has_avx2)
297         functable.chunkcopy_safe = &chunkcopy_safe_avx;
298 #endif
299 #ifdef ARM_NEON_CHUNKSET
300     if (arm_cpu_has_neon)
301         functable.chunkcopy_safe = &chunkcopy_safe_neon;
302 #endif
303 
304     return functable.chunkcopy_safe(out, from, len, safe);
305 }
306 
chunkunroll_stub(uint8_t * out,unsigned * dist,unsigned * len)307 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
308     // Initialize default
309     functable.chunkunroll = &chunkunroll_c;
310 
311 #ifdef X86_SSE2_CHUNKSET
312 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
313     if (x86_cpu_has_sse2)
314 # endif
315         functable.chunkunroll = &chunkunroll_sse2;
316 #endif
317 #ifdef X86_AVX_CHUNKSET
318     if (x86_cpu_has_avx2)
319         functable.chunkunroll = &chunkunroll_avx;
320 #endif
321 #ifdef ARM_NEON_CHUNKSET
322     if (arm_cpu_has_neon)
323         functable.chunkunroll = &chunkunroll_neon;
324 #endif
325 
326     return functable.chunkunroll(out, dist, len);
327 }
328 
chunkmemset_stub(uint8_t * out,unsigned dist,unsigned len)329 Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
330     // Initialize default
331     functable.chunkmemset = &chunkmemset_c;
332 
333 #ifdef X86_SSE2_CHUNKSET
334 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
335     if (x86_cpu_has_sse2)
336 # endif
337         functable.chunkmemset = &chunkmemset_sse2;
338 #endif
339 #ifdef X86_AVX_CHUNKSET
340     if (x86_cpu_has_avx2)
341         functable.chunkmemset = &chunkmemset_avx;
342 #endif
343 #ifdef ARM_NEON_CHUNKSET
344     if (arm_cpu_has_neon)
345         functable.chunkmemset = &chunkmemset_neon;
346 #endif
347 
348     return functable.chunkmemset(out, dist, len);
349 }
350 
chunkmemset_safe_stub(uint8_t * out,unsigned dist,unsigned len,unsigned left)351 Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
352     // Initialize default
353     functable.chunkmemset_safe = &chunkmemset_safe_c;
354 
355 #ifdef X86_SSE2_CHUNKSET
356 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
357     if (x86_cpu_has_sse2)
358 # endif
359         functable.chunkmemset_safe = &chunkmemset_safe_sse2;
360 #endif
361 #ifdef X86_AVX_CHUNKSET
362     if (x86_cpu_has_avx2)
363         functable.chunkmemset_safe = &chunkmemset_safe_avx;
364 #endif
365 #ifdef ARM_NEON_CHUNKSET
366     if (arm_cpu_has_neon)
367         functable.chunkmemset_safe = &chunkmemset_safe_neon;
368 #endif
369 
370     return functable.chunkmemset_safe(out, dist, len, left);
371 }
372 
crc32_stub(uint32_t crc,const unsigned char * buf,uint64_t len)373 Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
374     int32_t use_byfour = sizeof(void *) == sizeof(ptrdiff_t);
375 
376     Assert(sizeof(uint64_t) >= sizeof(size_t),
377            "crc32_z takes size_t but internally we have a uint64_t len");
378     /* return a function pointer for optimized arches here after a capability test */
379 
380     cpu_check_features();
381 
382     if (use_byfour) {
383 #if BYTE_ORDER == LITTLE_ENDIAN
384         functable.crc32 = crc32_little;
385 #  if defined(ARM_ACLE_CRC_HASH)
386         if (arm_cpu_has_crc32)
387             functable.crc32 = crc32_acle;
388 #  endif
389 #elif BYTE_ORDER == BIG_ENDIAN
390         functable.crc32 = crc32_big;
391 #else
392 #  error No endian defined
393 #endif
394     } else {
395         functable.crc32 = crc32_generic;
396     }
397 
398     return functable.crc32(crc, buf, len);
399 }
400 
compare258_stub(const unsigned char * src0,const unsigned char * src1)401 Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) {
402 
403     functable.compare258 = &compare258_c;
404 
405 #ifdef UNALIGNED_OK
406 #  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
407     functable.compare258 = &compare258_unaligned_64;
408 #  elif defined(HAVE_BUILTIN_CTZ)
409     functable.compare258 = &compare258_unaligned_32;
410 #  else
411     functable.compare258 = &compare258_unaligned_16;
412 #  endif
413 #  ifdef X86_SSE42_CMP_STR
414     if (x86_cpu_has_sse42)
415         functable.compare258 = &compare258_unaligned_sse4;
416 #  endif
417 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
418     if (x86_cpu_has_avx2)
419         functable.compare258 = &compare258_unaligned_avx2;
420 #  endif
421 #endif
422 
423     return functable.compare258(src0, src1);
424 }
425 
longest_match_stub(deflate_state * const s,Pos cur_match)426 Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
427 
428     functable.longest_match = &longest_match_c;
429 
430 #ifdef UNALIGNED_OK
431 #  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
432     functable.longest_match = &longest_match_unaligned_64;
433 #  elif defined(HAVE_BUILTIN_CTZ)
434     functable.longest_match = &longest_match_unaligned_32;
435 #  else
436     functable.longest_match = &longest_match_unaligned_16;
437 #  endif
438 #  ifdef X86_SSE42_CMP_STR
439     if (x86_cpu_has_sse42)
440         functable.longest_match = &longest_match_unaligned_sse4;
441 #  endif
442 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
443     if (x86_cpu_has_avx2)
444         functable.longest_match = &longest_match_unaligned_avx2;
445 #  endif
446 #endif
447 
448     return functable.longest_match(s, cur_match);
449 }
450 
451 /* functable init */
452 Z_INTERNAL Z_TLS struct functable_s functable = {
453     insert_string_stub,
454     quick_insert_string_stub,
455     adler32_stub,
456     crc32_stub,
457     slide_hash_stub,
458     compare258_stub,
459     longest_match_stub,
460     chunksize_stub,
461     chunkcopy_stub,
462     chunkcopy_safe_stub,
463     chunkunroll_stub,
464     chunkmemset_stub,
465     chunkmemset_safe_stub
466 };
467