1 /* functable.c -- Choose relevant optimized functions at runtime
2  * Copyright (C) 2017 Hans Kristian Rosbach
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  */
5 
6 #include "zbuild.h"
7 #include "zendian.h"
8 #include "deflate.h"
9 #include "deflate_p.h"
10 
11 #include "functable.h"
12 
13 #ifdef X86_FEATURES
14 #  include "fallback_builtins.h"
15 #endif
16 
17 /* insert_string */
18 extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
19 #ifdef X86_SSE42_CRC_HASH
20 extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
21 #elif defined(ARM_ACLE_CRC_HASH)
22 extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
23 #endif
24 
25 /* quick_insert_string */
26 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
27 #ifdef X86_SSE42_CRC_HASH
28 extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
29 #elif defined(ARM_ACLE_CRC_HASH)
30 extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
31 #endif
32 
33 /* slide_hash */
34 #ifdef X86_SSE2
35 void slide_hash_sse2(deflate_state *s);
36 #elif defined(ARM_NEON_SLIDEHASH)
37 void slide_hash_neon(deflate_state *s);
38 #elif defined(POWER8_VSX_SLIDEHASH)
39 void slide_hash_power8(deflate_state *s);
40 #endif
41 #ifdef X86_AVX2
42 void slide_hash_avx2(deflate_state *s);
43 #endif
44 
45 /* adler32 */
46 extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
47 #ifdef ARM_NEON_ADLER32
48 extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
49 #endif
50 #ifdef X86_SSSE3_ADLER32
51 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
52 #endif
53 #ifdef X86_AVX2_ADLER32
54 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
55 #endif
56 #ifdef POWER8_VSX_ADLER32
57 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
58 #endif
59 
60 /* memory chunking */
61 extern uint32_t chunksize_c(void);
62 extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
63 extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
64 extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
65 extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
66 extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
67 #ifdef X86_SSE2_CHUNKSET
68 extern uint32_t chunksize_sse2(void);
69 extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
70 extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
71 extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
72 extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
73 extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
74 #endif
75 #ifdef ARM_NEON_CHUNKSET
76 extern uint32_t chunksize_neon(void);
77 extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
78 extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
79 extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
80 extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
81 extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
82 #endif
83 
84 /* CRC32 */
85 Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
86 
87 #ifdef ARM_ACLE_CRC_HASH
88 extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
89 #endif
90 
91 #if BYTE_ORDER == LITTLE_ENDIAN
92 extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
93 #elif BYTE_ORDER == BIG_ENDIAN
94 extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
95 #endif
96 
97 /* compare258 */
98 extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1);
99 #ifdef UNALIGNED_OK
100 extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1);
101 extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1);
102 #ifdef UNALIGNED64_OK
103 extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1);
104 #endif
105 #ifdef X86_SSE42_CMP_STR
106 extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
107 #endif
108 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
109 extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
110 #endif
111 #endif
112 
113 /* longest_match */
114 extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
115 #ifdef UNALIGNED_OK
116 extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
117 extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
118 #ifdef UNALIGNED64_OK
119 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
120 #endif
121 #ifdef X86_SSE42_CMP_STR
122 extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
123 #endif
124 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
125 extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
126 #endif
127 #endif
128 
129 Z_INTERNAL Z_TLS struct functable_s functable;
130 
cpu_check_features(void)131 Z_INTERNAL void cpu_check_features(void)
132 {
133     static int features_checked = 0;
134     if (features_checked)
135         return;
136 #if defined(X86_FEATURES)
137     x86_check_features();
138 #elif defined(ARM_FEATURES)
139     arm_check_features();
140 #elif defined(POWER_FEATURES)
141     power_check_features();
142 #endif
143     features_checked = 1;
144 }
145 
146 /* stub functions */
insert_string_stub(deflate_state * const s,const uint32_t str,uint32_t count)147 Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) {
148     // Initialize default
149 
150     functable.insert_string = &insert_string_c;
151     cpu_check_features();
152 
153 #ifdef X86_SSE42_CRC_HASH
154     if (x86_cpu_has_sse42)
155         functable.insert_string = &insert_string_sse4;
156 #elif defined(ARM_ACLE_CRC_HASH)
157     if (arm_cpu_has_crc32)
158         functable.insert_string = &insert_string_acle;
159 #endif
160 
161     functable.insert_string(s, str, count);
162 }
163 
quick_insert_string_stub(deflate_state * const s,const uint32_t str)164 Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
165     functable.quick_insert_string = &quick_insert_string_c;
166 
167 #ifdef X86_SSE42_CRC_HASH
168     if (x86_cpu_has_sse42)
169         functable.quick_insert_string = &quick_insert_string_sse4;
170 #elif defined(ARM_ACLE_CRC_HASH)
171     if (arm_cpu_has_crc32)
172         functable.quick_insert_string = &quick_insert_string_acle;
173 #endif
174 
175     return functable.quick_insert_string(s, str);
176 }
177 
slide_hash_stub(deflate_state * s)178 Z_INTERNAL void slide_hash_stub(deflate_state *s) {
179 
180     functable.slide_hash = &slide_hash_c;
181     cpu_check_features();
182 
183 #ifdef X86_SSE2
184 #  if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
185     if (x86_cpu_has_sse2)
186 #  endif
187         functable.slide_hash = &slide_hash_sse2;
188 #elif defined(ARM_NEON_SLIDEHASH)
189 #  ifndef ARM_NOCHECK_NEON
190     if (arm_cpu_has_neon)
191 #  endif
192         functable.slide_hash = &slide_hash_neon;
193 #endif
194 #ifdef X86_AVX2
195     if (x86_cpu_has_avx2)
196         functable.slide_hash = &slide_hash_avx2;
197 #endif
198 #ifdef POWER8_VSX_SLIDEHASH
199     if (power_cpu_has_arch_2_07)
200         functable.slide_hash = &slide_hash_power8;
201 #endif
202 
203     functable.slide_hash(s);
204 }
205 
adler32_stub(uint32_t adler,const unsigned char * buf,size_t len)206 Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
207     // Initialize default
208     functable.adler32 = &adler32_c;
209     cpu_check_features();
210 
211 #ifdef ARM_NEON_ADLER32
212 #  ifndef ARM_NOCHECK_NEON
213     if (arm_cpu_has_neon)
214 #  endif
215         functable.adler32 = &adler32_neon;
216 #endif
217 #ifdef X86_SSSE3_ADLER32
218     if (x86_cpu_has_ssse3)
219         functable.adler32 = &adler32_ssse3;
220 #endif
221 #ifdef X86_AVX2_ADLER32
222     if (x86_cpu_has_avx2)
223         functable.adler32 = &adler32_avx2;
224 #endif
225 #ifdef POWER8_VSX_ADLER32
226     if (power_cpu_has_arch_2_07)
227         functable.adler32 = &adler32_power8;
228 #endif
229 
230     return functable.adler32(adler, buf, len);
231 }
232 
chunksize_stub(void)233 Z_INTERNAL uint32_t chunksize_stub(void) {
234     // Initialize default
235     functable.chunksize = &chunksize_c;
236 
237 #ifdef X86_SSE2_CHUNKSET
238 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
239     if (x86_cpu_has_sse2)
240 # endif
241         functable.chunksize = &chunksize_sse2;
242 #endif
243 #ifdef ARM_NEON_CHUNKSET
244     if (arm_cpu_has_neon)
245         functable.chunksize = &chunksize_neon;
246 #endif
247 
248     return functable.chunksize();
249 }
250 
chunkcopy_stub(uint8_t * out,uint8_t const * from,unsigned len)251 Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
252     // Initialize default
253     functable.chunkcopy = &chunkcopy_c;
254 
255 #ifdef X86_SSE2_CHUNKSET
256 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
257     if (x86_cpu_has_sse2)
258 # endif
259         functable.chunkcopy = &chunkcopy_sse2;
260 #endif
261 #ifdef ARM_NEON_CHUNKSET
262     if (arm_cpu_has_neon)
263         functable.chunkcopy = &chunkcopy_neon;
264 #endif
265 
266     return functable.chunkcopy(out, from, len);
267 }
268 
chunkcopy_safe_stub(uint8_t * out,uint8_t const * from,unsigned len,uint8_t * safe)269 Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
270     // Initialize default
271     functable.chunkcopy_safe = &chunkcopy_safe_c;
272 
273 #ifdef X86_SSE2_CHUNKSET
274 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
275     if (x86_cpu_has_sse2)
276 # endif
277         functable.chunkcopy_safe = &chunkcopy_safe_sse2;
278 #endif
279 #ifdef ARM_NEON_CHUNKSET
280     if (arm_cpu_has_neon)
281         functable.chunkcopy_safe = &chunkcopy_safe_neon;
282 #endif
283 
284     return functable.chunkcopy_safe(out, from, len, safe);
285 }
286 
chunkunroll_stub(uint8_t * out,unsigned * dist,unsigned * len)287 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
288     // Initialize default
289     functable.chunkunroll = &chunkunroll_c;
290 
291 #ifdef X86_SSE2_CHUNKSET
292 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
293     if (x86_cpu_has_sse2)
294 # endif
295         functable.chunkunroll = &chunkunroll_sse2;
296 #endif
297 #ifdef ARM_NEON_CHUNKSET
298     if (arm_cpu_has_neon)
299         functable.chunkunroll = &chunkunroll_neon;
300 #endif
301 
302     return functable.chunkunroll(out, dist, len);
303 }
304 
chunkmemset_stub(uint8_t * out,unsigned dist,unsigned len)305 Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
306     // Initialize default
307     functable.chunkmemset = &chunkmemset_c;
308 
309 #ifdef X86_SSE2_CHUNKSET
310 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
311     if (x86_cpu_has_sse2)
312 # endif
313         functable.chunkmemset = &chunkmemset_sse2;
314 #endif
315 #ifdef ARM_NEON_CHUNKSET
316     if (arm_cpu_has_neon)
317         functable.chunkmemset = &chunkmemset_neon;
318 #endif
319 
320     return functable.chunkmemset(out, dist, len);
321 }
322 
chunkmemset_safe_stub(uint8_t * out,unsigned dist,unsigned len,unsigned left)323 Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
324     // Initialize default
325     functable.chunkmemset_safe = &chunkmemset_safe_c;
326 
327 #ifdef X86_SSE2_CHUNKSET
328 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
329     if (x86_cpu_has_sse2)
330 # endif
331         functable.chunkmemset_safe = &chunkmemset_safe_sse2;
332 #endif
333 #ifdef ARM_NEON_CHUNKSET
334     if (arm_cpu_has_neon)
335         functable.chunkmemset_safe = &chunkmemset_safe_neon;
336 #endif
337 
338     return functable.chunkmemset_safe(out, dist, len, left);
339 }
340 
crc32_stub(uint32_t crc,const unsigned char * buf,uint64_t len)341 Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
342 
343     Assert(sizeof(uint64_t) >= sizeof(size_t),
344            "crc32_z takes size_t but internally we have a uint64_t len");
345     /* return a function pointer for optimized arches here after a capability test */
346 
347     cpu_check_features();
348 
349     if (sizeof(void *) == sizeof(ptrdiff_t)) {
350 #if BYTE_ORDER == LITTLE_ENDIAN
351         functable.crc32 = crc32_little;
352 #  if defined(ARM_ACLE_CRC_HASH)
353         if (arm_cpu_has_crc32)
354             functable.crc32 = crc32_acle;
355 #  endif
356 #elif BYTE_ORDER == BIG_ENDIAN
357         functable.crc32 = crc32_big;
358 #else
359 #  error No endian defined
360 #endif
361     } else {
362         functable.crc32 = crc32_generic;
363     }
364 
365     return functable.crc32(crc, buf, len);
366 }
367 
compare258_stub(const unsigned char * src0,const unsigned char * src1)368 Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) {
369 
370     functable.compare258 = &compare258_c;
371 
372 #ifdef UNALIGNED_OK
373 #  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
374     functable.compare258 = &compare258_unaligned_64;
375 #  elif defined(HAVE_BUILTIN_CTZ)
376     functable.compare258 = &compare258_unaligned_32;
377 #  else
378     functable.compare258 = &compare258_unaligned_16;
379 #  endif
380 #  ifdef X86_SSE42_CMP_STR
381     if (x86_cpu_has_sse42)
382         functable.compare258 = &compare258_unaligned_sse4;
383 #  endif
384 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
385     if (x86_cpu_has_avx2)
386         functable.compare258 = &compare258_unaligned_avx2;
387 #  endif
388 #endif
389 
390     return functable.compare258(src0, src1);
391 }
392 
longest_match_stub(deflate_state * const s,Pos cur_match)393 Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
394 
395     functable.longest_match = &longest_match_c;
396 
397 #ifdef UNALIGNED_OK
398 #  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
399     functable.longest_match = &longest_match_unaligned_64;
400 #  elif defined(HAVE_BUILTIN_CTZ)
401     functable.longest_match = &longest_match_unaligned_32;
402 #  else
403     functable.longest_match = &longest_match_unaligned_16;
404 #  endif
405 #  ifdef X86_SSE42_CMP_STR
406     if (x86_cpu_has_sse42)
407         functable.longest_match = &longest_match_unaligned_sse4;
408 #  endif
409 #  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
410     if (x86_cpu_has_avx2)
411         functable.longest_match = &longest_match_unaligned_avx2;
412 #  endif
413 #endif
414 
415     return functable.longest_match(s, cur_match);
416 }
417 
418 /* functable init */
419 Z_INTERNAL Z_TLS struct functable_s functable = {
420     insert_string_stub,
421     quick_insert_string_stub,
422     adler32_stub,
423     crc32_stub,
424     slide_hash_stub,
425     compare258_stub,
426     longest_match_stub,
427     chunksize_stub,
428     chunkcopy_stub,
429     chunkcopy_safe_stub,
430     chunkunroll_stub,
431     chunkmemset_stub,
432     chunkmemset_safe_stub
433 };
434