1 /* functable.c -- Choose relevant optimized functions at runtime 2 * Copyright (C) 2017 Hans Kristian Rosbach 3 * For conditions of distribution and use, see copyright notice in zlib.h 4 */ 5 6 #include "zbuild.h" 7 #include "zendian.h" 8 #include "deflate.h" 9 #include "deflate_p.h" 10 11 #include "functable.h" 12 13 #ifdef X86_FEATURES 14 # include "fallback_builtins.h" 15 #endif 16 17 /* insert_string */ 18 extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); 19 #ifdef X86_SSE42_CRC_HASH 20 extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count); 21 #elif defined(ARM_ACLE_CRC_HASH) 22 extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); 23 #endif 24 25 /* quick_insert_string */ 26 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); 27 #ifdef X86_SSE42_CRC_HASH 28 extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str); 29 #elif defined(ARM_ACLE_CRC_HASH) 30 extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); 31 #endif 32 33 /* slide_hash */ 34 #ifdef X86_SSE2 35 void slide_hash_sse2(deflate_state *s); 36 #elif defined(ARM_NEON_SLIDEHASH) 37 void slide_hash_neon(deflate_state *s); 38 #elif defined(POWER8_VSX_SLIDEHASH) 39 void slide_hash_power8(deflate_state *s); 40 #endif 41 #ifdef X86_AVX2 42 void slide_hash_avx2(deflate_state *s); 43 #endif 44 45 /* adler32 */ 46 extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); 47 #ifdef ARM_NEON_ADLER32 48 extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); 49 #endif 50 #ifdef X86_SSSE3_ADLER32 51 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); 52 #endif 53 #ifdef X86_AVX2_ADLER32 54 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); 55 #endif 56 #ifdef POWER8_VSX_ADLER32 57 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); 58 #endif 59 60 /* memory chunking */ 61 extern uint32_t chunksize_c(void); 62 extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len); 63 extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); 64 extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len); 65 extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len); 66 extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); 67 #ifdef X86_SSE2_CHUNKSET 68 extern uint32_t chunksize_sse2(void); 69 extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); 70 extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); 71 extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); 72 extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len); 73 extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); 74 #endif 75 #ifdef ARM_NEON_CHUNKSET 76 extern uint32_t chunksize_neon(void); 77 extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len); 78 extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); 79 extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len); 80 extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len); 81 extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); 82 #endif 83 84 /* CRC32 */ 85 Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); 86 87 #ifdef ARM_ACLE_CRC_HASH 88 extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t); 89 #endif 90 91 #if BYTE_ORDER == LITTLE_ENDIAN 92 extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t); 93 #elif BYTE_ORDER == BIG_ENDIAN 94 extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t); 95 #endif 96 97 /* compare258 */ 98 extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1); 99 #ifdef UNALIGNED_OK 100 extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1); 101 extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1); 102 #ifdef UNALIGNED64_OK 103 extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1); 104 #endif 105 #ifdef X86_SSE42_CMP_STR 106 extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); 107 #endif 108 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) 109 extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); 110 #endif 111 #endif 112 113 /* longest_match */ 114 extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match); 115 #ifdef UNALIGNED_OK 116 extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); 117 extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); 118 #ifdef UNALIGNED64_OK 119 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); 120 #endif 121 #ifdef X86_SSE42_CMP_STR 122 extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match); 123 #endif 124 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) 125 extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match); 126 #endif 127 #endif 128 129 Z_INTERNAL Z_TLS struct functable_s functable; 130 131 Z_INTERNAL void cpu_check_features(void) 132 { 133 static int features_checked = 0; 134 if (features_checked) 135 return; 136 #if defined(X86_FEATURES) 137 x86_check_features(); 138 #elif defined(ARM_FEATURES) 139 arm_check_features(); 140 #elif defined(POWER_FEATURES) 141 power_check_features(); 142 #endif 143 features_checked = 1; 144 } 145 146 /* stub functions */ 147 Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) { 148 // Initialize default 149 150 functable.insert_string = &insert_string_c; 151 cpu_check_features(); 152 153 #ifdef X86_SSE42_CRC_HASH 154 if (x86_cpu_has_sse42) 155 functable.insert_string = &insert_string_sse4; 156 #elif defined(ARM_ACLE_CRC_HASH) 157 if (arm_cpu_has_crc32) 158 functable.insert_string = &insert_string_acle; 159 #endif 160 161 functable.insert_string(s, str, count); 162 } 163 164 Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) { 165 functable.quick_insert_string = &quick_insert_string_c; 166 167 #ifdef X86_SSE42_CRC_HASH 168 if (x86_cpu_has_sse42) 169 functable.quick_insert_string = &quick_insert_string_sse4; 170 #elif defined(ARM_ACLE_CRC_HASH) 171 if (arm_cpu_has_crc32) 172 functable.quick_insert_string = &quick_insert_string_acle; 173 #endif 174 175 return functable.quick_insert_string(s, str); 176 } 177 178 Z_INTERNAL void slide_hash_stub(deflate_state *s) { 179 180 functable.slide_hash = &slide_hash_c; 181 cpu_check_features(); 182 183 #ifdef X86_SSE2 184 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) 185 if (x86_cpu_has_sse2) 186 # endif 187 functable.slide_hash = &slide_hash_sse2; 188 #elif defined(ARM_NEON_SLIDEHASH) 189 # ifndef ARM_NOCHECK_NEON 190 if (arm_cpu_has_neon) 191 # endif 192 functable.slide_hash = &slide_hash_neon; 193 #endif 194 #ifdef X86_AVX2 195 if (x86_cpu_has_avx2) 196 functable.slide_hash = &slide_hash_avx2; 197 #endif 198 #ifdef POWER8_VSX_SLIDEHASH 199 if (power_cpu_has_arch_2_07) 200 functable.slide_hash = &slide_hash_power8; 201 #endif 202 203 functable.slide_hash(s); 204 } 205 206 Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) { 207 // Initialize default 208 functable.adler32 = &adler32_c; 209 cpu_check_features(); 210 211 #ifdef ARM_NEON_ADLER32 212 # ifndef ARM_NOCHECK_NEON 213 if (arm_cpu_has_neon) 214 # endif 215 functable.adler32 = &adler32_neon; 216 #endif 217 #ifdef X86_SSSE3_ADLER32 218 if (x86_cpu_has_ssse3) 219 functable.adler32 = &adler32_ssse3; 220 #endif 221 #ifdef X86_AVX2_ADLER32 222 if (x86_cpu_has_avx2) 223 functable.adler32 = &adler32_avx2; 224 #endif 225 #ifdef POWER8_VSX_ADLER32 226 if (power_cpu_has_arch_2_07) 227 functable.adler32 = &adler32_power8; 228 #endif 229 230 return functable.adler32(adler, buf, len); 231 } 232 233 Z_INTERNAL uint32_t chunksize_stub(void) { 234 // Initialize default 235 functable.chunksize = &chunksize_c; 236 237 #ifdef X86_SSE2_CHUNKSET 238 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) 239 if (x86_cpu_has_sse2) 240 # endif 241 functable.chunksize = &chunksize_sse2; 242 #endif 243 #ifdef ARM_NEON_CHUNKSET 244 if (arm_cpu_has_neon) 245 functable.chunksize = &chunksize_neon; 246 #endif 247 248 return functable.chunksize(); 249 } 250 251 Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) { 252 // Initialize default 253 functable.chunkcopy = &chunkcopy_c; 254 255 #ifdef X86_SSE2_CHUNKSET 256 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) 257 if (x86_cpu_has_sse2) 258 # endif 259 functable.chunkcopy = &chunkcopy_sse2; 260 #endif 261 #ifdef ARM_NEON_CHUNKSET 262 if (arm_cpu_has_neon) 263 functable.chunkcopy = &chunkcopy_neon; 264 #endif 265 266 return functable.chunkcopy(out, from, len); 267 } 268 269 Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) { 270 // Initialize default 271 functable.chunkcopy_safe = &chunkcopy_safe_c; 272 273 #ifdef X86_SSE2_CHUNKSET 274 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) 275 if (x86_cpu_has_sse2) 276 # endif 277 functable.chunkcopy_safe = &chunkcopy_safe_sse2; 278 #endif 279 #ifdef ARM_NEON_CHUNKSET 280 if (arm_cpu_has_neon) 281 functable.chunkcopy_safe = &chunkcopy_safe_neon; 282 #endif 283 284 return functable.chunkcopy_safe(out, from, len, safe); 285 } 286 287 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) { 288 // Initialize default 289 functable.chunkunroll = &chunkunroll_c; 290 291 #ifdef X86_SSE2_CHUNKSET 292 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) 293 if (x86_cpu_has_sse2) 294 # endif 295 functable.chunkunroll = &chunkunroll_sse2; 296 #endif 297 #ifdef ARM_NEON_CHUNKSET 298 if (arm_cpu_has_neon) 299 functable.chunkunroll = &chunkunroll_neon; 300 #endif 301 302 return functable.chunkunroll(out, dist, len); 303 } 304 305 Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) { 306 // Initialize default 307 functable.chunkmemset = &chunkmemset_c; 308 309 #ifdef X86_SSE2_CHUNKSET 310 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) 311 if (x86_cpu_has_sse2) 312 # endif 313 functable.chunkmemset = &chunkmemset_sse2; 314 #endif 315 #ifdef ARM_NEON_CHUNKSET 316 if (arm_cpu_has_neon) 317 functable.chunkmemset = &chunkmemset_neon; 318 #endif 319 320 return functable.chunkmemset(out, dist, len); 321 } 322 323 Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) { 324 // Initialize default 325 functable.chunkmemset_safe = &chunkmemset_safe_c; 326 327 #ifdef X86_SSE2_CHUNKSET 328 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) 329 if (x86_cpu_has_sse2) 330 # endif 331 functable.chunkmemset_safe = &chunkmemset_safe_sse2; 332 #endif 333 #ifdef ARM_NEON_CHUNKSET 334 if (arm_cpu_has_neon) 335 functable.chunkmemset_safe = &chunkmemset_safe_neon; 336 #endif 337 338 return functable.chunkmemset_safe(out, dist, len, left); 339 } 340 341 Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) { 342 343 Assert(sizeof(uint64_t) >= sizeof(size_t), 344 "crc32_z takes size_t but internally we have a uint64_t len"); 345 /* return a function pointer for optimized arches here after a capability test */ 346 347 cpu_check_features(); 348 349 if (sizeof(void *) == sizeof(ptrdiff_t)) { 350 #if BYTE_ORDER == LITTLE_ENDIAN 351 functable.crc32 = crc32_little; 352 # if defined(ARM_ACLE_CRC_HASH) 353 if (arm_cpu_has_crc32) 354 functable.crc32 = crc32_acle; 355 # endif 356 #elif BYTE_ORDER == BIG_ENDIAN 357 functable.crc32 = crc32_big; 358 #else 359 # error No endian defined 360 #endif 361 } else { 362 functable.crc32 = crc32_generic; 363 } 364 365 return functable.crc32(crc, buf, len); 366 } 367 368 Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) { 369 370 functable.compare258 = &compare258_c; 371 372 #ifdef UNALIGNED_OK 373 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) 374 functable.compare258 = &compare258_unaligned_64; 375 # elif defined(HAVE_BUILTIN_CTZ) 376 functable.compare258 = &compare258_unaligned_32; 377 # else 378 functable.compare258 = &compare258_unaligned_16; 379 # endif 380 # ifdef X86_SSE42_CMP_STR 381 if (x86_cpu_has_sse42) 382 functable.compare258 = &compare258_unaligned_sse4; 383 # endif 384 # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) 385 if (x86_cpu_has_avx2) 386 functable.compare258 = &compare258_unaligned_avx2; 387 # endif 388 #endif 389 390 return functable.compare258(src0, src1); 391 } 392 393 Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { 394 395 functable.longest_match = &longest_match_c; 396 397 #ifdef UNALIGNED_OK 398 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) 399 functable.longest_match = &longest_match_unaligned_64; 400 # elif defined(HAVE_BUILTIN_CTZ) 401 functable.longest_match = &longest_match_unaligned_32; 402 # else 403 functable.longest_match = &longest_match_unaligned_16; 404 # endif 405 # ifdef X86_SSE42_CMP_STR 406 if (x86_cpu_has_sse42) 407 functable.longest_match = &longest_match_unaligned_sse4; 408 # endif 409 # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) 410 if (x86_cpu_has_avx2) 411 functable.longest_match = &longest_match_unaligned_avx2; 412 # endif 413 #endif 414 415 return functable.longest_match(s, cur_match); 416 } 417 418 /* functable init */ 419 Z_INTERNAL Z_TLS struct functable_s functable = { 420 insert_string_stub, 421 quick_insert_string_stub, 422 adler32_stub, 423 crc32_stub, 424 slide_hash_stub, 425 compare258_stub, 426 longest_match_stub, 427 chunksize_stub, 428 chunkcopy_stub, 429 chunkcopy_safe_stub, 430 chunkunroll_stub, 431 chunkmemset_stub, 432 chunkmemset_safe_stub 433 }; 434