1 /* SPDX-License-Identifier: MIT 2 * 3 * Permission is hereby granted, free of charge, to any person 4 * obtaining a copy of this software and associated documentation 5 * files (the "Software"), to deal in the Software without 6 * restriction, including without limitation the rights to use, copy, 7 * modify, merge, publish, distribute, sublicense, and/or sell copies 8 * of the Software, and to permit persons to whom the Software is 9 * furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be 12 * included in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Copyright: 24 * 2017-2020 Evan Nemerson <evan@nemerson.com> 25 */ 26 27 #if !defined(SIMDE_COMMON_H) 28 #define SIMDE_COMMON_H 29 30 #include "hedley.h" 31 32 #define SIMDE_VERSION_MAJOR 0 33 #define SIMDE_VERSION_MINOR 5 34 #define SIMDE_VERSION_MICRO 0 35 #define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO) 36 37 #include <stddef.h> 38 #include <stdint.h> 39 40 #include "simde-detect-clang.h" 41 #include "simde-arch.h" 42 #include "simde-features.h" 43 #include "simde-diagnostic.h" 44 #include "simde-math.h" 45 #include "simde-constify.h" 46 47 /* In some situations, SIMDe has to make large performance sacrifices 48 * for small increases in how faithfully it reproduces an API, but 49 * only a relatively small number of users will actually need the API 50 * to be completely accurate. The SIMDE_FAST_* options can be used to 51 * disable these trade-offs. 52 * 53 * They can be enabled by passing -DSIMDE_FAST_MATH to the compiler, or 54 * the individual defines (e.g., -DSIMDE_FAST_NANS) if you only want to 55 * enable some optimizations. Using -ffast-math and/or 56 * -ffinite-math-only will also enable the relevant options. If you 57 * don't want that you can pass -DSIMDE_NO_FAST_* to disable them. */ 58 59 /* Most programs avoid NaNs by never passing values which can result in 60 * a NaN; for example, if you only pass non-negative values to the sqrt 61 * functions, it won't generate a NaN. On some platforms, similar 62 * functions handle NaNs differently; for example, the _mm_min_ps SSE 63 * function will return 0.0 if you pass it (0.0, NaN), but the NEON 64 * vminq_f32 function will return NaN. Making them behave like one 65 * another is expensive; it requires generating a mask of all lanes 66 * with NaNs, then performing the operation (e.g., vminq_f32), then 67 * blending together the result with another vector using the mask. 68 * 69 * If you don't want SIMDe to worry about the differences between how 70 * NaNs are handled on the two platforms, define this (or pass 71 * -ffinite-math-only) */ 72 #if !defined(SIMDE_FAST_MATH) && !defined(SIMDE_NO_FAST_MATH) && defined(__FAST_MATH__) 73 #define SIMDE_FAST_MATH 74 #endif 75 76 #if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_NO_FAST_NANS) 77 #if defined(SIMDE_FAST_MATH) 78 #define SIMDE_FAST_NANS 79 #elif defined(__FINITE_MATH_ONLY__) 80 #if __FINITE_MATH_ONLY__ 81 #define SIMDE_FAST_NANS 82 #endif 83 #endif 84 #endif 85 86 /* Many functions are defined as using the current rounding mode 87 * (i.e., the SIMD version of fegetround()) when converting to 88 * an integer. For example, _mm_cvtpd_epi32. Unfortunately, 89 * on some platforms (such as ARMv8+ where round-to-nearest is 90 * always used, regardless of the FPSCR register) this means we 91 * have to first query the current rounding mode, then choose 92 * the proper function (rounnd 93 , ceil, floor, etc.) */ 94 #if !defined(SIMDE_FAST_ROUND_MODE) && !defined(SIMDE_NO_FAST_ROUND_MODE) && defined(SIMDE_FAST_MATH) 95 #define SIMDE_FAST_ROUND_MODE 96 #endif 97 98 /* This controls how ties are rounded. For example, does 10.5 round to 99 * 10 or 11? IEEE 754 specifies round-towards-even, but on ARMv7 (for 100 * example) doesn't support it and it must be emulated (which is rather 101 * slow). If you're okay with just using the default for whatever arch 102 * you're on, you should definitely define this. */ 103 #if !defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_NO_FAST_ROUND_TIES) && defined(SIMDE_FAST_MATH) 104 #define SIMDE_FAST_ROUND_TIES 105 #endif 106 107 #if \ 108 HEDLEY_HAS_ATTRIBUTE(aligned) || \ 109 HEDLEY_GCC_VERSION_CHECK(2,95,0) || \ 110 HEDLEY_CRAY_VERSION_CHECK(8,4,0) || \ 111 HEDLEY_IBM_VERSION_CHECK(11,1,0) || \ 112 HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ 113 HEDLEY_PGI_VERSION_CHECK(19,4,0) || \ 114 HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ 115 HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \ 116 HEDLEY_TI_VERSION_CHECK(8,1,0) 117 # define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment))) 118 #elif defined(_MSC_VER) && !(defined(_M_ARM) && !defined(_M_ARM64)) 119 # define SIMDE_ALIGN(alignment) __declspec(align(alignment)) 120 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) 121 # define SIMDE_ALIGN(alignment) _Alignas(alignment) 122 #elif defined(__cplusplus) && (__cplusplus >= 201103L) 123 # define SIMDE_ALIGN(alignment) alignas(alignment) 124 #else 125 # define SIMDE_ALIGN(alignment) 126 #endif 127 128 #if HEDLEY_GNUC_VERSION_CHECK(2,95,0) || \ 129 HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ 130 HEDLEY_IBM_VERSION_CHECK(11,1,0) 131 # define SIMDE_ALIGN_OF(T) (__alignof__(T)) 132 #elif \ 133 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ 134 HEDLEY_HAS_FEATURE(c11_alignof) 135 # define SIMDE_ALIGN_OF(T) (_Alignof(T)) 136 #elif \ 137 (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ 138 HEDLEY_HAS_FEATURE(cxx_alignof) 139 # define SIMDE_ALIGN_OF(T) (alignof(T)) 140 #endif 141 142 #if defined(SIMDE_ALIGN_OF) 143 # define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(SIMDE_ALIGN_OF(T)) 144 #else 145 # define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(N) 146 #endif 147 148 #define simde_assert_aligned(alignment, val) \ 149 simde_assert_int(HEDLEY_REINTERPRET_CAST(uintptr_t, HEDLEY_REINTERPRET_CAST(const void*, (val))) % (alignment), ==, 0) 150 151 #if \ 152 HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ 153 HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ 154 HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ 155 HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ 156 HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ 157 HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ 158 HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ 159 (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ 160 HEDLEY_CRAY_VERSION_CHECK(8,1,0) 161 #define SIMDE_CHECK_CONSTANT_(expr) (__builtin_constant_p(expr)) 162 #elif defined(__cplusplus) && (__cplusplus > 201703L) 163 #include <type_traits> 164 #define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated()) 165 #endif 166 167 #if !defined(SIMDE_NO_CHECK_IMMEDIATE_CONSTANT) 168 #if defined(SIMDE_CHECK_CONSTANT_) && SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) 169 #define SIMDE_REQUIRE_CONSTANT(arg) HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), "`" #arg "' must be constant") 170 #else 171 #define SIMDE_REQUIRE_CONSTANT(arg) 172 #endif 173 #else 174 #define SIMDE_REQUIRE_CONSTANT(arg) 175 #endif 176 177 #define SIMDE_REQUIRE_RANGE(arg, min, max) \ 178 HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), "'" #arg "' must be in [" #min ", " #max "]") 179 180 #define SIMDE_REQUIRE_CONSTANT_RANGE(arg, min, max) \ 181 SIMDE_REQUIRE_CONSTANT(arg) \ 182 SIMDE_REQUIRE_RANGE(arg, min, max) 183 184 /* A copy of HEDLEY_STATIC_ASSERT, except we don't define an empty 185 * fallback if we can't find an implementation; instead we have to 186 * check if SIMDE_STATIC_ASSERT is defined before using it. */ 187 #if \ 188 !defined(__cplusplus) && ( \ 189 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ 190 HEDLEY_HAS_FEATURE(c_static_assert) || \ 191 HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ 192 HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ 193 defined(_Static_assert) \ 194 ) 195 # define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message) 196 #elif \ 197 (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ 198 HEDLEY_MSVC_VERSION_CHECK(16,0,0) 199 # define SIMDE_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) 200 #endif 201 202 /* SIMDE_ASSUME_ALIGNED allows you to (try to) tell the compiler 203 * that a pointer is aligned to an `alignment`-byte boundary. */ 204 #if \ 205 HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \ 206 HEDLEY_GCC_VERSION_CHECK(4,7,0) 207 #define SIMDE_ASSUME_ALIGNED(alignment, v) HEDLEY_REINTERPRET_CAST(__typeof__(v), __builtin_assume_aligned(v, alignment)) 208 #elif defined(__cplusplus) && (__cplusplus > 201703L) 209 #define SIMDE_ASSUME_ALIGNED(alignment, v) std::assume_aligned<alignment>(v) 210 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) 211 #define SIMDE_ASSUME_ALIGNED(alignment, v) (__extension__ ({ \ 212 __typeof__(v) simde_assume_aligned_t_ = (v); \ 213 __assume_aligned(simde_assume_aligned_t_, alignment); \ 214 simde_assume_aligned_t_; \ 215 })) 216 #else 217 #define SIMDE_ASSUME_ALIGNED(alignment, v) (v) 218 #endif 219 220 #if defined(SIMDE_ALIGN_OF) 221 #define SIMDE_ASSUME_ALIGNED_AS(T, v) SIMDE_ASSUME_ALIGNED(SIMDE_ALIGN_OF(T), v) 222 #else 223 #define SIMDE_ASSUME_ALIGNED_AS(T, v) (v) 224 #endif 225 226 /* SIMDE_ALIGN_CAST allows you to convert to a type with greater 227 * aligment requirements without triggering a warning. */ 228 #if HEDLEY_HAS_WARNING("-Wcast-align") || defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3,4,0) 229 #define SIMDE_ALIGN_CAST(T, v) (__extension__({ \ 230 HEDLEY_DIAGNOSTIC_PUSH \ 231 _Pragma("GCC diagnostic ignored \"-Wcast-align\"") \ 232 T simde_r_ = HEDLEY_REINTERPRET_CAST(T, v); \ 233 HEDLEY_DIAGNOSTIC_POP \ 234 simde_r_; \ 235 })) 236 #else 237 #define SIMDE_ALIGN_CAST(T, v) HEDLEY_REINTERPRET_CAST(T, v) 238 #endif 239 240 #if \ 241 (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \ 242 HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ 243 HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ 244 HEDLEY_IBM_VERSION_CHECK(13,1,0) 245 # define SIMDE_MAY_ALIAS __attribute__((__may_alias__)) 246 #else 247 # define SIMDE_MAY_ALIAS 248 #endif 249 250 /* Lots of compilers support GCC-style vector extensions, but many 251 don't support all the features. Define different macros depending 252 on support for 253 254 * SIMDE_VECTOR - Declaring a vector. 255 * SIMDE_VECTOR_OPS - basic operations (binary and unary). 256 * SIMDE_VECTOR_SCALAR - For binary operators, the second argument 257 can be a scalar, in which case the result is as if that scalar 258 had been broadcast to all lanes of a vector. 259 * SIMDE_VECTOR_SUBSCRIPT - Supports array subscript notation for 260 extracting/inserting a single element.= 261 262 SIMDE_VECTOR can be assumed if any others are defined, the 263 others are independent. */ 264 #if !defined(SIMDE_NO_VECTOR) 265 # if \ 266 HEDLEY_GCC_VERSION_CHECK(4,8,0) 267 # define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) 268 # define SIMDE_VECTOR_OPS 269 # define SIMDE_VECTOR_SCALAR 270 # define SIMDE_VECTOR_SUBSCRIPT 271 # elif HEDLEY_INTEL_VERSION_CHECK(16,0,0) 272 # define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) 273 # define SIMDE_VECTOR_OPS 274 /* ICC only supports SIMDE_VECTOR_SCALAR for constants */ 275 # define SIMDE_VECTOR_SUBSCRIPT 276 # elif \ 277 HEDLEY_GCC_VERSION_CHECK(4,1,0) || \ 278 HEDLEY_INTEL_VERSION_CHECK(13,0,0) 279 # define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) 280 # define SIMDE_VECTOR_OPS 281 # elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) 282 # define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) 283 # elif HEDLEY_HAS_ATTRIBUTE(vector_size) 284 # define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) 285 # define SIMDE_VECTOR_OPS 286 # define SIMDE_VECTOR_SUBSCRIPT 287 # if HEDLEY_HAS_ATTRIBUTE(diagnose_if) /* clang 4.0 */ 288 # define SIMDE_VECTOR_SCALAR 289 # endif 290 # endif 291 292 /* GCC and clang have built-in functions to handle shuffling and 293 converting of vectors, but the implementations are slightly 294 different. This macro is just an abstraction over them. Note that 295 elem_size is in bits but vec_size is in bytes. */ 296 # if !defined(SIMDE_NO_SHUFFLE_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) 297 HEDLEY_DIAGNOSTIC_PUSH 298 /* We don't care about -Wvariadic-macros; all compilers that support 299 * shufflevector/shuffle support them. */ 300 # if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") 301 # pragma clang diagnostic ignored "-Wc++98-compat-pedantic" 302 # endif 303 # if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4,0,0) 304 # pragma GCC diagnostic ignored "-Wvariadic-macros" 305 # endif 306 307 # if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) 308 # define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) 309 # elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle,4,7,0) && !defined(__INTEL_COMPILER) 310 # define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) (__extension__ ({ \ 311 int##elem_size##_t SIMDE_VECTOR(vec_size) simde_shuffle_ = { __VA_ARGS__ }; \ 312 __builtin_shuffle(a, b, simde_shuffle_); \ 313 })) 314 # endif 315 HEDLEY_DIAGNOSTIC_POP 316 # endif 317 318 /* TODO: this actually works on XL C/C++ without SIMDE_VECTOR_SUBSCRIPT 319 but the code needs to be refactored a bit to take advantage. */ 320 # if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) 321 # if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || HEDLEY_GCC_VERSION_CHECK(9,0,0) 322 # if HEDLEY_GCC_VERSION_CHECK(9,0,0) && !HEDLEY_GCC_VERSION_CHECK(9,3,0) 323 /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */ 324 # define SIMDE_CONVERT_VECTOR_(to, from) ((to) = (__extension__({ \ 325 __typeof__(from) from_ = (from); \ 326 ((void) from_); \ 327 __builtin_convertvector(from_, __typeof__(to)); \ 328 }))) 329 # else 330 # define SIMDE_CONVERT_VECTOR_(to, from) ((to) = __builtin_convertvector((from), __typeof__(to))) 331 # endif 332 # endif 333 # endif 334 #endif 335 336 /* Since we currently require SUBSCRIPT before using a vector in a 337 union, we define these as dependencies of SUBSCRIPT. They are 338 likely to disappear in the future, once SIMDe learns how to make 339 use of vectors without using the union members. Do not use them 340 in your code unless you're okay with it breaking when SIMDe 341 changes. */ 342 #if defined(SIMDE_VECTOR_SUBSCRIPT) 343 # if defined(SIMDE_VECTOR_OPS) 344 # define SIMDE_VECTOR_SUBSCRIPT_OPS 345 # endif 346 # if defined(SIMDE_VECTOR_SCALAR) 347 # define SIMDE_VECTOR_SUBSCRIPT_SCALAR 348 # endif 349 #endif 350 351 #if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L))) 352 # define SIMDE_ENABLE_OPENMP 353 #endif 354 355 #if !defined(SIMDE_ENABLE_CILKPLUS) && (defined(__cilk) || defined(HEDLEY_INTEL_VERSION)) 356 # define SIMDE_ENABLE_CILKPLUS 357 #endif 358 359 #if defined(SIMDE_ENABLE_OPENMP) 360 # define SIMDE_VECTORIZE HEDLEY_PRAGMA(omp simd) 361 # define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l)) 362 # define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r)) 363 # define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a)) 364 #elif defined(SIMDE_ENABLE_CILKPLUS) 365 # define SIMDE_VECTORIZE HEDLEY_PRAGMA(simd) 366 # define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) 367 # define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) 368 # define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a)) 369 #elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION) 370 # define SIMDE_VECTORIZE HEDLEY_PRAGMA(clang loop vectorize(enable)) 371 # define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l)) 372 # define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE 373 # define SIMDE_VECTORIZE_ALIGNED(a) 374 #elif HEDLEY_GCC_VERSION_CHECK(4,9,0) 375 # define SIMDE_VECTORIZE HEDLEY_PRAGMA(GCC ivdep) 376 # define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE 377 # define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE 378 # define SIMDE_VECTORIZE_ALIGNED(a) 379 #elif HEDLEY_CRAY_VERSION_CHECK(5,0,0) 380 # define SIMDE_VECTORIZE HEDLEY_PRAGMA(_CRI ivdep) 381 # define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE 382 # define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE 383 # define SIMDE_VECTORIZE_ALIGNED(a) 384 #else 385 # define SIMDE_VECTORIZE 386 # define SIMDE_VECTORIZE_SAFELEN(l) 387 # define SIMDE_VECTORIZE_REDUCTION(r) 388 # define SIMDE_VECTORIZE_ALIGNED(a) 389 #endif 390 391 #define SIMDE_MASK_NZ_(v, mask) (((v) & (mask)) | !((v) & (mask))) 392 393 /* Intended for checking coverage, you should never use this in 394 production. */ 395 #if defined(SIMDE_NO_INLINE) 396 # define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static 397 #else 398 # define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static 399 #endif 400 401 #if \ 402 HEDLEY_HAS_ATTRIBUTE(unused) || \ 403 HEDLEY_GCC_VERSION_CHECK(2,95,0) 404 # define SIMDE_FUNCTION_POSSIBLY_UNUSED_ __attribute__((__unused__)) 405 #else 406 # define SIMDE_FUNCTION_POSSIBLY_UNUSED_ 407 #endif 408 409 #if HEDLEY_HAS_WARNING("-Wused-but-marked-unused") 410 # define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED _Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"") 411 #else 412 # define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED 413 #endif 414 415 #if defined(_MSC_VER) 416 # define SIMDE_BEGIN_DECLS_ HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable:4996 4204)) HEDLEY_BEGIN_C_DECLS 417 # define SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS 418 #else 419 # define SIMDE_BEGIN_DECLS_ \ 420 HEDLEY_DIAGNOSTIC_PUSH \ 421 SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED \ 422 HEDLEY_BEGIN_C_DECLS 423 # define SIMDE_END_DECLS_ \ 424 HEDLEY_END_C_DECLS \ 425 HEDLEY_DIAGNOSTIC_POP 426 #endif 427 428 #if HEDLEY_HAS_WARNING("-Wpedantic") 429 # define SIMDE_DIAGNOSTIC_DISABLE_INT128 _Pragma("clang diagnostic ignored \"-Wpedantic\"") 430 #elif defined(HEDLEY_GCC_VERSION) 431 # define SIMDE_DIAGNOSTIC_DISABLE_INT128 _Pragma("GCC diagnostic ignored \"-Wpedantic\"") 432 #else 433 # define SIMDE_DIAGNOSTIC_DISABLE_INT128 434 #endif 435 436 #if defined(__SIZEOF_INT128__) 437 # define SIMDE_HAVE_INT128_ 438 HEDLEY_DIAGNOSTIC_PUSH 439 SIMDE_DIAGNOSTIC_DISABLE_INT128 440 typedef __int128 simde_int128; 441 typedef unsigned __int128 simde_uint128; 442 HEDLEY_DIAGNOSTIC_POP 443 #endif 444 445 #if !defined(SIMDE_ENDIAN_LITTLE) 446 # define SIMDE_ENDIAN_LITTLE 1234 447 #endif 448 #if !defined(SIMDE_ENDIAN_BIG) 449 # define SIMDE_ENDIAN_BIG 4321 450 #endif 451 452 #if !defined(SIMDE_ENDIAN_ORDER) 453 /* GCC (and compilers masquerading as GCC) define __BYTE_ORDER__. */ 454 # if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 455 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 456 # elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 457 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG 458 /* TI defines _BIG_ENDIAN or _LITTLE_ENDIAN */ 459 # elif defined(_BIG_ENDIAN) 460 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG 461 # elif defined(_LITTLE_ENDIAN) 462 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 463 /* We know the endianness of some common architectures. Common 464 * architectures not listed (ARM, POWER, MIPS, etc.) here are 465 * bi-endian. */ 466 # elif defined(__amd64) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 467 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 468 # elif defined(__s390x__) || defined(__zarch__) 469 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG 470 /* Looks like we'll have to rely on the platform. If we're missing a 471 * platform, please let us know. */ 472 # elif defined(_WIN32) 473 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 474 # elif defined(sun) || defined(__sun) /* Solaris */ 475 # include <sys/byteorder.h> 476 # if defined(_LITTLE_ENDIAN) 477 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 478 # elif defined(_BIG_ENDIAN) 479 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG 480 # endif 481 # elif defined(__APPLE__) 482 # include <libkern/OSByteOrder.h> 483 # if defined(__LITTLE_ENDIAN__) 484 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 485 # elif defined(__BIG_ENDIAN__) 486 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG 487 # endif 488 # elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__) || defined(BSD) 489 # include <machine/endian.h> 490 # if defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN) 491 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 492 # elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) 493 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG 494 # endif 495 # elif defined(__linux__) || defined(__linux) || defined(__gnu_linux__) 496 # include <endian.h> 497 # if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN) 498 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE 499 # elif defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN) 500 # define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG 501 # endif 502 # endif 503 #endif 504 505 #if \ 506 HEDLEY_HAS_BUILTIN(__builtin_bswap64) || \ 507 HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ 508 HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ 509 HEDLEY_INTEL_VERSION_CHECK(13,0,0) 510 #define simde_bswap64(v) __builtin_bswap64(v) 511 #elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) 512 #define simde_bswap64(v) _byteswap_uint64(v) 513 #else 514 SIMDE_FUNCTION_ATTRIBUTES 515 uint64_t 516 simde_bswap64(uint64_t v) { 517 return 518 ((v & (((uint64_t) 0xff) << 56)) >> 56) | 519 ((v & (((uint64_t) 0xff) << 48)) >> 40) | 520 ((v & (((uint64_t) 0xff) << 40)) >> 24) | 521 ((v & (((uint64_t) 0xff) << 32)) >> 8) | 522 ((v & (((uint64_t) 0xff) << 24)) << 8) | 523 ((v & (((uint64_t) 0xff) << 16)) << 24) | 524 ((v & (((uint64_t) 0xff) << 8)) << 40) | 525 ((v & (((uint64_t) 0xff) )) << 56); 526 } 527 #endif 528 529 #if !defined(SIMDE_ENDIAN_ORDER) 530 # error Unknown byte order; please file a bug 531 #else 532 # if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE 533 # define simde_endian_bswap64_be(value) simde_bswap64(value) 534 # define simde_endian_bswap64_le(value) (value) 535 # elif SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG 536 # define simde_endian_bswap64_be(value) (value) 537 # define simde_endian_bswap64_le(value) simde_bswap64(value) 538 # endif 539 #endif 540 541 /* TODO: we should at least make an attempt to detect the correct 542 types for simde_float32/float64 instead of just assuming float and 543 double. */ 544 545 #if !defined(SIMDE_FLOAT32_TYPE) 546 # define SIMDE_FLOAT32_TYPE float 547 # define SIMDE_FLOAT32_C(value) value##f 548 #else 549 # define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE) value) 550 #endif 551 typedef SIMDE_FLOAT32_TYPE simde_float32; 552 553 #if !defined(SIMDE_FLOAT64_TYPE) 554 # define SIMDE_FLOAT64_TYPE double 555 # define SIMDE_FLOAT64_C(value) value 556 #else 557 # define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE) value) 558 #endif 559 typedef SIMDE_FLOAT64_TYPE simde_float64; 560 561 /* Whether to assume that the compiler can auto-vectorize reasonably 562 well. This will cause SIMDe to attempt to compose vector 563 operations using more simple vector operations instead of minimize 564 serial work. 565 566 As an example, consider the _mm_add_ss(a, b) function from SSE, 567 which returns { a0 + b0, a1, a2, a3 }. This pattern is repeated 568 for other operations (sub, mul, etc.). 569 570 The naïve implementation would result in loading a0 and b0, adding 571 them into a temporary variable, then splicing that value into a new 572 vector with the remaining elements from a. 573 574 On platforms which support vectorization, it's generally faster to 575 simply perform the operation on the entire vector to avoid having 576 to move data between SIMD registers and non-SIMD registers. 577 Basically, instead of the temporary variable being (a0 + b0) it 578 would be a vector of (a + b), which is then combined with a to form 579 the result. 580 581 By default, SIMDe will prefer the pure-vector versions if we detect 582 a vector ISA extension, but this can be overridden by defining 583 SIMDE_NO_ASSUME_VECTORIZATION. You can also define 584 SIMDE_ASSUME_VECTORIZATION if you want to force SIMDe to use the 585 vectorized version. */ 586 #if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && !defined(SIMDE_ASSUME_VECTORIZATION) 587 # if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || defined(__ALTIVEC__) || defined(__wasm_simd128__) 588 # define SIMDE_ASSUME_VECTORIZATION 589 # endif 590 #endif 591 592 #if HEDLEY_HAS_WARNING("-Wbad-function-cast") 593 # define SIMDE_CONVERT_FTOI(T,v) \ 594 HEDLEY_DIAGNOSTIC_PUSH \ 595 _Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \ 596 HEDLEY_STATIC_CAST(T, (v)) \ 597 HEDLEY_DIAGNOSTIC_POP 598 #else 599 # define SIMDE_CONVERT_FTOI(T,v) ((T) (v)) 600 #endif 601 602 /* TODO: detect compilers which support this outside of C11 mode */ 603 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) 604 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value))))) 605 #define SIMDE_CHECKED_STATIC_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value))))) 606 #else 607 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) HEDLEY_REINTERPRET_CAST(to, value) 608 #define SIMDE_CHECKED_STATIC_CAST(to, from, value) HEDLEY_STATIC_CAST(to, value) 609 #endif 610 611 #if HEDLEY_HAS_WARNING("-Wfloat-equal") 612 # define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("clang diagnostic ignored \"-Wfloat-equal\"") 613 #elif HEDLEY_GCC_VERSION_CHECK(3,0,0) 614 # define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") 615 #else 616 # define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL 617 #endif 618 619 /* Some functions can trade accuracy for speed. For those functions 620 you can control the trade-off using this macro. Possible values: 621 622 0: prefer speed 623 1: reasonable trade-offs 624 2: prefer accuracy */ 625 #if !defined(SIMDE_ACCURACY_PREFERENCE) 626 # define SIMDE_ACCURACY_PREFERENCE 1 627 #endif 628 629 #if defined(__STDC_HOSTED__) 630 # define SIMDE_STDC_HOSTED __STDC_HOSTED__ 631 #else 632 # if \ 633 defined(HEDLEY_PGI_VERSION) || \ 634 defined(HEDLEY_MSVC_VERSION) 635 # define SIMDE_STDC_HOSTED 1 636 # else 637 # define SIMDE_STDC_HOSTED 0 638 # endif 639 #endif 640 641 /* Try to deal with environments without a standard library. */ 642 #if !defined(simde_memcpy) 643 #if HEDLEY_HAS_BUILTIN(__builtin_memcpy) 644 #define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n) 645 #endif 646 #endif 647 #if !defined(simde_memset) 648 #if HEDLEY_HAS_BUILTIN(__builtin_memset) 649 #define simde_memset(s, c, n) __builtin_memset(s, c, n) 650 #endif 651 #endif 652 #if !defined(simde_memcmp) 653 #if HEDLEY_HAS_BUILTIN(__builtin_memcmp) 654 #define simde_memcmp(s1, s2, n) __builtin_memcmp(s1, s2, n) 655 #endif 656 #endif 657 658 #if !defined(simde_memcpy) || !defined(simde_memset) || !defined(simde_memcmp) 659 #if !defined(SIMDE_NO_STRING_H) 660 #if defined(__has_include) 661 #if !__has_include(<string.h>) 662 #define SIMDE_NO_STRING_H 663 #endif 664 #elif (SIMDE_STDC_HOSTED == 0) 665 #define SIMDE_NO_STRING_H 666 #endif 667 #endif 668 669 #if !defined(SIMDE_NO_STRING_H) 670 #include <string.h> 671 #if !defined(simde_memcpy) 672 #define simde_memcpy(dest, src, n) memcpy(dest, src, n) 673 #endif 674 #if !defined(simde_memset) 675 #define simde_memset(s, c, n) memset(s, c, n) 676 #endif 677 #if !defined(simde_memcmp) 678 #define simde_memcmp(s1, s2, n) memcmp(s1, s2, n) 679 #endif 680 #else 681 /* These are meant to be portable, not fast. If you're hitting them you 682 * should think about providing your own (by defining the simde_memcpy 683 * macro prior to including any SIMDe files) or submitting a patch to 684 * SIMDe so we can detect your system-provided memcpy/memset, like by 685 * adding your compiler to the checks for __builtin_memcpy and/or 686 * __builtin_memset. */ 687 #if !defined(simde_memcpy) 688 SIMDE_FUNCTION_ATTRIBUTES 689 void simde_memcpy_(void * dest,const void * src,size_t len)690 simde_memcpy_(void* dest, const void* src, size_t len) { 691 char* dest_ = HEDLEY_STATIC_CAST(char*, dest); 692 char* src_ = HEDLEY_STATIC_CAST(const char*, src); 693 for (size_t i = 0 ; i < len ; i++) { 694 dest_[i] = src_[i]; 695 } 696 } 697 #define simde_memcpy(dest, src, n) simde_memcpy_(dest, src, n) 698 #endif 699 700 #if !defined(simde_memset) 701 SIMDE_FUNCTION_ATTRIBUTES 702 void simde_memset_(void * s,int c,size_t len)703 simde_memset_(void* s, int c, size_t len) { 704 char* s_ = HEDLEY_STATIC_CAST(char*, s); 705 char c_ = HEDLEY_STATIC_CAST(char, c); 706 for (size_t i = 0 ; i < len ; i++) { 707 s_[i] = c_[i]; 708 } 709 } 710 #define simde_memset(s, c, n) simde_memset_(s, c, n) 711 #endif 712 713 #if !defined(simde_memcmp) 714 SIMDE_FUCTION_ATTRIBUTES 715 int simde_memcmp_(const void * s1,const void * s2,size_t n)716 simde_memcmp_(const void *s1, const void *s2, size_t n) { 717 unsigned char* s1_ = HEDLEY_STATIC_CAST(unsigned char*, s1); 718 unsigned char* s2_ = HEDLEY_STATIC_CAST(unsigned char*, s2); 719 for (size_t i = 0 ; i < len ; i++) { 720 if (s1_[i] != s2_[i]) { 721 return (int) (s1_[i] - s2_[i]); 722 } 723 } 724 return 0; 725 } 726 #define simde_memcmp(s1, s2, n) simde_memcmp_(s1, s2, n) 727 #endif 728 #endif 729 #endif 730 731 #if defined(FE_ALL_EXCEPT) 732 #define SIMDE_HAVE_FENV_H 733 #elif defined(__has_include) 734 #if __has_include(<fenv.h>) 735 #include <fenv.h> 736 #define SIMDE_HAVE_FENV_H 737 #endif 738 #elif SIMDE_STDC_HOSTED == 1 739 #include <fenv.h> 740 #define SIMDE_HAVE_FENV_H 741 #endif 742 743 #if defined(EXIT_FAILURE) 744 #define SIMDE_HAVE_STDLIB_H 745 #elif defined(__has_include) 746 #if __has_include(<stdlib.h>) 747 #include <stdlib.h> 748 #define SIMDE_HAVE_STDLIB_H 749 #endif 750 #elif SIMDE_STDC_HOSTED == 1 751 #include <stdlib.h> 752 #define SIMDE_HAVE_STDLIB_H 753 #endif 754 755 #if defined(__has_include) 756 # if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include(<cfenv>) 757 # include <cfenv> 758 # elif __has_include(<fenv.h>) 759 # include <fenv.h> 760 # endif 761 # if __has_include(<stdlib.h>) 762 # include <stdlib.h> 763 # endif 764 #elif SIMDE_STDC_HOSTED == 1 765 # include <stdlib.h> 766 # include <fenv.h> 767 #endif 768 769 #include "check.h" 770 771 /* Sometimes we run into problems with specific versions of compilers 772 which make the native versions unusable for us. Often this is due 773 to missing functions, sometimes buggy implementations, etc. These 774 macros are how we check for specific bugs. As they are fixed we'll 775 start only defining them for problematic compiler versions. */ 776 777 #if !defined(SIMDE_IGNORE_COMPILER_BUGS) 778 # if defined(HEDLEY_GCC_VERSION) 779 # if !HEDLEY_GCC_VERSION_CHECK(4,9,0) 780 # define SIMDE_BUG_GCC_REV_208793 781 # endif 782 # if !HEDLEY_GCC_VERSION_CHECK(5,0,0) 783 # define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */ 784 # endif 785 # if !HEDLEY_GCC_VERSION_CHECK(4,6,0) 786 # define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ 787 # endif 788 # if !HEDLEY_GCC_VERSION_CHECK(8,0,0) 789 # define SIMDE_BUG_GCC_REV_247851 790 # endif 791 # if !HEDLEY_GCC_VERSION_CHECK(10,0,0) 792 # define SIMDE_BUG_GCC_REV_274313 793 # define SIMDE_BUG_GCC_91341 794 # endif 795 # if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64) 796 # define SIMDE_BUG_GCC_ARM_SHIFT_SCALAR 797 # endif 798 # if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) 799 # define SIMDE_BUG_GCC_94482 800 # endif 801 # if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64) 802 # define SIMDE_BUG_GCC_94488 803 # endif 804 # if defined(SIMDE_ARCH_ARM) 805 # define SIMDE_BUG_GCC_95399 806 # define SIMDE_BUG_GCC_95471 807 # elif defined(SIMDE_ARCH_POWER) 808 # define SIMDE_BUG_GCC_95227 809 # define SIMDE_BUG_GCC_95782 810 # endif 811 # define SIMDE_BUG_GCC_95399 812 # elif defined(__clang__) 813 # if defined(SIMDE_ARCH_AARCH64) 814 # define SIMDE_BUG_CLANG_45541 815 # endif 816 # elif defined(HEDLEY_MSVC_VERSION) 817 # if defined(SIMDE_ARCH_X86) 818 # define SIMDE_BUG_MSVC_ROUND_EXTRACT 819 # endif 820 # endif 821 # if defined(HEDLEY_EMSCRIPTEN_VERSION) 822 # define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */ 823 # define SIMDE_BUG_EMSCRIPTEN_5242 824 # endif 825 #endif 826 827 /* GCC and Clang both have the same issue: 828 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144 829 * https://bugs.llvm.org/show_bug.cgi?id=45931 830 */ 831 #if HEDLEY_HAS_WARNING("-Wsign-conversion") || HEDLEY_GCC_VERSION_CHECK(4,3,0) 832 # define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (__extension__ ({ \ 833 HEDLEY_DIAGNOSTIC_PUSH \ 834 HEDLEY_DIAGNOSTIC_POP \ 835 _Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") \ 836 __typeof__(expr) simde_bug_ignore_sign_conversion_v_= (expr); \ 837 HEDLEY_DIAGNOSTIC_PUSH \ 838 simde_bug_ignore_sign_conversion_v_; \ 839 })) 840 #else 841 # define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (expr) 842 #endif 843 844 #endif /* !defined(SIMDE_COMMON_H) */ 845