1/* 2 * Copyright (C) 2014 the FFLAS-FFPACK group 3 * 4 * Written by Bastien Vialla<bastien.vialla@lirmm.fr> 5 * Brice Boyer (briceboyer) <boyer.brice@gmail.com> 6 * 7 * 8 * ========LICENCE======== 9 * This file is part of the library FFLAS-FFPACK. 10 * 11 * FFLAS-FFPACK is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * This library is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with this library; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24 * ========LICENCE======== 25 *. 26 */ 27 28#ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL 29#define __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL 30 31#if not (defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)) 32#error "You need AVX instructions to perform 256bits operations on double" 33#endif 34 35/* 36 * Simd256 specialized for double 37 */ 38template <> struct Simd256_impl<true, false, true, 8> : public Simd256fp_base { 39 /* 40 * alias to 256 bit simd register 41 */ 42 using vect_t = __m256d; 43 44 /* 45 * define the scalar type corresponding to the specialization 46 */ 47 using scalar_t = double; 48 49 /* 50 * number of scalar_t in a simd register 51 */ 52 static const constexpr size_t vect_size = 4; 53 54 /* 55 * alignement required by scalar_t pointer to be loaded in a vect_t 56 */ 57 static const constexpr size_t alignment = 32; 58 59 /* 60 * Check if the pointer p is a multiple of alignemnt 61 */ 62 template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } 63 64 /* 65 * Check if the number n is a multiple of vect_size 66 */ 67 template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; } 68 69 /* 70 * Return vector of type vect_t with all elements set to zero 71 * Return [0,0,0,0] 72 */ 73 static INLINE CONST vect_t zero() { return _mm256_setzero_pd(); } 74 75 /* 76 * Broadcast double-precision (64-bit) floating-point value x to all elements of vect_t. 77 * Return [x,x,x,x] 78 */ 79 static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_pd(x); } 80 81 /* 82 * Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values. 83 * Return [x1,x2,x3,x4] 84 */ 85 static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) { 86 return _mm256_set_pd(x4, x3, x2, x1); 87 } 88 89 /* 90 * Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in 91 *vect_t. 92 * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] 93 */ 94 template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { 95 return _mm256_set_pd(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]); 96 } 97 98 /* 99 * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t. 100 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. 101 * Return [p[0], p[1], p[2], p[3]] 102 */ 103 static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_pd(p); } 104 105 /* 106 * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t. 107 * p does not need to be aligned on any particular boundary. 108 * Return [p[0], p[1], p[2], p[3]] 109 */ 110 static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_pd(p); } 111 112 /* 113 * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory. 114 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. 115 */ 116 static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_pd(const_cast<scalar_t *>(p), v); } 117 118 /* 119 * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory. 120 * p does not need to be aligned on any particular boundary. 121 */ 122 static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_pd(const_cast<scalar_t *>(p), v); } 123 124 /* 125 * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory using 126 * a non-temporal memory hint. 127 * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. 128 */ 129 static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_pd(const_cast<scalar_t *>(p), v); } 130 131 /* 132 * Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, 133 * and store the results in dst. 134 * Args : [a0, a1, a2, a3] double 135 [b0, b1, b2, b3] double 136 * Return : [a[s[0..1]], ..., a[s[6..7]]] double 137 */ 138#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) 139 template<uint8_t s> 140 static INLINE CONST vect_t shuffle(const vect_t a) { 141 return _mm256_permute4x64_pd(a, s); 142 } 143#endif 144 145 /* 146 * Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, 147 * and store the results in dst. 148 * Args : [a0, a1, a2, a3] double 149 [b0, b1, b2, b3] double 150 * Return : [a0, b0, a2, b2] double 151 */ 152 static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_pd(a, b); } 153 154 /* 155 * Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, 156 * and store the results in dst. 157 * Args : [a0, a1, a2, a3] double 158 [b0, b1, b2, b3] double 159 * Return : [a1, b1, a3, b3] double 160 */ 161 static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_pd(a, b); } 162 163 /* 164 * Blend packed double-precision (64-bit) floating-point elements from a and b using control mask s, 165 * and store the results in dst. 166 * Args : [a0, a1, a2, a3] double 167 [b0, b1, b2, b3] double 168 * Return : [s[0]?a0:b0, ..., s[3]?a3:b3] double 169 */ 170 template<uint8_t s> 171 static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { 172 return _mm256_blend_pd(a, b, s); 173 } 174 175 /* 176 * Blend packed double-precision (64-bit) floating-point elements from a and b using mask, 177 * and store the results in dst. 178 * Args : [a0, a1, a2, a3] double 179 [b0, b1, b2, b3] double 180 * Return : [mask[31]?a0:b0, ..., mask[255]?a3:b3] double 181 */ 182 static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) { 183 return _mm256_blendv_pd(a, b, mask); 184 } 185 186 /* 187 * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t. 188 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 189 * Return : [a0+b0, a1+b1, a2+b2, a3+b3] 190 */ 191 static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_pd(a, b); } 192 193 static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } 194 195 /* 196 * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) 197 * floating-point elements in a, and store the results in vect_t. 198 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 199 * Return : [a0-b0, a1-b1, a2-b2, a3-b3] 200 */ 201 static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_pd(a, b); } 202 203 static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } 204 205 /* 206 * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t. 207 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 208 * Return : [a0*b0, a1*b1, a2*b2, a3*b3] 209 */ 210 static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_pd(a, b); } 211 212 static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); } 213 214 /* 215 * Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, 216 * and store the results in dst. 217 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 218 * Return : [a0/b0, a1/b1, a2/b2, a3/b3] 219 */ 220 static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm256_div_pd(a, b); } 221 222 /* 223 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to 224 * packed elements in c, and store the results in vect_t. 225 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] 226 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3] 227 */ 228 static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { 229#ifdef __FMA__ 230 return _mm256_fmadd_pd(a, b, c); 231#else 232 return add(c, mul(a, b)); 233#endif 234 } 235 236 static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } 237 238 /* 239 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result 240 * to packed elements in c, and store the results in vect_t. 241 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] 242 * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3] 243 */ 244 static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { 245#ifdef __FMA__ 246 return _mm256_fnmadd_pd(a, b, c); 247#else 248 return sub(c, mul(a, b)); 249#endif 250 } 251 252 static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } 253 254 /* 255 * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from 256 * the intermediate result, and store the results in vect_t. 257 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] 258 * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3] 259 */ 260 static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { 261#ifdef __FMA__ 262 return _mm256_fmsub_pd(a, b, c); 263#else 264 return sub(mul(a, b), c); 265#endif 266 } 267 268 static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } 269 270 /* 271 * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results 272 in vect_t. 273 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 274 * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0, 275 (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0, 276 (a2==b2) ? 0xFFFFFFFFFFFFFFFF : 0, 277 (a3==b3) ? 0xFFFFFFFFFFFFFFFF : 0] 278 */ 279 static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } 280 281 /* 282 * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the 283 results in vect_t. 284 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 285 * Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0, 286 (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0, 287 (a2<b2) ? 0xFFFFFFFFFFFFFFFF : 0, 288 (a3<b3) ? 0xFFFFFFFFFFFFFFFF : 0] 289 */ 290 static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } 291 292 /* 293 * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser or equal than, and store 294 the results in vect_t. 295 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 296 * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0, 297 (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0, 298 (a2<=b2) ? 0xFFFFFFFFFFFFFFFF : 0, 299 (a3<=b3) ? 0xFFFFFFFFFFFFFFFF : 0] 300 */ 301 static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } 302 303 /* 304 * Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the 305 results in vect_t. 306 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 307 * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0, 308 (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0, 309 (a2>b2) ? 0xFFFFFFFFFFFFFFFF : 0, 310 (a3>b3) ? 0xFFFFFFFFFFFFFFFF : 0] 311 */ 312 static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GT_OS); } 313 314 /* 315 * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store 316 the results in vect_t. 317 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 318 * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0, 319 (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0, 320 (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0, 321 (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0] 322 */ 323 static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GE_OS); } 324 325 /* 326 * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the 327 * results in vect_t. 328 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 329 * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3] 330 */ 331 static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_pd(a, b); } 332 333 /* 334 * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the 335 * results in vect_t. 336 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 337 * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3] 338 */ 339 static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_pd(a, b); } 340 341 /* 342 * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the 343 * results in vect_t. 344 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 345 * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3] 346 */ 347 static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_pd(a, b); } 348 349 /* 350 * Compute the bitwise NOT AND of packed double-precision (64-bit) floating-point elements in a and b, and store the 351 * results in vect_t. 352 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 353 * Return : [NOT(a0) AND b0, NOT(a1) AND b1, NOT(a2) AND b2, NOT(a3) AND b3] 354 */ 355 static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_pd(a, b); } 356 357 /* 358 * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the 359 * results as packed double-precision floating-point elements in vect_t. 360 * Args : [a0, a1, a2, a3] 361 * Return : [floor(a0), floor(a1), floor(a2), floor(a3)] 362 */ 363 static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_pd(a); } 364 365 /* 366 * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the 367 * results as packed double-precision floating-point elements in vect_t. 368 * Args : [a0, a1, a2, a3] 369 * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)] 370 */ 371 static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_pd(a); } 372 373 /* 374 * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed 375 * double-precision floating-point elements in vect_t. 376 * Args : [a0, a1, a2, a3] 377 * Return : [round(a0), round(a1), round(a2), round(a3)] 378 */ 379 static INLINE CONST vect_t round(const vect_t a) { 380 return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 381 } 382 383 /* 384 * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the 385 * results in vect_t. 386 * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] 387 * Return : [a0+a1, b0+b1, a2+a3, b2+b3] 388 */ 389 static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_pd(a, b); } 390 391 /* 392 * Horizontally add double-precision (64-bit) floating-point elements in a. 393 * Args : [a0, a1, a2, a3] 394 * Return : a0+a1+a2+a3 395 */ 396 static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { 397 return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] + 398 ((const scalar_t *)&a)[3]; 399 } 400 401 static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, 402 const vect_t &MAX, vect_t &Q, vect_t &T) { 403 FLOAT_MOD(C, P, INVP, Q); 404 NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); 405 406 return C; 407 } 408 409}; 410 411#endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL 412/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 413// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s 414