1/* 2 * Copyright (C) 2014 the FFLAS-FFPACK group 3 * 4 * Written by Bastien Vialla<bastien.vialla@lirmm.fr> 5 * Brice Boyer (briceboyer) <boyer.brice@gmail.com> 6 * 7 * 8 * ========LICENCE======== 9 * This file is part of the library FFLAS-FFPACK. 10 * 11 * FFLAS-FFPACK is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * This library is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with this library; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24 * ========LICENCE======== 25 *. 26 */ 27 28#ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_INL 29#define __FFLASFFPACK_fflas_ffpack_utils_simd256_INL 30 31struct Simd256fp_base { 32#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) 33 34 /* Name of the Simd struct */ 35 static inline const std::string type_string () { return "Simd256"; } 36 37 /* 38 * Shuffle 128-bits selected by imm8 from a and b, and store the results in dst. 39 * Args : [a0, a1] 40 * [b0, b1] 41 * Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1] 42 */ 43 template<int s> 44 static INLINE CONST __m256d permute128(const __m256d a, const __m256d b) { 45 return _mm256_permute2f128_pd(a, b, s); 46 } 47 48 template<int s> 49 static INLINE CONST __m256 permute128(const __m256 a, const __m256 b) { 50 return _mm256_permute2f128_ps(a, b, s); 51 } 52 53 /* 54 * Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst. 55 * Args : [a0, a1] int128_t 56 [b0, b1] int128_t 57 * Return : [a0, b0] int128_t 58 */ 59 static INLINE CONST __m256d unpacklo128(const __m256d a, const __m256d b) { return permute128<0x20>(a, b); } 60 static INLINE CONST __m256 unpacklo128(const __m256 a, const __m256 b) { return permute128<0x20>(a, b); } 61 62 /* 63 * Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst. 64 * Args : [a0, a1] int128_t 65 [b0, b1] int128_t 66 * Return : [a1, b1] int128_t 67 */ 68 static INLINE CONST __m256d unpackhi128(const __m256d a, const __m256d b) { return permute128<0x31>(a, b); } 69 static INLINE CONST __m256 unpackhi128(const __m256 a, const __m256 b) { return permute128<0x31>(a, b); } 70 71#endif 72}; 73 74struct Simd256i_base { 75 76 /* 77 * alias to 256 bit simd register 78 */ 79 using vect_t = __m256i; 80 81 /* Name of the Simd struct */ 82 static inline const std::string type_string () { return "Simd256"; } 83 84 /* 85 * Return vector of type vect_t with all elements set to zero 86 * Return [0, ...,0] 87 */ 88 static INLINE CONST vect_t zero() { return _mm256_setzero_si256(); } 89 90#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) 91 92 // CLANG < 3.8 does not implement m256_bslli_epi128 nor _mmm256_bsrli_epi128 93#if defined(__clang__) 94#if __clang_major < 3 || (__clang_major__ == 3 && __clang_minor__ < 8) 95#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) 96#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) 97#endif 98#endif 99 100 /* 101 * Shift packed 128-bit integers in a left by s bits while shifting in zeros, and store the results in vect_t. 102 * Args : [a0, a1] int128_t 103 * Return : [a0 << (s*8), a1 << (s*8)] int128_t 104 */ 105 template<uint8_t s> 106 static INLINE CONST vect_t sll128(const vect_t a) { return _mm256_bslli_epi128(a, s); } 107 108 /* 109 * Shift packed 128-bit integers in a right by s while shifting in zeros, and store the results in vect_t. 110 * Args : [a0, a1] int128_t 111 * Return : [a0 << (s*8), a1 << (s*8)] int128_t 112 */ 113 template<uint8_t s> 114 static INLINE CONST vect_t srl128(const vect_t a) { return _mm256_bsrli_epi128(a, s); } 115 116 /* 117 * Compute the bitwise AND and store the results in vect_t. 118 * Args : [a0, ..., a255] 119 * [b0, ..., b255] 120 * Return : [a0 AND b0, ..., a255 AND b255] 121 */ 122 static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_si256(b, a); } 123 124 /* 125 * Compute the bitwise OR and store the results in vect_t. 126 * Args : [a0, ..., a255] 127 * [b0, ..., b255] 128 * Return : [a0 OR b0, ..., a255 OR b255] 129 */ 130 static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_si256(b, a); } 131 132 /* 133 * Compute the bitwise XOR and store the results in vect_t. 134 * Args : [a0, ..., a255] 135 * [b0, ..., b255] 136 * Return : [a0 XOR b0, ..., a255 XOR b255] 137 */ 138 static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_si256(b, a); } 139 140 /* 141 * Compute the bitwise NOT AND and store the results in vect_t. 142 * Args : [a0, ..., a255] 143 * [b0, ..., b255] 144 * Return : [(NOT a0) AND b0, ..., (NOT a255) AND b255] 145 */ 146 static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_si256(a, b); } 147 148 /* 149 * Shuffle 128-bit integers in a and b using the control in imm8, and store the results in dst. 150 * Args : [a0, a1] int128_t 151 * [b0, b1] int128_t 152 * Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1] int128_t 153 */ 154 template<int s> 155 static INLINE CONST vect_t permute128(const vect_t a, const vect_t b) { 156 return _mm256_permute2x128_si256(a, b, s); 157 } 158 159 /* 160 * Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst. 161 * Args : [a0, a1] int128_t 162 [b0, b1] int128_t 163 * Return : [a0, b0] int128_t 164 */ 165 static INLINE CONST vect_t unpacklo128(const vect_t a, const vect_t b) { return permute128<0x20>(a, b); } 166 167 /* 168 * Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst. 169 * Args : [a0, a1] int128_t 170 [b0, b1] int128_t 171 * Return : [a1, b1] int128_t 172 */ 173 static INLINE CONST vect_t unpackhi128(const vect_t a, const vect_t b) { return permute128<0x31>(a, b); } 174#endif 175}; 176 177template <bool ArithType, bool Int, bool Signed, int Size> struct Simd256_impl; 178 179template <class T> 180using Simd256 = 181Simd256_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>; 182 183#include "simd256_float.inl" 184#include "simd256_double.inl" 185 186#ifdef SIMD_INT 187// To many missing insctructions on int8_t 188 189#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) 190#ifdef __x86_64__ 191#include "simd256_int64.inl" 192#endif 193#include "simd256_int32.inl" 194#include "simd256_int16.inl" 195#endif 196 197#endif //#ifdef SIMD_INT 198 199 200#endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_INL 201/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 202// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s 203