1 /* 2 * Simd Library (http://ermig1979.github.io/Simd). 3 * 4 * Copyright (c) 2011-2019 Yermalayeu Ihar. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #ifndef __SimdExtract_h__ 25 #define __SimdExtract_h__ 26 27 #include "Simd/SimdConst.h" 28 29 namespace Simd 30 { 31 #ifdef SIMD_SSE_ENABLE 32 namespace Sse 33 { ExtractValue(__m128 a,int i)34 SIMD_INLINE float ExtractValue(__m128 a, int i) 35 { 36 float SIMD_ALIGNED(16) _a[4]; 37 _mm_store_ps(_a, a); 38 return _a[i]; 39 } 40 ExtractSum(__m128 a)41 SIMD_INLINE float ExtractSum(__m128 a) 42 { 43 float SIMD_ALIGNED(16) _a[4]; 44 _mm_store_ps(_a, a); 45 return _a[0] + _a[1] + _a[2] + _a[3]; 46 } 47 } 48 #endif//SIMD_SSE_ENABLE 49 50 #ifdef SIMD_SSE2_ENABLE 51 namespace Sse2 52 { ExtractInt8(__m128i a)53 template <int index> SIMD_INLINE int ExtractInt8(__m128i a) 54 { 55 return _mm_extract_epi16(_mm_srli_si128(a, index & 0x1), index >> 1) & 0xFF; 56 } 57 ExtractInt16(__m128i a)58 template <int index> SIMD_INLINE int ExtractInt16(__m128i a) 59 { 60 return _mm_extract_epi16(a, index); 61 } 62 ExtractInt32(__m128i a)63 template <int index> SIMD_INLINE int ExtractInt32(__m128i a) 64 { 65 return _mm_cvtsi128_si32(_mm_srli_si128(a, 4 * index)); 66 } 67 ExtractInt32Sum(__m128i a)68 SIMD_INLINE int ExtractInt32Sum(__m128i a) 69 { 70 int SIMD_ALIGNED(16) _a[4]; 71 _mm_store_si128((__m128i*)_a, a); 72 return _a[0] + _a[1] + _a[2] + _a[3]; 73 } 74 ExtractInt64(__m128i a)75 template <int index> SIMD_INLINE int64_t ExtractInt64(__m128i a) 76 { 77 #if defined(SIMD_X64_ENABLE) && (!defined(_MSC_VER) || (defined(_MSC_VER) && _MSC_VER >= 1600)) 78 return _mm_cvtsi128_si64(_mm_srli_si128(a, 8 * index)); 79 #else 80 return (int64_t)ExtractInt32<2 * index + 1>(a) * 0x100000000 + (uint32_t)ExtractInt32<2 * index>(a); 81 #endif 82 } 83 ExtractInt64Sum(__m128i a)84 SIMD_INLINE int64_t ExtractInt64Sum(__m128i a) 85 { 86 int64_t SIMD_ALIGNED(16) _a[2]; 87 _mm_store_si128((__m128i*)_a, a); 88 return _a[0] + _a[1]; 89 } 90 } 91 #endif// SIMD_SSE2_ENABLE 92 93 #ifdef SIMD_SSE3_ENABLE 94 namespace Sse3 95 { ExtractSum(__m128 a)96 SIMD_INLINE float ExtractSum(__m128 a) 97 { 98 return _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(a, _mm_setzero_ps()), _mm_setzero_ps())); 99 } 100 Extract4Sums(const __m128 a[4])101 SIMD_INLINE __m128 Extract4Sums(const __m128 a[4]) 102 { 103 return _mm_hadd_ps(_mm_hadd_ps(a[0], a[1]), _mm_hadd_ps(a[2], a[3])); 104 } 105 } 106 #endif//SIMD_SSE3_ENABLE 107 108 #ifdef SIMD_AVX_ENABLE 109 namespace Avx 110 { ExtractValue(__m256 a,int i)111 SIMD_INLINE float ExtractValue(__m256 a, int i) 112 { 113 float SIMD_ALIGNED(32) _a[8]; 114 _mm256_store_ps(_a, a); 115 return _a[i]; 116 } 117 ExtractSum(__m256 a)118 SIMD_INLINE float ExtractSum(__m256 a) 119 { 120 float SIMD_ALIGNED(32) _a[8]; 121 _mm256_store_ps(_a, _mm256_hadd_ps(_mm256_hadd_ps(a, _mm256_setzero_ps()), _mm256_setzero_ps())); 122 return _a[0] + _a[4]; 123 } 124 Extract4Sums(const __m256 a[4])125 SIMD_INLINE __m128 Extract4Sums(const __m256 a[4]) 126 { 127 __m256 b = _mm256_hadd_ps(_mm256_hadd_ps(a[0], a[1]), _mm256_hadd_ps(a[2], a[3])); 128 return _mm_add_ps(_mm256_castps256_ps128(b), _mm256_extractf128_ps(b, 1)); 129 } 130 Extract4Sums(const __m256 & a0,const __m256 & a1,const __m256 & a2,const __m256 & a3)131 SIMD_INLINE __m128 Extract4Sums(const __m256 & a0, const __m256 & a1, const __m256 & a2, const __m256 & a3) 132 { 133 __m256 b = _mm256_hadd_ps(_mm256_hadd_ps(a0, a1), _mm256_hadd_ps(a2, a3)); 134 return _mm_add_ps(_mm256_castps256_ps128(b), _mm256_extractf128_ps(b, 1)); 135 } 136 } 137 #endif//SIMD_AVX_ENABLE 138 139 #ifdef SIMD_AVX2_ENABLE 140 namespace Avx2 141 { Extract(__m256i a,size_t index)142 template <class T> SIMD_INLINE T Extract(__m256i a, size_t index) 143 { 144 const size_t size = A / sizeof(T); 145 assert(index < size); 146 T buffer[size]; 147 _mm256_storeu_si256((__m256i*)buffer, a); 148 return buffer[index]; 149 } 150 ExtractSum(__m256i a)151 template <class T> SIMD_INLINE T ExtractSum(__m256i a) 152 { 153 const size_t size = A / sizeof(T); 154 T buffer[size]; 155 _mm256_storeu_si256((__m256i*)buffer, a); 156 T sum = 0; 157 for (size_t i = 0; i < size; ++i) 158 sum += buffer[i]; 159 return sum; 160 } 161 162 template <> SIMD_INLINE uint32_t ExtractSum<uint32_t>(__m256i a) 163 { 164 __m128i b = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1)); 165 return _mm_extract_epi32(_mm_hadd_epi32(_mm_hadd_epi32(b, _mm_setzero_si128()), _mm_setzero_si128()), 0); 166 } 167 168 #if defined(SIMD_X64_ENABLE) 169 template <> SIMD_INLINE uint64_t ExtractSum<uint64_t>(__m256i a) 170 { 171 __m128i b = _mm_add_epi64(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1)); 172 return _mm_extract_epi64(b, 0) + _mm_extract_epi64(b, 1); 173 } 174 #endif 175 Extract64i(__m256i value)176 template <int index> SIMD_INLINE int64_t Extract64i(__m256i value) 177 { 178 assert(index >= 0 && index < 4); 179 #if defined(SIMD_X64_ENABLE) 180 #if (defined(_MSC_VER) && (_MSC_VER <= 1900)) 181 return _mm_extract_epi64(_mm256_extractf128_si256(value, index / 2), index % 2); 182 #else 183 return _mm256_extract_epi64(value, index); 184 #endif 185 #else 186 SIMD_ALIGNED(32) int64_t buffer[4]; 187 _mm256_store_si256((__m256i*)buffer, value); 188 return buffer[index]; 189 #endif 190 } 191 } 192 #endif// SIMD_AVX2_ENABLE 193 194 #ifdef SIMD_NEON_ENABLE 195 namespace Neon 196 { ExtractSum32u(const uint32x4_t & a)197 SIMD_INLINE uint32_t ExtractSum32u(const uint32x4_t & a) 198 { 199 return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 1) + vgetq_lane_u32(a, 2) + vgetq_lane_u32(a, 3); 200 } 201 ExtractSum64u(const uint64x2_t & a)202 SIMD_INLINE uint64_t ExtractSum64u(const uint64x2_t & a) 203 { 204 return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); 205 } 206 ExtractSum64i(const int64x2_t & a)207 SIMD_INLINE int64_t ExtractSum64i(const int64x2_t & a) 208 { 209 return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); 210 } 211 ExtractSum32f(const float32x4_t & a)212 SIMD_INLINE float ExtractSum32f(const float32x4_t & a) 213 { 214 return vgetq_lane_f32(a, 0) + vgetq_lane_f32(a, 1) + vgetq_lane_f32(a, 2) + vgetq_lane_f32(a, 3); 215 } 216 Extract4Sums(const float32x4_t a[4])217 SIMD_INLINE float32x4_t Extract4Sums(const float32x4_t a[4]) 218 { 219 float32x4x2_t b0 = vzipq_f32(a[0], a[2]); 220 float32x4x2_t b1 = vzipq_f32(a[1], a[3]); 221 float32x4x2_t c0 = vzipq_f32(b0.val[0], b1.val[0]); 222 float32x4x2_t c1 = vzipq_f32(b0.val[1], b1.val[1]); 223 return vaddq_f32(vaddq_f32(c0.val[0], c0.val[1]), vaddq_f32(c1.val[0], c1.val[1])); 224 } 225 Extract4Sums(const float32x4_t & a0,const float32x4_t & a1,const float32x4_t & a2,const float32x4_t & a3)226 SIMD_INLINE float32x4_t Extract4Sums(const float32x4_t & a0, const float32x4_t & a1, const float32x4_t & a2, const float32x4_t & a3) 227 { 228 float32x4x2_t b0 = vzipq_f32(a0, a2); 229 float32x4x2_t b1 = vzipq_f32(a1, a3); 230 float32x4x2_t c0 = vzipq_f32(b0.val[0], b1.val[0]); 231 float32x4x2_t c1 = vzipq_f32(b0.val[1], b1.val[1]); 232 return vaddq_f32(vaddq_f32(c0.val[0], c0.val[1]), vaddq_f32(c1.val[0], c1.val[1])); 233 } 234 } 235 #endif// SIMD_NEON_ENABLE 236 } 237 238 #endif//__SimdExtract_h__ 239