1 /*
2 * Simd Library (http://ermig1979.github.io/Simd).
3 *
4 * Copyright (c) 2011-2019 Yermalayeu Ihar.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #ifndef __SimdExtract_h__
25 #define __SimdExtract_h__
26 
27 #include "Simd/SimdConst.h"
28 
29 namespace Simd
30 {
31 #ifdef SIMD_SSE_ENABLE
32     namespace Sse
33     {
ExtractValue(__m128 a,int i)34         SIMD_INLINE float ExtractValue(__m128 a, int i)
35         {
36             float SIMD_ALIGNED(16) _a[4];
37             _mm_store_ps(_a, a);
38             return _a[i];
39         }
40 
ExtractSum(__m128 a)41         SIMD_INLINE float ExtractSum(__m128 a)
42         {
43             float SIMD_ALIGNED(16) _a[4];
44             _mm_store_ps(_a, a);
45             return _a[0] + _a[1] + _a[2] + _a[3];
46         }
47     }
48 #endif//SIMD_SSE_ENABLE
49 
50 #ifdef SIMD_SSE2_ENABLE
51     namespace Sse2
52     {
ExtractInt8(__m128i a)53         template <int index> SIMD_INLINE int ExtractInt8(__m128i a)
54         {
55             return _mm_extract_epi16(_mm_srli_si128(a, index & 0x1), index >> 1) & 0xFF;
56         }
57 
ExtractInt16(__m128i a)58         template <int index> SIMD_INLINE int ExtractInt16(__m128i a)
59         {
60             return _mm_extract_epi16(a, index);
61         }
62 
ExtractInt32(__m128i a)63         template <int index> SIMD_INLINE int ExtractInt32(__m128i a)
64         {
65             return _mm_cvtsi128_si32(_mm_srli_si128(a, 4 * index));
66         }
67 
ExtractInt32Sum(__m128i a)68         SIMD_INLINE int ExtractInt32Sum(__m128i a)
69         {
70             int SIMD_ALIGNED(16) _a[4];
71             _mm_store_si128((__m128i*)_a, a);
72             return _a[0] + _a[1] + _a[2] + _a[3];
73         }
74 
ExtractInt64(__m128i a)75         template <int index> SIMD_INLINE int64_t ExtractInt64(__m128i a)
76         {
77 #if defined(SIMD_X64_ENABLE) && (!defined(_MSC_VER) || (defined(_MSC_VER) && _MSC_VER >= 1600))
78             return _mm_cvtsi128_si64(_mm_srli_si128(a, 8 * index));
79 #else
80             return (int64_t)ExtractInt32<2 * index + 1>(a) * 0x100000000 + (uint32_t)ExtractInt32<2 * index>(a);
81 #endif
82         }
83 
ExtractInt64Sum(__m128i a)84         SIMD_INLINE int64_t ExtractInt64Sum(__m128i a)
85         {
86             int64_t SIMD_ALIGNED(16) _a[2];
87             _mm_store_si128((__m128i*)_a, a);
88             return _a[0] + _a[1];
89         }
90     }
91 #endif// SIMD_SSE2_ENABLE
92 
93 #ifdef SIMD_SSE3_ENABLE
94     namespace Sse3
95     {
ExtractSum(__m128 a)96         SIMD_INLINE float ExtractSum(__m128 a)
97         {
98             return _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(a, _mm_setzero_ps()), _mm_setzero_ps()));
99         }
100 
Extract4Sums(const __m128 a[4])101         SIMD_INLINE __m128 Extract4Sums(const __m128 a[4])
102         {
103             return _mm_hadd_ps(_mm_hadd_ps(a[0], a[1]), _mm_hadd_ps(a[2], a[3]));
104         }
105     }
106 #endif//SIMD_SSE3_ENABLE
107 
108 #ifdef SIMD_AVX_ENABLE
109     namespace Avx
110     {
ExtractValue(__m256 a,int i)111         SIMD_INLINE float ExtractValue(__m256 a, int i)
112         {
113             float SIMD_ALIGNED(32) _a[8];
114             _mm256_store_ps(_a, a);
115             return _a[i];
116         }
117 
ExtractSum(__m256 a)118         SIMD_INLINE float ExtractSum(__m256 a)
119         {
120             float SIMD_ALIGNED(32) _a[8];
121             _mm256_store_ps(_a, _mm256_hadd_ps(_mm256_hadd_ps(a, _mm256_setzero_ps()), _mm256_setzero_ps()));
122             return _a[0] + _a[4];
123         }
124 
Extract4Sums(const __m256 a[4])125         SIMD_INLINE __m128 Extract4Sums(const __m256 a[4])
126         {
127             __m256 b = _mm256_hadd_ps(_mm256_hadd_ps(a[0], a[1]), _mm256_hadd_ps(a[2], a[3]));
128             return _mm_add_ps(_mm256_castps256_ps128(b), _mm256_extractf128_ps(b, 1));
129         }
130 
Extract4Sums(const __m256 & a0,const __m256 & a1,const __m256 & a2,const __m256 & a3)131         SIMD_INLINE __m128 Extract4Sums(const __m256 & a0, const __m256 & a1, const __m256 & a2, const __m256 & a3)
132         {
133             __m256 b = _mm256_hadd_ps(_mm256_hadd_ps(a0, a1), _mm256_hadd_ps(a2, a3));
134             return _mm_add_ps(_mm256_castps256_ps128(b), _mm256_extractf128_ps(b, 1));
135         }
136     }
137 #endif//SIMD_AVX_ENABLE
138 
139 #ifdef SIMD_AVX2_ENABLE
140     namespace Avx2
141     {
Extract(__m256i a,size_t index)142         template <class T> SIMD_INLINE T Extract(__m256i a, size_t index)
143         {
144             const size_t size = A / sizeof(T);
145             assert(index < size);
146             T buffer[size];
147             _mm256_storeu_si256((__m256i*)buffer, a);
148             return buffer[index];
149         }
150 
ExtractSum(__m256i a)151         template <class T> SIMD_INLINE T ExtractSum(__m256i a)
152         {
153             const size_t size = A / sizeof(T);
154             T buffer[size];
155             _mm256_storeu_si256((__m256i*)buffer, a);
156             T sum = 0;
157             for (size_t i = 0; i < size; ++i)
158                 sum += buffer[i];
159             return sum;
160         }
161 
162         template <> SIMD_INLINE uint32_t ExtractSum<uint32_t>(__m256i a)
163         {
164             __m128i b = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1));
165             return _mm_extract_epi32(_mm_hadd_epi32(_mm_hadd_epi32(b, _mm_setzero_si128()), _mm_setzero_si128()), 0);
166         }
167 
168 #if defined(SIMD_X64_ENABLE)
169         template <> SIMD_INLINE uint64_t ExtractSum<uint64_t>(__m256i a)
170         {
171             __m128i b = _mm_add_epi64(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1));
172             return _mm_extract_epi64(b, 0) + _mm_extract_epi64(b, 1);
173         }
174 #endif
175 
Extract64i(__m256i value)176         template <int index> SIMD_INLINE int64_t Extract64i(__m256i value)
177         {
178             assert(index >= 0 && index < 4);
179 #if defined(SIMD_X64_ENABLE)
180 #if (defined(_MSC_VER) && (_MSC_VER <= 1900))
181             return _mm_extract_epi64(_mm256_extractf128_si256(value, index / 2), index % 2);
182 #else
183             return _mm256_extract_epi64(value, index);
184 #endif
185 #else
186             SIMD_ALIGNED(32) int64_t buffer[4];
187             _mm256_store_si256((__m256i*)buffer, value);
188             return buffer[index];
189 #endif
190         }
191     }
192 #endif// SIMD_AVX2_ENABLE
193 
194 #ifdef SIMD_NEON_ENABLE
195     namespace Neon
196     {
ExtractSum32u(const uint32x4_t & a)197         SIMD_INLINE uint32_t ExtractSum32u(const uint32x4_t & a)
198         {
199             return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 1) + vgetq_lane_u32(a, 2) + vgetq_lane_u32(a, 3);
200         }
201 
ExtractSum64u(const uint64x2_t & a)202         SIMD_INLINE uint64_t ExtractSum64u(const uint64x2_t & a)
203         {
204             return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
205         }
206 
ExtractSum64i(const int64x2_t & a)207         SIMD_INLINE int64_t ExtractSum64i(const int64x2_t & a)
208         {
209             return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
210         }
211 
ExtractSum32f(const float32x4_t & a)212         SIMD_INLINE float ExtractSum32f(const float32x4_t & a)
213         {
214             return vgetq_lane_f32(a, 0) + vgetq_lane_f32(a, 1) + vgetq_lane_f32(a, 2) + vgetq_lane_f32(a, 3);
215         }
216 
Extract4Sums(const float32x4_t a[4])217         SIMD_INLINE float32x4_t Extract4Sums(const float32x4_t a[4])
218         {
219             float32x4x2_t b0 = vzipq_f32(a[0], a[2]);
220             float32x4x2_t b1 = vzipq_f32(a[1], a[3]);
221             float32x4x2_t c0 = vzipq_f32(b0.val[0], b1.val[0]);
222             float32x4x2_t c1 = vzipq_f32(b0.val[1], b1.val[1]);
223             return vaddq_f32(vaddq_f32(c0.val[0], c0.val[1]), vaddq_f32(c1.val[0], c1.val[1]));
224         }
225 
Extract4Sums(const float32x4_t & a0,const float32x4_t & a1,const float32x4_t & a2,const float32x4_t & a3)226         SIMD_INLINE float32x4_t Extract4Sums(const float32x4_t & a0, const float32x4_t & a1, const float32x4_t & a2, const float32x4_t & a3)
227         {
228             float32x4x2_t b0 = vzipq_f32(a0, a2);
229             float32x4x2_t b1 = vzipq_f32(a1, a3);
230             float32x4x2_t c0 = vzipq_f32(b0.val[0], b1.val[0]);
231             float32x4x2_t c1 = vzipq_f32(b0.val[1], b1.val[1]);
232             return vaddq_f32(vaddq_f32(c0.val[0], c0.val[1]), vaddq_f32(c1.val[0], c1.val[1]));
233         }
234     }
235 #endif// SIMD_NEON_ENABLE
236 }
237 
238 #endif//__SimdExtract_h__
239