1 /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/transpose.h>
17 #include <simdpp/detail/align.h>
18 #include <simdpp/detail/not_implemented.h>
19 #include <simdpp/detail/insn/mem_unpack.h>
20 #include <simdpp/detail/null/memory.h>
21
22 namespace simdpp {
23 namespace SIMDPP_ARCH_NAMESPACE {
24 namespace detail {
25 namespace insn {
26
27 // -----------------------------------------------------------------------------
28
29 // Each integer type is handled separately because higher aligment guarantees
30 // offer better performance on e.g. ARM. Note, we don't use LDDQU on SSE,
31 // because it has usage restrictions and offers improved performance only on
32 // Pentium 4 era processors.
33 static SIMDPP_INL
i_load_u(uint8x16 & a,const char * p)34 void i_load_u(uint8x16& a, const char* p)
35 {
36 #if SIMDPP_USE_NULL
37 detail::null::load(a, p);
38 #elif SIMDPP_USE_SSE2
39 a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
40 #elif SIMDPP_USE_NEON
41 a = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
42 #elif SIMDPP_USE_VSX_206
43 const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
44 a = vec_vsx_ld(0, q);
45 #elif SIMDPP_USE_ALTIVEC
46 const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
47 uint8x16 l1, l2, mask;
48 l1 = vec_ld(0, q);
49 l2 = vec_ld(16, q);
50 #pragma GCC diagnostic push
51 #pragma GCC diagnostic ignored "-Wdeprecated"
52 mask = vec_lvsl(0, q);
53 #pragma GCC diagnostic pop
54 a = vec_perm(l1.native(), l2.native(), mask.native());
55 #elif SIMDPP_USE_MSA
56 a = (v16u8) __msa_ld_b(p, 0);
57 #endif
58 }
59
60 static SIMDPP_INL
i_load_u(uint16x8 & a,const char * p)61 void i_load_u(uint16x8& a, const char* p)
62 {
63 #if SIMDPP_USE_NULL
64 detail::null::load(a, p);
65 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
66 uint8x16 b;
67 i_load_u(b, p);
68 a = b;
69 #elif SIMDPP_USE_NEON
70 a = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
71 #elif SIMDPP_USE_MSA
72 a = (v8u16) __msa_ld_h(p, 0);
73 #endif
74 }
75
76 static SIMDPP_INL
i_load_u(uint32x4 & a,const char * p)77 void i_load_u(uint32x4& a, const char* p)
78 {
79 #if SIMDPP_USE_NULL
80 detail::null::load(a, p);
81 #elif SIMDPP_USE_VSX_206
82 a = vec_vsx_ld(0, reinterpret_cast<const uint32_t*>(p));
83 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
84 uint8x16 b;
85 i_load_u(b, p);
86 a = b;
87 #elif SIMDPP_USE_NEON
88 a = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
89 #elif SIMDPP_USE_MSA
90 a = (v4u32) __msa_ld_w(p, 0);
91 #endif
92 }
93
94 static SIMDPP_INL
i_load_u(uint64x2 & a,const char * p)95 void i_load_u(uint64x2& a, const char* p)
96 {
97 #if SIMDPP_USE_NULL
98 detail::null::load(a, p);
99 #elif SIMDPP_USE_SSE2
100 uint8x16 b;
101 i_load_u(b, p);
102 a = b;
103 #elif SIMDPP_USE_VSX_207
104 #if SIMDPP_64_BITS
105 a = (__vector uint64_t) vec_vsx_ld(0, reinterpret_cast<const uint64_t*>(p));
106 #else
107 // BUG: GCC does not support vec_vsx_ld in 32-bit mode even when
108 // VSX 2.07 is enabled
109 uint8x16 r;
110 i_load_u(r, p);
111 a = r;
112 #endif
113 #elif SIMDPP_USE_ALTIVEC
114 detail::null::load(a, p);
115 #elif SIMDPP_USE_NEON
116 a = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
117 #elif SIMDPP_USE_MSA
118 a = (v2u64) __msa_ld_d(p, 0);
119 #endif
120 }
121
122 static SIMDPP_INL
i_load_u(float32x4 & a,const char * p)123 void i_load_u(float32x4& a, const char* p)
124 {
125 const float* q = reinterpret_cast<const float*>(p);
126 (void) q;
127 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
128 detail::null::load(a, p);
129 #elif SIMDPP_USE_SSE2
130 a = _mm_loadu_ps(q);
131 #elif SIMDPP_USE_NEON
132 a = vld1q_f32(q);
133 #elif SIMDPP_USE_VSX_206
134 a = vec_vsx_ld(0, q);
135 #elif SIMDPP_USE_ALTIVEC
136 uint32x4 b; (void) q;
137 i_load_u(b, p);
138 a = b;
139 #elif SIMDPP_USE_MSA
140 a = (v4f32) __msa_ld_w(q, 0);
141 #endif
142 }
143
144 static SIMDPP_INL
i_load_u(float64x2 & a,const char * p)145 void i_load_u(float64x2& a, const char* p)
146 {
147 const double* q = reinterpret_cast<const double*>(p);
148 (void) q;
149 #if SIMDPP_USE_SSE2
150 a = _mm_loadu_pd(q);
151 #elif SIMDPP_USE_NEON64
152 a = vld1q_f64(q);
153 #elif SIMDPP_USE_VSX_206
154 a = vec_vsx_ld(0, q);
155 #elif SIMDPP_USE_MSA
156 a = (v2f64) __msa_ld_d(q, 0);
157 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
158 detail::null::load(a, p);
159 #else
160 SIMDPP_NOT_IMPLEMENTED2(a, p);
161 #endif
162 }
163
164 #if SIMDPP_USE_AVX2
165 static SIMDPP_INL
i_load_u(uint8x32 & a,const char * p)166 void i_load_u(uint8x32& a, const char* p)
167 {
168 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
169 }
170 static SIMDPP_INL
i_load_u(uint16x16 & a,const char * p)171 void i_load_u(uint16x16& a, const char* p)
172 {
173 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
174 }
175 static SIMDPP_INL
i_load_u(uint32x8 & a,const char * p)176 void i_load_u(uint32x8& a, const char* p)
177 {
178 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
179 }
180 static SIMDPP_INL
i_load_u(uint64x4 & a,const char * p)181 void i_load_u(uint64x4& a, const char* p)
182 {
183 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
184 }
185 #endif
186 #if SIMDPP_USE_AVX
187 static SIMDPP_INL
i_load_u(float32x8 & a,const char * p)188 void i_load_u(float32x8& a, const char* p)
189 {
190 a = _mm256_loadu_ps(reinterpret_cast<const float*>(p));
191 }
192 static SIMDPP_INL
i_load_u(float64x4 & a,const char * p)193 void i_load_u(float64x4& a, const char* p)
194 {
195 a = _mm256_loadu_pd(reinterpret_cast<const double*>(p));
196 }
197 #endif
198
199 #if __INTEL_COMPILER && SIMDPP_USE_AVX && !SIMDPP_USE_AVX512F
200 // BUG: Certain versions of ICC don't like vectors larger than native vector
201 // (e.g. float32<16> and float64<8>) on AVX and AVX2. Two xmm vmovaps aligned
202 // loads are emitted for each 32-byte load even though the argument is clearly
203 // unaligned (e.g. p + 1). The code below results in the same output except
204 // that correct vmovups unaligned load instructions are used.
205 template<unsigned N> SIMDPP_INL
i_load_u(float32<N> & a,const char * p)206 void i_load_u(float32<N>& a, const char* p)
207 {
208 for (unsigned i = 0; i < float32<N>::vec_length; ++i) {
209 __m128 lo, hi;
210 lo = _mm_loadu_ps(reinterpret_cast<const float*>(p));
211 hi = _mm_loadu_ps(reinterpret_cast<const float*>(p + 16));
212 a.vec(i) = _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
213 p += 32;
214 }
215 }
216
217 template<unsigned N> SIMDPP_INL
i_load_u(float64<N> & a,const char * p)218 void i_load_u(float64<N>& a, const char* p)
219 {
220 for (unsigned i = 0; i < float64<N>::vec_length; ++i) {
221 __m128d lo, hi;
222 lo = _mm_loadu_pd(reinterpret_cast<const double*>(p));
223 hi = _mm_loadu_pd(reinterpret_cast<const double*>(p + 16));
224 a.vec(i) = _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
225 p += 32;
226 }
227 }
228 #endif
229
230 #if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512BW
231 template<unsigned N> SIMDPP_INL
i_load_u(uint8<N> & a,const char * p)232 void i_load_u(uint8<N>& a, const char* p)
233 {
234 for (unsigned i = 0; i < uint8<N>::vec_length; ++i) {
235 __m128i lo, hi;
236 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
237 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
238 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
239 p += 32;
240 }
241 }
242
243 template<unsigned N> SIMDPP_INL
i_load_u(uint16<N> & a,const char * p)244 void i_load_u(uint16<N>& a, const char* p)
245 {
246 for (unsigned i = 0; i < uint16<N>::vec_length; ++i) {
247 __m128i lo, hi;
248 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
249 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
250 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
251 p += 32;
252 }
253 }
254 #endif
255
256 #if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512F
257 template<unsigned N> SIMDPP_INL
i_load_u(uint32<N> & a,const char * p)258 void i_load_u(uint32<N>& a, const char* p)
259 {
260 for (unsigned i = 0; i < uint32<N>::vec_length; ++i) {
261 __m128i lo, hi;
262 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
263 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
264 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
265 p += 32;
266 }
267 }
268
269 template<unsigned N> SIMDPP_INL
i_load_u(uint64<N> & a,const char * p)270 void i_load_u(uint64<N>& a, const char* p)
271 {
272 for (unsigned i = 0; i < uint64<N>::vec_length; ++i) {
273 __m128i lo, hi;
274 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
275 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
276 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
277 p += 32;
278 }
279 }
280 #endif
281
282 #if SIMDPP_USE_AVX512BW
i_load_u(uint8<64> & a,const char * p)283 SIMDPP_INL void i_load_u(uint8<64>& a, const char* p)
284 {
285 a = _mm512_loadu_si512(p);
286 }
i_load_u(uint16<32> & a,const char * p)287 SIMDPP_INL void i_load_u(uint16<32>& a, const char* p)
288 {
289 a = _mm512_loadu_si512(p);
290 }
291 #endif
292
293 #if SIMDPP_USE_AVX512F
294 static SIMDPP_INL
i_load_u(uint32<16> & a,const char * p)295 void i_load_u(uint32<16>& a, const char* p)
296 {
297 a = _mm512_loadu_si512(p);
298 }
299 static SIMDPP_INL
i_load_u(uint64<8> & a,const char * p)300 void i_load_u(uint64<8>& a, const char* p)
301 {
302 a = _mm512_loadu_si512(p);
303 }
304 static SIMDPP_INL
i_load_u(float32<16> & a,const char * p)305 void i_load_u(float32<16>& a, const char* p)
306 {
307 a = _mm512_loadu_ps(reinterpret_cast<const float*>(p));
308 }
309 static SIMDPP_INL
i_load_u(float64<8> & a,const char * p)310 void i_load_u(float64<8>& a, const char* p)
311 {
312 a = _mm512_loadu_pd(reinterpret_cast<const double*>(p));
313 }
314 #endif
315
316 // -----------------------------------------------------------------------------
317
318 template<class V> SIMDPP_INL
i_load_u(V & a,const char * p)319 void i_load_u(V& a, const char* p)
320 {
321 const unsigned veclen = V::base_vector_type::length_bytes;
322 for (unsigned i = 0; i < V::vec_length; ++i) {
323 i_load_u(a.vec(i), p);
324 p += veclen;
325 }
326 }
327
328 template<class V> SIMDPP_INL
i_load_u_any(const char * p)329 V i_load_u_any(const char* p)
330 {
331 typename detail::remove_sign<V>::type r;
332 i_load_u(r, p);
333 return V(r);
334 }
335
336 } // namespace insn
337
338 template<class V> SIMDPP_INL
construct_eval(V & v,const expr_vec_load_u & e)339 void construct_eval(V& v, const expr_vec_load_u& e)
340 {
341 v = insn::i_load_u_any<V>(e.a);
342 }
343
344 } // namespace detail
345 } // namespace SIMDPP_ARCH_NAMESPACE
346 } // namespace simdpp
347
348 #endif
349
350