1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/transpose.h>
17 #include <simdpp/detail/align.h>
18 #include <simdpp/detail/not_implemented.h>
19 #include <simdpp/detail/insn/mem_unpack.h>
20 #include <simdpp/detail/null/memory.h>
21 
22 namespace simdpp {
23 namespace SIMDPP_ARCH_NAMESPACE {
24 namespace detail {
25 namespace insn {
26 
27 // -----------------------------------------------------------------------------
28 
29 // Each integer type is handled separately because higher aligment guarantees
30 // offer better performance on e.g. ARM. Note, we don't use LDDQU on SSE,
31 // because it has usage restrictions and offers improved performance only on
32 // Pentium 4 era processors.
33 static SIMDPP_INL
i_load_u(uint8x16 & a,const char * p)34 void i_load_u(uint8x16& a, const char* p)
35 {
36 #if SIMDPP_USE_NULL
37     detail::null::load(a, p);
38 #elif SIMDPP_USE_SSE2
39     a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
40 #elif SIMDPP_USE_NEON
41     a = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
42 #elif SIMDPP_USE_VSX_206
43     const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
44     a = vec_vsx_ld(0, q);
45 #elif SIMDPP_USE_ALTIVEC
46     const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
47     uint8x16 l1, l2, mask;
48     l1 = vec_ld(0, q);
49     l2 = vec_ld(16, q);
50 #pragma GCC diagnostic push
51 #pragma GCC diagnostic ignored "-Wdeprecated"
52     mask = vec_lvsl(0, q);
53 #pragma GCC diagnostic pop
54     a = vec_perm(l1.native(), l2.native(), mask.native());
55 #elif SIMDPP_USE_MSA
56     a = (v16u8) __msa_ld_b(p, 0);
57 #endif
58 }
59 
60 static SIMDPP_INL
i_load_u(uint16x8 & a,const char * p)61 void i_load_u(uint16x8& a, const char* p)
62 {
63 #if SIMDPP_USE_NULL
64     detail::null::load(a, p);
65 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
66     uint8x16 b;
67     i_load_u(b, p);
68     a = b;
69 #elif SIMDPP_USE_NEON
70     a = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
71 #elif SIMDPP_USE_MSA
72     a = (v8u16) __msa_ld_h(p, 0);
73 #endif
74 }
75 
76 static SIMDPP_INL
i_load_u(uint32x4 & a,const char * p)77 void i_load_u(uint32x4& a, const char* p)
78 {
79 #if SIMDPP_USE_NULL
80     detail::null::load(a, p);
81 #elif SIMDPP_USE_VSX_206
82     a = vec_vsx_ld(0, reinterpret_cast<const uint32_t*>(p));
83 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
84     uint8x16 b;
85     i_load_u(b, p);
86     a = b;
87 #elif SIMDPP_USE_NEON
88     a = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
89 #elif SIMDPP_USE_MSA
90     a = (v4u32) __msa_ld_w(p, 0);
91 #endif
92 }
93 
94 static SIMDPP_INL
i_load_u(uint64x2 & a,const char * p)95 void i_load_u(uint64x2& a, const char* p)
96 {
97 #if SIMDPP_USE_NULL
98     detail::null::load(a, p);
99 #elif SIMDPP_USE_SSE2
100     uint8x16 b;
101     i_load_u(b, p);
102     a = b;
103 #elif SIMDPP_USE_VSX_207
104 #if SIMDPP_64_BITS
105     a = (__vector uint64_t) vec_vsx_ld(0, reinterpret_cast<const uint64_t*>(p));
106 #else
107     // BUG: GCC does not support vec_vsx_ld in 32-bit mode even when
108     // VSX 2.07 is enabled
109     uint8x16 r;
110     i_load_u(r, p);
111     a = r;
112 #endif
113 #elif SIMDPP_USE_ALTIVEC
114     detail::null::load(a, p);
115 #elif SIMDPP_USE_NEON
116     a = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
117 #elif SIMDPP_USE_MSA
118     a = (v2u64) __msa_ld_d(p, 0);
119 #endif
120 }
121 
122 static SIMDPP_INL
i_load_u(float32x4 & a,const char * p)123 void i_load_u(float32x4& a, const char* p)
124 {
125     const float* q = reinterpret_cast<const float*>(p);
126     (void) q;
127 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
128     detail::null::load(a, p);
129 #elif SIMDPP_USE_SSE2
130     a = _mm_loadu_ps(q);
131 #elif SIMDPP_USE_NEON
132     a = vld1q_f32(q);
133 #elif SIMDPP_USE_VSX_206
134     a = vec_vsx_ld(0, q);
135 #elif SIMDPP_USE_ALTIVEC
136     uint32x4 b; (void) q;
137     i_load_u(b, p);
138     a = b;
139 #elif SIMDPP_USE_MSA
140     a = (v4f32) __msa_ld_w(q, 0);
141 #endif
142 }
143 
144 static SIMDPP_INL
i_load_u(float64x2 & a,const char * p)145 void i_load_u(float64x2& a, const char* p)
146 {
147     const double* q = reinterpret_cast<const double*>(p);
148     (void) q;
149 #if SIMDPP_USE_SSE2
150     a = _mm_loadu_pd(q);
151 #elif SIMDPP_USE_NEON64
152     a = vld1q_f64(q);
153 #elif SIMDPP_USE_VSX_206
154     a = vec_vsx_ld(0, q);
155 #elif SIMDPP_USE_MSA
156     a = (v2f64) __msa_ld_d(q, 0);
157 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
158     detail::null::load(a, p);
159 #else
160     SIMDPP_NOT_IMPLEMENTED2(a, p);
161 #endif
162 }
163 
164 #if SIMDPP_USE_AVX2
165 static SIMDPP_INL
i_load_u(uint8x32 & a,const char * p)166 void  i_load_u(uint8x32& a,  const char* p)
167 {
168     a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
169 }
170 static SIMDPP_INL
i_load_u(uint16x16 & a,const char * p)171 void i_load_u(uint16x16& a, const char* p)
172 {
173     a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
174 }
175 static SIMDPP_INL
i_load_u(uint32x8 & a,const char * p)176 void i_load_u(uint32x8& a,  const char* p)
177 {
178     a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
179 }
180 static SIMDPP_INL
i_load_u(uint64x4 & a,const char * p)181 void i_load_u(uint64x4& a,  const char* p)
182 {
183     a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
184 }
185 #endif
186 #if SIMDPP_USE_AVX
187 static SIMDPP_INL
i_load_u(float32x8 & a,const char * p)188 void i_load_u(float32x8& a, const char* p)
189 {
190     a = _mm256_loadu_ps(reinterpret_cast<const float*>(p));
191 }
192 static SIMDPP_INL
i_load_u(float64x4 & a,const char * p)193 void i_load_u(float64x4& a, const char* p)
194 {
195     a = _mm256_loadu_pd(reinterpret_cast<const double*>(p));
196 }
197 #endif
198 
199 #if __INTEL_COMPILER && SIMDPP_USE_AVX && !SIMDPP_USE_AVX512F
200 // BUG: Certain versions of ICC don't like vectors larger than native vector
201 // (e.g. float32<16> and float64<8>) on AVX and AVX2. Two xmm vmovaps aligned
202 // loads are emitted for each 32-byte load even though the argument is clearly
203 // unaligned (e.g. p + 1). The code below results in the same output except
204 // that correct vmovups unaligned load instructions are used.
205 template<unsigned N> SIMDPP_INL
i_load_u(float32<N> & a,const char * p)206 void i_load_u(float32<N>& a, const char* p)
207 {
208     for (unsigned i = 0; i < float32<N>::vec_length; ++i) {
209         __m128 lo, hi;
210         lo = _mm_loadu_ps(reinterpret_cast<const float*>(p));
211         hi = _mm_loadu_ps(reinterpret_cast<const float*>(p + 16));
212         a.vec(i) = _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
213         p += 32;
214     }
215 }
216 
217 template<unsigned N> SIMDPP_INL
i_load_u(float64<N> & a,const char * p)218 void i_load_u(float64<N>& a, const char* p)
219 {
220     for (unsigned i = 0; i < float64<N>::vec_length; ++i) {
221         __m128d lo, hi;
222         lo = _mm_loadu_pd(reinterpret_cast<const double*>(p));
223         hi = _mm_loadu_pd(reinterpret_cast<const double*>(p + 16));
224         a.vec(i) = _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
225         p += 32;
226     }
227 }
228 #endif
229 
230 #if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512BW
231 template<unsigned N> SIMDPP_INL
i_load_u(uint8<N> & a,const char * p)232 void i_load_u(uint8<N>& a, const char* p)
233 {
234     for (unsigned i = 0; i < uint8<N>::vec_length; ++i) {
235         __m128i lo, hi;
236         lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
237         hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
238         a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
239         p += 32;
240     }
241 }
242 
243 template<unsigned N> SIMDPP_INL
i_load_u(uint16<N> & a,const char * p)244 void i_load_u(uint16<N>& a, const char* p)
245 {
246     for (unsigned i = 0; i < uint16<N>::vec_length; ++i) {
247         __m128i lo, hi;
248         lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
249         hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
250         a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
251         p += 32;
252     }
253 }
254 #endif
255 
256 #if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512F
257 template<unsigned N> SIMDPP_INL
i_load_u(uint32<N> & a,const char * p)258 void i_load_u(uint32<N>& a, const char* p)
259 {
260     for (unsigned i = 0; i < uint32<N>::vec_length; ++i) {
261         __m128i lo, hi;
262         lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
263         hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
264         a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
265         p += 32;
266     }
267 }
268 
269 template<unsigned N> SIMDPP_INL
i_load_u(uint64<N> & a,const char * p)270 void i_load_u(uint64<N>& a, const char* p)
271 {
272     for (unsigned i = 0; i < uint64<N>::vec_length; ++i) {
273         __m128i lo, hi;
274         lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
275         hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
276         a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
277         p += 32;
278     }
279 }
280 #endif
281 
282 #if SIMDPP_USE_AVX512BW
i_load_u(uint8<64> & a,const char * p)283 SIMDPP_INL void i_load_u(uint8<64>& a, const char* p)
284 {
285     a = _mm512_loadu_si512(p);
286 }
i_load_u(uint16<32> & a,const char * p)287 SIMDPP_INL void i_load_u(uint16<32>& a,  const char* p)
288 {
289     a = _mm512_loadu_si512(p);
290 }
291 #endif
292 
293 #if SIMDPP_USE_AVX512F
294 static SIMDPP_INL
i_load_u(uint32<16> & a,const char * p)295 void i_load_u(uint32<16>& a, const char* p)
296 {
297     a = _mm512_loadu_si512(p);
298 }
299 static SIMDPP_INL
i_load_u(uint64<8> & a,const char * p)300 void i_load_u(uint64<8>& a,  const char* p)
301 {
302     a = _mm512_loadu_si512(p);
303 }
304 static SIMDPP_INL
i_load_u(float32<16> & a,const char * p)305 void i_load_u(float32<16>& a, const char* p)
306 {
307     a = _mm512_loadu_ps(reinterpret_cast<const float*>(p));
308 }
309 static SIMDPP_INL
i_load_u(float64<8> & a,const char * p)310 void i_load_u(float64<8>& a, const char* p)
311 {
312     a = _mm512_loadu_pd(reinterpret_cast<const double*>(p));
313 }
314 #endif
315 
316 // -----------------------------------------------------------------------------
317 
318 template<class V> SIMDPP_INL
i_load_u(V & a,const char * p)319 void i_load_u(V& a, const char* p)
320 {
321     const unsigned veclen = V::base_vector_type::length_bytes;
322     for (unsigned i = 0; i < V::vec_length; ++i) {
323         i_load_u(a.vec(i), p);
324         p += veclen;
325     }
326 }
327 
328 template<class V> SIMDPP_INL
i_load_u_any(const char * p)329 V i_load_u_any(const char* p)
330 {
331     typename detail::remove_sign<V>::type r;
332     i_load_u(r, p);
333     return V(r);
334 }
335 
336 } // namespace insn
337 
338 template<class V> SIMDPP_INL
construct_eval(V & v,const expr_vec_load_u & e)339 void construct_eval(V& v, const expr_vec_load_u& e)
340 {
341     v = insn::i_load_u_any<V>(e.a);
342 }
343 
344 } // namespace detail
345 } // namespace SIMDPP_ARCH_NAMESPACE
346 } // namespace simdpp
347 
348 #endif
349 
350