1 /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED3_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED3_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/detail/align.h>
17 #include <simdpp/detail/insn/mem_pack.h>
18 #include <simdpp/core/store.h>
19 #include <simdpp/detail/null/memory.h>
20
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23 namespace detail {
24 namespace insn {
25
26 // collect some boilerplate
27 template<class V> SIMDPP_INL
28 void v128_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
29 template<class V> SIMDPP_INL
30 void v256_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
31 template<class V> SIMDPP_INL
32 void v512_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
33
34 // -----------------------------------------------------------------------------
35
36 static SIMDPP_INL
i_store_packed3(char * p,const uint8x16 & a,const uint8x16 & b,const uint8x16 & c)37 void i_store_packed3(char* p, const uint8x16& a, const uint8x16& b, const uint8x16& c)
38 {
39 p = detail::assume_aligned(p, 16);
40 #if SIMDPP_USE_NULL
41 detail::null::store_packed3(p, a, b, c);
42 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
43 v128_store_pack3(p, a, b, c);
44 #elif SIMDPP_USE_NEON
45 uint8x16x3_t t;
46 t.val[0] = a.native();
47 t.val[1] = b.native();
48 t.val[2] = c.native();
49 vst3q_u8(reinterpret_cast<uint8_t*>(p), t);
50 #endif
51 }
52
53 #if SIMDPP_USE_AVX2
54 static SIMDPP_INL
i_store_packed3(char * p,const uint8x32 & a,const uint8x32 & b,const uint8x32 & c)55 void i_store_packed3(char* p, const uint8x32& a, const uint8x32& b, const uint8x32& c)
56 {
57 v256_store_pack3(p, a, b, c);
58 }
59 #endif
60
61 #if SIMDPP_USE_AVX512BW
62 static SIMDPP_INL
i_store_packed3(char * p,const uint8<64> & a,const uint8<64> & b,const uint8<64> & c)63 void i_store_packed3(char* p, const uint8<64>& a, const uint8<64>& b, const uint8<64>& c)
64 {
65 v512_store_pack3(p, a, b, c);
66 }
67 #endif
68
69 // -----------------------------------------------------------------------------
70
71 static SIMDPP_INL
i_store_packed3(char * p,const uint16x8 & a,const uint16x8 & b,const uint16x8 & c)72 void i_store_packed3(char* p, const uint16x8& a, const uint16x8& b, const uint16x8& c)
73 {
74 p = detail::assume_aligned(p, 16);
75 #if SIMDPP_USE_NULL
76 detail::null::store_packed3(p, a, b, c);
77 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
78 v128_store_pack3(p, a, b, c);
79 #elif SIMDPP_USE_NEON
80 uint16x8x3_t t;
81 t.val[0] = a.native();
82 t.val[1] = b.native();
83 t.val[2] = c.native();
84 vst3q_u16(reinterpret_cast<uint16_t*>(p), t);
85 #endif
86 }
87
88 #if SIMDPP_USE_AVX2
89 static SIMDPP_INL
i_store_packed3(char * p,const uint16x16 & a,const uint16x16 & b,const uint16x16 & c)90 void i_store_packed3(char* p, const uint16x16& a, const uint16x16& b, const uint16x16& c)
91 {
92 v256_store_pack3(p, a, b, c);
93 }
94 #endif
95
96 #if SIMDPP_USE_AVX512BW
97 static SIMDPP_INL
i_store_packed3(char * p,const uint16<32> & a,const uint16<32> & b,const uint16<32> & c)98 void i_store_packed3(char* p, const uint16<32>& a, const uint16<32>& b, const uint16<32>& c)
99 {
100 v512_store_pack3(p, a, b, c);
101 }
102 #endif
103
104 // -----------------------------------------------------------------------------
105
106 static SIMDPP_INL
i_store_packed3(char * p,const uint32x4 & a,const uint32x4 & b,const uint32x4 & c)107 void i_store_packed3(char* p, const uint32x4& a, const uint32x4& b, const uint32x4& c)
108 {
109 p = detail::assume_aligned(p, 16);
110 #if SIMDPP_USE_NULL
111 detail::null::store_packed3(p, a, b, c);
112 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
113 v128_store_pack3(p, a, b, c);
114 #elif SIMDPP_USE_NEON
115 uint32x4x3_t t;
116 t.val[0] = a.native();
117 t.val[1] = b.native();
118 t.val[2] = c.native();
119 vst3q_u32(reinterpret_cast<uint32_t*>(p), t);
120 #endif
121 }
122
123 #if SIMDPP_USE_AVX2
124 static SIMDPP_INL
i_store_packed3(char * p,const uint32x8 & a,const uint32x8 & b,const uint32x8 & c)125 void i_store_packed3(char* p, const uint32x8& a, const uint32x8& b, const uint32x8& c)
126 {
127 v256_store_pack3(p, a, b, c);
128 }
129 #endif
130
131 #if SIMDPP_USE_AVX512F
132 static SIMDPP_INL
i_store_packed3(char * p,const uint32<16> & a,const uint32<16> & b,const uint32<16> & c)133 void i_store_packed3(char* p, const uint32<16>& a, const uint32<16>& b, const uint32<16>& c)
134 {
135 v512_store_pack3(p, a, b, c);
136 }
137 #endif
138
139 // -----------------------------------------------------------------------------
140
141 static SIMDPP_INL
i_store_packed3(char * p,const uint64x2 & a,const uint64x2 & b,const uint64x2 & c)142 void i_store_packed3(char* p, const uint64x2& a, const uint64x2& b, const uint64x2& c)
143 {
144 p = detail::assume_aligned(p, 16);
145 #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
146 v128_store_pack3(p, a, b, c);
147 #elif SIMDPP_USE_NEON32
148 uint64_t* q = reinterpret_cast<uint64_t*>(p);
149 uint64x1x2_t t1, t2, t3;
150 t1.val[0] = vget_low_u64(a.native()); t1.val[1] = vget_low_u64(b.native());
151 t2.val[0] = vget_low_u64(c.native()); t2.val[1] = vget_high_u64(a.native());
152 t3.val[0] = vget_high_u64(b.native()); t3.val[1] = vget_high_u64(c.native());
153
154 vst2_u64(q, t1);
155 vst2_u64(q+2, t2);
156 vst2_u64(q+4, t3);
157 #elif SIMDPP_USE_NEON64
158 uint64x2x3_t t;
159 t.val[0] = a.native();
160 t.val[1] = b.native();
161 t.val[2] = c.native();
162 vst3q_u64(reinterpret_cast<uint64_t*>(p), t);
163 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
164 detail::null::store_packed3(p, a, b, c);
165 #endif
166 }
167
168 #if SIMDPP_USE_AVX2
169 static SIMDPP_INL
i_store_packed3(char * p,const uint64x4 & a,const uint64x4 & b,const uint64x4 & c)170 void i_store_packed3(char* p, const uint64x4& a, const uint64x4& b, const uint64x4& c)
171 {
172 v256_store_pack3(p, a, b, c);
173 }
174 #endif
175
176 #if SIMDPP_USE_AVX512F
177 static SIMDPP_INL
i_store_packed3(char * p,const uint64<8> & a,const uint64<8> & b,const uint64<8> & c)178 void i_store_packed3(char* p, const uint64<8>& a, const uint64<8>& b, const uint64<8>& c)
179 {
180 v512_store_pack3(p, a, b, c);
181 }
182 #endif
183
184 // -----------------------------------------------------------------------------
185
186 static SIMDPP_INL
i_store_packed3(char * p,const float32x4 & a,const float32x4 & b,const float32x4 & c)187 void i_store_packed3(char* p, const float32x4& a, const float32x4& b, const float32x4& c)
188 {
189 p = detail::assume_aligned(p, 16);
190 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
191 detail::null::store_packed3(p, a, b, c);
192 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
193 v128_store_pack3(p, a, b, c);
194 #elif SIMDPP_USE_NEON
195 float32x4x3_t t;
196 t.val[0] = a.native();
197 t.val[1] = b.native();
198 t.val[2] = c.native();
199 vst3q_f32(reinterpret_cast<float*>(p), t);
200 #endif
201 }
202
203 #if SIMDPP_USE_AVX
204 static SIMDPP_INL
i_store_packed3(char * p,const float32x8 & a,const float32x8 & b,const float32x8 & c)205 void i_store_packed3(char* p, const float32x8& a, const float32x8& b, const float32x8& c)
206 {
207 v256_store_pack3(p, a, b, c);
208 }
209 #endif
210
211 #if SIMDPP_USE_AVX512F
212 static SIMDPP_INL
i_store_packed3(char * p,const float32<16> & a,const float32<16> & b,const float32<16> & c)213 void i_store_packed3(char* p, const float32<16>& a, const float32<16>& b, const float32<16>& c)
214 {
215 v512_store_pack3(p, a, b, c);
216 }
217 #endif
218
219 // -----------------------------------------------------------------------------
220
221 static SIMDPP_INL
i_store_packed3(char * p,const float64x2 & a,const float64x2 & b,const float64x2 & c)222 void i_store_packed3(char* p, const float64x2& a, const float64x2& b, const float64x2& c)
223 {
224 p = detail::assume_aligned(p, 16);
225 #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
226 v128_store_pack3(p, a, b, c);
227 #elif SIMDPP_USE_NEON64
228 float64x2x3_t t;
229 t.val[0] = a.native();
230 t.val[1] = b.native();
231 t.val[2] = c.native();
232 vst3q_f64(reinterpret_cast<double*>(p), t);
233 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
234 detail::null::store_packed3(p, a, b, c);
235 #endif
236 }
237
238 #if SIMDPP_USE_AVX
239 static SIMDPP_INL
i_store_packed3(char * p,const float64x4 & a,const float64x4 & b,const float64x4 & c)240 void i_store_packed3(char* p, const float64x4& a, const float64x4& b, const float64x4& c)
241 {
242 v256_store_pack3(p, a, b, c);
243 }
244 #endif
245
246 #if SIMDPP_USE_AVX512F
247 static SIMDPP_INL
i_store_packed3(char * p,const float64<8> & a,const float64<8> & b,const float64<8> & c)248 void i_store_packed3(char* p, const float64<8>& a, const float64<8>& b, const float64<8>& c)
249 {
250 v512_store_pack3(p, a, b, c);
251 }
252 #endif
253
254 // -----------------------------------------------------------------------------
255
256 template<class V> SIMDPP_INL
v128_store_pack3(char * p,const V & ca,const V & cb,const V & cc)257 void v128_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
258 {
259 p = detail::assume_aligned(p, 16);
260 V a = ca, b = cb, c = cc;
261 mem_pack3(a, b, c);
262 i_store(p, a);
263 i_store(p + 16, b);
264 i_store(p + 32, c);
265 }
266
267 template<class V> SIMDPP_INL
v256_store_pack3(char * p,const V & ca,const V & cb,const V & cc)268 void v256_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
269 {
270 p = detail::assume_aligned(p, 32);
271 V a = ca, b = cb, c = cc;
272 mem_pack3(a, b, c);
273 i_store(p, a);
274 i_store(p + 32, b);
275 i_store(p + 64, c);
276 }
277
278 template<class V> SIMDPP_INL
v512_store_pack3(char * p,const V & ca,const V & cb,const V & cc)279 void v512_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
280 {
281 p = detail::assume_aligned(p, 64);
282 V a = ca, b = cb, c = cc;
283 mem_pack3(a, b, c);
284 i_store(p, a);
285 i_store(p + 64, b);
286 i_store(p + 128, c);
287 }
288
289 template<class V> SIMDPP_INL
i_store_packed3(char * p,const V & ca,const V & cb,const V & cc)290 void i_store_packed3(char* p, const V& ca, const V& cb, const V& cc)
291 {
292 const unsigned veclen = V::base_vector_type::length_bytes;
293 typename detail::remove_sign<V>::type a = ca, b = cb, c = cc;
294
295 p = detail::assume_aligned(p, veclen);
296 for (unsigned i = 0; i < V::vec_length; ++i) {
297 i_store_packed3(p, a.vec(i), b.vec(i), c.vec(i));
298 p += veclen*3;
299 }
300 }
301
302 } // namespace insn
303 } // namespace detail
304 } // namespace SIMDPP_ARCH_NAMESPACE
305 } // namespace simdpp
306
307 #endif
308