1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED3_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED3_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/align.h>
17 #include <simdpp/detail/insn/mem_pack.h>
18 #include <simdpp/core/store.h>
19 #include <simdpp/detail/null/memory.h>
20 
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23 namespace detail {
24 namespace insn {
25 
26 // collect some boilerplate
27 template<class V> SIMDPP_INL
28 void v128_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
29 template<class V> SIMDPP_INL
30 void v256_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
31 template<class V> SIMDPP_INL
32 void v512_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
33 
34 // -----------------------------------------------------------------------------
35 
36 static SIMDPP_INL
i_store_packed3(char * p,const uint8x16 & a,const uint8x16 & b,const uint8x16 & c)37 void i_store_packed3(char* p, const uint8x16& a, const uint8x16& b, const uint8x16& c)
38 {
39     p = detail::assume_aligned(p, 16);
40 #if SIMDPP_USE_NULL
41     detail::null::store_packed3(p, a, b, c);
42 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
43     v128_store_pack3(p, a, b, c);
44 #elif SIMDPP_USE_NEON
45     uint8x16x3_t t;
46     t.val[0] = a.native();
47     t.val[1] = b.native();
48     t.val[2] = c.native();
49     vst3q_u8(reinterpret_cast<uint8_t*>(p), t);
50 #endif
51 }
52 
53 #if SIMDPP_USE_AVX2
54 static SIMDPP_INL
i_store_packed3(char * p,const uint8x32 & a,const uint8x32 & b,const uint8x32 & c)55 void i_store_packed3(char* p, const uint8x32& a, const uint8x32& b, const uint8x32& c)
56 {
57     v256_store_pack3(p, a, b, c);
58 }
59 #endif
60 
61 #if SIMDPP_USE_AVX512BW
62 static SIMDPP_INL
i_store_packed3(char * p,const uint8<64> & a,const uint8<64> & b,const uint8<64> & c)63 void i_store_packed3(char* p, const uint8<64>& a, const uint8<64>& b, const uint8<64>& c)
64 {
65     v512_store_pack3(p, a, b, c);
66 }
67 #endif
68 
69 // -----------------------------------------------------------------------------
70 
71 static SIMDPP_INL
i_store_packed3(char * p,const uint16x8 & a,const uint16x8 & b,const uint16x8 & c)72 void i_store_packed3(char* p, const uint16x8& a, const uint16x8& b, const uint16x8& c)
73 {
74     p = detail::assume_aligned(p, 16);
75 #if SIMDPP_USE_NULL
76     detail::null::store_packed3(p, a, b, c);
77 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
78     v128_store_pack3(p, a, b, c);
79 #elif SIMDPP_USE_NEON
80     uint16x8x3_t t;
81     t.val[0] = a.native();
82     t.val[1] = b.native();
83     t.val[2] = c.native();
84     vst3q_u16(reinterpret_cast<uint16_t*>(p), t);
85 #endif
86 }
87 
88 #if SIMDPP_USE_AVX2
89 static SIMDPP_INL
i_store_packed3(char * p,const uint16x16 & a,const uint16x16 & b,const uint16x16 & c)90 void i_store_packed3(char* p, const uint16x16& a, const uint16x16& b, const uint16x16& c)
91 {
92     v256_store_pack3(p, a, b, c);
93 }
94 #endif
95 
96 #if SIMDPP_USE_AVX512BW
97 static SIMDPP_INL
i_store_packed3(char * p,const uint16<32> & a,const uint16<32> & b,const uint16<32> & c)98 void i_store_packed3(char* p, const uint16<32>& a, const uint16<32>& b, const uint16<32>& c)
99 {
100     v512_store_pack3(p, a, b, c);
101 }
102 #endif
103 
104 // -----------------------------------------------------------------------------
105 
106 static SIMDPP_INL
i_store_packed3(char * p,const uint32x4 & a,const uint32x4 & b,const uint32x4 & c)107 void i_store_packed3(char* p, const uint32x4& a, const uint32x4& b, const uint32x4& c)
108 {
109     p = detail::assume_aligned(p, 16);
110 #if SIMDPP_USE_NULL
111     detail::null::store_packed3(p, a, b, c);
112 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
113     v128_store_pack3(p, a, b, c);
114 #elif SIMDPP_USE_NEON
115     uint32x4x3_t t;
116     t.val[0] = a.native();
117     t.val[1] = b.native();
118     t.val[2] = c.native();
119     vst3q_u32(reinterpret_cast<uint32_t*>(p), t);
120 #endif
121 }
122 
123 #if SIMDPP_USE_AVX2
124 static SIMDPP_INL
i_store_packed3(char * p,const uint32x8 & a,const uint32x8 & b,const uint32x8 & c)125 void i_store_packed3(char* p, const uint32x8& a, const uint32x8& b, const uint32x8& c)
126 {
127     v256_store_pack3(p, a, b, c);
128 }
129 #endif
130 
131 #if SIMDPP_USE_AVX512F
132 static SIMDPP_INL
i_store_packed3(char * p,const uint32<16> & a,const uint32<16> & b,const uint32<16> & c)133 void i_store_packed3(char* p, const uint32<16>& a, const uint32<16>& b, const uint32<16>& c)
134 {
135     v512_store_pack3(p, a, b, c);
136 }
137 #endif
138 
139 // -----------------------------------------------------------------------------
140 
141 static SIMDPP_INL
i_store_packed3(char * p,const uint64x2 & a,const uint64x2 & b,const uint64x2 & c)142 void i_store_packed3(char* p, const uint64x2& a, const uint64x2& b, const uint64x2& c)
143 {
144     p = detail::assume_aligned(p, 16);
145 #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
146     v128_store_pack3(p, a, b, c);
147 #elif SIMDPP_USE_NEON32
148     uint64_t* q = reinterpret_cast<uint64_t*>(p);
149     uint64x1x2_t t1, t2, t3;
150     t1.val[0] = vget_low_u64(a.native());   t1.val[1] = vget_low_u64(b.native());
151     t2.val[0] = vget_low_u64(c.native());   t2.val[1] = vget_high_u64(a.native());
152     t3.val[0] = vget_high_u64(b.native());  t3.val[1] = vget_high_u64(c.native());
153 
154     vst2_u64(q, t1);
155     vst2_u64(q+2, t2);
156     vst2_u64(q+4, t3);
157 #elif SIMDPP_USE_NEON64
158     uint64x2x3_t t;
159     t.val[0] = a.native();
160     t.val[1] = b.native();
161     t.val[2] = c.native();
162     vst3q_u64(reinterpret_cast<uint64_t*>(p), t);
163 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
164     detail::null::store_packed3(p, a, b, c);
165 #endif
166 }
167 
168 #if SIMDPP_USE_AVX2
169 static SIMDPP_INL
i_store_packed3(char * p,const uint64x4 & a,const uint64x4 & b,const uint64x4 & c)170 void i_store_packed3(char* p, const uint64x4& a, const uint64x4& b, const uint64x4& c)
171 {
172     v256_store_pack3(p, a, b, c);
173 }
174 #endif
175 
176 #if SIMDPP_USE_AVX512F
177 static SIMDPP_INL
i_store_packed3(char * p,const uint64<8> & a,const uint64<8> & b,const uint64<8> & c)178 void i_store_packed3(char* p, const uint64<8>& a, const uint64<8>& b, const uint64<8>& c)
179 {
180     v512_store_pack3(p, a, b, c);
181 }
182 #endif
183 
184 // -----------------------------------------------------------------------------
185 
186 static SIMDPP_INL
i_store_packed3(char * p,const float32x4 & a,const float32x4 & b,const float32x4 & c)187 void i_store_packed3(char* p, const float32x4& a, const float32x4& b, const float32x4& c)
188 {
189     p = detail::assume_aligned(p, 16);
190 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
191     detail::null::store_packed3(p, a, b, c);
192 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
193     v128_store_pack3(p, a, b, c);
194 #elif SIMDPP_USE_NEON
195     float32x4x3_t t;
196     t.val[0] = a.native();
197     t.val[1] = b.native();
198     t.val[2] = c.native();
199     vst3q_f32(reinterpret_cast<float*>(p), t);
200 #endif
201 }
202 
203 #if SIMDPP_USE_AVX
204 static SIMDPP_INL
i_store_packed3(char * p,const float32x8 & a,const float32x8 & b,const float32x8 & c)205 void i_store_packed3(char* p, const float32x8& a, const float32x8& b, const float32x8& c)
206 {
207     v256_store_pack3(p, a, b, c);
208 }
209 #endif
210 
211 #if SIMDPP_USE_AVX512F
212 static SIMDPP_INL
i_store_packed3(char * p,const float32<16> & a,const float32<16> & b,const float32<16> & c)213 void i_store_packed3(char* p, const float32<16>& a, const float32<16>& b, const float32<16>& c)
214 {
215     v512_store_pack3(p, a, b, c);
216 }
217 #endif
218 
219 // -----------------------------------------------------------------------------
220 
221 static SIMDPP_INL
i_store_packed3(char * p,const float64x2 & a,const float64x2 & b,const float64x2 & c)222 void i_store_packed3(char* p, const float64x2& a, const float64x2& b, const float64x2& c)
223 {
224     p = detail::assume_aligned(p, 16);
225 #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
226     v128_store_pack3(p, a, b, c);
227 #elif SIMDPP_USE_NEON64
228     float64x2x3_t t;
229     t.val[0] = a.native();
230     t.val[1] = b.native();
231     t.val[2] = c.native();
232     vst3q_f64(reinterpret_cast<double*>(p), t);
233 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
234     detail::null::store_packed3(p, a, b, c);
235 #endif
236 }
237 
238 #if SIMDPP_USE_AVX
239 static SIMDPP_INL
i_store_packed3(char * p,const float64x4 & a,const float64x4 & b,const float64x4 & c)240 void i_store_packed3(char* p, const float64x4& a, const float64x4& b, const float64x4& c)
241 {
242     v256_store_pack3(p, a, b, c);
243 }
244 #endif
245 
246 #if SIMDPP_USE_AVX512F
247 static SIMDPP_INL
i_store_packed3(char * p,const float64<8> & a,const float64<8> & b,const float64<8> & c)248 void i_store_packed3(char* p, const float64<8>& a, const float64<8>& b, const float64<8>& c)
249 {
250     v512_store_pack3(p, a, b, c);
251 }
252 #endif
253 
254 // -----------------------------------------------------------------------------
255 
256 template<class V> SIMDPP_INL
v128_store_pack3(char * p,const V & ca,const V & cb,const V & cc)257 void v128_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
258 {
259     p = detail::assume_aligned(p, 16);
260     V a = ca, b = cb, c = cc;
261     mem_pack3(a, b, c);
262     i_store(p, a);
263     i_store(p + 16, b);
264     i_store(p + 32, c);
265 }
266 
267 template<class V> SIMDPP_INL
v256_store_pack3(char * p,const V & ca,const V & cb,const V & cc)268 void v256_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
269 {
270     p = detail::assume_aligned(p, 32);
271     V a = ca, b = cb, c = cc;
272     mem_pack3(a, b, c);
273     i_store(p, a);
274     i_store(p + 32, b);
275     i_store(p + 64, c);
276 }
277 
278 template<class V> SIMDPP_INL
v512_store_pack3(char * p,const V & ca,const V & cb,const V & cc)279 void v512_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
280 {
281     p = detail::assume_aligned(p, 64);
282     V a = ca, b = cb, c = cc;
283     mem_pack3(a, b, c);
284     i_store(p, a);
285     i_store(p + 64, b);
286     i_store(p + 128, c);
287 }
288 
289 template<class V> SIMDPP_INL
i_store_packed3(char * p,const V & ca,const V & cb,const V & cc)290 void i_store_packed3(char* p, const V& ca, const V& cb, const V& cc)
291 {
292     const unsigned veclen = V::base_vector_type::length_bytes;
293     typename detail::remove_sign<V>::type a = ca, b = cb, c = cc;
294 
295     p = detail::assume_aligned(p, veclen);
296     for (unsigned i = 0; i < V::vec_length; ++i) {
297         i_store_packed3(p, a.vec(i), b.vec(i), c.vec(i));
298         p += veclen*3;
299     }
300 }
301 
302 } // namespace insn
303 } // namespace detail
304 } // namespace SIMDPP_ARCH_NAMESPACE
305 } // namespace simdpp
306 
307 #endif
308