1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED2_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED2_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/align.h>
17 #include <simdpp/detail/insn/mem_pack.h>
18 #include <simdpp/core/store.h>
19 #include <simdpp/detail/null/memory.h>
20 
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23 namespace detail {
24 namespace insn {
25 
26 // collect some boilerplate
27 template<class V> SIMDPP_INL
28 void v128_store_pack2(char* p, const V& ca, const V& cb);
29 template<class V> SIMDPP_INL
30 void v256_store_pack2(char* p, const V& ca, const V& cb);
31 template<class V> SIMDPP_INL
32 void v512_store_pack2(char* p, const V& ca, const V& cb);
33 
34 // -----------------------------------------------------------------------------
35 
36 static SIMDPP_INL
i_store_packed2(char * p,const uint8x16 & a,const uint8x16 & b)37 void i_store_packed2(char* p, const uint8x16& a, const uint8x16& b)
38 {
39     p = detail::assume_aligned(p, 16);
40 #if SIMDPP_USE_NULL
41     detail::null::store_packed2(p, a, b);
42 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
43     v128_store_pack2(p, a, b);
44 #elif SIMDPP_USE_NEON
45     uint8x16x2_t t;
46     t.val[0] = a.native();
47     t.val[1] = b.native();
48     vst2q_u8(reinterpret_cast<uint8_t*>(p), t);
49 #endif
50 }
51 
52 #if SIMDPP_USE_AVX2
53 static SIMDPP_INL
i_store_packed2(char * p,const uint8x32 & a,const uint8x32 & b)54 void i_store_packed2(char* p, const uint8x32& a, const uint8x32& b)
55 {
56     v256_store_pack2(p, a, b);
57 }
58 #endif
59 
60 #if SIMDPP_USE_AVX512BW
i_store_packed2(char * p,const uint8<64> & a,const uint8<64> & b)61 SIMDPP_INL void i_store_packed2(char* p, const uint8<64>& a, const uint8<64>& b)
62 {
63     v512_store_pack2(p, a, b);
64 }
65 #endif
66 
67 // -----------------------------------------------------------------------------
68 
69 static SIMDPP_INL
i_store_packed2(char * p,const uint16x8 & a,const uint16x8 & b)70 void i_store_packed2(char* p, const uint16x8& a, const uint16x8& b)
71 {
72     p = detail::assume_aligned(p, 16);
73 #if SIMDPP_USE_NULL
74     detail::null::store_packed2(p, a, b);
75 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
76     v128_store_pack2(p, a, b);
77 #elif SIMDPP_USE_NEON
78     uint16x8x2_t t;
79     t.val[0] = a.native();
80     t.val[1] = b.native();
81     vst2q_u16(reinterpret_cast<uint16_t*>(p), t);
82 #endif
83 }
84 
85 #if SIMDPP_USE_AVX2
86 static SIMDPP_INL
i_store_packed2(char * p,const uint16x16 & a,const uint16x16 & b)87 void i_store_packed2(char* p, const uint16x16& a, const uint16x16& b)
88 {
89     v256_store_pack2(p, a, b);
90 }
91 #endif
92 
93 #if SIMDPP_USE_AVX512BW
i_store_packed2(char * p,const uint16<32> & a,const uint16<32> & b)94 SIMDPP_INL void i_store_packed2(char* p, const uint16<32>& a, const uint16<32>& b)
95 {
96     v512_store_pack2(p, a, b);
97 }
98 #endif
99 
100 // -----------------------------------------------------------------------------
101 
102 static SIMDPP_INL
i_store_packed2(char * p,const uint32x4 & a,const uint32x4 & b)103 void i_store_packed2(char* p, const uint32x4& a, const uint32x4& b)
104 {
105     p = detail::assume_aligned(p, 16);
106 #if SIMDPP_USE_NULL
107     detail::null::store_packed2(p, a, b);
108 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
109     v128_store_pack2(p, a, b);
110 #elif SIMDPP_USE_NEON
111     uint32x4x2_t t;
112     t.val[0] = a.native();
113     t.val[1] = b.native();
114     vst2q_u32(reinterpret_cast<uint32_t*>(p), t);
115 #endif
116 }
117 
118 #if SIMDPP_USE_AVX2
119 static SIMDPP_INL
i_store_packed2(char * p,const uint32x8 & a,const uint32x8 & b)120 void i_store_packed2(char* p, const uint32x8& a, const uint32x8& b)
121 {
122     v256_store_pack2(p, a, b);
123 }
124 #endif
125 
126 #if SIMDPP_USE_AVX512F
127 static SIMDPP_INL
i_store_packed2(char * p,const uint32<16> & a,const uint32<16> & b)128 void i_store_packed2(char* p, const uint32<16>& a, const uint32<16>& b)
129 {
130     v512_store_pack2(p, a, b);
131 }
132 #endif
133 
134 // -----------------------------------------------------------------------------
135 
136 static SIMDPP_INL
i_store_packed2(char * p,const uint64x2 & a,const uint64x2 & b)137 void i_store_packed2(char* p, const uint64x2& a, const uint64x2& b)
138 {
139 #if SIMDPP_USE_NEON64
140     uint64x2x2_t t;
141     t.val[0] = a.native();
142     t.val[1] = b.native();
143     vst2q_u64(reinterpret_cast<uint64_t*>(p), t);
144 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
145     v128_store_pack2(p, a, b);
146 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
147     detail::null::store_packed2(p, a, b);
148 #endif
149 }
150 
151 #if SIMDPP_USE_AVX2
152 static SIMDPP_INL
i_store_packed2(char * p,const uint64x4 & a,const uint64x4 & b)153 void i_store_packed2(char* p, const uint64x4& a, const uint64x4& b)
154 {
155     v256_store_pack2(p, a, b);
156 }
157 #endif
158 
159 #if SIMDPP_USE_AVX512F
160 static SIMDPP_INL
i_store_packed2(char * p,const uint64<8> & a,const uint64<8> & b)161 void i_store_packed2(char* p, const uint64<8>& a, const uint64<8>& b)
162 {
163     v512_store_pack2(p, a, b);
164 }
165 #endif
166 
167 // -----------------------------------------------------------------------------
168 
169 static SIMDPP_INL
i_store_packed2(char * p,const float32x4 & a,const float32x4 & b)170 void i_store_packed2(char* p, const float32x4& a, const float32x4& b)
171 {
172     p = detail::assume_aligned(p, 16);
173 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
174     detail::null::store_packed2(p, a, b);
175 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
176     v128_store_pack2(p, a, b);
177 #elif SIMDPP_USE_NEON
178     float32x4x2_t t;
179     t.val[0] = a.native();
180     t.val[1] = b.native();
181     vst2q_f32(reinterpret_cast<float*>(p), t);
182 #endif
183 }
184 
185 #if SIMDPP_USE_AVX
186 static SIMDPP_INL
i_store_packed2(char * p,const float32x8 & a,const float32x8 & b)187 void i_store_packed2(char* p, const float32x8& a, const float32x8& b)
188 {
189     v256_store_pack2(p, a, b);
190 }
191 #endif
192 
193 #if SIMDPP_USE_AVX512F
194 static SIMDPP_INL
i_store_packed2(char * p,const float32<16> & a,const float32<16> & b)195 void i_store_packed2(char* p, const float32<16>& a, const float32<16>& b)
196 {
197     v512_store_pack2(p, a, b);
198 }
199 #endif
200 
201 // -----------------------------------------------------------------------------
202 
203 static SIMDPP_INL
i_store_packed2(char * p,const float64x2 & a,const float64x2 & b)204 void i_store_packed2(char* p, const float64x2& a, const float64x2& b)
205 {
206 #if SIMDPP_USE_NEON64
207     float64x2x2_t t;
208     t.val[0] = a.native();
209     t.val[1] = b.native();
210     vst2q_f64(reinterpret_cast<double*>(p), t);
211 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
212     v128_store_pack2(p, a, b);
213 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
214     detail::null::store_packed2(p, a, b);
215 #endif
216 }
217 
218 #if SIMDPP_USE_AVX
219 static SIMDPP_INL
i_store_packed2(char * p,const float64x4 & a,const float64x4 & b)220 void i_store_packed2(char* p, const float64x4& a, const float64x4& b)
221 {
222     v256_store_pack2(p, a, b);
223 }
224 #endif
225 
226 #if SIMDPP_USE_AVX512F
227 static SIMDPP_INL
i_store_packed2(char * p,const float64<8> & a,const float64<8> & b)228 void i_store_packed2(char* p, const float64<8>& a, const float64<8>& b)
229 {
230     v512_store_pack2(p, a, b);
231 }
232 #endif
233 
234 // -----------------------------------------------------------------------------
235 
236 template<class V> SIMDPP_INL
v128_store_pack2(char * p,const V & ca,const V & cb)237 void v128_store_pack2(char* p, const V& ca, const V& cb)
238 {
239     p = detail::assume_aligned(p, 32);
240     V a = ca, b = cb;
241     mem_pack2(a, b);
242     i_store(p, a);
243     i_store(p + 16, b);
244 }
245 
246 template<class V> SIMDPP_INL
v256_store_pack2(char * p,const V & ca,const V & cb)247 void v256_store_pack2(char* p, const V& ca, const V& cb)
248 {
249     p = detail::assume_aligned(p, 32);
250     V a = ca, b = cb;
251     mem_pack2(a, b);
252     i_store(p, a);
253     i_store(p + 32, b);
254 }
255 
256 template<class V> SIMDPP_INL
v512_store_pack2(char * p,const V & ca,const V & cb)257 void v512_store_pack2(char* p, const V& ca, const V& cb)
258 {
259     p = detail::assume_aligned(p, 32);
260     V a = ca, b = cb;
261     mem_pack2(a, b);
262     i_store(p, a);
263     i_store(p + 64, b);
264 }
265 
266 
267 template<class V> SIMDPP_INL
i_store_packed2(char * p,const V & ca,const V & cb)268 void i_store_packed2(char* p, const V& ca, const V& cb)
269 {
270     const unsigned veclen = V::base_vector_type::length_bytes;
271     typename detail::remove_sign<V>::type a = ca, b = cb;
272 
273     p = detail::assume_aligned(p, veclen);
274     for (unsigned i = 0; i < V::vec_length; ++i) {
275         i_store_packed2(p, a.vec(i), b.vec(i));
276         p += veclen*2;
277     }
278 }
279 
280 } // namespace insn
281 } // namespace detail
282 } // namespace SIMDPP_ARCH_NAMESPACE
283 } // namespace simdpp
284 
285 #endif
286