1 /*  Copyright (C) 2016  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_MASKED_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_MASKED_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/load.h>
17 #include <simdpp/core/store.h>
18 #include <simdpp/detail/null/memory.h>
19 #include <simdpp/detail/align.h>
20 
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23 namespace detail {
24 namespace insn {
25 
26 static SIMDPP_INL
i_store_masked(char * p,const uint32<4> & a,const mask_int32<4> & mask)27 void i_store_masked(char* p, const uint32<4>& a, const mask_int32<4>& mask)
28 {
29 #if SIMDPP_USE_NULL
30     null::store_masked(p, a, mask);
31 #elif SIMDPP_USE_AVX512VL
32     _mm_mask_store_epi32(p, mask.native(), a.native());
33 #elif SIMDPP_USE_AVX2
34     _mm_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
35 #elif SIMDPP_USE_AVX
36     _mm_maskstore_ps(reinterpret_cast<float*>(p), mask.native(),
37                      _mm_castsi128_ps(a.native()));
38 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
39     uint32<4> b = load(p);
40     b = blend(a, b, mask);
41     store(p, b);
42 #endif
43 }
44 
45 #if SIMDPP_USE_AVX2
46 static SIMDPP_INL
i_store_masked(char * p,const uint32<8> & a,const mask_int32<8> & mask)47 void i_store_masked(char* p, const uint32<8>& a, const mask_int32<8>& mask)
48 {
49 #if SIMDPP_USE_AVX512VL
50     _mm256_mask_store_epi32(p, mask.native(), a.native());
51 #else
52     _mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
53 #endif
54 }
55 #endif
56 
57 #if SIMDPP_USE_AVX512F
58 static SIMDPP_INL
i_store_masked(char * p,const uint32<16> & a,const mask_int32<16> & mask)59 void i_store_masked(char* p, const uint32<16>& a, const mask_int32<16>& mask)
60 {
61     _mm512_mask_store_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
62 }
63 #endif
64 
65 // -----------------------------------------------------------------------------
66 
67 static SIMDPP_INL
i_store_masked(char * p,const uint64<2> & a,const mask_int64<2> & mask)68 void i_store_masked(char* p, const uint64<2>& a, const mask_int64<2>& mask)
69 {
70 #if SIMDPP_USE_AVX512VL
71 #if __INTEL_COMPILER
72     _mm_mask_store_epi64(reinterpret_cast<__int64*>(p), mask.native(),
73                          a.native());
74 #else
75     _mm_mask_store_epi64(reinterpret_cast<long long*>(p), mask.native(),
76                          a.native());
77 #endif
78 #elif SIMDPP_USE_AVX2
79 #if __INTEL_COMPILER
80     _mm_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
81 #else
82     _mm_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
83 #endif
84 #elif SIMDPP_USE_AVX
85     _mm_maskstore_pd(reinterpret_cast<double*>(p), mask.native(), _mm_castsi128_pd(a.native()));
86 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
87     uint64<2> b = load(p);
88     b = blend(a, b, mask);
89     store(p, b);
90 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
91     null::store_masked(p, a, mask);
92 #endif
93 }
94 
95 #if SIMDPP_USE_AVX2
96 static SIMDPP_INL
i_store_masked(char * p,const uint64<4> & a,const mask_int64<4> & mask)97 void i_store_masked(char* p, const uint64<4>& a, const mask_int64<4>& mask)
98 {
99 #if SIMDPP_USE_AVX512VL
100 #if __INTEL_COMPILER
101     _mm256_mask_store_epi64(reinterpret_cast<__int64*>(p), mask.native(),
102                             a.native());
103 #else
104     _mm256_mask_store_epi64(reinterpret_cast<long long*>(p), mask.native(),
105                             a.native());
106 #endif
107 #else
108 #if __INTEL_COMPILER
109     _mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
110 #else
111     _mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
112 #endif
113 #endif
114 }
115 #endif
116 
117 #if SIMDPP_USE_AVX512F
118 static SIMDPP_INL
i_store_masked(char * p,const uint64<8> & a,const mask_int64<8> & mask)119 void i_store_masked(char* p, const uint64<8>& a, const mask_int64<8>& mask)
120 {
121 #if __INTEL_COMPILER
122     _mm512_mask_store_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
123 #else
124     _mm512_mask_store_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
125 #endif
126 }
127 #endif
128 
129 // -----------------------------------------------------------------------------
130 
131 static SIMDPP_INL
i_store_masked(char * p,const float32<4> & a,const mask_float32<4> & mask)132 void i_store_masked(char* p, const float32<4>& a, const mask_float32<4>& mask)
133 {
134 #if SIMDPP_USE_NULL
135     null::store_masked(p, a, mask);
136 #elif SIMDPP_USE_AVX512VL
137     _mm_mask_store_ps(reinterpret_cast<float*>(p), mask.native(), a.native());
138 #elif SIMDPP_USE_AVX
139     _mm_maskstore_ps(reinterpret_cast<float*>(p),
140                      _mm_castps_si128(mask.native()), a.native());
141 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
142     float32<4> b = load(p);
143     b = blend(a, b, mask);
144     store(p, b);
145 #endif
146 }
147 
148 #if SIMDPP_USE_AVX
149 static SIMDPP_INL
i_store_masked(char * p,const float32<8> & a,const mask_float32<8> & mask)150 void i_store_masked(char* p, const float32<8>& a, const mask_float32<8>& mask)
151 {
152 #if SIMDPP_USE_AVX512VL
153     _mm256_mask_store_ps(reinterpret_cast<float*>(p), mask.native(),
154                          a.native());
155 #else
156     _mm256_maskstore_ps(reinterpret_cast<float*>(p),
157                         _mm256_castps_si256(mask.native()), a.native());
158 #endif
159 }
160 #endif
161 
162 #if SIMDPP_USE_AVX512F
163 static SIMDPP_INL
i_store_masked(char * p,const float32<16> & a,const mask_float32<16> & mask)164 void i_store_masked(char* p, const float32<16>& a, const mask_float32<16>& mask)
165 {
166     _mm512_mask_store_ps(reinterpret_cast<float*>(p), mask.native(), a.native());
167 }
168 #endif
169 
170 // -----------------------------------------------------------------------------
171 
172 static SIMDPP_INL
i_store_masked(char * p,const float64<2> & a,const mask_float64<2> & mask)173 void i_store_masked(char* p, const float64<2>& a, const mask_float64<2>& mask)
174 {
175 #if SIMDPP_USE_AVX512VL
176     _mm_mask_store_pd(reinterpret_cast<double*>(p), mask.native(), a.native());
177 #elif SIMDPP_USE_AVX
178     _mm_maskstore_pd(reinterpret_cast<double*>(p),
179                      _mm_castpd_si128(mask.native()), a.native());
180 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
181     float64<2> b = load(p);
182     b = blend(a, b, mask);
183     store(p, b);
184 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
185     null::store_masked(p, a, mask);
186 #endif
187 }
188 
189 #if SIMDPP_USE_AVX
190 static SIMDPP_INL
i_store_masked(char * p,const float64<4> & a,const mask_float64<4> & mask)191 void i_store_masked(char* p, const float64<4>& a, const mask_float64<4>& mask)
192 {
193 #if SIMDPP_USE_AVX512VL
194     _mm256_mask_store_pd(reinterpret_cast<double*>(p), mask.native(),
195                          a.native());
196 #else
197     _mm256_maskstore_pd(reinterpret_cast<double*>(p),
198                         _mm256_castpd_si256(mask.native()), a.native());
199 #endif
200 }
201 #endif
202 
203 #if SIMDPP_USE_AVX512F
204 static SIMDPP_INL
i_store_masked(char * p,const float64<8> & a,const mask_float64<8> & mask)205 void i_store_masked(char* p, const float64<8>& a, const mask_float64<8>& mask)
206 {
207     _mm512_mask_store_pd(reinterpret_cast<double*>(p), mask.native(), a.native());
208 }
209 #endif
210 
211 // -----------------------------------------------------------------------------
212 
213 template<class V, class M>
i_store_masked(char * p,const V & a,const M & mask)214 void i_store_masked(char* p, const V& a, const M& mask)
215 {
216     const unsigned veclen = V::base_vector_type::length_bytes;
217 
218     for (unsigned i = 0; i < a.vec_length; ++i) {
219         i_store_masked(p, a.vec(i), mask.vec(i));
220         p += veclen;
221     }
222 }
223 
224 } // namespace insn
225 } // namespace detail
226 } // namespace SIMDPP_ARCH_NAMESPACE
227 } // namespace simdpp
228 
229 #endif
230 
231