1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  */
26 
27 #if !defined(SIMDE_X86_SSE3_H)
28 #define SIMDE_X86_SSE3_H
29 
30 #include "sse2.h"
31 
32 HEDLEY_DIAGNOSTIC_PUSH
33 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
34 SIMDE_BEGIN_DECLS_
35 
36 SIMDE_FUNCTION_ATTRIBUTES
37 simde__m128i
simde_x_mm_deinterleaveeven_epi16(simde__m128i a,simde__m128i b)38 simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) {
39   simde__m128i_private
40     r_,
41     a_ = simde__m128i_to_private(a),
42     b_ = simde__m128i_to_private(b);
43 
44   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
45     r_.neon_i16 = vuzp1q_s16(a_.neon_i16, b_.neon_i16);
46   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
47     int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16);
48     r_.neon_i16 = t.val[0];
49   #elif defined(SIMDE_SHUFFLE_VECTOR_)
50     r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14);
51   #else
52     const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
53     for(size_t i = 0 ; i < halfway_point ; i++) {
54       r_.i16[i] = a_.i16[2 * i];
55       r_.i16[i + halfway_point] = b_.i16[2 * i];
56     }
57   #endif
58 
59   return simde__m128i_from_private(r_);
60 }
61 
62 SIMDE_FUNCTION_ATTRIBUTES
63 simde__m128i
simde_x_mm_deinterleaveodd_epi16(simde__m128i a,simde__m128i b)64 simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) {
65   simde__m128i_private
66     r_,
67     a_ = simde__m128i_to_private(a),
68     b_ = simde__m128i_to_private(b);
69 
70   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
71     r_.neon_i16 = vuzp2q_s16(a_.neon_i16, b_.neon_i16);
72   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
73     int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16);
74     r_.neon_i16 = t.val[1];
75   #elif defined(SIMDE_SHUFFLE_VECTOR_)
76     r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15);
77   #else
78     const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
79     for(size_t i = 0 ; i < halfway_point ; i++) {
80       r_.i16[i] = a_.i16[2 * i + 1];
81       r_.i16[i + halfway_point] = b_.i16[2 * i + 1];
82     }
83   #endif
84 
85   return simde__m128i_from_private(r_);
86 }
87 
88 SIMDE_FUNCTION_ATTRIBUTES
89 simde__m128i
simde_x_mm_deinterleaveeven_epi32(simde__m128i a,simde__m128i b)90 simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) {
91   simde__m128i_private
92     r_,
93     a_ = simde__m128i_to_private(a),
94     b_ = simde__m128i_to_private(b);
95 
96   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
97     r_.neon_i32 = vuzp1q_s32(a_.neon_i32, b_.neon_i32);
98   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
99     int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32);
100     r_.neon_i32 = t.val[0];
101   #elif defined(SIMDE_SHUFFLE_VECTOR_)
102     r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6);
103   #else
104     const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
105     for(size_t i = 0 ; i < halfway_point ; i++) {
106       r_.i32[i] = a_.i32[2 * i];
107       r_.i32[i + halfway_point] = b_.i32[2 * i];
108     }
109   #endif
110 
111   return simde__m128i_from_private(r_);
112 }
113 
114 SIMDE_FUNCTION_ATTRIBUTES
115 simde__m128i
simde_x_mm_deinterleaveodd_epi32(simde__m128i a,simde__m128i b)116 simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) {
117   simde__m128i_private
118     r_,
119     a_ = simde__m128i_to_private(a),
120     b_ = simde__m128i_to_private(b);
121 
122   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
123     r_.neon_i32 = vuzp2q_s32(a_.neon_i32, b_.neon_i32);
124   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
125     int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32);
126     r_.neon_i32 = t.val[1];
127   #elif defined(SIMDE_SHUFFLE_VECTOR_)
128     r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7);
129   #else
130     const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
131     for(size_t i = 0 ; i < halfway_point ; i++) {
132       r_.i32[i] = a_.i32[2 * i + 1];
133       r_.i32[i + halfway_point] = b_.i32[2 * i + 1];
134     }
135   #endif
136 
137   return simde__m128i_from_private(r_);
138 }
139 
140 SIMDE_FUNCTION_ATTRIBUTES
141 simde__m128
simde_x_mm_deinterleaveeven_ps(simde__m128 a,simde__m128 b)142 simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) {
143   simde__m128_private
144     r_,
145     a_ = simde__m128_to_private(a),
146     b_ = simde__m128_to_private(b);
147 
148   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
149     r_.neon_f32 = vuzp1q_f32(a_.neon_f32, b_.neon_f32);
150   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
151     float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32);
152     r_.neon_f32 = t.val[0];
153   #elif defined(SIMDE_SHUFFLE_VECTOR_)
154     r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6);
155   #else
156     const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
157     for(size_t i = 0 ; i < halfway_point ; i++) {
158       r_.f32[i] = a_.f32[2 * i];
159       r_.f32[i + halfway_point] = b_.f32[2 * i];
160     }
161   #endif
162 
163   return simde__m128_from_private(r_);
164 }
165 
166 SIMDE_FUNCTION_ATTRIBUTES
167 simde__m128
simde_x_mm_deinterleaveodd_ps(simde__m128 a,simde__m128 b)168 simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) {
169   simde__m128_private
170     r_,
171     a_ = simde__m128_to_private(a),
172     b_ = simde__m128_to_private(b);
173 
174   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
175     r_.neon_f32 = vuzp2q_f32(a_.neon_f32, b_.neon_f32);
176   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
177     float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32);
178     r_.neon_f32 = t.val[1];
179   #elif defined(SIMDE_SHUFFLE_VECTOR_)
180     r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7);
181   #else
182     const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
183     for(size_t i = 0 ; i < halfway_point ; i++) {
184       r_.f32[i] = a_.f32[2 * i + 1];
185       r_.f32[i + halfway_point] = b_.f32[2 * i + 1];
186     }
187   #endif
188 
189   return simde__m128_from_private(r_);
190 }
191 
192 SIMDE_FUNCTION_ATTRIBUTES
193 simde__m128d
simde_x_mm_deinterleaveeven_pd(simde__m128d a,simde__m128d b)194 simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) {
195   simde__m128d_private
196     r_,
197     a_ = simde__m128d_to_private(a),
198     b_ = simde__m128d_to_private(b);
199 
200   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
201     r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64);
202   #elif defined(SIMDE_SHUFFLE_VECTOR_)
203     r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
204   #else
205     const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
206     for(size_t i = 0 ; i < halfway_point ; i++) {
207       r_.f64[i] = a_.f64[2 * i];
208       r_.f64[i + halfway_point] = b_.f64[2 * i];
209     }
210   #endif
211 
212   return simde__m128d_from_private(r_);
213 }
214 
215 SIMDE_FUNCTION_ATTRIBUTES
216 simde__m128d
simde_x_mm_deinterleaveodd_pd(simde__m128d a,simde__m128d b)217 simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) {
218   simde__m128d_private
219     r_,
220     a_ = simde__m128d_to_private(a),
221     b_ = simde__m128d_to_private(b);
222 
223   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
224     r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64);
225   #elif defined(SIMDE_SHUFFLE_VECTOR_)
226     r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
227   #else
228     const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
229     for(size_t i = 0 ; i < halfway_point ; i++) {
230       r_.f64[i] = a_.f64[2 * i + 1];
231       r_.f64[i + halfway_point] = b_.f64[2 * i + 1];
232     }
233   #endif
234 
235   return simde__m128d_from_private(r_);
236 }
237 
238 SIMDE_FUNCTION_ATTRIBUTES
239 simde__m128d
simde_mm_addsub_pd(simde__m128d a,simde__m128d b)240 simde_mm_addsub_pd (simde__m128d a, simde__m128d b) {
241   #if defined(SIMDE_X86_SSE3_NATIVE)
242     return _mm_addsub_pd(a, b);
243   #else
244     simde__m128d_private
245       r_,
246       a_ = simde__m128d_to_private(a),
247       b_ = simde__m128d_to_private(b);
248 
249     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
250       float64x2_t rs = vsubq_f64(a_.neon_f64, b_.neon_f64);
251       float64x2_t ra = vaddq_f64(a_.neon_f64, b_.neon_f64);
252       return vcombine_f64(vget_low_f64(rs), vget_high_f64(ra));
253     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
254       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64 - b_.f64, a_.f64 + b_.f64, 0, 3);
255     #else
256       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) {
257         r_.f64[  i  ] = a_.f64[  i  ] - b_.f64[  i  ];
258         r_.f64[1 + i] = a_.f64[1 + i] + b_.f64[1 + i];
259       }
260     #endif
261 
262     return simde__m128d_from_private(r_);
263   #endif
264 }
265 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
266 #  define _mm_addsub_pd(a, b) simde_mm_addsub_pd(a, b)
267 #endif
268 
269 SIMDE_FUNCTION_ATTRIBUTES
270 simde__m128
simde_mm_addsub_ps(simde__m128 a,simde__m128 b)271 simde_mm_addsub_ps (simde__m128 a, simde__m128 b) {
272   #if defined(SIMDE_X86_SSE3_NATIVE)
273     return _mm_addsub_ps(a, b);
274   #else
275     simde__m128_private
276       r_,
277       a_ = simde__m128_to_private(a),
278       b_ = simde__m128_to_private(b);
279 
280     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
281       float32x4_t rs = vsubq_f32(a_.neon_f32, b_.neon_f32);
282       float32x4_t ra = vaddq_f32(a_.neon_f32, b_.neon_f32);
283       return vtrn2q_f32(vreinterpretq_f32_s32(vrev64q_s32(vreinterpretq_s32_f32(rs))), ra);
284     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
285       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32 - b_.f32, a_.f32 + b_.f32, 0, 5, 2, 7);
286     #else
287       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
288         r_.f32[  i  ] = a_.f32[  i  ] - b_.f32[  i  ];
289         r_.f32[1 + i] = a_.f32[1 + i] + b_.f32[1 + i];
290       }
291     #endif
292 
293     return simde__m128_from_private(r_);
294   #endif
295 }
296 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
297 #  define _mm_addsub_ps(a, b) simde_mm_addsub_ps(a, b)
298 #endif
299 
300 SIMDE_FUNCTION_ATTRIBUTES
301 simde__m128d
simde_mm_hadd_pd(simde__m128d a,simde__m128d b)302 simde_mm_hadd_pd (simde__m128d a, simde__m128d b) {
303   #if defined(SIMDE_X86_SSE3_NATIVE)
304     return _mm_hadd_pd(a, b);
305   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
306     return simde__m128d_from_neon_f64(vpaddq_f64(simde__m128d_to_neon_f64(a), simde__m128d_to_neon_f64(b)));
307   #else
308     return simde_mm_add_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b));
309   #endif
310 }
311 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
312 #  define _mm_hadd_pd(a, b) simde_mm_hadd_pd(a, b)
313 #endif
314 
315 SIMDE_FUNCTION_ATTRIBUTES
316 simde__m128
simde_mm_hadd_ps(simde__m128 a,simde__m128 b)317 simde_mm_hadd_ps (simde__m128 a, simde__m128 b) {
318   #if defined(SIMDE_X86_SSE3_NATIVE)
319     return _mm_hadd_ps(a, b);
320   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
321     return simde__m128_from_neon_f32(vpaddq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)));
322   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
323     float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b));
324     return simde__m128_from_neon_f32(vaddq_f32(t.val[0], t.val[1]));
325   #else
326     return simde_mm_add_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b));
327   #endif
328 }
329 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
330 #  define _mm_hadd_ps(a, b) simde_mm_hadd_ps(a, b)
331 #endif
332 
333 SIMDE_FUNCTION_ATTRIBUTES
334 simde__m128d
simde_mm_hsub_pd(simde__m128d a,simde__m128d b)335 simde_mm_hsub_pd (simde__m128d a, simde__m128d b) {
336   #if defined(SIMDE_X86_SSE3_NATIVE)
337     return _mm_hsub_pd(a, b);
338   #else
339     return simde_mm_sub_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b));
340   #endif
341 }
342 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
343 #  define _mm_hsub_pd(a, b) simde_mm_hsub_pd(a, b)
344 #endif
345 
346 SIMDE_FUNCTION_ATTRIBUTES
347 simde__m128
simde_mm_hsub_ps(simde__m128 a,simde__m128 b)348 simde_mm_hsub_ps (simde__m128 a, simde__m128 b) {
349   #if defined(SIMDE_X86_SSE3_NATIVE)
350     return _mm_hsub_ps(a, b);
351   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
352     float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b));
353     return simde__m128_from_neon_f32(vaddq_f32(t.val[0], vnegq_f32(t.val[1])));
354   #else
355     return simde_mm_sub_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b));
356   #endif
357 }
358 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
359 #  define _mm_hsub_ps(a, b) simde_mm_hsub_ps(a, b)
360 #endif
361 
362 SIMDE_FUNCTION_ATTRIBUTES
363 simde__m128i
simde_mm_lddqu_si128(simde__m128i const * mem_addr)364 simde_mm_lddqu_si128 (simde__m128i const* mem_addr) {
365   #if defined(SIMDE_X86_SSE3_NATIVE)
366     return _mm_lddqu_si128(mem_addr);
367   #else
368     simde__m128i_private r_;
369 
370     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
371       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
372     #else
373       simde_memcpy(&r_, mem_addr, sizeof(r_));
374     #endif
375 
376     return simde__m128i_from_private(r_);
377   #endif
378 }
379 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
380 #  define _mm_lddqu_si128(mem_addr) simde_mm_lddqu_si128(mem_addr)
381 #endif
382 
383 SIMDE_FUNCTION_ATTRIBUTES
384 simde__m128d
simde_mm_loaddup_pd(simde_float64 const * mem_addr)385 simde_mm_loaddup_pd (simde_float64 const* mem_addr) {
386   #if defined(SIMDE_X86_SSE3_NATIVE)
387     return _mm_loaddup_pd(mem_addr);
388   #else
389     simde__m128d_private r_;
390 
391     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
392       r_.neon_f64 = vdupq_n_f64(*mem_addr);
393     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
394       r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
395     #else
396       r_.f64[0] = *mem_addr;
397       r_.f64[1] = *mem_addr;
398     #endif
399 
400     return simde__m128d_from_private(r_);
401   #endif
402 }
403 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
404 #  define _mm_loaddup_pd(mem_addr) simde_mm_loaddup_pd(mem_addr)
405 #endif
406 
407 SIMDE_FUNCTION_ATTRIBUTES
408 simde__m128d
simde_mm_movedup_pd(simde__m128d a)409 simde_mm_movedup_pd (simde__m128d a) {
410   #if defined(SIMDE_X86_SSE3_NATIVE)
411     return _mm_movedup_pd(a);
412   #else
413     simde__m128d_private
414       r_,
415       a_ = simde__m128d_to_private(a);
416 
417     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
418       r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0);
419     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
420       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0);
421     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_SHUFFLE_VECTOR_)
422       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0);
423     #else
424       r_.f64[0] = a_.f64[0];
425       r_.f64[1] = a_.f64[0];
426     #endif
427 
428     return simde__m128d_from_private(r_);
429   #endif
430 }
431 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
432 #  define _mm_movedup_pd(a) simde_mm_movedup_pd(a)
433 #endif
434 
435 SIMDE_FUNCTION_ATTRIBUTES
436 simde__m128
simde_mm_movehdup_ps(simde__m128 a)437 simde_mm_movehdup_ps (simde__m128 a) {
438   #if defined(SIMDE_X86_SSE3_NATIVE)
439     return _mm_movehdup_ps(a);
440   #else
441     simde__m128_private
442       r_,
443       a_ = simde__m128_to_private(a);
444 
445     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
446       r_.neon_f32 = vtrn2q_f32(a_.neon_f32, a_.neon_f32);
447     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
448       r_.wasm_v128 = wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 1, 3, 3);
449     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
450       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 1, 1, 3, 3);
451     #else
452       r_.f32[0] = a_.f32[1];
453       r_.f32[1] = a_.f32[1];
454       r_.f32[2] = a_.f32[3];
455       r_.f32[3] = a_.f32[3];
456     #endif
457 
458     return simde__m128_from_private(r_);
459   #endif
460 }
461 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
462 #  define _mm_movehdup_ps(a) simde_mm_movehdup_ps(a)
463 #endif
464 
465 SIMDE_FUNCTION_ATTRIBUTES
466 simde__m128
simde_mm_moveldup_ps(simde__m128 a)467 simde_mm_moveldup_ps (simde__m128 a) {
468   #if defined(SIMDE__SSE3_NATIVE)
469     return _mm_moveldup_ps(a);
470   #else
471     simde__m128_private
472       r_,
473       a_ = simde__m128_to_private(a);
474 
475     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
476       r_.neon_f32 = vtrn1q_f32(a_.neon_f32, a_.neon_f32);
477     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
478       r_.wasm_v128 = wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 2, 2);
479     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
480       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 2, 2);
481     #else
482       r_.f32[0] = a_.f32[0];
483       r_.f32[1] = a_.f32[0];
484       r_.f32[2] = a_.f32[2];
485       r_.f32[3] = a_.f32[2];
486     #endif
487 
488     return simde__m128_from_private(r_);
489   #endif
490 }
491 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
492 #  define _mm_moveldup_ps(a) simde_mm_moveldup_ps(a)
493 #endif
494 
495 SIMDE_END_DECLS_
496 
497 HEDLEY_DIAGNOSTIC_POP
498 
499 #endif /* !defined(SIMDE_X86_SSE3_H) */
500