1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 */
26
27 #if !defined(SIMDE_X86_SSE3_H)
28 #define SIMDE_X86_SSE3_H
29
30 #include "sse2.h"
31
32 HEDLEY_DIAGNOSTIC_PUSH
33 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
34 SIMDE_BEGIN_DECLS_
35
36 SIMDE_FUNCTION_ATTRIBUTES
37 simde__m128i
simde_x_mm_deinterleaveeven_epi16(simde__m128i a,simde__m128i b)38 simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) {
39 simde__m128i_private
40 r_,
41 a_ = simde__m128i_to_private(a),
42 b_ = simde__m128i_to_private(b);
43
44 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
45 r_.neon_i16 = vuzp1q_s16(a_.neon_i16, b_.neon_i16);
46 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
47 int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16);
48 r_.neon_i16 = t.val[0];
49 #elif defined(SIMDE_SHUFFLE_VECTOR_)
50 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14);
51 #else
52 const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
53 for(size_t i = 0 ; i < halfway_point ; i++) {
54 r_.i16[i] = a_.i16[2 * i];
55 r_.i16[i + halfway_point] = b_.i16[2 * i];
56 }
57 #endif
58
59 return simde__m128i_from_private(r_);
60 }
61
62 SIMDE_FUNCTION_ATTRIBUTES
63 simde__m128i
simde_x_mm_deinterleaveodd_epi16(simde__m128i a,simde__m128i b)64 simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) {
65 simde__m128i_private
66 r_,
67 a_ = simde__m128i_to_private(a),
68 b_ = simde__m128i_to_private(b);
69
70 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
71 r_.neon_i16 = vuzp2q_s16(a_.neon_i16, b_.neon_i16);
72 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
73 int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16);
74 r_.neon_i16 = t.val[1];
75 #elif defined(SIMDE_SHUFFLE_VECTOR_)
76 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15);
77 #else
78 const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
79 for(size_t i = 0 ; i < halfway_point ; i++) {
80 r_.i16[i] = a_.i16[2 * i + 1];
81 r_.i16[i + halfway_point] = b_.i16[2 * i + 1];
82 }
83 #endif
84
85 return simde__m128i_from_private(r_);
86 }
87
88 SIMDE_FUNCTION_ATTRIBUTES
89 simde__m128i
simde_x_mm_deinterleaveeven_epi32(simde__m128i a,simde__m128i b)90 simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) {
91 simde__m128i_private
92 r_,
93 a_ = simde__m128i_to_private(a),
94 b_ = simde__m128i_to_private(b);
95
96 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
97 r_.neon_i32 = vuzp1q_s32(a_.neon_i32, b_.neon_i32);
98 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
99 int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32);
100 r_.neon_i32 = t.val[0];
101 #elif defined(SIMDE_SHUFFLE_VECTOR_)
102 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6);
103 #else
104 const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
105 for(size_t i = 0 ; i < halfway_point ; i++) {
106 r_.i32[i] = a_.i32[2 * i];
107 r_.i32[i + halfway_point] = b_.i32[2 * i];
108 }
109 #endif
110
111 return simde__m128i_from_private(r_);
112 }
113
114 SIMDE_FUNCTION_ATTRIBUTES
115 simde__m128i
simde_x_mm_deinterleaveodd_epi32(simde__m128i a,simde__m128i b)116 simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) {
117 simde__m128i_private
118 r_,
119 a_ = simde__m128i_to_private(a),
120 b_ = simde__m128i_to_private(b);
121
122 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
123 r_.neon_i32 = vuzp2q_s32(a_.neon_i32, b_.neon_i32);
124 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
125 int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32);
126 r_.neon_i32 = t.val[1];
127 #elif defined(SIMDE_SHUFFLE_VECTOR_)
128 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7);
129 #else
130 const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
131 for(size_t i = 0 ; i < halfway_point ; i++) {
132 r_.i32[i] = a_.i32[2 * i + 1];
133 r_.i32[i + halfway_point] = b_.i32[2 * i + 1];
134 }
135 #endif
136
137 return simde__m128i_from_private(r_);
138 }
139
140 SIMDE_FUNCTION_ATTRIBUTES
141 simde__m128
simde_x_mm_deinterleaveeven_ps(simde__m128 a,simde__m128 b)142 simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) {
143 simde__m128_private
144 r_,
145 a_ = simde__m128_to_private(a),
146 b_ = simde__m128_to_private(b);
147
148 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
149 r_.neon_f32 = vuzp1q_f32(a_.neon_f32, b_.neon_f32);
150 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
151 float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32);
152 r_.neon_f32 = t.val[0];
153 #elif defined(SIMDE_SHUFFLE_VECTOR_)
154 r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6);
155 #else
156 const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
157 for(size_t i = 0 ; i < halfway_point ; i++) {
158 r_.f32[i] = a_.f32[2 * i];
159 r_.f32[i + halfway_point] = b_.f32[2 * i];
160 }
161 #endif
162
163 return simde__m128_from_private(r_);
164 }
165
166 SIMDE_FUNCTION_ATTRIBUTES
167 simde__m128
simde_x_mm_deinterleaveodd_ps(simde__m128 a,simde__m128 b)168 simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) {
169 simde__m128_private
170 r_,
171 a_ = simde__m128_to_private(a),
172 b_ = simde__m128_to_private(b);
173
174 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
175 r_.neon_f32 = vuzp2q_f32(a_.neon_f32, b_.neon_f32);
176 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
177 float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32);
178 r_.neon_f32 = t.val[1];
179 #elif defined(SIMDE_SHUFFLE_VECTOR_)
180 r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7);
181 #else
182 const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
183 for(size_t i = 0 ; i < halfway_point ; i++) {
184 r_.f32[i] = a_.f32[2 * i + 1];
185 r_.f32[i + halfway_point] = b_.f32[2 * i + 1];
186 }
187 #endif
188
189 return simde__m128_from_private(r_);
190 }
191
192 SIMDE_FUNCTION_ATTRIBUTES
193 simde__m128d
simde_x_mm_deinterleaveeven_pd(simde__m128d a,simde__m128d b)194 simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) {
195 simde__m128d_private
196 r_,
197 a_ = simde__m128d_to_private(a),
198 b_ = simde__m128d_to_private(b);
199
200 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
201 r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64);
202 #elif defined(SIMDE_SHUFFLE_VECTOR_)
203 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
204 #else
205 const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
206 for(size_t i = 0 ; i < halfway_point ; i++) {
207 r_.f64[i] = a_.f64[2 * i];
208 r_.f64[i + halfway_point] = b_.f64[2 * i];
209 }
210 #endif
211
212 return simde__m128d_from_private(r_);
213 }
214
215 SIMDE_FUNCTION_ATTRIBUTES
216 simde__m128d
simde_x_mm_deinterleaveodd_pd(simde__m128d a,simde__m128d b)217 simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) {
218 simde__m128d_private
219 r_,
220 a_ = simde__m128d_to_private(a),
221 b_ = simde__m128d_to_private(b);
222
223 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
224 r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64);
225 #elif defined(SIMDE_SHUFFLE_VECTOR_)
226 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
227 #else
228 const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
229 for(size_t i = 0 ; i < halfway_point ; i++) {
230 r_.f64[i] = a_.f64[2 * i + 1];
231 r_.f64[i + halfway_point] = b_.f64[2 * i + 1];
232 }
233 #endif
234
235 return simde__m128d_from_private(r_);
236 }
237
238 SIMDE_FUNCTION_ATTRIBUTES
239 simde__m128d
simde_mm_addsub_pd(simde__m128d a,simde__m128d b)240 simde_mm_addsub_pd (simde__m128d a, simde__m128d b) {
241 #if defined(SIMDE_X86_SSE3_NATIVE)
242 return _mm_addsub_pd(a, b);
243 #else
244 simde__m128d_private
245 r_,
246 a_ = simde__m128d_to_private(a),
247 b_ = simde__m128d_to_private(b);
248
249 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
250 float64x2_t rs = vsubq_f64(a_.neon_f64, b_.neon_f64);
251 float64x2_t ra = vaddq_f64(a_.neon_f64, b_.neon_f64);
252 return vcombine_f64(vget_low_f64(rs), vget_high_f64(ra));
253 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
254 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64 - b_.f64, a_.f64 + b_.f64, 0, 3);
255 #else
256 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) {
257 r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ];
258 r_.f64[1 + i] = a_.f64[1 + i] + b_.f64[1 + i];
259 }
260 #endif
261
262 return simde__m128d_from_private(r_);
263 #endif
264 }
265 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
266 # define _mm_addsub_pd(a, b) simde_mm_addsub_pd(a, b)
267 #endif
268
269 SIMDE_FUNCTION_ATTRIBUTES
270 simde__m128
simde_mm_addsub_ps(simde__m128 a,simde__m128 b)271 simde_mm_addsub_ps (simde__m128 a, simde__m128 b) {
272 #if defined(SIMDE_X86_SSE3_NATIVE)
273 return _mm_addsub_ps(a, b);
274 #else
275 simde__m128_private
276 r_,
277 a_ = simde__m128_to_private(a),
278 b_ = simde__m128_to_private(b);
279
280 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
281 float32x4_t rs = vsubq_f32(a_.neon_f32, b_.neon_f32);
282 float32x4_t ra = vaddq_f32(a_.neon_f32, b_.neon_f32);
283 return vtrn2q_f32(vreinterpretq_f32_s32(vrev64q_s32(vreinterpretq_s32_f32(rs))), ra);
284 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
285 r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32 - b_.f32, a_.f32 + b_.f32, 0, 5, 2, 7);
286 #else
287 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
288 r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ];
289 r_.f32[1 + i] = a_.f32[1 + i] + b_.f32[1 + i];
290 }
291 #endif
292
293 return simde__m128_from_private(r_);
294 #endif
295 }
296 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
297 # define _mm_addsub_ps(a, b) simde_mm_addsub_ps(a, b)
298 #endif
299
300 SIMDE_FUNCTION_ATTRIBUTES
301 simde__m128d
simde_mm_hadd_pd(simde__m128d a,simde__m128d b)302 simde_mm_hadd_pd (simde__m128d a, simde__m128d b) {
303 #if defined(SIMDE_X86_SSE3_NATIVE)
304 return _mm_hadd_pd(a, b);
305 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
306 return simde__m128d_from_neon_f64(vpaddq_f64(simde__m128d_to_neon_f64(a), simde__m128d_to_neon_f64(b)));
307 #else
308 return simde_mm_add_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b));
309 #endif
310 }
311 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
312 # define _mm_hadd_pd(a, b) simde_mm_hadd_pd(a, b)
313 #endif
314
315 SIMDE_FUNCTION_ATTRIBUTES
316 simde__m128
simde_mm_hadd_ps(simde__m128 a,simde__m128 b)317 simde_mm_hadd_ps (simde__m128 a, simde__m128 b) {
318 #if defined(SIMDE_X86_SSE3_NATIVE)
319 return _mm_hadd_ps(a, b);
320 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
321 return simde__m128_from_neon_f32(vpaddq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)));
322 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
323 float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b));
324 return simde__m128_from_neon_f32(vaddq_f32(t.val[0], t.val[1]));
325 #else
326 return simde_mm_add_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b));
327 #endif
328 }
329 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
330 # define _mm_hadd_ps(a, b) simde_mm_hadd_ps(a, b)
331 #endif
332
333 SIMDE_FUNCTION_ATTRIBUTES
334 simde__m128d
simde_mm_hsub_pd(simde__m128d a,simde__m128d b)335 simde_mm_hsub_pd (simde__m128d a, simde__m128d b) {
336 #if defined(SIMDE_X86_SSE3_NATIVE)
337 return _mm_hsub_pd(a, b);
338 #else
339 return simde_mm_sub_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b));
340 #endif
341 }
342 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
343 # define _mm_hsub_pd(a, b) simde_mm_hsub_pd(a, b)
344 #endif
345
346 SIMDE_FUNCTION_ATTRIBUTES
347 simde__m128
simde_mm_hsub_ps(simde__m128 a,simde__m128 b)348 simde_mm_hsub_ps (simde__m128 a, simde__m128 b) {
349 #if defined(SIMDE_X86_SSE3_NATIVE)
350 return _mm_hsub_ps(a, b);
351 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
352 float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b));
353 return simde__m128_from_neon_f32(vaddq_f32(t.val[0], vnegq_f32(t.val[1])));
354 #else
355 return simde_mm_sub_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b));
356 #endif
357 }
358 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
359 # define _mm_hsub_ps(a, b) simde_mm_hsub_ps(a, b)
360 #endif
361
362 SIMDE_FUNCTION_ATTRIBUTES
363 simde__m128i
simde_mm_lddqu_si128(simde__m128i const * mem_addr)364 simde_mm_lddqu_si128 (simde__m128i const* mem_addr) {
365 #if defined(SIMDE_X86_SSE3_NATIVE)
366 return _mm_lddqu_si128(mem_addr);
367 #else
368 simde__m128i_private r_;
369
370 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
371 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
372 #else
373 simde_memcpy(&r_, mem_addr, sizeof(r_));
374 #endif
375
376 return simde__m128i_from_private(r_);
377 #endif
378 }
379 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
380 # define _mm_lddqu_si128(mem_addr) simde_mm_lddqu_si128(mem_addr)
381 #endif
382
383 SIMDE_FUNCTION_ATTRIBUTES
384 simde__m128d
simde_mm_loaddup_pd(simde_float64 const * mem_addr)385 simde_mm_loaddup_pd (simde_float64 const* mem_addr) {
386 #if defined(SIMDE_X86_SSE3_NATIVE)
387 return _mm_loaddup_pd(mem_addr);
388 #else
389 simde__m128d_private r_;
390
391 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
392 r_.neon_f64 = vdupq_n_f64(*mem_addr);
393 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
394 r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
395 #else
396 r_.f64[0] = *mem_addr;
397 r_.f64[1] = *mem_addr;
398 #endif
399
400 return simde__m128d_from_private(r_);
401 #endif
402 }
403 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
404 # define _mm_loaddup_pd(mem_addr) simde_mm_loaddup_pd(mem_addr)
405 #endif
406
407 SIMDE_FUNCTION_ATTRIBUTES
408 simde__m128d
simde_mm_movedup_pd(simde__m128d a)409 simde_mm_movedup_pd (simde__m128d a) {
410 #if defined(SIMDE_X86_SSE3_NATIVE)
411 return _mm_movedup_pd(a);
412 #else
413 simde__m128d_private
414 r_,
415 a_ = simde__m128d_to_private(a);
416
417 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
418 r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0);
419 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
420 r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0);
421 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_SHUFFLE_VECTOR_)
422 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0);
423 #else
424 r_.f64[0] = a_.f64[0];
425 r_.f64[1] = a_.f64[0];
426 #endif
427
428 return simde__m128d_from_private(r_);
429 #endif
430 }
431 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
432 # define _mm_movedup_pd(a) simde_mm_movedup_pd(a)
433 #endif
434
435 SIMDE_FUNCTION_ATTRIBUTES
436 simde__m128
simde_mm_movehdup_ps(simde__m128 a)437 simde_mm_movehdup_ps (simde__m128 a) {
438 #if defined(SIMDE_X86_SSE3_NATIVE)
439 return _mm_movehdup_ps(a);
440 #else
441 simde__m128_private
442 r_,
443 a_ = simde__m128_to_private(a);
444
445 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
446 r_.neon_f32 = vtrn2q_f32(a_.neon_f32, a_.neon_f32);
447 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
448 r_.wasm_v128 = wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 1, 3, 3);
449 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
450 r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 1, 1, 3, 3);
451 #else
452 r_.f32[0] = a_.f32[1];
453 r_.f32[1] = a_.f32[1];
454 r_.f32[2] = a_.f32[3];
455 r_.f32[3] = a_.f32[3];
456 #endif
457
458 return simde__m128_from_private(r_);
459 #endif
460 }
461 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
462 # define _mm_movehdup_ps(a) simde_mm_movehdup_ps(a)
463 #endif
464
465 SIMDE_FUNCTION_ATTRIBUTES
466 simde__m128
simde_mm_moveldup_ps(simde__m128 a)467 simde_mm_moveldup_ps (simde__m128 a) {
468 #if defined(SIMDE__SSE3_NATIVE)
469 return _mm_moveldup_ps(a);
470 #else
471 simde__m128_private
472 r_,
473 a_ = simde__m128_to_private(a);
474
475 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
476 r_.neon_f32 = vtrn1q_f32(a_.neon_f32, a_.neon_f32);
477 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
478 r_.wasm_v128 = wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 2, 2);
479 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
480 r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 2, 2);
481 #else
482 r_.f32[0] = a_.f32[0];
483 r_.f32[1] = a_.f32[0];
484 r_.f32[2] = a_.f32[2];
485 r_.f32[3] = a_.f32[2];
486 #endif
487
488 return simde__m128_from_private(r_);
489 #endif
490 }
491 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
492 # define _mm_moveldup_ps(a) simde_mm_moveldup_ps(a)
493 #endif
494
495 SIMDE_END_DECLS_
496
497 HEDLEY_DIAGNOSTIC_POP
498
499 #endif /* !defined(SIMDE_X86_SSE3_H) */
500