1 /*
2 * Copyright 2020 The Emscripten Authors. All rights reserved.
3 * Emscripten is available under two separate licenses, the MIT license and the
4 * University of Illinois/NCSA Open Source License. Both these licenses can be
5 * found in the LICENSE file.
6 */
7 #ifndef __emscripten_emmintrin_h__
8 #define __emscripten_emmintrin_h__
9
10 #ifndef __SSE2__
11 #error "SSE2 instruction set not enabled"
12 #endif
13
14 #include <xmmintrin.h>
15 #include <emscripten/emscripten.h>
16
17 #define __SATURATE(x, Min, Max) ((x) >= Min ? ((x) <= Max ? (x) : Max) : Min)
18 #define __MIN(x, y) ((x) <= (y) ? (x) : (y))
19 #define __MAX(x, y) ((x) >= (y) ? (x) : (y))
20
21 // Alias different (functionally) equivalent intrinsics.
22 #define _mm_set_epi64x _mm_set_epi64
23 #define _mm_cvtsd_si64x _mm_cvtsd_si64
24 #define _mm_cvtsi128_si64x _mm_cvtsi128_si64
25 #define _mm_cvtsi64x_sd _mm_cvtsi64_sd
26 #define _mm_cvtsi64x_si128 _mm_cvtsi64_si128
27 #define _mm_cvttsd_si64x _mm_cvttsd_si64
28 #define _mm_store_pd1 _mm_store1_pd
29
30 typedef __f64x2 __m128d;
31
32 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d __a,__m128d __b)33 _mm_move_sd(__m128d __a, __m128d __b)
34 {
35 return (__m128d){ __b[0], __a[1] };
36 }
37
38 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d __a,__m128d __b)39 _mm_add_pd(__m128d __a, __m128d __b)
40 {
41 return (__m128d)wasm_f64x2_add((v128_t)__a, (v128_t)__b);
42 }
43
44 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d __a,__m128d __b)45 _mm_add_sd(__m128d __a, __m128d __b)
46 {
47 return _mm_move_sd(__a, _mm_add_pd(__a, __b));
48 }
49
50 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d __a,__m128d __b)51 _mm_sub_pd(__m128d __a, __m128d __b)
52 {
53 return (__m128d)wasm_f64x2_sub((v128_t)__a, (v128_t)__b);
54 }
55
56 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d __a,__m128d __b)57 _mm_sub_sd(__m128d __a, __m128d __b)
58 {
59 return _mm_move_sd(__a, _mm_sub_pd(__a, __b));
60 }
61
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d __a,__m128d __b)63 _mm_mul_pd(__m128d __a, __m128d __b)
64 {
65 return (__m128d)wasm_f64x2_mul((v128_t)__a, (v128_t)__b);
66 }
67
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d __a,__m128d __b)69 _mm_mul_sd(__m128d __a, __m128d __b)
70 {
71 return _mm_move_sd(__a, _mm_mul_pd(__a, __b));
72 }
73
74 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d __a,__m128d __b)75 _mm_div_pd(__m128d __a, __m128d __b)
76 {
77 return (__m128d)wasm_f64x2_div((v128_t)__a, (v128_t)__b);
78 }
79
80 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d __a,__m128d __b)81 _mm_div_sd(__m128d __a, __m128d __b)
82 {
83 return _mm_move_sd(__a, _mm_div_pd(__a, __b));
84 }
85
86 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d __a)87 _mm_sqrt_pd(__m128d __a)
88 {
89 return (__m128d)wasm_f64x2_sqrt((v128_t)__a);
90 }
91
92 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d __a,__m128d __b)93 _mm_sqrt_sd(__m128d __a, __m128d __b)
94 {
95 return _mm_move_sd(__a, _mm_sqrt_pd(__b));
96 }
97
98 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d __a,__m128d __b)99 _mm_min_pd(__m128d __a, __m128d __b)
100 {
101 // return (__m128d)wasm_f32x4_pmin((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
102 return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_lt((v128_t)__a, (v128_t)__b));
103 }
104
105 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d __a,__m128d __b)106 _mm_min_sd(__m128d __a, __m128d __b)
107 {
108 return _mm_move_sd(__a, _mm_min_pd(__a, __b));
109 }
110
111 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d __a,__m128d __b)112 _mm_max_pd(__m128d __a, __m128d __b)
113 {
114 // return (__m128)wasm_f32x4_pmax((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
115 return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_gt((v128_t)__a, (v128_t)__b));
116 }
117
118 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d __a,__m128d __b)119 _mm_max_sd(__m128d __a, __m128d __b)
120 {
121 return _mm_move_sd(__a, _mm_max_pd(__a, __b));
122 }
123
124 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d __a,__m128d __b)125 _mm_and_pd(__m128d __a, __m128d __b)
126 {
127 return (__m128d)wasm_v128_and((v128_t)__a, (v128_t)__b);
128 }
129
130 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d __a,__m128d __b)131 _mm_andnot_pd(__m128d __a, __m128d __b)
132 {
133 return (__m128d)wasm_v128_andnot((v128_t)__b, (v128_t)__a);
134 }
135
136 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d __a,__m128d __b)137 _mm_or_pd(__m128d __a, __m128d __b)
138 {
139 return (__m128d)wasm_v128_or((v128_t)__a, (v128_t)__b);
140 }
141
142 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d __a,__m128d __b)143 _mm_xor_pd(__m128d __a, __m128d __b)
144 {
145 return (__m128d)wasm_v128_xor((v128_t)__a, (v128_t)__b);
146 }
147
148 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d __a,__m128d __b)149 _mm_cmpeq_pd(__m128d __a, __m128d __b)
150 {
151 return (__m128d)wasm_f64x2_eq((v128_t)__a, (v128_t)__b);
152 }
153
154 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d __a,__m128d __b)155 _mm_cmplt_pd(__m128d __a, __m128d __b)
156 {
157 return (__m128d)wasm_f64x2_lt((v128_t)__a, (v128_t)__b);
158 }
159
160 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d __a,__m128d __b)161 _mm_cmple_pd(__m128d __a, __m128d __b)
162 {
163 return (__m128d)wasm_f64x2_le((v128_t)__a, (v128_t)__b);
164 }
165
166 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d __a,__m128d __b)167 _mm_cmpgt_pd(__m128d __a, __m128d __b)
168 {
169 return (__m128d)wasm_f64x2_gt((v128_t)__a, (v128_t)__b);
170 }
171
172 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d __a,__m128d __b)173 _mm_cmpge_pd(__m128d __a, __m128d __b)
174 {
175 return (__m128d)wasm_f64x2_ge((v128_t)__a, (v128_t)__b);
176 }
177
178 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d __a,__m128d __b)179 _mm_cmpord_pd(__m128d __a, __m128d __b)
180 {
181 return (__m128d)wasm_v128_and(wasm_f64x2_eq((v128_t)__a, (v128_t)__a),
182 wasm_f64x2_eq((v128_t)__b, (v128_t)__b));
183 }
184
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d __a,__m128d __b)186 _mm_cmpunord_pd(__m128d __a, __m128d __b)
187 {
188 return (__m128d)wasm_v128_or(wasm_f64x2_ne((v128_t)__a, (v128_t)__a),
189 wasm_f64x2_ne((v128_t)__b, (v128_t)__b));
190 }
191
192 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d __a,__m128d __b)193 _mm_cmpneq_pd(__m128d __a, __m128d __b)
194 {
195 return (__m128d)wasm_f64x2_ne((v128_t)__a, (v128_t)__b);
196 }
197
198 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d __a,__m128d __b)199 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
200 {
201 return (__m128d)wasm_v128_not((v128_t)_mm_cmplt_pd(__a, __b));
202 }
203
204 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d __a,__m128d __b)205 _mm_cmpnle_pd(__m128d __a, __m128d __b)
206 {
207 return (__m128d)wasm_v128_not((v128_t)_mm_cmple_pd(__a, __b));
208 }
209
210 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d __a,__m128d __b)211 _mm_cmpngt_pd(__m128d __a, __m128d __b)
212 {
213 return (__m128d)wasm_v128_not((v128_t)_mm_cmpgt_pd(__a, __b));
214 }
215
216 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d __a,__m128d __b)217 _mm_cmpnge_pd(__m128d __a, __m128d __b)
218 {
219 return (__m128d)wasm_v128_not((v128_t)_mm_cmpge_pd(__a, __b));
220 }
221
222 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d __a,__m128d __b)223 _mm_cmpeq_sd(__m128d __a, __m128d __b)
224 {
225 return _mm_move_sd(__a, _mm_cmpeq_pd(__a, __b));
226 }
227
228 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d __a,__m128d __b)229 _mm_cmplt_sd(__m128d __a, __m128d __b)
230 {
231 return _mm_move_sd(__a, _mm_cmplt_pd(__a, __b));
232 }
233
234 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d __a,__m128d __b)235 _mm_cmple_sd(__m128d __a, __m128d __b)
236 {
237 return _mm_move_sd(__a, _mm_cmple_pd(__a, __b));
238 }
239
240 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d __a,__m128d __b)241 _mm_cmpgt_sd(__m128d __a, __m128d __b)
242 {
243 return _mm_move_sd(__a, _mm_cmpgt_pd(__a, __b));
244 }
245
246 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d __a,__m128d __b)247 _mm_cmpge_sd(__m128d __a, __m128d __b)
248 {
249 return _mm_move_sd(__a, _mm_cmpge_pd(__a, __b));
250 }
251
252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d __a,__m128d __b)253 _mm_cmpord_sd(__m128d __a, __m128d __b)
254 {
255 return _mm_move_sd(__a, _mm_cmpord_pd(__a, __b));
256 }
257
258 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d __a,__m128d __b)259 _mm_cmpunord_sd(__m128d __a, __m128d __b)
260 {
261 return _mm_move_sd(__a, _mm_cmpunord_pd(__a, __b));
262 }
263
264 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d __a,__m128d __b)265 _mm_cmpneq_sd(__m128d __a, __m128d __b)
266 {
267 return _mm_move_sd(__a, _mm_cmpneq_pd(__a, __b));
268 }
269
270 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d __a,__m128d __b)271 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
272 {
273 return _mm_move_sd(__a, _mm_cmpnlt_pd(__a, __b));
274 }
275
276 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d __a,__m128d __b)277 _mm_cmpnle_sd(__m128d __a, __m128d __b)
278 {
279 return _mm_move_sd(__a, _mm_cmpnle_pd(__a, __b));
280 }
281
282 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d __a,__m128d __b)283 _mm_cmpngt_sd(__m128d __a, __m128d __b)
284 {
285 return _mm_move_sd(__a, _mm_cmpngt_pd(__a, __b));
286 }
287
288 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d __a,__m128d __b)289 _mm_cmpnge_sd(__m128d __a, __m128d __b)
290 {
291 return _mm_move_sd(__a, _mm_cmpnge_pd(__a, __b));
292 }
293
294 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d __a,__m128d __b)295 _mm_comieq_sd(__m128d __a, __m128d __b)
296 {
297 return wasm_f64x2_extract_lane(__a, 0) == wasm_f64x2_extract_lane(__b, 0);
298 }
299
300 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d __a,__m128d __b)301 _mm_comilt_sd(__m128d __a, __m128d __b)
302 {
303 return wasm_f64x2_extract_lane(__a, 0) < wasm_f64x2_extract_lane(__b, 0);
304 }
305
306 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d __a,__m128d __b)307 _mm_comile_sd(__m128d __a, __m128d __b)
308 {
309 return wasm_f64x2_extract_lane(__a, 0) <= wasm_f64x2_extract_lane(__b, 0);
310 }
311
312 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d __a,__m128d __b)313 _mm_comigt_sd(__m128d __a, __m128d __b)
314 {
315 return wasm_f64x2_extract_lane(__a, 0) > wasm_f64x2_extract_lane(__b, 0);
316 }
317
318 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comige_sd(__m128d __a,__m128d __b)319 _mm_comige_sd(__m128d __a, __m128d __b)
320 {
321 return wasm_f64x2_extract_lane(__a, 0) >= wasm_f64x2_extract_lane(__b, 0);
322 }
323
324 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d __a,__m128d __b)325 _mm_comineq_sd(__m128d __a, __m128d __b)
326 {
327 return wasm_f64x2_extract_lane(__a, 0) != wasm_f64x2_extract_lane(__b, 0);
328 }
329
330 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d __a,__m128d __b)331 _mm_ucomieq_sd(__m128d __a, __m128d __b)
332 {
333 return wasm_f64x2_extract_lane(__a, 0) == wasm_f64x2_extract_lane(__b, 0);
334 }
335
336 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d __a,__m128d __b)337 _mm_ucomilt_sd(__m128d __a, __m128d __b)
338 {
339 return wasm_f64x2_extract_lane(__a, 0) < wasm_f64x2_extract_lane(__b, 0);
340 }
341
342 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d __a,__m128d __b)343 _mm_ucomile_sd(__m128d __a, __m128d __b)
344 {
345 return wasm_f64x2_extract_lane(__a, 0) <= wasm_f64x2_extract_lane(__b, 0);
346 }
347
348 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d __a,__m128d __b)349 _mm_ucomigt_sd(__m128d __a, __m128d __b)
350 {
351 return wasm_f64x2_extract_lane(__a, 0) > wasm_f64x2_extract_lane(__b, 0);
352 }
353
354 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_sd(__m128d __a,__m128d __b)355 _mm_ucomige_sd(__m128d __a, __m128d __b)
356 {
357 return wasm_f64x2_extract_lane(__a, 0) >= wasm_f64x2_extract_lane(__b, 0);
358 }
359
360 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d __a,__m128d __b)361 _mm_ucomineq_sd(__m128d __a, __m128d __b)
362 {
363 return wasm_f64x2_extract_lane(__a, 0) != wasm_f64x2_extract_lane(__b, 0);
364 }
365
366 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d __a)367 _mm_cvtpd_ps(__m128d __a)
368 {
369 return (__m128)wasm_f32x4_make((float)__a[0], (float)__a[1], 0, 0);
370 }
371
372 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 __a)373 _mm_cvtps_pd(__m128 __a)
374 {
375 return (__m128d) { (double)__a[0], (double)__a[1] };
376 }
377
378 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i __a)379 _mm_cvtepi32_pd(__m128i __a)
380 {
381 return (__m128d) { (double)__a[0], (double)__a[1] };
382 }
383
384 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d __a)385 _mm_cvtpd_epi32(__m128d __a)
386 {
387 // TODO: OPTIMIZE!
388 int m[2];
389 for(int i = 0; i < 2; ++i)
390 {
391 int x = lrint(__a[i]);
392 if (x != 0 || fabs(__a[i]) < 2.0)
393 m[i] = (int)x;
394 else
395 m[i] = (int)0x80000000;
396 }
397 return (__m128i) { m[0], m[1], 0, 0 };
398 }
399
400 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d __a)401 _mm_cvtsd_si32(__m128d __a)
402 {
403 // TODO: OPTIMIZE!
404 int x = lrint(__a[0]);
405 if (x != 0 || fabs(__a[0]) < 2.0)
406 return (int)x;
407 else
408 return (int)0x80000000;
409 }
410
411 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 __a,__m128d __b)412 _mm_cvtsd_ss(__m128 __a, __m128d __b)
413 {
414 __a[0] = __b[0];
415 return __a;
416 }
417
418 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d __a,int __b)419 _mm_cvtsi32_sd(__m128d __a, int __b)
420 {
421 __a[0] = __b;
422 return __a;
423 }
424
425 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d __a,__m128 __b)426 _mm_cvtss_sd(__m128d __a, __m128 __b)
427 {
428 __a[0] = __b[0];
429 return __a;
430 }
431
432 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d __a)433 _mm_cvttpd_epi32(__m128d __a)
434 {
435 // TODO: OPTIMIZE!
436 int m[2];
437 for(int i = 0; i < 2; ++i)
438 {
439 int x = lrint(__a[i]);
440 if (x != 0 || fabs(__a[i]) < 2.0)
441 m[i] = (int)__a[i];
442 else
443 m[i] = (int)0x80000000;
444 }
445 return (__m128i) { m[0], m[1], 0, 0 };
446 }
447
448 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d __a)449 _mm_cvttsd_si32(__m128d __a)
450 {
451 // TODO: OPTIMIZE!
452 int x = lrint(__a[0]);
453 if (x != 0 || fabs(__a[0]) < 2.0)
454 return (int)__a[0];
455 else
456 return (int)0x80000000;
457 }
458
459 static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d __a)460 _mm_cvtsd_f64(__m128d __a)
461 {
462 return wasm_f64x2_extract_lane((v128_t)__a, 0);
463 }
464
465 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const * __dp)466 _mm_load_pd(double const *__dp)
467 {
468 return *(__m128d*)__dp;
469 }
470
471 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const * __dp)472 _mm_load1_pd(double const *__dp)
473 {
474 return (__m128d)wasm_v64x2_load_splat(__dp);
475 }
476
477 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
478
479 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const * __p)480 _mm_loadr_pd(double const *__p)
481 {
482 __m128d __u = *(__m128d*)__p; // aligned load
483 return (__m128d)wasm_v64x2_shuffle((v128_t)__u, (v128_t)__u, 1, 0);
484 }
485
486 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const * __dp)487 _mm_loadu_pd(double const *__dp)
488 {
489 struct __loadu_pd {
490 __m128d __v;
491 } __attribute__((__packed__, __may_alias__));
492 return ((struct __loadu_pd*)__dp)->__v;
493 }
494
495 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const * __p)496 _mm_load_sd(double const *__p)
497 {
498 return (__m128d)wasm_f64x2_make(*__p, 0.0);
499 }
500
501 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d __a,double const * __dp)502 _mm_loadh_pd(__m128d __a, double const *__dp)
503 {
504 struct __mm_loadh_pd_struct {
505 double __u;
506 } __attribute__((__packed__, __may_alias__));
507 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
508 return (__m128d){ __a[0], __u };
509 }
510
511 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d __a,double const * __dp)512 _mm_loadl_pd(__m128d __a, double const *__dp)
513 {
514 struct __mm_loadl_pd_struct {
515 double __u;
516 } __attribute__((__packed__, __may_alias__));
517 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
518 return (__m128d){ __u, __a[1] };
519 }
520
521 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double __w)522 _mm_set_sd(double __w)
523 {
524 return (__m128d)wasm_f64x2_make(__w, 0);
525 }
526
527 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double __w)528 _mm_set1_pd(double __w)
529 {
530 return (__m128d)wasm_f64x2_splat(__w);
531 }
532
533 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double __c1,double __c0)534 _mm_set_pd(double __c1, double __c0)
535 {
536 return (__m128d)wasm_f64x2_make(__c0, __c1);
537 }
538
539 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double __c0,double __c1)540 _mm_setr_pd(double __c0, double __c1)
541 {
542 return (__m128d)wasm_f64x2_make(__c0, __c1);
543 }
544
545 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)546 _mm_setzero_pd(void)
547 {
548 return (__m128d)wasm_f64x2_const(0.0, 0.0);
549 }
550
551 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double * __dp,__m128d __a)552 _mm_store_sd(double *__dp, __m128d __a)
553 {
554 struct __mm_store_sd_struct {
555 double __u;
556 } __attribute__((__packed__, __may_alias__));
557 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
558 }
559
560 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double * __dp,__m128d __a)561 _mm_store1_pd(double *__dp, __m128d __a)
562 {
563 struct __mm_store1_pd_struct {
564 double __u[2];
565 } __attribute__((__packed__, __may_alias__));
566 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
567 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
568 }
569
570 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double * __dp,__m128d __a)571 _mm_store_pd(double *__dp, __m128d __a)
572 {
573 *(__m128d *)__dp = __a;
574 }
575
576 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double * __dp,__m128d __a)577 _mm_storeu_pd(double *__dp, __m128d __a)
578 {
579 struct __unaligned {
580 __m128d __v;
581 } __attribute__((__packed__, __may_alias__));
582
583 ((struct __unaligned *)__dp)->__v = __a;
584 }
585
586 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double * __p,__m128d __a)587 _mm_storer_pd(double *__p, __m128d __a)
588 {
589 *(__m128d *)__p = (__m128d)wasm_v64x2_shuffle((v128_t)__a, (v128_t)__a, 1, 0);
590 }
591
592 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double * __dp,__m128d __a)593 _mm_storeh_pd(double *__dp, __m128d __a)
594 {
595 struct __mm_storeh_pd_struct {
596 double __u;
597 } __attribute__((__packed__, __may_alias__));
598 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
599 }
600
601 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double * __dp,__m128d __a)602 _mm_storel_pd(double *__dp, __m128d __a)
603 {
604 struct __mm_storeh_pd_struct {
605 double __u;
606 } __attribute__((__packed__, __may_alias__));
607 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
608 }
609
610 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i __a,__m128i __b)611 _mm_add_epi8(__m128i __a, __m128i __b)
612 {
613 return (__m128i)wasm_i8x16_add((v128_t)__a, (v128_t)__b);
614 }
615
616 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i __a,__m128i __b)617 _mm_add_epi16(__m128i __a, __m128i __b)
618 {
619 return (__m128i)wasm_i16x8_add((v128_t)__a, (v128_t)__b);
620 }
621
622 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i __a,__m128i __b)623 _mm_add_epi32(__m128i __a, __m128i __b)
624 {
625 return (__m128i)wasm_i32x4_add((v128_t)__a, (v128_t)__b);
626 }
627
628 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i __a,__m128i __b)629 _mm_add_epi64(__m128i __a, __m128i __b)
630 {
631 return (__m128i)wasm_i64x2_add((v128_t)__a, (v128_t)__b);
632 }
633
634 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i __a,__m128i __b)635 _mm_adds_epi8(__m128i __a, __m128i __b)
636 {
637 return (__m128i)wasm_i8x16_add_saturate((v128_t)__a, (v128_t)__b);
638 }
639
640 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i __a,__m128i __b)641 _mm_adds_epi16(__m128i __a, __m128i __b)
642 {
643 return (__m128i)wasm_i16x8_add_saturate((v128_t)__a, (v128_t)__b);
644 }
645
646 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i __a,__m128i __b)647 _mm_adds_epu8(__m128i __a, __m128i __b)
648 {
649 return (__m128i)wasm_u8x16_add_saturate((v128_t)__a, (v128_t)__b);
650 }
651
652 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i __a,__m128i __b)653 _mm_adds_epu16(__m128i __a, __m128i __b)
654 {
655 return (__m128i)wasm_u16x8_add_saturate((v128_t)__a, (v128_t)__b);
656 }
657
658 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i __a,__m128i __b)659 _mm_avg_epu8(__m128i __a, __m128i __b)
660 {
661 return (__m128i)wasm_u8x16_avgr((v128_t)__a, (v128_t)__b);
662 }
663
664 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i __a,__m128i __b)665 _mm_avg_epu16(__m128i __a, __m128i __b)
666 {
667 return (__m128i)wasm_u16x8_avgr((v128_t)__a, (v128_t)__b);
668 }
669
670 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i __a,__m128i __b)671 _mm_madd_epi16(__m128i __a, __m128i __b)
672 {
673 // TODO: optimize
674 union {
675 signed short x[8];
676 __m128i m;
677 } src, src2;
678 union {
679 signed int x[4];
680 __m128i m;
681 } dst;
682 src.m = __a;
683 src2.m = __b;
684 for(int i = 0; i < 4; ++i)
685 dst.x[i] = src.x[i*2] * src2.x[i*2] + src.x[i*2+1] * src2.x[i*2+1];
686 return dst.m;
687 }
688
689 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i __a,__m128i __b)690 _mm_max_epi16(__m128i __a, __m128i __b)
691 {
692 return (__m128i)wasm_i16x8_max((v128_t)__a, (v128_t)__b);
693 }
694
695 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i __a,__m128i __b)696 _mm_max_epu8(__m128i __a, __m128i __b)
697 {
698 return (__m128i)wasm_u8x16_max((v128_t)__a, (v128_t)__b);
699 }
700
701 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i __a,__m128i __b)702 _mm_min_epi16(__m128i __a, __m128i __b)
703 {
704 return (__m128i)wasm_i16x8_min((v128_t)__a, (v128_t)__b);
705 }
706
707 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i __a,__m128i __b)708 _mm_min_epu8(__m128i __a, __m128i __b)
709 {
710 return (__m128i)wasm_u8x16_min((v128_t)__a, (v128_t)__b);
711 }
712
713 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i __a,__m128i __b)714 _mm_mulhi_epi16(__m128i __a, __m128i __b)
715 {
716 // TODO: optimize
717 union {
718 signed short x[8];
719 __m128i m;
720 } src, src2, dst;
721 src.m = __a;
722 src2.m = __b;
723 for(int i = 0; i < 8; ++i)
724 dst.x[i] = (signed short)(((int)src.x[i] * (int)src2.x[i]) >> 16);
725 return dst.m;
726 }
727
728 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i __a,__m128i __b)729 _mm_mulhi_epu16(__m128i __a, __m128i __b)
730 {
731 // TODO: optimize
732 union {
733 unsigned short x[8];
734 __m128i m;
735 } src, src2, dst;
736 src.m = __a;
737 src2.m = __b;
738 for(int i = 0; i < 8; ++i)
739 dst.x[i] = (unsigned short)(((int)src.x[i] * (int)src2.x[i]) >> 16);
740 return dst.m;
741 }
742
743 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i __a,__m128i __b)744 _mm_mullo_epi16(__m128i __a, __m128i __b)
745 {
746 return (__m128i)wasm_i16x8_mul((v128_t)__a, (v128_t)__b);
747 }
748
749 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i __a,__m128i __b)750 _mm_mul_epu32(__m128i __a, __m128i __b)
751 {
752 // TODO: optimize
753 unsigned long long a0 = (unsigned long long)(unsigned int)__a[0];
754 unsigned long long a2 = (unsigned long long)(unsigned int)__a[2];
755 unsigned long long b0 = (unsigned long long)(unsigned int)__b[0];
756 unsigned long long b2 = (unsigned long long)(unsigned int)__b[2];
757 union {
758 unsigned long long x[2];
759 __m128i m;
760 } u;
761 u.x[0] = a0*b0;
762 u.x[1] = a2*b2;
763 return u.m;
764 }
765
766 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i __a,__m128i __b)767 _mm_sad_epu8(__m128i __a, __m128i __b)
768 {
769 // TODO: optimize
770 union {
771 unsigned char x[16];
772 __m128i m;
773 } src, src2;
774 src.m = __a;
775 src2.m = __b;
776 union {
777 unsigned short x[8];
778 __m128i m;
779 } dst;
780 #define __ABS(__a) ((__a) < 0 ? -(__a) : (__a))
781 for(int i = 0; i < 8; ++i)
782 dst.x[i] = 0;
783 for(int i = 0; i < 8; ++i)
784 {
785 dst.x[0] += __ABS(src.x[i] - src2.x[i]);
786 dst.x[4] += __ABS(src.x[8+i] - src2.x[8+i]);
787 }
788 return dst.m;
789 #undef __ABS
790 }
791
792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i __a,__m128i __b)793 _mm_sub_epi8(__m128i __a, __m128i __b)
794 {
795 return (__m128i)wasm_i8x16_sub((v128_t)__a, (v128_t)__b);
796 }
797
798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i __a,__m128i __b)799 _mm_sub_epi16(__m128i __a, __m128i __b)
800 {
801 return (__m128i)wasm_i16x8_sub((v128_t)__a, (v128_t)__b);
802 }
803
804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i __a,__m128i __b)805 _mm_sub_epi32(__m128i __a, __m128i __b)
806 {
807 return (__m128i)wasm_i32x4_sub((v128_t)__a, (v128_t)__b);
808 }
809
810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i __a,__m128i __b)811 _mm_sub_epi64(__m128i __a, __m128i __b)
812 {
813 return (__m128i)wasm_i64x2_sub((v128_t)__a, (v128_t)__b);
814 }
815
816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i __a,__m128i __b)817 _mm_subs_epi8(__m128i __a, __m128i __b)
818 {
819 return (__m128i)wasm_i8x16_sub_saturate((v128_t)__a, (v128_t)__b);
820 }
821
822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i __a,__m128i __b)823 _mm_subs_epi16(__m128i __a, __m128i __b)
824 {
825 return (__m128i)wasm_i16x8_sub_saturate((v128_t)__a, (v128_t)__b);
826 }
827
828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i __a,__m128i __b)829 _mm_subs_epu8(__m128i __a, __m128i __b)
830 {
831 return (__m128i)wasm_u8x16_sub_saturate((v128_t)__a, (v128_t)__b);
832 }
833
834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i __a,__m128i __b)835 _mm_subs_epu16(__m128i __a, __m128i __b)
836 {
837 return (__m128i)wasm_u16x8_sub_saturate((v128_t)__a, (v128_t)__b);
838 }
839
840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i __a,__m128i __b)841 _mm_and_si128(__m128i __a, __m128i __b)
842 {
843 return (__m128i)wasm_v128_and((v128_t)__a, (v128_t)__b);
844 }
845
846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i __a,__m128i __b)847 _mm_andnot_si128(__m128i __a, __m128i __b)
848 {
849 return (__m128i)wasm_v128_andnot((v128_t)__b, (v128_t)__a);
850 }
851
852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i __a,__m128i __b)853 _mm_or_si128(__m128i __a, __m128i __b)
854 {
855 return (__m128i)wasm_v128_or((v128_t)__b, (v128_t)__a);
856 }
857
858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i __a,__m128i __b)859 _mm_xor_si128(__m128i __a, __m128i __b)
860 {
861 return (__m128i)wasm_v128_xor((v128_t)__b, (v128_t)__a);
862 }
863
864 #define _mm_slli_si128(__a, __imm) __extension__ ({ \
865 (__m128i)wasm_v8x16_shuffle(_mm_setzero_si128(), \
866 (__a), \
867 ((__imm)&0xF0) ? 0 : 16 - ((__imm)&0xF), \
868 ((__imm)&0xF0) ? 0 : 17 - ((__imm)&0xF), \
869 ((__imm)&0xF0) ? 0 : 18 - ((__imm)&0xF), \
870 ((__imm)&0xF0) ? 0 : 19 - ((__imm)&0xF), \
871 ((__imm)&0xF0) ? 0 : 20 - ((__imm)&0xF), \
872 ((__imm)&0xF0) ? 0 : 21 - ((__imm)&0xF), \
873 ((__imm)&0xF0) ? 0 : 22 - ((__imm)&0xF), \
874 ((__imm)&0xF0) ? 0 : 23 - ((__imm)&0xF), \
875 ((__imm)&0xF0) ? 0 : 24 - ((__imm)&0xF), \
876 ((__imm)&0xF0) ? 0 : 25 - ((__imm)&0xF), \
877 ((__imm)&0xF0) ? 0 : 26 - ((__imm)&0xF), \
878 ((__imm)&0xF0) ? 0 : 27 - ((__imm)&0xF), \
879 ((__imm)&0xF0) ? 0 : 28 - ((__imm)&0xF), \
880 ((__imm)&0xF0) ? 0 : 29 - ((__imm)&0xF), \
881 ((__imm)&0xF0) ? 0 : 30 - ((__imm)&0xF), \
882 ((__imm)&0xF0) ? 0 : 31 - ((__imm)&0xF)); })
883 #define _mm_bslli_si128(__a, __imm) \
884 _mm_slli_si128((__a), (__imm))
885
886 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i __a,int __count)887 _mm_slli_epi16(__m128i __a, int __count)
888 {
889 return (__m128i)((__count < 16) ? wasm_i16x8_shl((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
890 }
891
892 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i __a,__m128i __count)893 _mm_sll_epi16(__m128i __a, __m128i __count)
894 {
895 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
896 return (__m128i)((__c < 16) ? wasm_i16x8_shl((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
897 }
898
899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i __a,int __count)900 _mm_slli_epi32(__m128i __a, int __count)
901 {
902 return (__m128i)((__count < 32) ? wasm_i32x4_shl((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));
903 }
904
905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i __a,__m128i __count)906 _mm_sll_epi32(__m128i __a, __m128i __count)
907 {
908 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
909 return (__m128i)((__c < 32) ? wasm_i32x4_shl((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));
910 }
911
912 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i __a,int __count)913 _mm_slli_epi64(__m128i __a, int __count)
914 {
915 return (__m128i)((__count < 64) ? wasm_i64x2_shl((v128_t)__a, __count) : wasm_i64x2_const(0,0));
916 }
917
918 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i __a,__m128i __count)919 _mm_sll_epi64(__m128i __a, __m128i __count)
920 {
921 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
922 return (__m128i)((__c < 64) ? wasm_i64x2_shl((v128_t)__a, __c) : wasm_i64x2_const(0,0));
923 }
924
925 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i __a,int __count)926 _mm_srai_epi16(__m128i __a, int __count)
927 {
928 __count = __count < 15 ? __count : 15;
929 return (__m128i)wasm_i16x8_shr((v128_t)__a, __count);
930 }
931
932 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i __a,__m128i __count)933 _mm_sra_epi16(__m128i __a, __m128i __count)
934 {
935 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
936 __c = __c < 15 ? __c : 15;
937 return (__m128i)wasm_i16x8_shr((v128_t)__a, __c);
938 }
939
940 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i __a,int __count)941 _mm_srai_epi32(__m128i __a, int __count)
942 {
943 __count = __count < 31 ? __count : 31;
944 return (__m128i)wasm_i32x4_shr((v128_t)__a, __count);
945 }
946
947 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i __a,__m128i __count)948 _mm_sra_epi32(__m128i __a, __m128i __count)
949 {
950 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
951 __c = __c < 31 ? __c : 31;
952 return (__m128i)wasm_i32x4_shr((v128_t)__a, __c);
953 }
954
955 #define _mm_srli_si128(__a, __imm) __extension__ ({ \
956 (__m128i)wasm_v8x16_shuffle((__a), \
957 _mm_setzero_si128(), \
958 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 0, \
959 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 1, \
960 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 2, \
961 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 3, \
962 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 4, \
963 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 5, \
964 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 6, \
965 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 7, \
966 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 8, \
967 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 9, \
968 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 10, \
969 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 11, \
970 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 12, \
971 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 13, \
972 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 14, \
973 ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 15); })
974
975 #define _mm_bsrli_si128(__a, __imm) \
976 _mm_srli_si128((__a), (__imm))
977
978 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i __a,int __count)979 _mm_srli_epi16(__m128i __a, int __count)
980 {
981 return (__m128i)(((unsigned int)__count < 16) ? wasm_u16x8_shr((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
982 }
983
984 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i __a,__m128i __count)985 _mm_srl_epi16(__m128i __a, __m128i __count)
986 {
987 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
988 return (__m128i)((__c < 16) ? wasm_u16x8_shr((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
989 }
990
991 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i __a,int __count)992 _mm_srli_epi32(__m128i __a, int __count)
993 {
994 return (__m128i)(((unsigned int)__count < 32) ? wasm_u32x4_shr((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));
995 }
996
997 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i __a,__m128i __count)998 _mm_srl_epi32(__m128i __a, __m128i __count)
999 {
1000 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
1001 return (__m128i)((__c < 32) ? wasm_u32x4_shr((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));
1002 }
1003
1004 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i __a,int __count)1005 _mm_srli_epi64(__m128i __a, int __count)
1006 {
1007 return (__m128i)(((unsigned int)__count < 64) ? wasm_u64x2_shr((v128_t)__a, __count) : wasm_i64x2_const(0,0));
1008 }
1009
1010 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i __a,__m128i __count)1011 _mm_srl_epi64(__m128i __a, __m128i __count)
1012 {
1013 unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
1014 return (__m128i)((__c < 64) ? wasm_u64x2_shr((v128_t)__a, __c) : wasm_i64x2_const(0,0));
1015 }
1016
1017 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i __a,__m128i __b)1018 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
1019 {
1020 return (__m128i)wasm_i8x16_eq((v128_t)__a, (v128_t)__b);
1021 }
1022
1023 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i __a,__m128i __b)1024 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
1025 {
1026 return (__m128i)wasm_i16x8_eq((v128_t)__a, (v128_t)__b);
1027 }
1028
1029 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i __a,__m128i __b)1030 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
1031 {
1032 return (__m128i)wasm_i32x4_eq((v128_t)__a, (v128_t)__b);
1033 }
1034
1035 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i __a,__m128i __b)1036 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
1037 {
1038 return (__m128i)wasm_i8x16_gt((v128_t)__a, (v128_t)__b);
1039 }
1040
1041 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i __a,__m128i __b)1042 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
1043 {
1044 return (__m128i)wasm_i16x8_gt((v128_t)__a, (v128_t)__b);
1045 }
1046
1047 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i __a,__m128i __b)1048 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
1049 {
1050 return (__m128i)wasm_i32x4_gt((v128_t)__a, (v128_t)__b);
1051 }
1052
1053 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i __a,__m128i __b)1054 _mm_cmplt_epi8(__m128i __a, __m128i __b)
1055 {
1056 return (__m128i)wasm_i8x16_lt((v128_t)__a, (v128_t)__b);
1057 }
1058
1059 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i __a,__m128i __b)1060 _mm_cmplt_epi16(__m128i __a, __m128i __b)
1061 {
1062 return (__m128i)wasm_i16x8_lt((v128_t)__a, (v128_t)__b);
1063 }
1064
1065 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i __a,__m128i __b)1066 _mm_cmplt_epi32(__m128i __a, __m128i __b)
1067 {
1068 return (__m128i)wasm_i32x4_lt((v128_t)__a, (v128_t)__b);
1069 }
1070
1071 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d __a,long long __b)1072 _mm_cvtsi64_sd(__m128d __a, long long __b)
1073 {
1074 // TODO: optimize
1075 union {
1076 double x[2];
1077 __m128d m;
1078 } m;
1079 m.m = __a;
1080 m.x[0] = (double)__b;
1081 return m.m;
1082 }
1083
1084 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d __a)1085 _mm_cvtsd_si64(__m128d __a)
1086 {
1087 // TODO: optimize
1088 if (isnan(__a[0]) || isinf(__a[0])) return 0x8000000000000000LL;
1089 long long x = llrint(__a[0]);
1090 if (x != 0xFFFFFFFF00000000ULL && (x != 0 || fabsf(__a[0]) < 2.f))
1091 return x;
1092 else
1093 return 0x8000000000000000LL;
1094 }
1095
1096 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d __a)1097 _mm_cvttsd_si64(__m128d __a)
1098 {
1099 // TODO: optimize
1100 if (isnan(__a[0]) || isinf(__a[0])) return 0x8000000000000000LL;
1101 long long x = llrint(__a[0]);
1102 if (x != 0xFFFFFFFF00000000ULL && (x != 0 || fabsf(__a[0]) < 2.f))
1103 return (long long)__a[0];
1104 else
1105 return 0x8000000000000000LL;
1106 }
1107
1108 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i __a)1109 _mm_cvtepi32_ps(__m128i __a)
1110 {
1111 return (__m128)wasm_f32x4_convert_i32x4(__a);
1112 }
1113
1114 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 __a)1115 _mm_cvtps_epi32(__m128 __a)
1116 {
1117 // TODO: optimize
1118 union {
1119 int x[4];
1120 __m128i m;
1121 } u;
1122 for(int i = 0; i < 4; ++i)
1123 {
1124 int x = lrint(__a[i]);
1125 if (x != 0 || fabs(__a[i]) < 2.0)
1126 u.x[i] = x;
1127 else
1128 u.x[i] = (int)0x80000000;
1129 }
1130 return u.m;
1131 }
1132
1133 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 __a)1134 _mm_cvttps_epi32(__m128 __a)
1135 {
1136 // TODO: optimize
1137 union {
1138 int x[4];
1139 __m128i m;
1140 } u;
1141 for(int i = 0; i < 4; ++i)
1142 {
1143 int x = lrint(__a[i]);
1144 if (x != 0 || fabs(__a[i]) < 2.0)
1145 u.x[i] = (int)__a[i];
1146 else
1147 u.x[i] = (int)0x80000000;
1148 }
1149 return u.m;
1150 }
1151
1152 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int __a)1153 _mm_cvtsi32_si128(int __a)
1154 {
1155 return (__m128i)wasm_i32x4_make(__a, 0, 0, 0);
1156 }
1157
1158 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long __a)1159 _mm_cvtsi64_si128(long long __a)
1160 {
1161 return (__m128i)wasm_i64x2_make(__a, 0);
1162 }
1163
1164 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i __a)1165 _mm_cvtsi128_si32(__m128i __a)
1166 {
1167 return wasm_i32x4_extract_lane(__a, 0);
1168 }
1169
1170 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i __a)1171 _mm_cvtsi128_si64(__m128i __a)
1172 {
1173 return wasm_i64x2_extract_lane(__a, 0);
1174 }
1175
1176 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const * __p)1177 _mm_load_si128(__m128i const *__p)
1178 {
1179 return *__p;
1180 }
1181
1182 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const * __p)1183 _mm_loadu_si128(__m128i const *__p)
1184 {
1185 struct __loadu_si128 {
1186 __m128i __v;
1187 } __attribute__((__packed__, __may_alias__));
1188 return ((struct __loadu_si128*)__p)->__v;
1189 }
1190
1191 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si32(void const * __p)1192 _mm_loadu_si32(void const *__p)
1193 {
1194 return (__m128i)wasm_i32x4_make(*(unsigned int*)__p, 0, 0, 0);
1195 }
1196
1197 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const * __p)1198 _mm_loadl_epi64(__m128i const *__p)
1199 {
1200 struct __mm_loadl_epi64_struct {
1201 int __u[2];
1202 } __attribute__((__packed__, __may_alias__));
1203 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u[0], ((struct __mm_loadl_epi64_struct*)__p)->__u[1], 0, 0};
1204 }
1205
1206 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(long long q1,long long q0)1207 _mm_set_epi64(long long q1, long long q0)
1208 {
1209 return (__m128i)wasm_i64x2_make(q0, q1);
1210 }
1211
1212 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3,int i2,int i1,int i0)1213 _mm_set_epi32(int i3, int i2, int i1, int i0)
1214 {
1215 return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);
1216 }
1217
1218 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1219 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1220 {
1221 return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);
1222 }
1223
1224 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1225 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1226 {
1227 return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1228 }
1229
1230 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long __q)1231 _mm_set1_epi64x(long long __q)
1232 {
1233 return (__m128i)wasm_i64x2_splat(__q);
1234 }
1235
1236 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int __i)1237 _mm_set1_epi32(int __i)
1238 {
1239 return (__m128i)wasm_i32x4_splat(__i);
1240 }
1241
1242 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short __w)1243 _mm_set1_epi16(short __w)
1244 {
1245 return (__m128i)wasm_i16x8_splat(__w);
1246 }
1247
1248 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char __b)1249 _mm_set1_epi8(char __b)
1250 {
1251 return (__m128i)wasm_i8x16_splat(__b);
1252 }
1253
1254 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0,int i1,int i2,int i3)1255 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1256 {
1257 return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);
1258 }
1259
1260 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1261 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1262 {
1263 return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);
1264 }
1265
1266 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1267 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1268 {
1269 return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1270 }
1271
1272 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)1273 _mm_setzero_si128(void)
1274 {
1275 return wasm_i64x2_const(0, 0);
1276 }
1277
1278 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i * __p,__m128i __b)1279 _mm_store_si128(__m128i *__p, __m128i __b)
1280 {
1281 *__p = __b;
1282 }
1283
1284 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si32(void * __p,__m128i __a)1285 _mm_storeu_si32(void *__p, __m128i __a)
1286 {
1287 *(unsigned int *)__p = wasm_i32x4_extract_lane((v128_t)__a, 0);
1288 }
1289
1290 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i * __p,__m128i __b)1291 _mm_storeu_si128(__m128i *__p, __m128i __b)
1292 {
1293 struct __unaligned {
1294 __m128i __v;
1295 } __attribute__((__packed__, __may_alias__));
1296 ((struct __unaligned *)__p)->__v = __b;
1297 }
1298
1299 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)1300 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1301 {
1302 // TODO: optimize
1303 union {
1304 unsigned char x[16];
1305 __m128i m;
1306 } mask, data;
1307 mask.m = __n;
1308 data.m = __d;
1309 for(int i = 0; i < 16; ++i)
1310 if (mask.x[i] & 0x80)
1311 __p[i] = data.x[i];
1312 }
1313
1314 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i * __p,__m128i __a)1315 _mm_storel_epi64(__m128i *__p, __m128i __a)
1316 {
1317 *(long long *)__p = wasm_i64x2_extract_lane((v128_t)__a, 0);
1318 }
1319
1320 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double * __p,__m128d __a)1321 _mm_stream_pd(double *__p, __m128d __a)
1322 {
1323 // Emscripten/SIMD.js does not have cache hinting.
1324 _mm_store_pd(__p, __a);
1325 }
1326
1327 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i * __p,__m128i __a)1328 _mm_stream_si128(__m128i *__p, __m128i __a)
1329 {
1330 // Emscripten/SIMD.js does not have cache hinting.
1331 _mm_store_si128(__p, __a);
1332 }
1333
1334 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int * __p,int __a)1335 _mm_stream_si32(int *__p, int __a)
1336 {
1337 // No cache hinting available.
1338 *__p = __a;
1339 }
1340
1341 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si64(long long * __p,long long __a)1342 _mm_stream_si64(long long *__p, long long __a)
1343 {
1344 // No cache hinting available.
1345 *__p = __a;
1346 }
1347
1348 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const * __p)1349 _mm_clflush(void const *__p)
1350 {
1351 // Wasm SIMD does not have cache hinting
1352 }
1353
1354 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)1355 _mm_lfence(void)
1356 {
1357 __sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.
1358 }
1359
1360 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)1361 _mm_mfence(void)
1362 {
1363 __sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.
1364 }
1365
1366 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i __a,__m128i __b)1367 _mm_packs_epi16(__m128i __a, __m128i __b)
1368 {
1369 // TODO: optimize
1370 union {
1371 signed short x[8];
1372 __m128i m;
1373 } src, src2;
1374 union {
1375 signed char x[16];
1376 __m128i m;
1377 } dst;
1378 src.m = __a;
1379 src2.m = __b;
1380 for(int i = 0; i < 8; ++i)
1381 {
1382 dst.x[i] = __SATURATE(src.x[i], -128, 127);
1383 dst.x[8+i] = __SATURATE(src2.x[i], -128, 127);
1384 }
1385 return dst.m;
1386 }
1387
1388 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i __a,__m128i __b)1389 _mm_packs_epi32(__m128i __a, __m128i __b)
1390 {
1391 // TODO: optimize
1392 union {
1393 signed int x[4];
1394 __m128i m;
1395 } src, src2;
1396 union {
1397 signed short x[8];
1398 __m128i m;
1399 } dst;
1400 src.m = __a;
1401 src2.m = __b;
1402 for(int i = 0; i < 4; ++i)
1403 {
1404 dst.x[i] = __SATURATE(src.x[i], -32768, 32767);
1405 dst.x[4+i] = __SATURATE(src2.x[i], -32768, 32767);
1406 }
1407 return dst.m;
1408 }
1409
1410 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i __a,__m128i __b)1411 _mm_packus_epi16(__m128i __a, __m128i __b)
1412 {
1413 // TODO: optimize
1414 union {
1415 signed short x[8];
1416 __m128i m;
1417 } src, src2;
1418 union {
1419 unsigned char x[16];
1420 __m128i m;
1421 } dst;
1422 src.m = __a;
1423 src2.m = __b;
1424 for(int i = 0; i < 8; ++i)
1425 {
1426 dst.x[i] = __SATURATE(src.x[i], 0, 255);
1427 dst.x[8+i] = __SATURATE(src2.x[i], 0, 255);
1428 }
1429 return dst.m;
1430 }
1431
1432 #define _mm_extract_epi16(__a, __imm) wasm_u16x8_extract_lane((v128_t)(__a), (__imm) & 7)
1433 #define _mm_insert_epi16(__a, __b, __imm) wasm_i16x8_replace_lane((v128_t)(__a), (__imm) & 7, (__b))
1434
1435 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i __a)1436 _mm_movemask_epi8(__m128i __a)
1437 {
1438 // TODO: optimize
1439 union {
1440 unsigned char x[16];
1441 __m128i m;
1442 } src;
1443 src.m = __a;
1444 unsigned int x = 0;
1445 for(int i = 0; i < 16; ++i)
1446 x |= ((unsigned int)src.x[i] >> 7) << i;
1447 return (int)x;
1448 }
1449
1450 #define _mm_shuffle_epi32(__a, __imm) __extension__ ({ \
1451 (__m128i)wasm_v32x4_shuffle((__a), \
1452 _mm_set1_epi32(0), \
1453 ((__imm) & 0x3), (((__imm) & 0xc) >> 2), \
1454 (((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6)); })
1455
1456 #define _mm_shufflelo_epi16(__a, __imm) __extension__ ({ \
1457 (__m128i)wasm_v16x8_shuffle((__a), \
1458 _mm_set1_epi16(0), \
1459 ((__imm) & 0x3), (((__imm) & 0xc) >> 2), \
1460 (((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6), \
1461 4, 5, 6, 7); })
1462
1463 #define _mm_shufflehi_epi16(__a, __imm) __extension__ ({ \
1464 (__m128i)wasm_v16x8_shuffle((__a), \
1465 _mm_set1_epi16(0), \
1466 0, 1, 2, 3, \
1467 (4 + (((__imm) & 0x03) >> 0)), \
1468 (4 + (((__imm) & 0x0c) >> 2)), \
1469 (4 + (((__imm) & 0x30) >> 4)), \
1470 (4 + (((__imm) & 0xc0) >> 6))); })
1471
1472 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i __a,__m128i __b)1473 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
1474 {
1475 return (__m128i)wasm_v8x16_shuffle(__a, __b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
1476 }
1477
1478 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i __a,__m128i __b)1479 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
1480 {
1481 return (__m128i)wasm_v16x8_shuffle(__a, __b, 4, 12, 5, 13, 6, 14, 7, 15);
1482 }
1483
1484 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i __a,__m128i __b)1485 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
1486 {
1487 return (__m128i)wasm_v32x4_shuffle(__a, __b, 2, 6, 3, 7);
1488 }
1489
1490 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i __a,__m128i __b)1491 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
1492 {
1493 return (__m128i)wasm_v64x2_shuffle(__a, __b, 1, 3);
1494 }
1495
1496 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i __a,__m128i __b)1497 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
1498 {
1499 return (__m128i)wasm_v8x16_shuffle(__a, __b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
1500 }
1501
1502 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i __a,__m128i __b)1503 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
1504 {
1505 return (__m128i)wasm_v16x8_shuffle(__a, __b, 0, 8, 1, 9, 2, 10, 3, 11);
1506 }
1507
1508 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i __a,__m128i __b)1509 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
1510 {
1511 return (__m128i)wasm_v32x4_shuffle(__a, __b, 0, 4, 1, 5);
1512 }
1513
1514 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i __a,__m128i __b)1515 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
1516 {
1517 return (__m128i)wasm_v64x2_shuffle(__a, __b, 0, 2);
1518 }
1519
1520 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i __a)1521 _mm_move_epi64(__m128i __a)
1522 {
1523 return wasm_v64x2_shuffle(__a, wasm_i64x2_const(0, 0), 0, 2);
1524 }
1525
1526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d __a,__m128d __b)1527 _mm_unpackhi_pd(__m128d __a, __m128d __b)
1528 {
1529 return (__m128d)wasm_v64x2_shuffle((v128_t)__a, (v128_t)__b, 1, 3);
1530 }
1531
1532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d __a,__m128d __b)1533 _mm_unpacklo_pd(__m128d __a, __m128d __b)
1534 {
1535 return (__m128d)wasm_v64x2_shuffle((v128_t)__a, (v128_t)__b, 0, 2);
1536 }
1537
1538 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d __a)1539 _mm_movemask_pd(__m128d __a)
1540 {
1541 union {
1542 unsigned long long x[2];
1543 __m128d m;
1544 } __attribute__((__packed__, __may_alias__)) src;
1545 src.m = __a;
1546 return (src.x[0] >> 63) | ((src.x[1] >> 63) << 1);
1547 }
1548
1549 #define _mm_shuffle_pd(__a, __b, __i) __extension__ ({ \
1550 (__m128d) __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), \
1551 (__i) & 1, \
1552 (((__i) & 2) >> 1) + 2); })
1553
1554 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d __a)1555 _mm_castpd_ps(__m128d __a)
1556 {
1557 return (__m128)__a;
1558 }
1559
1560 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d __a)1561 _mm_castpd_si128(__m128d __a)
1562 {
1563 return (__m128i)__a;
1564 }
1565
1566 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 __a)1567 _mm_castps_pd(__m128 __a)
1568 {
1569 return (__m128d)__a;
1570 }
1571
1572 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 __a)1573 _mm_castps_si128(__m128 __a)
1574 {
1575 return (__m128i)__a;
1576 }
1577
1578 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i __a)1579 _mm_castsi128_ps(__m128i __a)
1580 {
1581 return (__m128)__a;
1582 }
1583
1584 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i __a)1585 _mm_castsi128_pd(__m128i __a)
1586 {
1587 return (__m128d)__a;
1588 }
1589
1590 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)1591 _mm_pause(void)
1592 {
1593 // No pause/wait instruction in Wasm/SIMD.
1594 }
1595
1596 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_undefined_pd()1597 _mm_undefined_pd()
1598 {
1599 __m128d val;
1600 return val;
1601 }
1602
1603 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_undefined_si128()1604 _mm_undefined_si128()
1605 {
1606 __m128i val;
1607 return val;
1608 }
1609
1610 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1611
1612 #endif /* __emscripten_emmintrin_h__ */
1613