1 /*
2  * Copyright 2020 The Emscripten Authors.  All rights reserved.
3  * Emscripten is available under two separate licenses, the MIT license and the
4  * University of Illinois/NCSA Open Source License.  Both these licenses can be
5  * found in the LICENSE file.
6  */
7 #ifndef __emscripten_emmintrin_h__
8 #define __emscripten_emmintrin_h__
9 
10 #ifndef __SSE2__
11 #error "SSE2 instruction set not enabled"
12 #endif
13 
14 #include <xmmintrin.h>
15 #include <emscripten/emscripten.h>
16 
17 #define __SATURATE(x, Min, Max) ((x) >= Min ? ((x) <= Max ? (x) : Max) : Min)
18 #define __MIN(x, y) ((x) <= (y) ? (x) : (y))
19 #define __MAX(x, y) ((x) >= (y) ? (x) : (y))
20 
21 // Alias different (functionally) equivalent intrinsics.
22 #define _mm_set_epi64x _mm_set_epi64
23 #define _mm_cvtsd_si64x _mm_cvtsd_si64
24 #define _mm_cvtsi128_si64x _mm_cvtsi128_si64
25 #define _mm_cvtsi64x_sd _mm_cvtsi64_sd
26 #define _mm_cvtsi64x_si128 _mm_cvtsi64_si128
27 #define _mm_cvttsd_si64x _mm_cvttsd_si64
28 #define _mm_store_pd1 _mm_store1_pd
29 
30 typedef __f64x2 __m128d;
31 
32 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d __a,__m128d __b)33 _mm_move_sd(__m128d __a, __m128d __b)
34 {
35   return (__m128d){ __b[0], __a[1] };
36 }
37 
38 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d __a,__m128d __b)39 _mm_add_pd(__m128d __a, __m128d __b)
40 {
41   return (__m128d)wasm_f64x2_add((v128_t)__a, (v128_t)__b);
42 }
43 
44 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d __a,__m128d __b)45 _mm_add_sd(__m128d __a, __m128d __b)
46 {
47   return _mm_move_sd(__a, _mm_add_pd(__a, __b));
48 }
49 
50 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d __a,__m128d __b)51 _mm_sub_pd(__m128d __a, __m128d __b)
52 {
53   return (__m128d)wasm_f64x2_sub((v128_t)__a, (v128_t)__b);
54 }
55 
56 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d __a,__m128d __b)57 _mm_sub_sd(__m128d __a, __m128d __b)
58 {
59   return _mm_move_sd(__a, _mm_sub_pd(__a, __b));
60 }
61 
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d __a,__m128d __b)63 _mm_mul_pd(__m128d __a, __m128d __b)
64 {
65   return (__m128d)wasm_f64x2_mul((v128_t)__a, (v128_t)__b);
66 }
67 
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d __a,__m128d __b)69 _mm_mul_sd(__m128d __a, __m128d __b)
70 {
71   return _mm_move_sd(__a, _mm_mul_pd(__a, __b));
72 }
73 
74 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d __a,__m128d __b)75 _mm_div_pd(__m128d __a, __m128d __b)
76 {
77   return (__m128d)wasm_f64x2_div((v128_t)__a, (v128_t)__b);
78 }
79 
80 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d __a,__m128d __b)81 _mm_div_sd(__m128d __a, __m128d __b)
82 {
83   return _mm_move_sd(__a, _mm_div_pd(__a, __b));
84 }
85 
86 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d __a)87 _mm_sqrt_pd(__m128d __a)
88 {
89   return (__m128d)wasm_f64x2_sqrt((v128_t)__a);
90 }
91 
92 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d __a,__m128d __b)93 _mm_sqrt_sd(__m128d __a, __m128d __b)
94 {
95   return _mm_move_sd(__a, _mm_sqrt_pd(__b));
96 }
97 
98 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d __a,__m128d __b)99 _mm_min_pd(__m128d __a, __m128d __b)
100 {
101 //  return (__m128d)wasm_f32x4_pmin((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
102   return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_lt((v128_t)__a, (v128_t)__b));
103 }
104 
105 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d __a,__m128d __b)106 _mm_min_sd(__m128d __a, __m128d __b)
107 {
108   return _mm_move_sd(__a, _mm_min_pd(__a, __b));
109 }
110 
111 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d __a,__m128d __b)112 _mm_max_pd(__m128d __a, __m128d __b)
113 {
114 //  return (__m128)wasm_f32x4_pmax((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
115   return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_gt((v128_t)__a, (v128_t)__b));
116 }
117 
118 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d __a,__m128d __b)119 _mm_max_sd(__m128d __a, __m128d __b)
120 {
121   return _mm_move_sd(__a, _mm_max_pd(__a, __b));
122 }
123 
124 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d __a,__m128d __b)125 _mm_and_pd(__m128d __a, __m128d __b)
126 {
127   return (__m128d)wasm_v128_and((v128_t)__a, (v128_t)__b);
128 }
129 
130 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d __a,__m128d __b)131 _mm_andnot_pd(__m128d __a, __m128d __b)
132 {
133   return (__m128d)wasm_v128_andnot((v128_t)__b, (v128_t)__a);
134 }
135 
136 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d __a,__m128d __b)137 _mm_or_pd(__m128d __a, __m128d __b)
138 {
139   return (__m128d)wasm_v128_or((v128_t)__a, (v128_t)__b);
140 }
141 
142 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d __a,__m128d __b)143 _mm_xor_pd(__m128d __a, __m128d __b)
144 {
145   return (__m128d)wasm_v128_xor((v128_t)__a, (v128_t)__b);
146 }
147 
148 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d __a,__m128d __b)149 _mm_cmpeq_pd(__m128d __a, __m128d __b)
150 {
151   return (__m128d)wasm_f64x2_eq((v128_t)__a, (v128_t)__b);
152 }
153 
154 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d __a,__m128d __b)155 _mm_cmplt_pd(__m128d __a, __m128d __b)
156 {
157   return (__m128d)wasm_f64x2_lt((v128_t)__a, (v128_t)__b);
158 }
159 
160 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d __a,__m128d __b)161 _mm_cmple_pd(__m128d __a, __m128d __b)
162 {
163   return (__m128d)wasm_f64x2_le((v128_t)__a, (v128_t)__b);
164 }
165 
166 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d __a,__m128d __b)167 _mm_cmpgt_pd(__m128d __a, __m128d __b)
168 {
169   return (__m128d)wasm_f64x2_gt((v128_t)__a, (v128_t)__b);
170 }
171 
172 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d __a,__m128d __b)173 _mm_cmpge_pd(__m128d __a, __m128d __b)
174 {
175   return (__m128d)wasm_f64x2_ge((v128_t)__a, (v128_t)__b);
176 }
177 
178 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d __a,__m128d __b)179 _mm_cmpord_pd(__m128d __a, __m128d __b)
180 {
181   return (__m128d)wasm_v128_and(wasm_f64x2_eq((v128_t)__a, (v128_t)__a),
182                                 wasm_f64x2_eq((v128_t)__b, (v128_t)__b));
183 }
184 
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d __a,__m128d __b)186 _mm_cmpunord_pd(__m128d __a, __m128d __b)
187 {
188   return (__m128d)wasm_v128_or(wasm_f64x2_ne((v128_t)__a, (v128_t)__a),
189                                wasm_f64x2_ne((v128_t)__b, (v128_t)__b));
190 }
191 
192 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d __a,__m128d __b)193 _mm_cmpneq_pd(__m128d __a, __m128d __b)
194 {
195   return (__m128d)wasm_f64x2_ne((v128_t)__a, (v128_t)__b);
196 }
197 
198 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d __a,__m128d __b)199 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
200 {
201   return (__m128d)wasm_v128_not((v128_t)_mm_cmplt_pd(__a, __b));
202 }
203 
204 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d __a,__m128d __b)205 _mm_cmpnle_pd(__m128d __a, __m128d __b)
206 {
207   return (__m128d)wasm_v128_not((v128_t)_mm_cmple_pd(__a, __b));
208 }
209 
210 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d __a,__m128d __b)211 _mm_cmpngt_pd(__m128d __a, __m128d __b)
212 {
213   return (__m128d)wasm_v128_not((v128_t)_mm_cmpgt_pd(__a, __b));
214 }
215 
216 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d __a,__m128d __b)217 _mm_cmpnge_pd(__m128d __a, __m128d __b)
218 {
219   return (__m128d)wasm_v128_not((v128_t)_mm_cmpge_pd(__a, __b));
220 }
221 
222 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d __a,__m128d __b)223 _mm_cmpeq_sd(__m128d __a, __m128d __b)
224 {
225   return _mm_move_sd(__a, _mm_cmpeq_pd(__a, __b));
226 }
227 
228 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d __a,__m128d __b)229 _mm_cmplt_sd(__m128d __a, __m128d __b)
230 {
231   return _mm_move_sd(__a, _mm_cmplt_pd(__a, __b));
232 }
233 
234 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d __a,__m128d __b)235 _mm_cmple_sd(__m128d __a, __m128d __b)
236 {
237   return _mm_move_sd(__a, _mm_cmple_pd(__a, __b));
238 }
239 
240 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d __a,__m128d __b)241 _mm_cmpgt_sd(__m128d __a, __m128d __b)
242 {
243   return _mm_move_sd(__a, _mm_cmpgt_pd(__a, __b));
244 }
245 
246 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d __a,__m128d __b)247 _mm_cmpge_sd(__m128d __a, __m128d __b)
248 {
249   return _mm_move_sd(__a, _mm_cmpge_pd(__a, __b));
250 }
251 
252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d __a,__m128d __b)253 _mm_cmpord_sd(__m128d __a, __m128d __b)
254 {
255   return _mm_move_sd(__a, _mm_cmpord_pd(__a, __b));
256 }
257 
258 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d __a,__m128d __b)259 _mm_cmpunord_sd(__m128d __a, __m128d __b)
260 {
261   return _mm_move_sd(__a, _mm_cmpunord_pd(__a, __b));
262 }
263 
264 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d __a,__m128d __b)265 _mm_cmpneq_sd(__m128d __a, __m128d __b)
266 {
267   return _mm_move_sd(__a, _mm_cmpneq_pd(__a, __b));
268 }
269 
270 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d __a,__m128d __b)271 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
272 {
273   return _mm_move_sd(__a, _mm_cmpnlt_pd(__a, __b));
274 }
275 
276 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d __a,__m128d __b)277 _mm_cmpnle_sd(__m128d __a, __m128d __b)
278 {
279   return _mm_move_sd(__a, _mm_cmpnle_pd(__a, __b));
280 }
281 
282 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d __a,__m128d __b)283 _mm_cmpngt_sd(__m128d __a, __m128d __b)
284 {
285   return _mm_move_sd(__a, _mm_cmpngt_pd(__a, __b));
286 }
287 
288 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d __a,__m128d __b)289 _mm_cmpnge_sd(__m128d __a, __m128d __b)
290 {
291   return _mm_move_sd(__a, _mm_cmpnge_pd(__a, __b));
292 }
293 
294 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d __a,__m128d __b)295 _mm_comieq_sd(__m128d __a, __m128d __b)
296 {
297   return wasm_f64x2_extract_lane(__a, 0) == wasm_f64x2_extract_lane(__b, 0);
298 }
299 
300 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d __a,__m128d __b)301 _mm_comilt_sd(__m128d __a, __m128d __b)
302 {
303   return wasm_f64x2_extract_lane(__a, 0) < wasm_f64x2_extract_lane(__b, 0);
304 }
305 
306 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d __a,__m128d __b)307 _mm_comile_sd(__m128d __a, __m128d __b)
308 {
309   return wasm_f64x2_extract_lane(__a, 0) <= wasm_f64x2_extract_lane(__b, 0);
310 }
311 
312 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d __a,__m128d __b)313 _mm_comigt_sd(__m128d __a, __m128d __b)
314 {
315   return wasm_f64x2_extract_lane(__a, 0) > wasm_f64x2_extract_lane(__b, 0);
316 }
317 
318 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comige_sd(__m128d __a,__m128d __b)319 _mm_comige_sd(__m128d __a, __m128d __b)
320 {
321   return wasm_f64x2_extract_lane(__a, 0) >= wasm_f64x2_extract_lane(__b, 0);
322 }
323 
324 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d __a,__m128d __b)325 _mm_comineq_sd(__m128d __a, __m128d __b)
326 {
327   return wasm_f64x2_extract_lane(__a, 0) != wasm_f64x2_extract_lane(__b, 0);
328 }
329 
330 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d __a,__m128d __b)331 _mm_ucomieq_sd(__m128d __a, __m128d __b)
332 {
333   return wasm_f64x2_extract_lane(__a, 0) == wasm_f64x2_extract_lane(__b, 0);
334 }
335 
336 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d __a,__m128d __b)337 _mm_ucomilt_sd(__m128d __a, __m128d __b)
338 {
339   return wasm_f64x2_extract_lane(__a, 0) < wasm_f64x2_extract_lane(__b, 0);
340 }
341 
342 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d __a,__m128d __b)343 _mm_ucomile_sd(__m128d __a, __m128d __b)
344 {
345   return wasm_f64x2_extract_lane(__a, 0) <= wasm_f64x2_extract_lane(__b, 0);
346 }
347 
348 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d __a,__m128d __b)349 _mm_ucomigt_sd(__m128d __a, __m128d __b)
350 {
351   return wasm_f64x2_extract_lane(__a, 0) > wasm_f64x2_extract_lane(__b, 0);
352 }
353 
354 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_sd(__m128d __a,__m128d __b)355 _mm_ucomige_sd(__m128d __a, __m128d __b)
356 {
357   return wasm_f64x2_extract_lane(__a, 0) >= wasm_f64x2_extract_lane(__b, 0);
358 }
359 
360 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d __a,__m128d __b)361 _mm_ucomineq_sd(__m128d __a, __m128d __b)
362 {
363   return wasm_f64x2_extract_lane(__a, 0) != wasm_f64x2_extract_lane(__b, 0);
364 }
365 
366 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d __a)367 _mm_cvtpd_ps(__m128d __a)
368 {
369   return (__m128)wasm_f32x4_make((float)__a[0], (float)__a[1], 0, 0);
370 }
371 
372 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 __a)373 _mm_cvtps_pd(__m128 __a)
374 {
375   return (__m128d) { (double)__a[0], (double)__a[1] };
376 }
377 
378 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i __a)379 _mm_cvtepi32_pd(__m128i __a)
380 {
381   return (__m128d) { (double)__a[0], (double)__a[1] };
382 }
383 
384 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d __a)385 _mm_cvtpd_epi32(__m128d __a)
386 {
387   // TODO: OPTIMIZE!
388   int m[2];
389   for(int i = 0; i < 2; ++i)
390   {
391     int x = lrint(__a[i]);
392     if (x != 0 || fabs(__a[i]) < 2.0)
393       m[i] = (int)x;
394     else
395       m[i] = (int)0x80000000;
396   }
397   return (__m128i) { m[0], m[1], 0, 0 };
398 }
399 
400 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d __a)401 _mm_cvtsd_si32(__m128d __a)
402 {
403   // TODO: OPTIMIZE!
404   int x = lrint(__a[0]);
405   if (x != 0 || fabs(__a[0]) < 2.0)
406     return (int)x;
407   else
408     return (int)0x80000000;
409 }
410 
411 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 __a,__m128d __b)412 _mm_cvtsd_ss(__m128 __a, __m128d __b)
413 {
414   __a[0] = __b[0];
415   return __a;
416 }
417 
418 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d __a,int __b)419 _mm_cvtsi32_sd(__m128d __a, int __b)
420 {
421   __a[0] = __b;
422   return __a;
423 }
424 
425 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d __a,__m128 __b)426 _mm_cvtss_sd(__m128d __a, __m128 __b)
427 {
428   __a[0] = __b[0];
429   return __a;
430 }
431 
432 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d __a)433 _mm_cvttpd_epi32(__m128d __a)
434 {
435   // TODO: OPTIMIZE!
436   int m[2];
437   for(int i = 0; i < 2; ++i)
438   {
439     int x = lrint(__a[i]);
440     if (x != 0 || fabs(__a[i]) < 2.0)
441       m[i] = (int)__a[i];
442     else
443       m[i] = (int)0x80000000;
444   }
445   return (__m128i) { m[0], m[1], 0, 0 };
446 }
447 
448 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d __a)449 _mm_cvttsd_si32(__m128d __a)
450 {
451   // TODO: OPTIMIZE!
452   int x = lrint(__a[0]);
453   if (x != 0 || fabs(__a[0]) < 2.0)
454     return (int)__a[0];
455   else
456     return (int)0x80000000;
457 }
458 
459 static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d __a)460 _mm_cvtsd_f64(__m128d __a)
461 {
462   return wasm_f64x2_extract_lane((v128_t)__a, 0);
463 }
464 
465 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const * __dp)466 _mm_load_pd(double const *__dp)
467 {
468   return *(__m128d*)__dp;
469 }
470 
471 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const * __dp)472 _mm_load1_pd(double const *__dp)
473 {
474   return (__m128d)wasm_v64x2_load_splat(__dp);
475 }
476 
477 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
478 
479 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const * __p)480 _mm_loadr_pd(double const *__p)
481 {
482   __m128d __u = *(__m128d*)__p; // aligned load
483   return (__m128d)wasm_v64x2_shuffle((v128_t)__u, (v128_t)__u, 1, 0);
484 }
485 
486 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const * __dp)487 _mm_loadu_pd(double const *__dp)
488 {
489   struct __loadu_pd {
490     __m128d __v;
491   } __attribute__((__packed__, __may_alias__));
492   return ((struct __loadu_pd*)__dp)->__v;
493 }
494 
495 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const * __p)496 _mm_load_sd(double const *__p)
497 {
498   return (__m128d)wasm_f64x2_make(*__p, 0.0);
499 }
500 
501 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d __a,double const * __dp)502 _mm_loadh_pd(__m128d __a, double const *__dp)
503 {
504   struct __mm_loadh_pd_struct {
505     double __u;
506   } __attribute__((__packed__, __may_alias__));
507   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
508   return (__m128d){ __a[0], __u };
509 }
510 
511 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d __a,double const * __dp)512 _mm_loadl_pd(__m128d __a, double const *__dp)
513 {
514   struct __mm_loadl_pd_struct {
515     double __u;
516   } __attribute__((__packed__, __may_alias__));
517   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
518   return (__m128d){ __u, __a[1] };
519 }
520 
521 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double __w)522 _mm_set_sd(double __w)
523 {
524   return (__m128d)wasm_f64x2_make(__w, 0);
525 }
526 
527 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double __w)528 _mm_set1_pd(double __w)
529 {
530   return (__m128d)wasm_f64x2_splat(__w);
531 }
532 
533 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double __c1,double __c0)534 _mm_set_pd(double __c1, double __c0)
535 {
536   return (__m128d)wasm_f64x2_make(__c0, __c1);
537 }
538 
539 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double __c0,double __c1)540 _mm_setr_pd(double __c0, double __c1)
541 {
542   return (__m128d)wasm_f64x2_make(__c0, __c1);
543 }
544 
545 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)546 _mm_setzero_pd(void)
547 {
548   return (__m128d)wasm_f64x2_const(0.0, 0.0);
549 }
550 
551 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double * __dp,__m128d __a)552 _mm_store_sd(double *__dp, __m128d __a)
553 {
554   struct __mm_store_sd_struct {
555     double __u;
556   } __attribute__((__packed__, __may_alias__));
557   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
558 }
559 
560 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double * __dp,__m128d __a)561 _mm_store1_pd(double *__dp, __m128d __a)
562 {
563   struct __mm_store1_pd_struct {
564     double __u[2];
565   } __attribute__((__packed__, __may_alias__));
566   ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
567   ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
568 }
569 
570 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double * __dp,__m128d __a)571 _mm_store_pd(double *__dp, __m128d __a)
572 {
573   *(__m128d *)__dp = __a;
574 }
575 
576 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double * __dp,__m128d __a)577 _mm_storeu_pd(double *__dp, __m128d __a)
578 {
579   struct __unaligned {
580     __m128d __v;
581   } __attribute__((__packed__, __may_alias__));
582 
583   ((struct __unaligned *)__dp)->__v = __a;
584 }
585 
586 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double * __p,__m128d __a)587 _mm_storer_pd(double *__p, __m128d __a)
588 {
589   *(__m128d *)__p = (__m128d)wasm_v64x2_shuffle((v128_t)__a, (v128_t)__a, 1, 0);
590 }
591 
592 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double * __dp,__m128d __a)593 _mm_storeh_pd(double *__dp, __m128d __a)
594 {
595   struct __mm_storeh_pd_struct {
596     double __u;
597   } __attribute__((__packed__, __may_alias__));
598   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
599 }
600 
601 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double * __dp,__m128d __a)602 _mm_storel_pd(double *__dp, __m128d __a)
603 {
604   struct __mm_storeh_pd_struct {
605     double __u;
606   } __attribute__((__packed__, __may_alias__));
607   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
608 }
609 
610 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i __a,__m128i __b)611 _mm_add_epi8(__m128i __a, __m128i __b)
612 {
613   return (__m128i)wasm_i8x16_add((v128_t)__a, (v128_t)__b);
614 }
615 
616 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i __a,__m128i __b)617 _mm_add_epi16(__m128i __a, __m128i __b)
618 {
619   return (__m128i)wasm_i16x8_add((v128_t)__a, (v128_t)__b);
620 }
621 
622 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i __a,__m128i __b)623 _mm_add_epi32(__m128i __a, __m128i __b)
624 {
625   return (__m128i)wasm_i32x4_add((v128_t)__a, (v128_t)__b);
626 }
627 
628 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i __a,__m128i __b)629 _mm_add_epi64(__m128i __a, __m128i __b)
630 {
631   return (__m128i)wasm_i64x2_add((v128_t)__a, (v128_t)__b);
632 }
633 
634 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i __a,__m128i __b)635 _mm_adds_epi8(__m128i __a, __m128i __b)
636 {
637   return (__m128i)wasm_i8x16_add_saturate((v128_t)__a, (v128_t)__b);
638 }
639 
640 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i __a,__m128i __b)641 _mm_adds_epi16(__m128i __a, __m128i __b)
642 {
643   return (__m128i)wasm_i16x8_add_saturate((v128_t)__a, (v128_t)__b);
644 }
645 
646 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i __a,__m128i __b)647 _mm_adds_epu8(__m128i __a, __m128i __b)
648 {
649   return (__m128i)wasm_u8x16_add_saturate((v128_t)__a, (v128_t)__b);
650 }
651 
652 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i __a,__m128i __b)653 _mm_adds_epu16(__m128i __a, __m128i __b)
654 {
655   return (__m128i)wasm_u16x8_add_saturate((v128_t)__a, (v128_t)__b);
656 }
657 
658 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i __a,__m128i __b)659 _mm_avg_epu8(__m128i __a, __m128i __b)
660 {
661   return (__m128i)wasm_u8x16_avgr((v128_t)__a, (v128_t)__b);
662 }
663 
664 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i __a,__m128i __b)665 _mm_avg_epu16(__m128i __a, __m128i __b)
666 {
667   return (__m128i)wasm_u16x8_avgr((v128_t)__a, (v128_t)__b);
668 }
669 
670 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i __a,__m128i __b)671 _mm_madd_epi16(__m128i __a, __m128i __b)
672 {
673   // TODO: optimize
674   union {
675     signed short x[8];
676     __m128i m;
677   } src, src2;
678   union {
679     signed int x[4];
680     __m128i m;
681   } dst;
682   src.m = __a;
683   src2.m = __b;
684   for(int i = 0; i < 4; ++i)
685     dst.x[i] = src.x[i*2] * src2.x[i*2] + src.x[i*2+1] * src2.x[i*2+1];
686   return dst.m;
687 }
688 
689 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i __a,__m128i __b)690 _mm_max_epi16(__m128i __a, __m128i __b)
691 {
692   return (__m128i)wasm_i16x8_max((v128_t)__a, (v128_t)__b);
693 }
694 
695 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i __a,__m128i __b)696 _mm_max_epu8(__m128i __a, __m128i __b)
697 {
698   return (__m128i)wasm_u8x16_max((v128_t)__a, (v128_t)__b);
699 }
700 
701 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i __a,__m128i __b)702 _mm_min_epi16(__m128i __a, __m128i __b)
703 {
704   return (__m128i)wasm_i16x8_min((v128_t)__a, (v128_t)__b);
705 }
706 
707 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i __a,__m128i __b)708 _mm_min_epu8(__m128i __a, __m128i __b)
709 {
710   return (__m128i)wasm_u8x16_min((v128_t)__a, (v128_t)__b);
711 }
712 
713 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i __a,__m128i __b)714 _mm_mulhi_epi16(__m128i __a, __m128i __b)
715 {
716   // TODO: optimize
717   union {
718     signed short x[8];
719     __m128i m;
720   } src, src2, dst;
721   src.m = __a;
722   src2.m = __b;
723   for(int i = 0; i < 8; ++i)
724     dst.x[i] = (signed short)(((int)src.x[i] * (int)src2.x[i]) >> 16);
725   return dst.m;
726 }
727 
728 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i __a,__m128i __b)729 _mm_mulhi_epu16(__m128i __a, __m128i __b)
730 {
731   // TODO: optimize
732   union {
733     unsigned short x[8];
734     __m128i m;
735   } src, src2, dst;
736   src.m = __a;
737   src2.m = __b;
738   for(int i = 0; i < 8; ++i)
739     dst.x[i] = (unsigned short)(((int)src.x[i] * (int)src2.x[i]) >> 16);
740   return dst.m;
741 }
742 
743 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i __a,__m128i __b)744 _mm_mullo_epi16(__m128i __a, __m128i __b)
745 {
746   return (__m128i)wasm_i16x8_mul((v128_t)__a, (v128_t)__b);
747 }
748 
749 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i __a,__m128i __b)750 _mm_mul_epu32(__m128i __a, __m128i __b)
751 {
752   // TODO: optimize
753   unsigned long long a0 = (unsigned long long)(unsigned int)__a[0];
754   unsigned long long a2 = (unsigned long long)(unsigned int)__a[2];
755   unsigned long long b0 = (unsigned long long)(unsigned int)__b[0];
756   unsigned long long b2 = (unsigned long long)(unsigned int)__b[2];
757   union {
758     unsigned long long x[2];
759     __m128i m;
760   } u;
761   u.x[0] = a0*b0;
762   u.x[1] = a2*b2;
763   return u.m;
764 }
765 
766 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i __a,__m128i __b)767 _mm_sad_epu8(__m128i __a, __m128i __b)
768 {
769   // TODO: optimize
770   union {
771     unsigned char x[16];
772     __m128i m;
773   } src, src2;
774   src.m = __a;
775   src2.m = __b;
776   union {
777     unsigned short x[8];
778     __m128i m;
779   } dst;
780 #define __ABS(__a) ((__a) < 0 ? -(__a) : (__a))
781   for(int i = 0; i < 8; ++i)
782     dst.x[i] = 0;
783   for(int i = 0; i < 8; ++i)
784   {
785     dst.x[0] += __ABS(src.x[i] - src2.x[i]);
786     dst.x[4] += __ABS(src.x[8+i] - src2.x[8+i]);
787   }
788   return dst.m;
789 #undef __ABS
790 }
791 
792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i __a,__m128i __b)793 _mm_sub_epi8(__m128i __a, __m128i __b)
794 {
795   return (__m128i)wasm_i8x16_sub((v128_t)__a, (v128_t)__b);
796 }
797 
798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i __a,__m128i __b)799 _mm_sub_epi16(__m128i __a, __m128i __b)
800 {
801   return (__m128i)wasm_i16x8_sub((v128_t)__a, (v128_t)__b);
802 }
803 
804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i __a,__m128i __b)805 _mm_sub_epi32(__m128i __a, __m128i __b)
806 {
807   return (__m128i)wasm_i32x4_sub((v128_t)__a, (v128_t)__b);
808 }
809 
810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i __a,__m128i __b)811 _mm_sub_epi64(__m128i __a, __m128i __b)
812 {
813   return (__m128i)wasm_i64x2_sub((v128_t)__a, (v128_t)__b);
814 }
815 
816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i __a,__m128i __b)817 _mm_subs_epi8(__m128i __a, __m128i __b)
818 {
819   return (__m128i)wasm_i8x16_sub_saturate((v128_t)__a, (v128_t)__b);
820 }
821 
822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i __a,__m128i __b)823 _mm_subs_epi16(__m128i __a, __m128i __b)
824 {
825   return (__m128i)wasm_i16x8_sub_saturate((v128_t)__a, (v128_t)__b);
826 }
827 
828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i __a,__m128i __b)829 _mm_subs_epu8(__m128i __a, __m128i __b)
830 {
831   return (__m128i)wasm_u8x16_sub_saturate((v128_t)__a, (v128_t)__b);
832 }
833 
834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i __a,__m128i __b)835 _mm_subs_epu16(__m128i __a, __m128i __b)
836 {
837   return (__m128i)wasm_u16x8_sub_saturate((v128_t)__a, (v128_t)__b);
838 }
839 
840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i __a,__m128i __b)841 _mm_and_si128(__m128i __a, __m128i __b)
842 {
843   return (__m128i)wasm_v128_and((v128_t)__a, (v128_t)__b);
844 }
845 
846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i __a,__m128i __b)847 _mm_andnot_si128(__m128i __a, __m128i __b)
848 {
849   return (__m128i)wasm_v128_andnot((v128_t)__b, (v128_t)__a);
850 }
851 
852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i __a,__m128i __b)853 _mm_or_si128(__m128i __a, __m128i __b)
854 {
855   return (__m128i)wasm_v128_or((v128_t)__b, (v128_t)__a);
856 }
857 
858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i __a,__m128i __b)859 _mm_xor_si128(__m128i __a, __m128i __b)
860 {
861   return (__m128i)wasm_v128_xor((v128_t)__b, (v128_t)__a);
862 }
863 
864 #define _mm_slli_si128(__a, __imm) __extension__ ({               \
865   (__m128i)wasm_v8x16_shuffle(_mm_setzero_si128(),                \
866                              (__a),                               \
867                              ((__imm)&0xF0) ? 0 : 16 - ((__imm)&0xF), \
868                              ((__imm)&0xF0) ? 0 : 17 - ((__imm)&0xF), \
869                              ((__imm)&0xF0) ? 0 : 18 - ((__imm)&0xF), \
870                              ((__imm)&0xF0) ? 0 : 19 - ((__imm)&0xF), \
871                              ((__imm)&0xF0) ? 0 : 20 - ((__imm)&0xF), \
872                              ((__imm)&0xF0) ? 0 : 21 - ((__imm)&0xF), \
873                              ((__imm)&0xF0) ? 0 : 22 - ((__imm)&0xF), \
874                              ((__imm)&0xF0) ? 0 : 23 - ((__imm)&0xF), \
875                              ((__imm)&0xF0) ? 0 : 24 - ((__imm)&0xF), \
876                              ((__imm)&0xF0) ? 0 : 25 - ((__imm)&0xF), \
877                              ((__imm)&0xF0) ? 0 : 26 - ((__imm)&0xF), \
878                              ((__imm)&0xF0) ? 0 : 27 - ((__imm)&0xF), \
879                              ((__imm)&0xF0) ? 0 : 28 - ((__imm)&0xF), \
880                              ((__imm)&0xF0) ? 0 : 29 - ((__imm)&0xF), \
881                              ((__imm)&0xF0) ? 0 : 30 - ((__imm)&0xF), \
882                              ((__imm)&0xF0) ? 0 : 31 - ((__imm)&0xF)); })
883 #define _mm_bslli_si128(__a, __imm) \
884   _mm_slli_si128((__a), (__imm))
885 
886 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i __a,int __count)887 _mm_slli_epi16(__m128i __a, int __count)
888 {
889   return (__m128i)((__count < 16) ? wasm_i16x8_shl((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
890 }
891 
892 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i __a,__m128i __count)893 _mm_sll_epi16(__m128i __a, __m128i __count)
894 {
895   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
896   return (__m128i)((__c < 16) ? wasm_i16x8_shl((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
897 }
898 
899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i __a,int __count)900 _mm_slli_epi32(__m128i __a, int __count)
901 {
902   return (__m128i)((__count < 32) ? wasm_i32x4_shl((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));
903 }
904 
905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i __a,__m128i __count)906 _mm_sll_epi32(__m128i __a, __m128i __count)
907 {
908   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
909   return (__m128i)((__c < 32) ? wasm_i32x4_shl((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));
910 }
911 
912 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i __a,int __count)913 _mm_slli_epi64(__m128i __a, int __count)
914 {
915   return (__m128i)((__count < 64) ? wasm_i64x2_shl((v128_t)__a, __count) : wasm_i64x2_const(0,0));
916 }
917 
918 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i __a,__m128i __count)919 _mm_sll_epi64(__m128i __a, __m128i __count)
920 {
921   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
922   return (__m128i)((__c < 64) ? wasm_i64x2_shl((v128_t)__a, __c) : wasm_i64x2_const(0,0));
923 }
924 
925 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i __a,int __count)926 _mm_srai_epi16(__m128i __a, int __count)
927 {
928   __count = __count < 15 ? __count : 15;
929   return (__m128i)wasm_i16x8_shr((v128_t)__a, __count);
930 }
931 
932 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i __a,__m128i __count)933 _mm_sra_epi16(__m128i __a, __m128i __count)
934 {
935   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
936   __c = __c < 15 ? __c : 15;
937   return (__m128i)wasm_i16x8_shr((v128_t)__a, __c);
938 }
939 
940 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i __a,int __count)941 _mm_srai_epi32(__m128i __a, int __count)
942 {
943   __count = __count < 31 ? __count : 31;
944   return (__m128i)wasm_i32x4_shr((v128_t)__a, __count);
945 }
946 
947 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i __a,__m128i __count)948 _mm_sra_epi32(__m128i __a, __m128i __count)
949 {
950   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
951   __c = __c < 31 ? __c : 31;
952   return (__m128i)wasm_i32x4_shr((v128_t)__a, __c);
953 }
954 
955 #define _mm_srli_si128(__a, __imm) __extension__ ({                     \
956   (__m128i)wasm_v8x16_shuffle((__a),                                    \
957                               _mm_setzero_si128(),                      \
958                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 0,  \
959                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 1,  \
960                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 2,  \
961                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 3,  \
962                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 4,  \
963                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 5,  \
964                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 6,  \
965                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 7,  \
966                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 8,  \
967                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 9,  \
968                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 10, \
969                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 11, \
970                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 12, \
971                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 13, \
972                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 14, \
973                               ((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 15); })
974 
975 #define _mm_bsrli_si128(__a, __imm) \
976   _mm_srli_si128((__a), (__imm))
977 
978 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i __a,int __count)979 _mm_srli_epi16(__m128i __a, int __count)
980 {
981   return (__m128i)(((unsigned int)__count < 16) ? wasm_u16x8_shr((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
982 }
983 
984 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i __a,__m128i __count)985 _mm_srl_epi16(__m128i __a, __m128i __count)
986 {
987   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
988   return (__m128i)((__c < 16) ? wasm_u16x8_shr((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
989 }
990 
991 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i __a,int __count)992 _mm_srli_epi32(__m128i __a, int __count)
993 {
994   return (__m128i)(((unsigned int)__count < 32) ? wasm_u32x4_shr((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));
995 }
996 
997 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i __a,__m128i __count)998 _mm_srl_epi32(__m128i __a, __m128i __count)
999 {
1000   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
1001   return (__m128i)((__c < 32) ? wasm_u32x4_shr((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));
1002 }
1003 
1004 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i __a,int __count)1005 _mm_srli_epi64(__m128i __a, int __count)
1006 {
1007   return (__m128i)(((unsigned int)__count < 64) ? wasm_u64x2_shr((v128_t)__a, __count) : wasm_i64x2_const(0,0));
1008 }
1009 
1010 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i __a,__m128i __count)1011 _mm_srl_epi64(__m128i __a, __m128i __count)
1012 {
1013   unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
1014   return (__m128i)((__c < 64) ? wasm_u64x2_shr((v128_t)__a, __c) : wasm_i64x2_const(0,0));
1015 }
1016 
1017 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i __a,__m128i __b)1018 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
1019 {
1020   return (__m128i)wasm_i8x16_eq((v128_t)__a, (v128_t)__b);
1021 }
1022 
1023 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i __a,__m128i __b)1024 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
1025 {
1026   return (__m128i)wasm_i16x8_eq((v128_t)__a, (v128_t)__b);
1027 }
1028 
1029 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i __a,__m128i __b)1030 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
1031 {
1032   return (__m128i)wasm_i32x4_eq((v128_t)__a, (v128_t)__b);
1033 }
1034 
1035 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i __a,__m128i __b)1036 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
1037 {
1038   return (__m128i)wasm_i8x16_gt((v128_t)__a, (v128_t)__b);
1039 }
1040 
1041 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i __a,__m128i __b)1042 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
1043 {
1044   return (__m128i)wasm_i16x8_gt((v128_t)__a, (v128_t)__b);
1045 }
1046 
1047 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i __a,__m128i __b)1048 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
1049 {
1050   return (__m128i)wasm_i32x4_gt((v128_t)__a, (v128_t)__b);
1051 }
1052 
1053 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i __a,__m128i __b)1054 _mm_cmplt_epi8(__m128i __a, __m128i __b)
1055 {
1056   return (__m128i)wasm_i8x16_lt((v128_t)__a, (v128_t)__b);
1057 }
1058 
1059 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i __a,__m128i __b)1060 _mm_cmplt_epi16(__m128i __a, __m128i __b)
1061 {
1062   return (__m128i)wasm_i16x8_lt((v128_t)__a, (v128_t)__b);
1063 }
1064 
1065 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i __a,__m128i __b)1066 _mm_cmplt_epi32(__m128i __a, __m128i __b)
1067 {
1068   return (__m128i)wasm_i32x4_lt((v128_t)__a, (v128_t)__b);
1069 }
1070 
1071 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d __a,long long __b)1072 _mm_cvtsi64_sd(__m128d __a, long long __b)
1073 {
1074   // TODO: optimize
1075   union {
1076     double x[2];
1077     __m128d m;
1078   } m;
1079   m.m = __a;
1080   m.x[0] = (double)__b;
1081   return m.m;
1082 }
1083 
1084 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d __a)1085 _mm_cvtsd_si64(__m128d __a)
1086 {
1087   // TODO: optimize
1088   if (isnan(__a[0]) || isinf(__a[0])) return 0x8000000000000000LL;
1089   long long x = llrint(__a[0]);
1090   if (x != 0xFFFFFFFF00000000ULL && (x != 0 || fabsf(__a[0]) < 2.f))
1091     return x;
1092   else
1093     return 0x8000000000000000LL;
1094 }
1095 
1096 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d __a)1097 _mm_cvttsd_si64(__m128d __a)
1098 {
1099   // TODO: optimize
1100   if (isnan(__a[0]) || isinf(__a[0])) return 0x8000000000000000LL;
1101   long long x = llrint(__a[0]);
1102   if (x != 0xFFFFFFFF00000000ULL && (x != 0 || fabsf(__a[0]) < 2.f))
1103     return (long long)__a[0];
1104   else
1105     return 0x8000000000000000LL;
1106 }
1107 
1108 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i __a)1109 _mm_cvtepi32_ps(__m128i __a)
1110 {
1111   return (__m128)wasm_f32x4_convert_i32x4(__a);
1112 }
1113 
1114 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 __a)1115 _mm_cvtps_epi32(__m128 __a)
1116 {
1117   // TODO: optimize
1118   union {
1119     int x[4];
1120     __m128i m;
1121   } u;
1122   for(int i = 0; i < 4; ++i)
1123   {
1124     int x = lrint(__a[i]);
1125     if (x != 0 || fabs(__a[i]) < 2.0)
1126       u.x[i] = x;
1127     else
1128       u.x[i] = (int)0x80000000;
1129   }
1130   return u.m;
1131 }
1132 
1133 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 __a)1134 _mm_cvttps_epi32(__m128 __a)
1135 {
1136   // TODO: optimize
1137   union {
1138     int x[4];
1139     __m128i m;
1140   } u;
1141   for(int i = 0; i < 4; ++i)
1142   {
1143     int x = lrint(__a[i]);
1144     if (x != 0 || fabs(__a[i]) < 2.0)
1145       u.x[i] = (int)__a[i];
1146     else
1147       u.x[i] = (int)0x80000000;
1148   }
1149   return u.m;
1150 }
1151 
1152 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int __a)1153 _mm_cvtsi32_si128(int __a)
1154 {
1155   return (__m128i)wasm_i32x4_make(__a, 0, 0, 0);
1156 }
1157 
1158 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long __a)1159 _mm_cvtsi64_si128(long long __a)
1160 {
1161   return (__m128i)wasm_i64x2_make(__a, 0);
1162 }
1163 
1164 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i __a)1165 _mm_cvtsi128_si32(__m128i __a)
1166 {
1167   return wasm_i32x4_extract_lane(__a, 0);
1168 }
1169 
1170 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i __a)1171 _mm_cvtsi128_si64(__m128i __a)
1172 {
1173   return wasm_i64x2_extract_lane(__a, 0);
1174 }
1175 
1176 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const * __p)1177 _mm_load_si128(__m128i const *__p)
1178 {
1179   return *__p;
1180 }
1181 
1182 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const * __p)1183 _mm_loadu_si128(__m128i const *__p)
1184 {
1185   struct __loadu_si128 {
1186     __m128i __v;
1187   } __attribute__((__packed__, __may_alias__));
1188   return ((struct __loadu_si128*)__p)->__v;
1189 }
1190 
1191 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si32(void const * __p)1192 _mm_loadu_si32(void const *__p)
1193 {
1194   return (__m128i)wasm_i32x4_make(*(unsigned int*)__p, 0, 0, 0);
1195 }
1196 
1197 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const * __p)1198 _mm_loadl_epi64(__m128i const *__p)
1199 {
1200   struct __mm_loadl_epi64_struct {
1201     int __u[2];
1202   } __attribute__((__packed__, __may_alias__));
1203   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u[0], ((struct __mm_loadl_epi64_struct*)__p)->__u[1], 0, 0};
1204 }
1205 
1206 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(long long q1,long long q0)1207 _mm_set_epi64(long long q1, long long q0)
1208 {
1209   return (__m128i)wasm_i64x2_make(q0, q1);
1210 }
1211 
1212 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3,int i2,int i1,int i0)1213 _mm_set_epi32(int i3, int i2, int i1, int i0)
1214 {
1215   return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);
1216 }
1217 
1218 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1219 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1220 {
1221   return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);
1222 }
1223 
1224 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1225 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1226 {
1227   return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1228 }
1229 
1230 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long __q)1231 _mm_set1_epi64x(long long __q)
1232 {
1233   return (__m128i)wasm_i64x2_splat(__q);
1234 }
1235 
1236 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int __i)1237 _mm_set1_epi32(int __i)
1238 {
1239   return (__m128i)wasm_i32x4_splat(__i);
1240 }
1241 
1242 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short __w)1243 _mm_set1_epi16(short __w)
1244 {
1245   return (__m128i)wasm_i16x8_splat(__w);
1246 }
1247 
1248 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char __b)1249 _mm_set1_epi8(char __b)
1250 {
1251   return (__m128i)wasm_i8x16_splat(__b);
1252 }
1253 
1254 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0,int i1,int i2,int i3)1255 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1256 {
1257   return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);
1258 }
1259 
1260 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1261 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1262 {
1263   return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);
1264 }
1265 
1266 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1267 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1268 {
1269   return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1270 }
1271 
1272 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)1273 _mm_setzero_si128(void)
1274 {
1275   return wasm_i64x2_const(0, 0);
1276 }
1277 
1278 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i * __p,__m128i __b)1279 _mm_store_si128(__m128i *__p, __m128i __b)
1280 {
1281   *__p = __b;
1282 }
1283 
1284 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si32(void * __p,__m128i __a)1285 _mm_storeu_si32(void *__p, __m128i __a)
1286 {
1287   *(unsigned int *)__p = wasm_i32x4_extract_lane((v128_t)__a, 0);
1288 }
1289 
1290 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i * __p,__m128i __b)1291 _mm_storeu_si128(__m128i *__p, __m128i __b)
1292 {
1293   struct __unaligned {
1294     __m128i __v;
1295   } __attribute__((__packed__, __may_alias__));
1296   ((struct __unaligned *)__p)->__v = __b;
1297 }
1298 
1299 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)1300 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1301 {
1302   // TODO: optimize
1303   union {
1304     unsigned char x[16];
1305     __m128i m;
1306   } mask, data;
1307   mask.m = __n;
1308   data.m = __d;
1309   for(int i = 0; i < 16; ++i)
1310     if (mask.x[i] & 0x80)
1311       __p[i] = data.x[i];
1312 }
1313 
1314 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i * __p,__m128i __a)1315 _mm_storel_epi64(__m128i *__p, __m128i __a)
1316 {
1317   *(long long *)__p = wasm_i64x2_extract_lane((v128_t)__a, 0);
1318 }
1319 
1320 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double * __p,__m128d __a)1321 _mm_stream_pd(double *__p, __m128d __a)
1322 {
1323   // Emscripten/SIMD.js does not have cache hinting.
1324   _mm_store_pd(__p, __a);
1325 }
1326 
1327 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i * __p,__m128i __a)1328 _mm_stream_si128(__m128i *__p, __m128i __a)
1329 {
1330   // Emscripten/SIMD.js does not have cache hinting.
1331   _mm_store_si128(__p, __a);
1332 }
1333 
1334 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int * __p,int __a)1335 _mm_stream_si32(int *__p, int __a)
1336 {
1337   // No cache hinting available.
1338   *__p = __a;
1339 }
1340 
1341 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si64(long long * __p,long long __a)1342 _mm_stream_si64(long long *__p, long long __a)
1343 {
1344   // No cache hinting available.
1345   *__p = __a;
1346 }
1347 
1348 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const * __p)1349 _mm_clflush(void const *__p)
1350 {
1351   // Wasm SIMD does not have cache hinting
1352 }
1353 
1354 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)1355 _mm_lfence(void)
1356 {
1357   __sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.
1358 }
1359 
1360 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)1361 _mm_mfence(void)
1362 {
1363   __sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.
1364 }
1365 
1366 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i __a,__m128i __b)1367 _mm_packs_epi16(__m128i __a, __m128i __b)
1368 {
1369   // TODO: optimize
1370   union {
1371     signed short x[8];
1372     __m128i m;
1373   } src, src2;
1374   union {
1375     signed char x[16];
1376     __m128i m;
1377   } dst;
1378   src.m = __a;
1379   src2.m = __b;
1380   for(int i = 0; i < 8; ++i)
1381   {
1382     dst.x[i] = __SATURATE(src.x[i], -128, 127);
1383     dst.x[8+i] = __SATURATE(src2.x[i], -128, 127);
1384   }
1385   return dst.m;
1386 }
1387 
1388 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i __a,__m128i __b)1389 _mm_packs_epi32(__m128i __a, __m128i __b)
1390 {
1391   // TODO: optimize
1392   union {
1393     signed int x[4];
1394     __m128i m;
1395   } src, src2;
1396   union {
1397     signed short x[8];
1398     __m128i m;
1399   } dst;
1400   src.m = __a;
1401   src2.m = __b;
1402   for(int i = 0; i < 4; ++i)
1403   {
1404     dst.x[i] = __SATURATE(src.x[i], -32768, 32767);
1405     dst.x[4+i] = __SATURATE(src2.x[i], -32768, 32767);
1406   }
1407   return dst.m;
1408 }
1409 
1410 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i __a,__m128i __b)1411 _mm_packus_epi16(__m128i __a, __m128i __b)
1412 {
1413   // TODO: optimize
1414   union {
1415     signed short x[8];
1416     __m128i m;
1417   } src, src2;
1418   union {
1419     unsigned char x[16];
1420     __m128i m;
1421   } dst;
1422   src.m = __a;
1423   src2.m = __b;
1424   for(int i = 0; i < 8; ++i)
1425   {
1426     dst.x[i] = __SATURATE(src.x[i], 0, 255);
1427     dst.x[8+i] = __SATURATE(src2.x[i], 0, 255);
1428   }
1429   return dst.m;
1430 }
1431 
1432 #define _mm_extract_epi16(__a, __imm) wasm_u16x8_extract_lane((v128_t)(__a), (__imm) & 7)
1433 #define _mm_insert_epi16(__a, __b, __imm) wasm_i16x8_replace_lane((v128_t)(__a), (__imm) & 7, (__b))
1434 
1435 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i __a)1436 _mm_movemask_epi8(__m128i __a)
1437 {
1438   // TODO: optimize
1439   union {
1440     unsigned char x[16];
1441     __m128i m;
1442   } src;
1443   src.m = __a;
1444   unsigned int x = 0;
1445   for(int i = 0; i < 16; ++i)
1446     x |= ((unsigned int)src.x[i] >> 7) << i;
1447   return (int)x;
1448 }
1449 
1450 #define _mm_shuffle_epi32(__a, __imm) __extension__ ({ \
1451   (__m128i)wasm_v32x4_shuffle((__a), \
1452                               _mm_set1_epi32(0), \
1453                               ((__imm) & 0x3), (((__imm) & 0xc) >> 2), \
1454                               (((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6)); })
1455 
1456 #define _mm_shufflelo_epi16(__a, __imm) __extension__ ({ \
1457   (__m128i)wasm_v16x8_shuffle((__a), \
1458                               _mm_set1_epi16(0), \
1459                               ((__imm) & 0x3), (((__imm) & 0xc) >> 2), \
1460                               (((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6), \
1461                               4, 5, 6, 7); })
1462 
1463 #define _mm_shufflehi_epi16(__a, __imm) __extension__ ({ \
1464   (__m128i)wasm_v16x8_shuffle((__a), \
1465                               _mm_set1_epi16(0), \
1466                               0, 1, 2, 3, \
1467                               (4 + (((__imm) & 0x03) >> 0)), \
1468                               (4 + (((__imm) & 0x0c) >> 2)), \
1469                               (4 + (((__imm) & 0x30) >> 4)), \
1470                               (4 + (((__imm) & 0xc0) >> 6))); })
1471 
1472 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i __a,__m128i __b)1473 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
1474 {
1475   return (__m128i)wasm_v8x16_shuffle(__a, __b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
1476 }
1477 
1478 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i __a,__m128i __b)1479 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
1480 {
1481   return (__m128i)wasm_v16x8_shuffle(__a, __b, 4, 12, 5, 13, 6, 14, 7, 15);
1482 }
1483 
1484 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i __a,__m128i __b)1485 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
1486 {
1487   return (__m128i)wasm_v32x4_shuffle(__a, __b, 2, 6, 3, 7);
1488 }
1489 
1490 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i __a,__m128i __b)1491 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
1492 {
1493   return (__m128i)wasm_v64x2_shuffle(__a, __b, 1, 3);
1494 }
1495 
1496 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i __a,__m128i __b)1497 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
1498 {
1499   return (__m128i)wasm_v8x16_shuffle(__a, __b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
1500 }
1501 
1502 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i __a,__m128i __b)1503 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
1504 {
1505   return (__m128i)wasm_v16x8_shuffle(__a, __b, 0, 8, 1, 9, 2, 10, 3, 11);
1506 }
1507 
1508 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i __a,__m128i __b)1509 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
1510 {
1511   return (__m128i)wasm_v32x4_shuffle(__a, __b, 0, 4, 1, 5);
1512 }
1513 
1514 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i __a,__m128i __b)1515 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
1516 {
1517   return (__m128i)wasm_v64x2_shuffle(__a, __b, 0, 2);
1518 }
1519 
1520 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i __a)1521 _mm_move_epi64(__m128i __a)
1522 {
1523   return wasm_v64x2_shuffle(__a, wasm_i64x2_const(0, 0), 0, 2);
1524 }
1525 
1526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d __a,__m128d __b)1527 _mm_unpackhi_pd(__m128d __a, __m128d __b)
1528 {
1529   return (__m128d)wasm_v64x2_shuffle((v128_t)__a, (v128_t)__b, 1, 3);
1530 }
1531 
1532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d __a,__m128d __b)1533 _mm_unpacklo_pd(__m128d __a, __m128d __b)
1534 {
1535   return (__m128d)wasm_v64x2_shuffle((v128_t)__a, (v128_t)__b, 0, 2);
1536 }
1537 
1538 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d __a)1539 _mm_movemask_pd(__m128d __a)
1540 {
1541   union {
1542     unsigned long long x[2];
1543     __m128d m;
1544   } __attribute__((__packed__, __may_alias__)) src;
1545   src.m = __a;
1546   return (src.x[0] >> 63) | ((src.x[1] >> 63) << 1);
1547 }
1548 
1549 #define _mm_shuffle_pd(__a, __b, __i) __extension__ ({ \
1550   (__m128d) __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), \
1551                                     (__i) & 1, \
1552                                     (((__i) & 2) >> 1) + 2); })
1553 
1554 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d __a)1555 _mm_castpd_ps(__m128d __a)
1556 {
1557   return (__m128)__a;
1558 }
1559 
1560 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d __a)1561 _mm_castpd_si128(__m128d __a)
1562 {
1563   return (__m128i)__a;
1564 }
1565 
1566 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 __a)1567 _mm_castps_pd(__m128 __a)
1568 {
1569   return (__m128d)__a;
1570 }
1571 
1572 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 __a)1573 _mm_castps_si128(__m128 __a)
1574 {
1575   return (__m128i)__a;
1576 }
1577 
1578 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i __a)1579 _mm_castsi128_ps(__m128i __a)
1580 {
1581   return (__m128)__a;
1582 }
1583 
1584 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i __a)1585 _mm_castsi128_pd(__m128i __a)
1586 {
1587   return (__m128d)__a;
1588 }
1589 
1590 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)1591 _mm_pause(void)
1592 {
1593   // No pause/wait instruction in Wasm/SIMD.
1594 }
1595 
1596 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_undefined_pd()1597 _mm_undefined_pd()
1598 {
1599   __m128d val;
1600   return val;
1601 }
1602 
1603 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_undefined_si128()1604 _mm_undefined_si128()
1605 {
1606   __m128i val;
1607   return val;
1608 }
1609 
1610 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1611 
1612 #endif /* __emscripten_emmintrin_h__ */
1613