1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h"
6 
7 #include "src/codegen/assembler.h"
8 #include "src/codegen/cpu-features.h"
9 #include "src/codegen/register-arch.h"
10 
11 #if V8_TARGET_ARCH_IA32
12 #include "src/codegen/ia32/register-ia32.h"
13 #elif V8_TARGET_ARCH_X64
14 #include "src/codegen/x64/register-x64.h"
15 #else
16 #error Unsupported target architecture.
17 #endif
18 
19 // Operand on IA32 can be a wrapper for a single register, in which case they
20 // should call I8x16Splat |src| being Register.
21 #if V8_TARGET_ARCH_IA32
22 #define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only());
23 #else
24 #define DCHECK_OPERAND_IS_NOT_REG(op)
25 #endif
26 
27 namespace v8 {
28 namespace internal {
29 
Move(Register dst,uint32_t src)30 void SharedTurboAssembler::Move(Register dst, uint32_t src) {
31   // Helper to paper over the different assembler function names.
32 #if V8_TARGET_ARCH_IA32
33   mov(dst, Immediate(src));
34 #elif V8_TARGET_ARCH_X64
35   movl(dst, Immediate(src));
36 #else
37 #error Unsupported target architecture.
38 #endif
39 }
40 
Move(Register dst,Register src)41 void SharedTurboAssembler::Move(Register dst, Register src) {
42   // Helper to paper over the different assembler function names.
43   if (dst != src) {
44 #if V8_TARGET_ARCH_IA32
45     mov(dst, src);
46 #elif V8_TARGET_ARCH_X64
47     movq(dst, src);
48 #else
49 #error Unsupported target architecture.
50 #endif
51   }
52 }
53 
Add(Register dst,Immediate src)54 void SharedTurboAssembler::Add(Register dst, Immediate src) {
55   // Helper to paper over the different assembler function names.
56 #if V8_TARGET_ARCH_IA32
57   add(dst, src);
58 #elif V8_TARGET_ARCH_X64
59   addq(dst, src);
60 #else
61 #error Unsupported target architecture.
62 #endif
63 }
64 
And(Register dst,Immediate src)65 void SharedTurboAssembler::And(Register dst, Immediate src) {
66   // Helper to paper over the different assembler function names.
67 #if V8_TARGET_ARCH_IA32
68   and_(dst, src);
69 #elif V8_TARGET_ARCH_X64
70   andq(dst, src);
71 #else
72 #error Unsupported target architecture.
73 #endif
74 }
75 
Movhps(XMMRegister dst,XMMRegister src1,Operand src2)76 void SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1,
77                                   Operand src2) {
78   if (CpuFeatures::IsSupported(AVX)) {
79     CpuFeatureScope scope(this, AVX);
80     vmovhps(dst, src1, src2);
81   } else {
82     if (dst != src1) {
83       movaps(dst, src1);
84     }
85     movhps(dst, src2);
86   }
87 }
88 
Movlps(XMMRegister dst,XMMRegister src1,Operand src2)89 void SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1,
90                                   Operand src2) {
91   if (CpuFeatures::IsSupported(AVX)) {
92     CpuFeatureScope scope(this, AVX);
93     vmovlps(dst, src1, src2);
94   } else {
95     if (dst != src1) {
96       movaps(dst, src1);
97     }
98     movlps(dst, src2);
99   }
100 }
101 
Shufps(XMMRegister dst,XMMRegister src1,XMMRegister src2,uint8_t imm8)102 void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
103                                   XMMRegister src2, uint8_t imm8) {
104   if (CpuFeatures::IsSupported(AVX)) {
105     CpuFeatureScope avx_scope(this, AVX);
106     vshufps(dst, src1, src2, imm8);
107   } else {
108     if (dst != src1) {
109       movaps(dst, src1);
110     }
111     shufps(dst, src2, imm8);
112   }
113 }
114 
F64x2ExtractLane(DoubleRegister dst,XMMRegister src,uint8_t lane)115 void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
116                                             uint8_t lane) {
117   ASM_CODE_COMMENT(this);
118   if (lane == 0) {
119     if (dst != src) {
120       Movaps(dst, src);
121     }
122   } else {
123     DCHECK_EQ(1, lane);
124     if (CpuFeatures::IsSupported(AVX)) {
125       CpuFeatureScope avx_scope(this, AVX);
126       // Pass src as operand to avoid false-dependency on dst.
127       vmovhlps(dst, src, src);
128     } else {
129       movhlps(dst, src);
130     }
131   }
132 }
133 
F64x2ReplaceLane(XMMRegister dst,XMMRegister src,DoubleRegister rep,uint8_t lane)134 void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
135                                             DoubleRegister rep, uint8_t lane) {
136   ASM_CODE_COMMENT(this);
137   if (CpuFeatures::IsSupported(AVX)) {
138     CpuFeatureScope scope(this, AVX);
139     if (lane == 0) {
140       vmovsd(dst, src, rep);
141     } else {
142       vmovlhps(dst, src, rep);
143     }
144   } else {
145     CpuFeatureScope scope(this, SSE4_1);
146     if (dst != src) {
147       DCHECK_NE(dst, rep);  // Ensure rep is not overwritten.
148       movaps(dst, src);
149     }
150     if (lane == 0) {
151       movsd(dst, rep);
152     } else {
153       movlhps(dst, rep);
154     }
155   }
156 }
157 
F32x4Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)158 void SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs,
159                                     XMMRegister rhs, XMMRegister scratch) {
160   ASM_CODE_COMMENT(this);
161   // The minps instruction doesn't propagate NaNs and +0's in its first
162   // operand. Perform minps in both orders, merge the results, and adjust.
163   if (CpuFeatures::IsSupported(AVX)) {
164     CpuFeatureScope scope(this, AVX);
165     vminps(scratch, lhs, rhs);
166     vminps(dst, rhs, lhs);
167   } else if (dst == lhs || dst == rhs) {
168     XMMRegister src = dst == lhs ? rhs : lhs;
169     movaps(scratch, src);
170     minps(scratch, dst);
171     minps(dst, src);
172   } else {
173     movaps(scratch, lhs);
174     minps(scratch, rhs);
175     movaps(dst, rhs);
176     minps(dst, lhs);
177   }
178   // Propagate -0's and NaNs, which may be non-canonical.
179   Orps(scratch, dst);
180   // Canonicalize NaNs by quieting and clearing the payload.
181   Cmpunordps(dst, dst, scratch);
182   Orps(scratch, dst);
183   Psrld(dst, dst, byte{10});
184   Andnps(dst, dst, scratch);
185 }
186 
F32x4Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)187 void SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs,
188                                     XMMRegister rhs, XMMRegister scratch) {
189   ASM_CODE_COMMENT(this);
190   // The maxps instruction doesn't propagate NaNs and +0's in its first
191   // operand. Perform maxps in both orders, merge the results, and adjust.
192   if (CpuFeatures::IsSupported(AVX)) {
193     CpuFeatureScope scope(this, AVX);
194     vmaxps(scratch, lhs, rhs);
195     vmaxps(dst, rhs, lhs);
196   } else if (dst == lhs || dst == rhs) {
197     XMMRegister src = dst == lhs ? rhs : lhs;
198     movaps(scratch, src);
199     maxps(scratch, dst);
200     maxps(dst, src);
201   } else {
202     movaps(scratch, lhs);
203     maxps(scratch, rhs);
204     movaps(dst, rhs);
205     maxps(dst, lhs);
206   }
207   // Find discrepancies.
208   Xorps(dst, scratch);
209   // Propagate NaNs, which may be non-canonical.
210   Orps(scratch, dst);
211   // Propagate sign discrepancy and (subtle) quiet NaNs.
212   Subps(scratch, scratch, dst);
213   // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
214   Cmpunordps(dst, dst, scratch);
215   Psrld(dst, dst, byte{10});
216   Andnps(dst, dst, scratch);
217 }
218 
F64x2Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)219 void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
220                                     XMMRegister rhs, XMMRegister scratch) {
221   ASM_CODE_COMMENT(this);
222   if (CpuFeatures::IsSupported(AVX)) {
223     CpuFeatureScope scope(this, AVX);
224     // The minpd instruction doesn't propagate NaNs and +0's in its first
225     // operand. Perform minpd in both orders, merge the resuls, and adjust.
226     vminpd(scratch, lhs, rhs);
227     vminpd(dst, rhs, lhs);
228     // propagate -0's and NaNs, which may be non-canonical.
229     vorpd(scratch, scratch, dst);
230     // Canonicalize NaNs by quieting and clearing the payload.
231     vcmpunordpd(dst, dst, scratch);
232     vorpd(scratch, scratch, dst);
233     vpsrlq(dst, dst, byte{13});
234     vandnpd(dst, dst, scratch);
235   } else {
236     // Compare lhs with rhs, and rhs with lhs, and have the results in scratch
237     // and dst. If dst overlaps with lhs or rhs, we can save a move.
238     if (dst == lhs || dst == rhs) {
239       XMMRegister src = dst == lhs ? rhs : lhs;
240       movaps(scratch, src);
241       minpd(scratch, dst);
242       minpd(dst, src);
243     } else {
244       movaps(scratch, lhs);
245       movaps(dst, rhs);
246       minpd(scratch, rhs);
247       minpd(dst, lhs);
248     }
249     orpd(scratch, dst);
250     cmpunordpd(dst, scratch);
251     orpd(scratch, dst);
252     psrlq(dst, byte{13});
253     andnpd(dst, scratch);
254   }
255 }
256 
F64x2Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)257 void SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs,
258                                     XMMRegister rhs, XMMRegister scratch) {
259   ASM_CODE_COMMENT(this);
260   if (CpuFeatures::IsSupported(AVX)) {
261     CpuFeatureScope scope(this, AVX);
262     // The maxpd instruction doesn't propagate NaNs and +0's in its first
263     // operand. Perform maxpd in both orders, merge the resuls, and adjust.
264     vmaxpd(scratch, lhs, rhs);
265     vmaxpd(dst, rhs, lhs);
266     // Find discrepancies.
267     vxorpd(dst, dst, scratch);
268     // Propagate NaNs, which may be non-canonical.
269     vorpd(scratch, scratch, dst);
270     // Propagate sign discrepancy and (subtle) quiet NaNs.
271     vsubpd(scratch, scratch, dst);
272     // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
273     vcmpunordpd(dst, dst, scratch);
274     vpsrlq(dst, dst, byte{13});
275     vandnpd(dst, dst, scratch);
276   } else {
277     if (dst == lhs || dst == rhs) {
278       XMMRegister src = dst == lhs ? rhs : lhs;
279       movaps(scratch, src);
280       maxpd(scratch, dst);
281       maxpd(dst, src);
282     } else {
283       movaps(scratch, lhs);
284       movaps(dst, rhs);
285       maxpd(scratch, rhs);
286       maxpd(dst, lhs);
287     }
288     xorpd(dst, scratch);
289     orpd(scratch, dst);
290     subpd(scratch, dst);
291     cmpunordpd(dst, scratch);
292     psrlq(dst, byte{13});
293     andnpd(dst, scratch);
294   }
295 }
296 
F32x4Splat(XMMRegister dst,DoubleRegister src)297 void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
298   ASM_CODE_COMMENT(this);
299   if (CpuFeatures::IsSupported(AVX2)) {
300     CpuFeatureScope avx2_scope(this, AVX2);
301     vbroadcastss(dst, src);
302   } else if (CpuFeatures::IsSupported(AVX)) {
303     CpuFeatureScope avx_scope(this, AVX);
304     vshufps(dst, src, src, 0);
305   } else {
306     if (dst == src) {
307       // 1 byte shorter than pshufd.
308       shufps(dst, src, 0);
309     } else {
310       pshufd(dst, src, 0);
311     }
312   }
313 }
314 
F32x4ExtractLane(FloatRegister dst,XMMRegister src,uint8_t lane)315 void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src,
316                                             uint8_t lane) {
317   ASM_CODE_COMMENT(this);
318   DCHECK_LT(lane, 4);
319   // These instructions are shorter than insertps, but will leave junk in
320   // the top lanes of dst.
321   if (lane == 0) {
322     if (dst != src) {
323       Movaps(dst, src);
324     }
325   } else if (lane == 1) {
326     Movshdup(dst, src);
327   } else if (lane == 2 && dst == src) {
328     // Check dst == src to avoid false dependency on dst.
329     Movhlps(dst, src);
330   } else if (dst == src) {
331     Shufps(dst, src, src, lane);
332   } else {
333     Pshufd(dst, src, lane);
334   }
335 }
336 
S128Store32Lane(Operand dst,XMMRegister src,uint8_t laneidx)337 void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
338                                            uint8_t laneidx) {
339   ASM_CODE_COMMENT(this);
340   if (laneidx == 0) {
341     Movss(dst, src);
342   } else {
343     DCHECK_GE(3, laneidx);
344     Extractps(dst, src, laneidx);
345   }
346 }
347 
348 template <typename Op>
I8x16SplatPreAvx2(XMMRegister dst,Op src,XMMRegister scratch)349 void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
350                                              XMMRegister scratch) {
351   ASM_CODE_COMMENT(this);
352   DCHECK(!CpuFeatures::IsSupported(AVX2));
353   CpuFeatureScope ssse3_scope(this, SSSE3);
354   Movd(dst, src);
355   Xorps(scratch, scratch);
356   Pshufb(dst, scratch);
357 }
358 
I8x16Splat(XMMRegister dst,Register src,XMMRegister scratch)359 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
360                                       XMMRegister scratch) {
361   ASM_CODE_COMMENT(this);
362   if (CpuFeatures::IsSupported(AVX2)) {
363     CpuFeatureScope avx2_scope(this, AVX2);
364     Movd(scratch, src);
365     vpbroadcastb(dst, scratch);
366   } else {
367     I8x16SplatPreAvx2(dst, src, scratch);
368   }
369 }
370 
I8x16Splat(XMMRegister dst,Operand src,XMMRegister scratch)371 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
372                                       XMMRegister scratch) {
373   ASM_CODE_COMMENT(this);
374   DCHECK_OPERAND_IS_NOT_REG(src);
375   if (CpuFeatures::IsSupported(AVX2)) {
376     CpuFeatureScope avx2_scope(this, AVX2);
377     vpbroadcastb(dst, src);
378   } else {
379     I8x16SplatPreAvx2(dst, src, scratch);
380   }
381 }
382 
I8x16Shl(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)383 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
384                                     uint8_t src2, Register tmp1,
385                                     XMMRegister tmp2) {
386   ASM_CODE_COMMENT(this);
387   DCHECK_NE(dst, tmp2);
388   // Perform 16-bit shift, then mask away low bits.
389   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
390     movaps(dst, src1);
391     src1 = dst;
392   }
393 
394   uint8_t shift = truncate_to_int3(src2);
395   Psllw(dst, src1, byte{shift});
396 
397   uint8_t bmask = static_cast<uint8_t>(0xff << shift);
398   uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
399   Move(tmp1, mask);
400   Movd(tmp2, tmp1);
401   Pshufd(tmp2, tmp2, uint8_t{0});
402   Pand(dst, tmp2);
403 }
404 
I8x16Shl(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)405 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
406                                     Register src2, Register tmp1,
407                                     XMMRegister tmp2, XMMRegister tmp3) {
408   ASM_CODE_COMMENT(this);
409   DCHECK(!AreAliased(dst, tmp2, tmp3));
410   DCHECK(!AreAliased(src1, tmp2, tmp3));
411 
412   // Take shift value modulo 8.
413   Move(tmp1, src2);
414   And(tmp1, Immediate(7));
415   Add(tmp1, Immediate(8));
416   // Create a mask to unset high bits.
417   Movd(tmp3, tmp1);
418   Pcmpeqd(tmp2, tmp2);
419   Psrlw(tmp2, tmp2, tmp3);
420   Packuswb(tmp2, tmp2);
421   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
422     movaps(dst, src1);
423     src1 = dst;
424   }
425   // Mask off the unwanted bits before word-shifting.
426   Pand(dst, src1, tmp2);
427   Add(tmp1, Immediate(-8));
428   Movd(tmp3, tmp1);
429   Psllw(dst, dst, tmp3);
430 }
431 
I8x16ShrS(XMMRegister dst,XMMRegister src1,uint8_t src2,XMMRegister tmp)432 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
433                                      uint8_t src2, XMMRegister tmp) {
434   ASM_CODE_COMMENT(this);
435   // Unpack bytes into words, do word (16-bit) shifts, and repack.
436   DCHECK_NE(dst, tmp);
437   uint8_t shift = truncate_to_int3(src2) + 8;
438 
439   Punpckhbw(tmp, src1);
440   Punpcklbw(dst, src1);
441   Psraw(tmp, shift);
442   Psraw(dst, shift);
443   Packsswb(dst, tmp);
444 }
445 
I8x16ShrS(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)446 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
447                                      Register src2, Register tmp1,
448                                      XMMRegister tmp2, XMMRegister tmp3) {
449   ASM_CODE_COMMENT(this);
450   DCHECK(!AreAliased(dst, tmp2, tmp3));
451   DCHECK_NE(src1, tmp2);
452 
453   // Unpack the bytes into words, do arithmetic shifts, and repack.
454   Punpckhbw(tmp2, src1);
455   Punpcklbw(dst, src1);
456   // Prepare shift value
457   Move(tmp1, src2);
458   // Take shift value modulo 8.
459   And(tmp1, Immediate(7));
460   Add(tmp1, Immediate(8));
461   Movd(tmp3, tmp1);
462   Psraw(tmp2, tmp3);
463   Psraw(dst, tmp3);
464   Packsswb(dst, tmp2);
465 }
466 
I8x16ShrU(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)467 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
468                                      uint8_t src2, Register tmp1,
469                                      XMMRegister tmp2) {
470   ASM_CODE_COMMENT(this);
471   DCHECK_NE(dst, tmp2);
472   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
473     movaps(dst, src1);
474     src1 = dst;
475   }
476 
477   // Perform 16-bit shift, then mask away high bits.
478   uint8_t shift = truncate_to_int3(src2);
479   Psrlw(dst, src1, shift);
480 
481   uint8_t bmask = 0xff >> shift;
482   uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
483   Move(tmp1, mask);
484   Movd(tmp2, tmp1);
485   Pshufd(tmp2, tmp2, byte{0});
486   Pand(dst, tmp2);
487 }
488 
I8x16ShrU(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)489 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
490                                      Register src2, Register tmp1,
491                                      XMMRegister tmp2, XMMRegister tmp3) {
492   ASM_CODE_COMMENT(this);
493   DCHECK(!AreAliased(dst, tmp2, tmp3));
494   DCHECK_NE(src1, tmp2);
495 
496   // Unpack the bytes into words, do logical shifts, and repack.
497   Punpckhbw(tmp2, src1);
498   Punpcklbw(dst, src1);
499   // Prepare shift value.
500   Move(tmp1, src2);
501   // Take shift value modulo 8.
502   And(tmp1, Immediate(7));
503   Add(tmp1, Immediate(8));
504   Movd(tmp3, tmp1);
505   Psrlw(tmp2, tmp3);
506   Psrlw(dst, tmp3);
507   Packuswb(dst, tmp2);
508 }
509 
510 template <typename Op>
I16x8SplatPreAvx2(XMMRegister dst,Op src)511 void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
512   DCHECK(!CpuFeatures::IsSupported(AVX2));
513   Movd(dst, src);
514   Pshuflw(dst, dst, uint8_t{0x0});
515   Punpcklqdq(dst, dst);
516 }
517 
I16x8Splat(XMMRegister dst,Register src)518 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
519   ASM_CODE_COMMENT(this);
520   if (CpuFeatures::IsSupported(AVX2)) {
521     CpuFeatureScope avx2_scope(this, AVX2);
522     Movd(dst, src);
523     vpbroadcastw(dst, dst);
524   } else {
525     I16x8SplatPreAvx2(dst, src);
526   }
527 }
528 
I16x8Splat(XMMRegister dst,Operand src)529 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
530   ASM_CODE_COMMENT(this);
531   DCHECK_OPERAND_IS_NOT_REG(src);
532   if (CpuFeatures::IsSupported(AVX2)) {
533     CpuFeatureScope avx2_scope(this, AVX2);
534     vpbroadcastw(dst, src);
535   } else {
536     I16x8SplatPreAvx2(dst, src);
537   }
538 }
539 
I16x8ExtMulLow(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool is_signed)540 void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
541                                           XMMRegister src2, XMMRegister scratch,
542                                           bool is_signed) {
543   ASM_CODE_COMMENT(this);
544   is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
545   is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
546   Pmullw(dst, scratch);
547 }
548 
I16x8ExtMulHighS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)549 void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
550                                             XMMRegister src2,
551                                             XMMRegister scratch) {
552   ASM_CODE_COMMENT(this);
553   if (CpuFeatures::IsSupported(AVX)) {
554     CpuFeatureScope avx_scope(this, AVX);
555     vpunpckhbw(scratch, src1, src1);
556     vpsraw(scratch, scratch, 8);
557     vpunpckhbw(dst, src2, src2);
558     vpsraw(dst, dst, 8);
559     vpmullw(dst, dst, scratch);
560   } else {
561     if (dst != src1) {
562       movaps(dst, src1);
563     }
564     movaps(scratch, src2);
565     punpckhbw(dst, dst);
566     psraw(dst, 8);
567     punpckhbw(scratch, scratch);
568     psraw(scratch, 8);
569     pmullw(dst, scratch);
570   }
571 }
572 
I16x8ExtMulHighU(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)573 void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
574                                             XMMRegister src2,
575                                             XMMRegister scratch) {
576   ASM_CODE_COMMENT(this);
577   // The logic here is slightly complicated to handle all the cases of register
578   // aliasing. This allows flexibility for callers in TurboFan and Liftoff.
579   if (CpuFeatures::IsSupported(AVX)) {
580     CpuFeatureScope avx_scope(this, AVX);
581     if (src1 == src2) {
582       vpxor(scratch, scratch, scratch);
583       vpunpckhbw(dst, src1, scratch);
584       vpmullw(dst, dst, dst);
585     } else {
586       if (dst == src2) {
587         // We overwrite dst, then use src2, so swap src1 and src2.
588         std::swap(src1, src2);
589       }
590       vpxor(scratch, scratch, scratch);
591       vpunpckhbw(dst, src1, scratch);
592       vpunpckhbw(scratch, src2, scratch);
593       vpmullw(dst, dst, scratch);
594     }
595   } else {
596     if (src1 == src2) {
597       xorps(scratch, scratch);
598       if (dst != src1) {
599         movaps(dst, src1);
600       }
601       punpckhbw(dst, scratch);
602       pmullw(dst, scratch);
603     } else {
604       // When dst == src1, nothing special needs to be done.
605       // When dst == src2, swap src1 and src2, since we overwrite dst.
606       // When dst is unique, copy src1 to dst first.
607       if (dst == src2) {
608         std::swap(src1, src2);
609         // Now, dst == src1.
610       } else if (dst != src1) {
611         // dst != src1 && dst != src2.
612         movaps(dst, src1);
613       }
614       xorps(scratch, scratch);
615       punpckhbw(dst, scratch);
616       punpckhbw(scratch, src2);
617       psrlw(scratch, 8);
618       pmullw(dst, scratch);
619     }
620   }
621 }
622 
I16x8SConvertI8x16High(XMMRegister dst,XMMRegister src)623 void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
624                                                   XMMRegister src) {
625   ASM_CODE_COMMENT(this);
626   if (CpuFeatures::IsSupported(AVX)) {
627     CpuFeatureScope avx_scope(this, AVX);
628     // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high)
629     // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p|
630     vpunpckhbw(dst, src, src);
631     vpsraw(dst, dst, 8);
632   } else {
633     CpuFeatureScope sse_scope(this, SSE4_1);
634     if (dst == src) {
635       // 2 bytes shorter than pshufd, but has depdency on dst.
636       movhlps(dst, src);
637       pmovsxbw(dst, dst);
638     } else {
639       // No dependency on dst.
640       pshufd(dst, src, 0xEE);
641       pmovsxbw(dst, dst);
642     }
643   }
644 }
645 
I16x8UConvertI8x16High(XMMRegister dst,XMMRegister src,XMMRegister scratch)646 void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
647                                                   XMMRegister src,
648                                                   XMMRegister scratch) {
649   ASM_CODE_COMMENT(this);
650   if (CpuFeatures::IsSupported(AVX)) {
651     CpuFeatureScope avx_scope(this, AVX);
652     // tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
653     // src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
654     // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
655     XMMRegister tmp = dst == src ? scratch : dst;
656     vpxor(tmp, tmp, tmp);
657     vpunpckhbw(dst, src, tmp);
658   } else {
659     CpuFeatureScope sse_scope(this, SSE4_1);
660     if (dst == src) {
661       // xorps can be executed on more ports than pshufd.
662       xorps(scratch, scratch);
663       punpckhbw(dst, scratch);
664     } else {
665       // No dependency on dst.
666       pshufd(dst, src, 0xEE);
667       pmovzxbw(dst, dst);
668     }
669   }
670 }
671 
I16x8Q15MulRSatS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)672 void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
673                                             XMMRegister src2,
674                                             XMMRegister scratch) {
675   ASM_CODE_COMMENT(this);
676   // k = i16x8.splat(0x8000)
677   Pcmpeqd(scratch, scratch);
678   Psllw(scratch, scratch, byte{15});
679 
680   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
681     movaps(dst, src1);
682     src1 = dst;
683   }
684 
685   Pmulhrsw(dst, src1, src2);
686   Pcmpeqw(scratch, dst);
687   Pxor(dst, scratch);
688 }
689 
I32x4ExtAddPairwiseI16x8U(XMMRegister dst,XMMRegister src,XMMRegister tmp)690 void SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
691                                                      XMMRegister src,
692                                                      XMMRegister tmp) {
693   ASM_CODE_COMMENT(this);
694   if (CpuFeatures::IsSupported(AVX)) {
695     CpuFeatureScope avx_scope(this, AVX);
696     // src = |a|b|c|d|e|f|g|h| (low)
697     // scratch = |0|a|0|c|0|e|0|g|
698     vpsrld(tmp, src, 16);
699     // dst = |0|b|0|d|0|f|0|h|
700     vpblendw(dst, src, tmp, 0xAA);
701     // dst = |a+b|c+d|e+f|g+h|
702     vpaddd(dst, tmp, dst);
703   } else if (CpuFeatures::IsSupported(SSE4_1)) {
704     CpuFeatureScope sse_scope(this, SSE4_1);
705     // There is a potentially better lowering if we get rip-relative
706     // constants, see https://github.com/WebAssembly/simd/pull/380.
707     movaps(tmp, src);
708     psrld(tmp, 16);
709     if (dst != src) {
710       movaps(dst, src);
711     }
712     pblendw(dst, tmp, 0xAA);
713     paddd(dst, tmp);
714   } else {
715     // src = |a|b|c|d|e|f|g|h|
716     // tmp = i32x4.splat(0x0000FFFF)
717     pcmpeqd(tmp, tmp);
718     psrld(tmp, byte{16});
719     // tmp =|0|b|0|d|0|f|0|h|
720     andps(tmp, src);
721     // dst = |0|a|0|c|0|e|0|g|
722     if (dst != src) {
723       movaps(dst, src);
724     }
725     psrld(dst, byte{16});
726     // dst = |a+b|c+d|e+f|g+h|
727     paddd(dst, tmp);
728   }
729 }
730 
731 // 1. Multiply low word into scratch.
732 // 2. Multiply high word (can be signed or unsigned) into dst.
733 // 3. Unpack and interleave scratch and dst into dst.
I32x4ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)734 void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
735                                        XMMRegister src2, XMMRegister scratch,
736                                        bool low, bool is_signed) {
737   ASM_CODE_COMMENT(this);
738   if (CpuFeatures::IsSupported(AVX)) {
739     CpuFeatureScope avx_scope(this, AVX);
740     vpmullw(scratch, src1, src2);
741     is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
742     low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
743   } else {
744     DCHECK_EQ(dst, src1);
745     movaps(scratch, src1);
746     pmullw(dst, src2);
747     is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
748     low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
749   }
750 }
751 
I32x4SConvertI16x8High(XMMRegister dst,XMMRegister src)752 void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
753                                                   XMMRegister src) {
754   ASM_CODE_COMMENT(this);
755   if (CpuFeatures::IsSupported(AVX)) {
756     CpuFeatureScope avx_scope(this, AVX);
757     // src = |a|b|c|d|e|f|g|h| (high)
758     // dst = |e|e|f|f|g|g|h|h|
759     vpunpckhwd(dst, src, src);
760     vpsrad(dst, dst, 16);
761   } else {
762     CpuFeatureScope sse_scope(this, SSE4_1);
763     if (dst == src) {
764       // 2 bytes shorter than pshufd, but has depdency on dst.
765       movhlps(dst, src);
766       pmovsxwd(dst, dst);
767     } else {
768       // No dependency on dst.
769       pshufd(dst, src, 0xEE);
770       pmovsxwd(dst, dst);
771     }
772   }
773 }
774 
I32x4UConvertI16x8High(XMMRegister dst,XMMRegister src,XMMRegister scratch)775 void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
776                                                   XMMRegister src,
777                                                   XMMRegister scratch) {
778   ASM_CODE_COMMENT(this);
779   if (CpuFeatures::IsSupported(AVX)) {
780     CpuFeatureScope avx_scope(this, AVX);
781     // scratch = |0|0|0|0|0|0|0|0|
782     // src     = |a|b|c|d|e|f|g|h|
783     // dst     = |0|a|0|b|0|c|0|d|
784     XMMRegister tmp = dst == src ? scratch : dst;
785     vpxor(tmp, tmp, tmp);
786     vpunpckhwd(dst, src, tmp);
787   } else {
788     if (dst == src) {
789       // xorps can be executed on more ports than pshufd.
790       xorps(scratch, scratch);
791       punpckhwd(dst, scratch);
792     } else {
793       CpuFeatureScope sse_scope(this, SSE4_1);
794       // No dependency on dst.
795       pshufd(dst, src, 0xEE);
796       pmovzxwd(dst, dst);
797     }
798   }
799 }
800 
I64x2Neg(XMMRegister dst,XMMRegister src,XMMRegister scratch)801 void SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src,
802                                     XMMRegister scratch) {
803   ASM_CODE_COMMENT(this);
804   if (CpuFeatures::IsSupported(AVX)) {
805     CpuFeatureScope scope(this, AVX);
806     vpxor(scratch, scratch, scratch);
807     vpsubq(dst, scratch, src);
808   } else {
809     if (dst == src) {
810       movaps(scratch, src);
811       std::swap(src, scratch);
812     }
813     pxor(dst, dst);
814     psubq(dst, src);
815   }
816 }
817 
I64x2Abs(XMMRegister dst,XMMRegister src,XMMRegister scratch)818 void SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src,
819                                     XMMRegister scratch) {
820   ASM_CODE_COMMENT(this);
821   if (CpuFeatures::IsSupported(AVX)) {
822     CpuFeatureScope avx_scope(this, AVX);
823     XMMRegister tmp = dst == src ? scratch : dst;
824     vpxor(tmp, tmp, tmp);
825     vpsubq(tmp, tmp, src);
826     vblendvpd(dst, src, tmp, src);
827   } else {
828     CpuFeatureScope sse_scope(this, SSE3);
829     movshdup(scratch, src);
830     if (dst != src) {
831       movaps(dst, src);
832     }
833     psrad(scratch, 31);
834     xorps(dst, scratch);
835     psubq(dst, scratch);
836   }
837 }
838 
I64x2GtS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)839 void SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0,
840                                     XMMRegister src1, XMMRegister scratch) {
841   ASM_CODE_COMMENT(this);
842   if (CpuFeatures::IsSupported(AVX)) {
843     CpuFeatureScope avx_scope(this, AVX);
844     vpcmpgtq(dst, src0, src1);
845   } else if (CpuFeatures::IsSupported(SSE4_2)) {
846     CpuFeatureScope sse_scope(this, SSE4_2);
847     if (dst == src0) {
848       pcmpgtq(dst, src1);
849     } else if (dst == src1) {
850       movaps(scratch, src0);
851       pcmpgtq(scratch, src1);
852       movaps(dst, scratch);
853     } else {
854       movaps(dst, src0);
855       pcmpgtq(dst, src1);
856     }
857   } else {
858     CpuFeatureScope sse_scope(this, SSE3);
859     DCHECK_NE(dst, src0);
860     DCHECK_NE(dst, src1);
861     movaps(dst, src1);
862     movaps(scratch, src0);
863     psubq(dst, src0);
864     pcmpeqd(scratch, src1);
865     andps(dst, scratch);
866     movaps(scratch, src0);
867     pcmpgtd(scratch, src1);
868     orps(dst, scratch);
869     movshdup(dst, dst);
870   }
871 }
872 
I64x2GeS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)873 void SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
874                                     XMMRegister src1, XMMRegister scratch) {
875   ASM_CODE_COMMENT(this);
876   if (CpuFeatures::IsSupported(AVX)) {
877     CpuFeatureScope avx_scope(this, AVX);
878     vpcmpgtq(dst, src1, src0);
879     vpcmpeqd(scratch, scratch, scratch);
880     vpxor(dst, dst, scratch);
881   } else if (CpuFeatures::IsSupported(SSE4_2)) {
882     CpuFeatureScope sse_scope(this, SSE4_2);
883     DCHECK_NE(dst, src0);
884     if (dst != src1) {
885       movaps(dst, src1);
886     }
887     pcmpgtq(dst, src0);
888     pcmpeqd(scratch, scratch);
889     xorps(dst, scratch);
890   } else {
891     CpuFeatureScope sse_scope(this, SSE3);
892     DCHECK_NE(dst, src0);
893     DCHECK_NE(dst, src1);
894     movaps(dst, src0);
895     movaps(scratch, src1);
896     psubq(dst, src1);
897     pcmpeqd(scratch, src0);
898     andps(dst, scratch);
899     movaps(scratch, src1);
900     pcmpgtd(scratch, src0);
901     orps(dst, scratch);
902     movshdup(dst, dst);
903     pcmpeqd(scratch, scratch);
904     xorps(dst, scratch);
905   }
906 }
907 
I64x2ShrS(XMMRegister dst,XMMRegister src,uint8_t shift,XMMRegister xmm_tmp)908 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
909                                      uint8_t shift, XMMRegister xmm_tmp) {
910   ASM_CODE_COMMENT(this);
911   DCHECK_GT(64, shift);
912   DCHECK_NE(xmm_tmp, dst);
913   DCHECK_NE(xmm_tmp, src);
914   // Use logical right shift to emulate arithmetic right shifts:
915   // Given:
916   // signed >> c
917   //   == (signed + 2^63 - 2^63) >> c
918   //   == ((signed + 2^63) >> c) - (2^63 >> c)
919   //                                ^^^^^^^^^
920   //                                 xmm_tmp
921   // signed + 2^63 is an unsigned number, so we can use logical right shifts.
922 
923   // xmm_tmp = wasm_i64x2_const(0x80000000'00000000).
924   Pcmpeqd(xmm_tmp, xmm_tmp);
925   Psllq(xmm_tmp, byte{63});
926 
927   if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
928     movaps(dst, src);
929     src = dst;
930   }
931   // Add a bias of 2^63 to convert signed to unsigned.
932   // Since only highest bit changes, use pxor instead of paddq.
933   Pxor(dst, src, xmm_tmp);
934   // Logically shift both value and bias.
935   Psrlq(dst, shift);
936   Psrlq(xmm_tmp, shift);
937   // Subtract shifted bias to convert back to signed value.
938   Psubq(dst, xmm_tmp);
939 }
940 
I64x2ShrS(XMMRegister dst,XMMRegister src,Register shift,XMMRegister xmm_tmp,XMMRegister xmm_shift,Register tmp_shift)941 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
942                                      Register shift, XMMRegister xmm_tmp,
943                                      XMMRegister xmm_shift,
944                                      Register tmp_shift) {
945   ASM_CODE_COMMENT(this);
946   DCHECK_NE(xmm_tmp, dst);
947   DCHECK_NE(xmm_tmp, src);
948   DCHECK_NE(xmm_shift, dst);
949   DCHECK_NE(xmm_shift, src);
950   // tmp_shift can alias shift since we don't use shift after masking it.
951 
952   // See I64x2ShrS with constant shift for explanation of this algorithm.
953   Pcmpeqd(xmm_tmp, xmm_tmp);
954   Psllq(xmm_tmp, byte{63});
955 
956   // Shift modulo 64.
957   Move(tmp_shift, shift);
958   And(tmp_shift, Immediate(0x3F));
959   Movd(xmm_shift, tmp_shift);
960 
961   if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
962     movaps(dst, src);
963     src = dst;
964   }
965   Pxor(dst, src, xmm_tmp);
966   Psrlq(dst, xmm_shift);
967   Psrlq(xmm_tmp, xmm_shift);
968   Psubq(dst, xmm_tmp);
969 }
970 
I64x2Mul(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister tmp1,XMMRegister tmp2)971 void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
972                                     XMMRegister rhs, XMMRegister tmp1,
973                                     XMMRegister tmp2) {
974   ASM_CODE_COMMENT(this);
975   DCHECK(!AreAliased(dst, tmp1, tmp2));
976   DCHECK(!AreAliased(lhs, tmp1, tmp2));
977   DCHECK(!AreAliased(rhs, tmp1, tmp2));
978 
979   if (CpuFeatures::IsSupported(AVX)) {
980     CpuFeatureScope avx_scope(this, AVX);
981     // 1. Multiply high dword of each qword of left with right.
982     vpsrlq(tmp1, lhs, byte{32});
983     vpmuludq(tmp1, tmp1, rhs);
984     // 2. Multiply high dword of each qword of right with left.
985     vpsrlq(tmp2, rhs, byte{32});
986     vpmuludq(tmp2, tmp2, lhs);
987     // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
988     vpaddq(tmp2, tmp2, tmp1);
989     vpsllq(tmp2, tmp2, byte{32});
990     // 4. Multiply low dwords (this is the low dword of result).
991     vpmuludq(dst, lhs, rhs);
992     // 5. Add 3 and 4.
993     vpaddq(dst, dst, tmp2);
994   } else {
995     // Same algorithm as AVX version, but with moves to not overwrite inputs.
996     movaps(tmp1, lhs);
997     movaps(tmp2, rhs);
998     psrlq(tmp1, byte{32});
999     pmuludq(tmp1, rhs);
1000     psrlq(tmp2, byte{32});
1001     pmuludq(tmp2, lhs);
1002     paddq(tmp2, tmp1);
1003     psllq(tmp2, byte{32});
1004     if (dst == rhs) {
1005       // pmuludq is commutative
1006       pmuludq(dst, lhs);
1007     } else {
1008       if (dst != lhs) {
1009         movaps(dst, lhs);
1010       }
1011       pmuludq(dst, rhs);
1012     }
1013     paddq(dst, tmp2);
1014   }
1015 }
1016 
1017 // 1. Unpack src0, src1 into even-number elements of scratch.
1018 // 2. Unpack src1, src0 into even-number elements of dst.
1019 // 3. Multiply 1. with 2.
1020 // For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
I64x2ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)1021 void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
1022                                        XMMRegister src2, XMMRegister scratch,
1023                                        bool low, bool is_signed) {
1024   ASM_CODE_COMMENT(this);
1025   if (CpuFeatures::IsSupported(AVX)) {
1026     CpuFeatureScope avx_scope(this, AVX);
1027     if (low) {
1028       vpunpckldq(scratch, src1, src1);
1029       vpunpckldq(dst, src2, src2);
1030     } else {
1031       vpunpckhdq(scratch, src1, src1);
1032       vpunpckhdq(dst, src2, src2);
1033     }
1034     if (is_signed) {
1035       vpmuldq(dst, scratch, dst);
1036     } else {
1037       vpmuludq(dst, scratch, dst);
1038     }
1039   } else {
1040     uint8_t mask = low ? 0x50 : 0xFA;
1041     pshufd(scratch, src1, mask);
1042     pshufd(dst, src2, mask);
1043     if (is_signed) {
1044       CpuFeatureScope sse4_scope(this, SSE4_1);
1045       pmuldq(dst, scratch);
1046     } else {
1047       pmuludq(dst, scratch);
1048     }
1049   }
1050 }
1051 
I64x2SConvertI32x4High(XMMRegister dst,XMMRegister src)1052 void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
1053                                                   XMMRegister src) {
1054   ASM_CODE_COMMENT(this);
1055   if (CpuFeatures::IsSupported(AVX)) {
1056     CpuFeatureScope avx_scope(this, AVX);
1057     vpunpckhqdq(dst, src, src);
1058     vpmovsxdq(dst, dst);
1059   } else {
1060     CpuFeatureScope sse_scope(this, SSE4_1);
1061     if (dst == src) {
1062       movhlps(dst, src);
1063     } else {
1064       pshufd(dst, src, 0xEE);
1065     }
1066     pmovsxdq(dst, dst);
1067   }
1068 }
1069 
I64x2UConvertI32x4High(XMMRegister dst,XMMRegister src,XMMRegister scratch)1070 void SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst,
1071                                                   XMMRegister src,
1072                                                   XMMRegister scratch) {
1073   ASM_CODE_COMMENT(this);
1074   if (CpuFeatures::IsSupported(AVX)) {
1075     CpuFeatureScope avx_scope(this, AVX);
1076     vpxor(scratch, scratch, scratch);
1077     vpunpckhdq(dst, src, scratch);
1078   } else {
1079     if (dst == src) {
1080       // xorps can be executed on more ports than pshufd.
1081       xorps(scratch, scratch);
1082       punpckhdq(dst, scratch);
1083     } else {
1084       CpuFeatureScope sse_scope(this, SSE4_1);
1085       // No dependency on dst.
1086       pshufd(dst, src, 0xEE);
1087       pmovzxdq(dst, dst);
1088     }
1089   }
1090 }
1091 
S128Not(XMMRegister dst,XMMRegister src,XMMRegister scratch)1092 void SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src,
1093                                    XMMRegister scratch) {
1094   ASM_CODE_COMMENT(this);
1095   if (dst == src) {
1096     Pcmpeqd(scratch, scratch);
1097     Pxor(dst, scratch);
1098   } else {
1099     Pcmpeqd(dst, dst);
1100     Pxor(dst, src);
1101   }
1102 }
1103 
S128Select(XMMRegister dst,XMMRegister mask,XMMRegister src1,XMMRegister src2,XMMRegister scratch)1104 void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
1105                                       XMMRegister src1, XMMRegister src2,
1106                                       XMMRegister scratch) {
1107   ASM_CODE_COMMENT(this);
1108   // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
1109   // pandn(x, y) = !x & y, so we have to flip the mask and input.
1110   if (CpuFeatures::IsSupported(AVX)) {
1111     CpuFeatureScope avx_scope(this, AVX);
1112     vpandn(scratch, mask, src2);
1113     vpand(dst, src1, mask);
1114     vpor(dst, dst, scratch);
1115   } else {
1116     DCHECK_EQ(dst, mask);
1117     // Use float ops as they are 1 byte shorter than int ops.
1118     movaps(scratch, mask);
1119     andnps(scratch, src2);
1120     andps(dst, src1);
1121     orps(dst, scratch);
1122   }
1123 }
1124 
S128Load8Splat(XMMRegister dst,Operand src,XMMRegister scratch)1125 void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
1126                                           XMMRegister scratch) {
1127   ASM_CODE_COMMENT(this);
1128   // The trap handler uses the current pc to creating a landing, so that it can
1129   // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1130   // first instruction in each case below is the one that loads.
1131   if (CpuFeatures::IsSupported(AVX2)) {
1132     CpuFeatureScope avx2_scope(this, AVX2);
1133     vpbroadcastb(dst, src);
1134   } else if (CpuFeatures::IsSupported(AVX)) {
1135     CpuFeatureScope avx_scope(this, AVX);
1136     // Avoid dependency on previous value of dst.
1137     vpinsrb(dst, scratch, src, uint8_t{0});
1138     vpxor(scratch, scratch, scratch);
1139     vpshufb(dst, dst, scratch);
1140   } else {
1141     CpuFeatureScope ssse4_scope(this, SSE4_1);
1142     CpuFeatureScope ssse3_scope(this, SSSE3);
1143     pinsrb(dst, src, uint8_t{0});
1144     xorps(scratch, scratch);
1145     pshufb(dst, scratch);
1146   }
1147 }
1148 
S128Load16Splat(XMMRegister dst,Operand src,XMMRegister scratch)1149 void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
1150                                            XMMRegister scratch) {
1151   ASM_CODE_COMMENT(this);
1152   // The trap handler uses the current pc to creating a landing, so that it can
1153   // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1154   // first instruction in each case below is the one that loads.
1155   if (CpuFeatures::IsSupported(AVX2)) {
1156     CpuFeatureScope avx2_scope(this, AVX2);
1157     vpbroadcastw(dst, src);
1158   } else if (CpuFeatures::IsSupported(AVX)) {
1159     CpuFeatureScope avx_scope(this, AVX);
1160     // Avoid dependency on previous value of dst.
1161     vpinsrw(dst, scratch, src, uint8_t{0});
1162     vpshuflw(dst, dst, uint8_t{0});
1163     vpunpcklqdq(dst, dst, dst);
1164   } else {
1165     pinsrw(dst, src, uint8_t{0});
1166     pshuflw(dst, dst, uint8_t{0});
1167     movlhps(dst, dst);
1168   }
1169 }
1170 
S128Load32Splat(XMMRegister dst,Operand src)1171 void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
1172   ASM_CODE_COMMENT(this);
1173   // The trap handler uses the current pc to creating a landing, so that it can
1174   // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1175   // first instruction in each case below is the one that loads.
1176   if (CpuFeatures::IsSupported(AVX)) {
1177     CpuFeatureScope avx_scope(this, AVX);
1178     vbroadcastss(dst, src);
1179   } else {
1180     movss(dst, src);
1181     shufps(dst, dst, byte{0});
1182   }
1183 }
1184 
S128Store64Lane(Operand dst,XMMRegister src,uint8_t laneidx)1185 void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
1186                                            uint8_t laneidx) {
1187   ASM_CODE_COMMENT(this);
1188   if (laneidx == 0) {
1189     Movlps(dst, src);
1190   } else {
1191     DCHECK_EQ(1, laneidx);
1192     Movhps(dst, src);
1193   }
1194 }
1195 
1196 }  // namespace internal
1197 }  // namespace v8
1198 
1199 #undef DCHECK_OPERAND_IS_NOT_REG
1200