1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h"
6
7 #include "src/codegen/assembler.h"
8 #include "src/codegen/cpu-features.h"
9 #include "src/codegen/register-arch.h"
10
11 #if V8_TARGET_ARCH_IA32
12 #include "src/codegen/ia32/register-ia32.h"
13 #elif V8_TARGET_ARCH_X64
14 #include "src/codegen/x64/register-x64.h"
15 #else
16 #error Unsupported target architecture.
17 #endif
18
19 // Operand on IA32 can be a wrapper for a single register, in which case they
20 // should call I8x16Splat |src| being Register.
21 #if V8_TARGET_ARCH_IA32
22 #define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only());
23 #else
24 #define DCHECK_OPERAND_IS_NOT_REG(op)
25 #endif
26
27 namespace v8 {
28 namespace internal {
29
Move(Register dst,uint32_t src)30 void SharedTurboAssembler::Move(Register dst, uint32_t src) {
31 // Helper to paper over the different assembler function names.
32 #if V8_TARGET_ARCH_IA32
33 mov(dst, Immediate(src));
34 #elif V8_TARGET_ARCH_X64
35 movl(dst, Immediate(src));
36 #else
37 #error Unsupported target architecture.
38 #endif
39 }
40
Move(Register dst,Register src)41 void SharedTurboAssembler::Move(Register dst, Register src) {
42 // Helper to paper over the different assembler function names.
43 if (dst != src) {
44 #if V8_TARGET_ARCH_IA32
45 mov(dst, src);
46 #elif V8_TARGET_ARCH_X64
47 movq(dst, src);
48 #else
49 #error Unsupported target architecture.
50 #endif
51 }
52 }
53
Add(Register dst,Immediate src)54 void SharedTurboAssembler::Add(Register dst, Immediate src) {
55 // Helper to paper over the different assembler function names.
56 #if V8_TARGET_ARCH_IA32
57 add(dst, src);
58 #elif V8_TARGET_ARCH_X64
59 addq(dst, src);
60 #else
61 #error Unsupported target architecture.
62 #endif
63 }
64
And(Register dst,Immediate src)65 void SharedTurboAssembler::And(Register dst, Immediate src) {
66 // Helper to paper over the different assembler function names.
67 #if V8_TARGET_ARCH_IA32
68 and_(dst, src);
69 #elif V8_TARGET_ARCH_X64
70 andq(dst, src);
71 #else
72 #error Unsupported target architecture.
73 #endif
74 }
75
Movhps(XMMRegister dst,XMMRegister src1,Operand src2)76 void SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1,
77 Operand src2) {
78 if (CpuFeatures::IsSupported(AVX)) {
79 CpuFeatureScope scope(this, AVX);
80 vmovhps(dst, src1, src2);
81 } else {
82 if (dst != src1) {
83 movaps(dst, src1);
84 }
85 movhps(dst, src2);
86 }
87 }
88
Movlps(XMMRegister dst,XMMRegister src1,Operand src2)89 void SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1,
90 Operand src2) {
91 if (CpuFeatures::IsSupported(AVX)) {
92 CpuFeatureScope scope(this, AVX);
93 vmovlps(dst, src1, src2);
94 } else {
95 if (dst != src1) {
96 movaps(dst, src1);
97 }
98 movlps(dst, src2);
99 }
100 }
101
Shufps(XMMRegister dst,XMMRegister src1,XMMRegister src2,uint8_t imm8)102 void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
103 XMMRegister src2, uint8_t imm8) {
104 if (CpuFeatures::IsSupported(AVX)) {
105 CpuFeatureScope avx_scope(this, AVX);
106 vshufps(dst, src1, src2, imm8);
107 } else {
108 if (dst != src1) {
109 movaps(dst, src1);
110 }
111 shufps(dst, src2, imm8);
112 }
113 }
114
F64x2ExtractLane(DoubleRegister dst,XMMRegister src,uint8_t lane)115 void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
116 uint8_t lane) {
117 ASM_CODE_COMMENT(this);
118 if (lane == 0) {
119 if (dst != src) {
120 Movaps(dst, src);
121 }
122 } else {
123 DCHECK_EQ(1, lane);
124 if (CpuFeatures::IsSupported(AVX)) {
125 CpuFeatureScope avx_scope(this, AVX);
126 // Pass src as operand to avoid false-dependency on dst.
127 vmovhlps(dst, src, src);
128 } else {
129 movhlps(dst, src);
130 }
131 }
132 }
133
F64x2ReplaceLane(XMMRegister dst,XMMRegister src,DoubleRegister rep,uint8_t lane)134 void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
135 DoubleRegister rep, uint8_t lane) {
136 ASM_CODE_COMMENT(this);
137 if (CpuFeatures::IsSupported(AVX)) {
138 CpuFeatureScope scope(this, AVX);
139 if (lane == 0) {
140 vmovsd(dst, src, rep);
141 } else {
142 vmovlhps(dst, src, rep);
143 }
144 } else {
145 CpuFeatureScope scope(this, SSE4_1);
146 if (dst != src) {
147 DCHECK_NE(dst, rep); // Ensure rep is not overwritten.
148 movaps(dst, src);
149 }
150 if (lane == 0) {
151 movsd(dst, rep);
152 } else {
153 movlhps(dst, rep);
154 }
155 }
156 }
157
F32x4Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)158 void SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs,
159 XMMRegister rhs, XMMRegister scratch) {
160 ASM_CODE_COMMENT(this);
161 // The minps instruction doesn't propagate NaNs and +0's in its first
162 // operand. Perform minps in both orders, merge the results, and adjust.
163 if (CpuFeatures::IsSupported(AVX)) {
164 CpuFeatureScope scope(this, AVX);
165 vminps(scratch, lhs, rhs);
166 vminps(dst, rhs, lhs);
167 } else if (dst == lhs || dst == rhs) {
168 XMMRegister src = dst == lhs ? rhs : lhs;
169 movaps(scratch, src);
170 minps(scratch, dst);
171 minps(dst, src);
172 } else {
173 movaps(scratch, lhs);
174 minps(scratch, rhs);
175 movaps(dst, rhs);
176 minps(dst, lhs);
177 }
178 // Propagate -0's and NaNs, which may be non-canonical.
179 Orps(scratch, dst);
180 // Canonicalize NaNs by quieting and clearing the payload.
181 Cmpunordps(dst, dst, scratch);
182 Orps(scratch, dst);
183 Psrld(dst, dst, byte{10});
184 Andnps(dst, dst, scratch);
185 }
186
F32x4Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)187 void SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs,
188 XMMRegister rhs, XMMRegister scratch) {
189 ASM_CODE_COMMENT(this);
190 // The maxps instruction doesn't propagate NaNs and +0's in its first
191 // operand. Perform maxps in both orders, merge the results, and adjust.
192 if (CpuFeatures::IsSupported(AVX)) {
193 CpuFeatureScope scope(this, AVX);
194 vmaxps(scratch, lhs, rhs);
195 vmaxps(dst, rhs, lhs);
196 } else if (dst == lhs || dst == rhs) {
197 XMMRegister src = dst == lhs ? rhs : lhs;
198 movaps(scratch, src);
199 maxps(scratch, dst);
200 maxps(dst, src);
201 } else {
202 movaps(scratch, lhs);
203 maxps(scratch, rhs);
204 movaps(dst, rhs);
205 maxps(dst, lhs);
206 }
207 // Find discrepancies.
208 Xorps(dst, scratch);
209 // Propagate NaNs, which may be non-canonical.
210 Orps(scratch, dst);
211 // Propagate sign discrepancy and (subtle) quiet NaNs.
212 Subps(scratch, scratch, dst);
213 // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
214 Cmpunordps(dst, dst, scratch);
215 Psrld(dst, dst, byte{10});
216 Andnps(dst, dst, scratch);
217 }
218
F64x2Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)219 void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
220 XMMRegister rhs, XMMRegister scratch) {
221 ASM_CODE_COMMENT(this);
222 if (CpuFeatures::IsSupported(AVX)) {
223 CpuFeatureScope scope(this, AVX);
224 // The minpd instruction doesn't propagate NaNs and +0's in its first
225 // operand. Perform minpd in both orders, merge the resuls, and adjust.
226 vminpd(scratch, lhs, rhs);
227 vminpd(dst, rhs, lhs);
228 // propagate -0's and NaNs, which may be non-canonical.
229 vorpd(scratch, scratch, dst);
230 // Canonicalize NaNs by quieting and clearing the payload.
231 vcmpunordpd(dst, dst, scratch);
232 vorpd(scratch, scratch, dst);
233 vpsrlq(dst, dst, byte{13});
234 vandnpd(dst, dst, scratch);
235 } else {
236 // Compare lhs with rhs, and rhs with lhs, and have the results in scratch
237 // and dst. If dst overlaps with lhs or rhs, we can save a move.
238 if (dst == lhs || dst == rhs) {
239 XMMRegister src = dst == lhs ? rhs : lhs;
240 movaps(scratch, src);
241 minpd(scratch, dst);
242 minpd(dst, src);
243 } else {
244 movaps(scratch, lhs);
245 movaps(dst, rhs);
246 minpd(scratch, rhs);
247 minpd(dst, lhs);
248 }
249 orpd(scratch, dst);
250 cmpunordpd(dst, scratch);
251 orpd(scratch, dst);
252 psrlq(dst, byte{13});
253 andnpd(dst, scratch);
254 }
255 }
256
F64x2Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)257 void SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs,
258 XMMRegister rhs, XMMRegister scratch) {
259 ASM_CODE_COMMENT(this);
260 if (CpuFeatures::IsSupported(AVX)) {
261 CpuFeatureScope scope(this, AVX);
262 // The maxpd instruction doesn't propagate NaNs and +0's in its first
263 // operand. Perform maxpd in both orders, merge the resuls, and adjust.
264 vmaxpd(scratch, lhs, rhs);
265 vmaxpd(dst, rhs, lhs);
266 // Find discrepancies.
267 vxorpd(dst, dst, scratch);
268 // Propagate NaNs, which may be non-canonical.
269 vorpd(scratch, scratch, dst);
270 // Propagate sign discrepancy and (subtle) quiet NaNs.
271 vsubpd(scratch, scratch, dst);
272 // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
273 vcmpunordpd(dst, dst, scratch);
274 vpsrlq(dst, dst, byte{13});
275 vandnpd(dst, dst, scratch);
276 } else {
277 if (dst == lhs || dst == rhs) {
278 XMMRegister src = dst == lhs ? rhs : lhs;
279 movaps(scratch, src);
280 maxpd(scratch, dst);
281 maxpd(dst, src);
282 } else {
283 movaps(scratch, lhs);
284 movaps(dst, rhs);
285 maxpd(scratch, rhs);
286 maxpd(dst, lhs);
287 }
288 xorpd(dst, scratch);
289 orpd(scratch, dst);
290 subpd(scratch, dst);
291 cmpunordpd(dst, scratch);
292 psrlq(dst, byte{13});
293 andnpd(dst, scratch);
294 }
295 }
296
F32x4Splat(XMMRegister dst,DoubleRegister src)297 void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
298 ASM_CODE_COMMENT(this);
299 if (CpuFeatures::IsSupported(AVX2)) {
300 CpuFeatureScope avx2_scope(this, AVX2);
301 vbroadcastss(dst, src);
302 } else if (CpuFeatures::IsSupported(AVX)) {
303 CpuFeatureScope avx_scope(this, AVX);
304 vshufps(dst, src, src, 0);
305 } else {
306 if (dst == src) {
307 // 1 byte shorter than pshufd.
308 shufps(dst, src, 0);
309 } else {
310 pshufd(dst, src, 0);
311 }
312 }
313 }
314
F32x4ExtractLane(FloatRegister dst,XMMRegister src,uint8_t lane)315 void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src,
316 uint8_t lane) {
317 ASM_CODE_COMMENT(this);
318 DCHECK_LT(lane, 4);
319 // These instructions are shorter than insertps, but will leave junk in
320 // the top lanes of dst.
321 if (lane == 0) {
322 if (dst != src) {
323 Movaps(dst, src);
324 }
325 } else if (lane == 1) {
326 Movshdup(dst, src);
327 } else if (lane == 2 && dst == src) {
328 // Check dst == src to avoid false dependency on dst.
329 Movhlps(dst, src);
330 } else if (dst == src) {
331 Shufps(dst, src, src, lane);
332 } else {
333 Pshufd(dst, src, lane);
334 }
335 }
336
S128Store32Lane(Operand dst,XMMRegister src,uint8_t laneidx)337 void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
338 uint8_t laneidx) {
339 ASM_CODE_COMMENT(this);
340 if (laneidx == 0) {
341 Movss(dst, src);
342 } else {
343 DCHECK_GE(3, laneidx);
344 Extractps(dst, src, laneidx);
345 }
346 }
347
348 template <typename Op>
I8x16SplatPreAvx2(XMMRegister dst,Op src,XMMRegister scratch)349 void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
350 XMMRegister scratch) {
351 ASM_CODE_COMMENT(this);
352 DCHECK(!CpuFeatures::IsSupported(AVX2));
353 CpuFeatureScope ssse3_scope(this, SSSE3);
354 Movd(dst, src);
355 Xorps(scratch, scratch);
356 Pshufb(dst, scratch);
357 }
358
I8x16Splat(XMMRegister dst,Register src,XMMRegister scratch)359 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
360 XMMRegister scratch) {
361 ASM_CODE_COMMENT(this);
362 if (CpuFeatures::IsSupported(AVX2)) {
363 CpuFeatureScope avx2_scope(this, AVX2);
364 Movd(scratch, src);
365 vpbroadcastb(dst, scratch);
366 } else {
367 I8x16SplatPreAvx2(dst, src, scratch);
368 }
369 }
370
I8x16Splat(XMMRegister dst,Operand src,XMMRegister scratch)371 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
372 XMMRegister scratch) {
373 ASM_CODE_COMMENT(this);
374 DCHECK_OPERAND_IS_NOT_REG(src);
375 if (CpuFeatures::IsSupported(AVX2)) {
376 CpuFeatureScope avx2_scope(this, AVX2);
377 vpbroadcastb(dst, src);
378 } else {
379 I8x16SplatPreAvx2(dst, src, scratch);
380 }
381 }
382
I8x16Shl(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)383 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
384 uint8_t src2, Register tmp1,
385 XMMRegister tmp2) {
386 ASM_CODE_COMMENT(this);
387 DCHECK_NE(dst, tmp2);
388 // Perform 16-bit shift, then mask away low bits.
389 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
390 movaps(dst, src1);
391 src1 = dst;
392 }
393
394 uint8_t shift = truncate_to_int3(src2);
395 Psllw(dst, src1, byte{shift});
396
397 uint8_t bmask = static_cast<uint8_t>(0xff << shift);
398 uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
399 Move(tmp1, mask);
400 Movd(tmp2, tmp1);
401 Pshufd(tmp2, tmp2, uint8_t{0});
402 Pand(dst, tmp2);
403 }
404
I8x16Shl(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)405 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
406 Register src2, Register tmp1,
407 XMMRegister tmp2, XMMRegister tmp3) {
408 ASM_CODE_COMMENT(this);
409 DCHECK(!AreAliased(dst, tmp2, tmp3));
410 DCHECK(!AreAliased(src1, tmp2, tmp3));
411
412 // Take shift value modulo 8.
413 Move(tmp1, src2);
414 And(tmp1, Immediate(7));
415 Add(tmp1, Immediate(8));
416 // Create a mask to unset high bits.
417 Movd(tmp3, tmp1);
418 Pcmpeqd(tmp2, tmp2);
419 Psrlw(tmp2, tmp2, tmp3);
420 Packuswb(tmp2, tmp2);
421 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
422 movaps(dst, src1);
423 src1 = dst;
424 }
425 // Mask off the unwanted bits before word-shifting.
426 Pand(dst, src1, tmp2);
427 Add(tmp1, Immediate(-8));
428 Movd(tmp3, tmp1);
429 Psllw(dst, dst, tmp3);
430 }
431
I8x16ShrS(XMMRegister dst,XMMRegister src1,uint8_t src2,XMMRegister tmp)432 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
433 uint8_t src2, XMMRegister tmp) {
434 ASM_CODE_COMMENT(this);
435 // Unpack bytes into words, do word (16-bit) shifts, and repack.
436 DCHECK_NE(dst, tmp);
437 uint8_t shift = truncate_to_int3(src2) + 8;
438
439 Punpckhbw(tmp, src1);
440 Punpcklbw(dst, src1);
441 Psraw(tmp, shift);
442 Psraw(dst, shift);
443 Packsswb(dst, tmp);
444 }
445
I8x16ShrS(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)446 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
447 Register src2, Register tmp1,
448 XMMRegister tmp2, XMMRegister tmp3) {
449 ASM_CODE_COMMENT(this);
450 DCHECK(!AreAliased(dst, tmp2, tmp3));
451 DCHECK_NE(src1, tmp2);
452
453 // Unpack the bytes into words, do arithmetic shifts, and repack.
454 Punpckhbw(tmp2, src1);
455 Punpcklbw(dst, src1);
456 // Prepare shift value
457 Move(tmp1, src2);
458 // Take shift value modulo 8.
459 And(tmp1, Immediate(7));
460 Add(tmp1, Immediate(8));
461 Movd(tmp3, tmp1);
462 Psraw(tmp2, tmp3);
463 Psraw(dst, tmp3);
464 Packsswb(dst, tmp2);
465 }
466
I8x16ShrU(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)467 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
468 uint8_t src2, Register tmp1,
469 XMMRegister tmp2) {
470 ASM_CODE_COMMENT(this);
471 DCHECK_NE(dst, tmp2);
472 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
473 movaps(dst, src1);
474 src1 = dst;
475 }
476
477 // Perform 16-bit shift, then mask away high bits.
478 uint8_t shift = truncate_to_int3(src2);
479 Psrlw(dst, src1, shift);
480
481 uint8_t bmask = 0xff >> shift;
482 uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
483 Move(tmp1, mask);
484 Movd(tmp2, tmp1);
485 Pshufd(tmp2, tmp2, byte{0});
486 Pand(dst, tmp2);
487 }
488
I8x16ShrU(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)489 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
490 Register src2, Register tmp1,
491 XMMRegister tmp2, XMMRegister tmp3) {
492 ASM_CODE_COMMENT(this);
493 DCHECK(!AreAliased(dst, tmp2, tmp3));
494 DCHECK_NE(src1, tmp2);
495
496 // Unpack the bytes into words, do logical shifts, and repack.
497 Punpckhbw(tmp2, src1);
498 Punpcklbw(dst, src1);
499 // Prepare shift value.
500 Move(tmp1, src2);
501 // Take shift value modulo 8.
502 And(tmp1, Immediate(7));
503 Add(tmp1, Immediate(8));
504 Movd(tmp3, tmp1);
505 Psrlw(tmp2, tmp3);
506 Psrlw(dst, tmp3);
507 Packuswb(dst, tmp2);
508 }
509
510 template <typename Op>
I16x8SplatPreAvx2(XMMRegister dst,Op src)511 void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
512 DCHECK(!CpuFeatures::IsSupported(AVX2));
513 Movd(dst, src);
514 Pshuflw(dst, dst, uint8_t{0x0});
515 Punpcklqdq(dst, dst);
516 }
517
I16x8Splat(XMMRegister dst,Register src)518 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
519 ASM_CODE_COMMENT(this);
520 if (CpuFeatures::IsSupported(AVX2)) {
521 CpuFeatureScope avx2_scope(this, AVX2);
522 Movd(dst, src);
523 vpbroadcastw(dst, dst);
524 } else {
525 I16x8SplatPreAvx2(dst, src);
526 }
527 }
528
I16x8Splat(XMMRegister dst,Operand src)529 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
530 ASM_CODE_COMMENT(this);
531 DCHECK_OPERAND_IS_NOT_REG(src);
532 if (CpuFeatures::IsSupported(AVX2)) {
533 CpuFeatureScope avx2_scope(this, AVX2);
534 vpbroadcastw(dst, src);
535 } else {
536 I16x8SplatPreAvx2(dst, src);
537 }
538 }
539
I16x8ExtMulLow(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool is_signed)540 void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
541 XMMRegister src2, XMMRegister scratch,
542 bool is_signed) {
543 ASM_CODE_COMMENT(this);
544 is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
545 is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
546 Pmullw(dst, scratch);
547 }
548
I16x8ExtMulHighS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)549 void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
550 XMMRegister src2,
551 XMMRegister scratch) {
552 ASM_CODE_COMMENT(this);
553 if (CpuFeatures::IsSupported(AVX)) {
554 CpuFeatureScope avx_scope(this, AVX);
555 vpunpckhbw(scratch, src1, src1);
556 vpsraw(scratch, scratch, 8);
557 vpunpckhbw(dst, src2, src2);
558 vpsraw(dst, dst, 8);
559 vpmullw(dst, dst, scratch);
560 } else {
561 if (dst != src1) {
562 movaps(dst, src1);
563 }
564 movaps(scratch, src2);
565 punpckhbw(dst, dst);
566 psraw(dst, 8);
567 punpckhbw(scratch, scratch);
568 psraw(scratch, 8);
569 pmullw(dst, scratch);
570 }
571 }
572
I16x8ExtMulHighU(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)573 void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
574 XMMRegister src2,
575 XMMRegister scratch) {
576 ASM_CODE_COMMENT(this);
577 // The logic here is slightly complicated to handle all the cases of register
578 // aliasing. This allows flexibility for callers in TurboFan and Liftoff.
579 if (CpuFeatures::IsSupported(AVX)) {
580 CpuFeatureScope avx_scope(this, AVX);
581 if (src1 == src2) {
582 vpxor(scratch, scratch, scratch);
583 vpunpckhbw(dst, src1, scratch);
584 vpmullw(dst, dst, dst);
585 } else {
586 if (dst == src2) {
587 // We overwrite dst, then use src2, so swap src1 and src2.
588 std::swap(src1, src2);
589 }
590 vpxor(scratch, scratch, scratch);
591 vpunpckhbw(dst, src1, scratch);
592 vpunpckhbw(scratch, src2, scratch);
593 vpmullw(dst, dst, scratch);
594 }
595 } else {
596 if (src1 == src2) {
597 xorps(scratch, scratch);
598 if (dst != src1) {
599 movaps(dst, src1);
600 }
601 punpckhbw(dst, scratch);
602 pmullw(dst, scratch);
603 } else {
604 // When dst == src1, nothing special needs to be done.
605 // When dst == src2, swap src1 and src2, since we overwrite dst.
606 // When dst is unique, copy src1 to dst first.
607 if (dst == src2) {
608 std::swap(src1, src2);
609 // Now, dst == src1.
610 } else if (dst != src1) {
611 // dst != src1 && dst != src2.
612 movaps(dst, src1);
613 }
614 xorps(scratch, scratch);
615 punpckhbw(dst, scratch);
616 punpckhbw(scratch, src2);
617 psrlw(scratch, 8);
618 pmullw(dst, scratch);
619 }
620 }
621 }
622
I16x8SConvertI8x16High(XMMRegister dst,XMMRegister src)623 void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
624 XMMRegister src) {
625 ASM_CODE_COMMENT(this);
626 if (CpuFeatures::IsSupported(AVX)) {
627 CpuFeatureScope avx_scope(this, AVX);
628 // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high)
629 // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p|
630 vpunpckhbw(dst, src, src);
631 vpsraw(dst, dst, 8);
632 } else {
633 CpuFeatureScope sse_scope(this, SSE4_1);
634 if (dst == src) {
635 // 2 bytes shorter than pshufd, but has depdency on dst.
636 movhlps(dst, src);
637 pmovsxbw(dst, dst);
638 } else {
639 // No dependency on dst.
640 pshufd(dst, src, 0xEE);
641 pmovsxbw(dst, dst);
642 }
643 }
644 }
645
I16x8UConvertI8x16High(XMMRegister dst,XMMRegister src,XMMRegister scratch)646 void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
647 XMMRegister src,
648 XMMRegister scratch) {
649 ASM_CODE_COMMENT(this);
650 if (CpuFeatures::IsSupported(AVX)) {
651 CpuFeatureScope avx_scope(this, AVX);
652 // tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
653 // src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
654 // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
655 XMMRegister tmp = dst == src ? scratch : dst;
656 vpxor(tmp, tmp, tmp);
657 vpunpckhbw(dst, src, tmp);
658 } else {
659 CpuFeatureScope sse_scope(this, SSE4_1);
660 if (dst == src) {
661 // xorps can be executed on more ports than pshufd.
662 xorps(scratch, scratch);
663 punpckhbw(dst, scratch);
664 } else {
665 // No dependency on dst.
666 pshufd(dst, src, 0xEE);
667 pmovzxbw(dst, dst);
668 }
669 }
670 }
671
I16x8Q15MulRSatS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)672 void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
673 XMMRegister src2,
674 XMMRegister scratch) {
675 ASM_CODE_COMMENT(this);
676 // k = i16x8.splat(0x8000)
677 Pcmpeqd(scratch, scratch);
678 Psllw(scratch, scratch, byte{15});
679
680 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
681 movaps(dst, src1);
682 src1 = dst;
683 }
684
685 Pmulhrsw(dst, src1, src2);
686 Pcmpeqw(scratch, dst);
687 Pxor(dst, scratch);
688 }
689
I32x4ExtAddPairwiseI16x8U(XMMRegister dst,XMMRegister src,XMMRegister tmp)690 void SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
691 XMMRegister src,
692 XMMRegister tmp) {
693 ASM_CODE_COMMENT(this);
694 if (CpuFeatures::IsSupported(AVX)) {
695 CpuFeatureScope avx_scope(this, AVX);
696 // src = |a|b|c|d|e|f|g|h| (low)
697 // scratch = |0|a|0|c|0|e|0|g|
698 vpsrld(tmp, src, 16);
699 // dst = |0|b|0|d|0|f|0|h|
700 vpblendw(dst, src, tmp, 0xAA);
701 // dst = |a+b|c+d|e+f|g+h|
702 vpaddd(dst, tmp, dst);
703 } else if (CpuFeatures::IsSupported(SSE4_1)) {
704 CpuFeatureScope sse_scope(this, SSE4_1);
705 // There is a potentially better lowering if we get rip-relative
706 // constants, see https://github.com/WebAssembly/simd/pull/380.
707 movaps(tmp, src);
708 psrld(tmp, 16);
709 if (dst != src) {
710 movaps(dst, src);
711 }
712 pblendw(dst, tmp, 0xAA);
713 paddd(dst, tmp);
714 } else {
715 // src = |a|b|c|d|e|f|g|h|
716 // tmp = i32x4.splat(0x0000FFFF)
717 pcmpeqd(tmp, tmp);
718 psrld(tmp, byte{16});
719 // tmp =|0|b|0|d|0|f|0|h|
720 andps(tmp, src);
721 // dst = |0|a|0|c|0|e|0|g|
722 if (dst != src) {
723 movaps(dst, src);
724 }
725 psrld(dst, byte{16});
726 // dst = |a+b|c+d|e+f|g+h|
727 paddd(dst, tmp);
728 }
729 }
730
731 // 1. Multiply low word into scratch.
732 // 2. Multiply high word (can be signed or unsigned) into dst.
733 // 3. Unpack and interleave scratch and dst into dst.
I32x4ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)734 void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
735 XMMRegister src2, XMMRegister scratch,
736 bool low, bool is_signed) {
737 ASM_CODE_COMMENT(this);
738 if (CpuFeatures::IsSupported(AVX)) {
739 CpuFeatureScope avx_scope(this, AVX);
740 vpmullw(scratch, src1, src2);
741 is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
742 low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
743 } else {
744 DCHECK_EQ(dst, src1);
745 movaps(scratch, src1);
746 pmullw(dst, src2);
747 is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
748 low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
749 }
750 }
751
I32x4SConvertI16x8High(XMMRegister dst,XMMRegister src)752 void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
753 XMMRegister src) {
754 ASM_CODE_COMMENT(this);
755 if (CpuFeatures::IsSupported(AVX)) {
756 CpuFeatureScope avx_scope(this, AVX);
757 // src = |a|b|c|d|e|f|g|h| (high)
758 // dst = |e|e|f|f|g|g|h|h|
759 vpunpckhwd(dst, src, src);
760 vpsrad(dst, dst, 16);
761 } else {
762 CpuFeatureScope sse_scope(this, SSE4_1);
763 if (dst == src) {
764 // 2 bytes shorter than pshufd, but has depdency on dst.
765 movhlps(dst, src);
766 pmovsxwd(dst, dst);
767 } else {
768 // No dependency on dst.
769 pshufd(dst, src, 0xEE);
770 pmovsxwd(dst, dst);
771 }
772 }
773 }
774
I32x4UConvertI16x8High(XMMRegister dst,XMMRegister src,XMMRegister scratch)775 void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
776 XMMRegister src,
777 XMMRegister scratch) {
778 ASM_CODE_COMMENT(this);
779 if (CpuFeatures::IsSupported(AVX)) {
780 CpuFeatureScope avx_scope(this, AVX);
781 // scratch = |0|0|0|0|0|0|0|0|
782 // src = |a|b|c|d|e|f|g|h|
783 // dst = |0|a|0|b|0|c|0|d|
784 XMMRegister tmp = dst == src ? scratch : dst;
785 vpxor(tmp, tmp, tmp);
786 vpunpckhwd(dst, src, tmp);
787 } else {
788 if (dst == src) {
789 // xorps can be executed on more ports than pshufd.
790 xorps(scratch, scratch);
791 punpckhwd(dst, scratch);
792 } else {
793 CpuFeatureScope sse_scope(this, SSE4_1);
794 // No dependency on dst.
795 pshufd(dst, src, 0xEE);
796 pmovzxwd(dst, dst);
797 }
798 }
799 }
800
I64x2Neg(XMMRegister dst,XMMRegister src,XMMRegister scratch)801 void SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src,
802 XMMRegister scratch) {
803 ASM_CODE_COMMENT(this);
804 if (CpuFeatures::IsSupported(AVX)) {
805 CpuFeatureScope scope(this, AVX);
806 vpxor(scratch, scratch, scratch);
807 vpsubq(dst, scratch, src);
808 } else {
809 if (dst == src) {
810 movaps(scratch, src);
811 std::swap(src, scratch);
812 }
813 pxor(dst, dst);
814 psubq(dst, src);
815 }
816 }
817
I64x2Abs(XMMRegister dst,XMMRegister src,XMMRegister scratch)818 void SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src,
819 XMMRegister scratch) {
820 ASM_CODE_COMMENT(this);
821 if (CpuFeatures::IsSupported(AVX)) {
822 CpuFeatureScope avx_scope(this, AVX);
823 XMMRegister tmp = dst == src ? scratch : dst;
824 vpxor(tmp, tmp, tmp);
825 vpsubq(tmp, tmp, src);
826 vblendvpd(dst, src, tmp, src);
827 } else {
828 CpuFeatureScope sse_scope(this, SSE3);
829 movshdup(scratch, src);
830 if (dst != src) {
831 movaps(dst, src);
832 }
833 psrad(scratch, 31);
834 xorps(dst, scratch);
835 psubq(dst, scratch);
836 }
837 }
838
I64x2GtS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)839 void SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0,
840 XMMRegister src1, XMMRegister scratch) {
841 ASM_CODE_COMMENT(this);
842 if (CpuFeatures::IsSupported(AVX)) {
843 CpuFeatureScope avx_scope(this, AVX);
844 vpcmpgtq(dst, src0, src1);
845 } else if (CpuFeatures::IsSupported(SSE4_2)) {
846 CpuFeatureScope sse_scope(this, SSE4_2);
847 if (dst == src0) {
848 pcmpgtq(dst, src1);
849 } else if (dst == src1) {
850 movaps(scratch, src0);
851 pcmpgtq(scratch, src1);
852 movaps(dst, scratch);
853 } else {
854 movaps(dst, src0);
855 pcmpgtq(dst, src1);
856 }
857 } else {
858 CpuFeatureScope sse_scope(this, SSE3);
859 DCHECK_NE(dst, src0);
860 DCHECK_NE(dst, src1);
861 movaps(dst, src1);
862 movaps(scratch, src0);
863 psubq(dst, src0);
864 pcmpeqd(scratch, src1);
865 andps(dst, scratch);
866 movaps(scratch, src0);
867 pcmpgtd(scratch, src1);
868 orps(dst, scratch);
869 movshdup(dst, dst);
870 }
871 }
872
I64x2GeS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)873 void SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
874 XMMRegister src1, XMMRegister scratch) {
875 ASM_CODE_COMMENT(this);
876 if (CpuFeatures::IsSupported(AVX)) {
877 CpuFeatureScope avx_scope(this, AVX);
878 vpcmpgtq(dst, src1, src0);
879 vpcmpeqd(scratch, scratch, scratch);
880 vpxor(dst, dst, scratch);
881 } else if (CpuFeatures::IsSupported(SSE4_2)) {
882 CpuFeatureScope sse_scope(this, SSE4_2);
883 DCHECK_NE(dst, src0);
884 if (dst != src1) {
885 movaps(dst, src1);
886 }
887 pcmpgtq(dst, src0);
888 pcmpeqd(scratch, scratch);
889 xorps(dst, scratch);
890 } else {
891 CpuFeatureScope sse_scope(this, SSE3);
892 DCHECK_NE(dst, src0);
893 DCHECK_NE(dst, src1);
894 movaps(dst, src0);
895 movaps(scratch, src1);
896 psubq(dst, src1);
897 pcmpeqd(scratch, src0);
898 andps(dst, scratch);
899 movaps(scratch, src1);
900 pcmpgtd(scratch, src0);
901 orps(dst, scratch);
902 movshdup(dst, dst);
903 pcmpeqd(scratch, scratch);
904 xorps(dst, scratch);
905 }
906 }
907
I64x2ShrS(XMMRegister dst,XMMRegister src,uint8_t shift,XMMRegister xmm_tmp)908 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
909 uint8_t shift, XMMRegister xmm_tmp) {
910 ASM_CODE_COMMENT(this);
911 DCHECK_GT(64, shift);
912 DCHECK_NE(xmm_tmp, dst);
913 DCHECK_NE(xmm_tmp, src);
914 // Use logical right shift to emulate arithmetic right shifts:
915 // Given:
916 // signed >> c
917 // == (signed + 2^63 - 2^63) >> c
918 // == ((signed + 2^63) >> c) - (2^63 >> c)
919 // ^^^^^^^^^
920 // xmm_tmp
921 // signed + 2^63 is an unsigned number, so we can use logical right shifts.
922
923 // xmm_tmp = wasm_i64x2_const(0x80000000'00000000).
924 Pcmpeqd(xmm_tmp, xmm_tmp);
925 Psllq(xmm_tmp, byte{63});
926
927 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
928 movaps(dst, src);
929 src = dst;
930 }
931 // Add a bias of 2^63 to convert signed to unsigned.
932 // Since only highest bit changes, use pxor instead of paddq.
933 Pxor(dst, src, xmm_tmp);
934 // Logically shift both value and bias.
935 Psrlq(dst, shift);
936 Psrlq(xmm_tmp, shift);
937 // Subtract shifted bias to convert back to signed value.
938 Psubq(dst, xmm_tmp);
939 }
940
I64x2ShrS(XMMRegister dst,XMMRegister src,Register shift,XMMRegister xmm_tmp,XMMRegister xmm_shift,Register tmp_shift)941 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
942 Register shift, XMMRegister xmm_tmp,
943 XMMRegister xmm_shift,
944 Register tmp_shift) {
945 ASM_CODE_COMMENT(this);
946 DCHECK_NE(xmm_tmp, dst);
947 DCHECK_NE(xmm_tmp, src);
948 DCHECK_NE(xmm_shift, dst);
949 DCHECK_NE(xmm_shift, src);
950 // tmp_shift can alias shift since we don't use shift after masking it.
951
952 // See I64x2ShrS with constant shift for explanation of this algorithm.
953 Pcmpeqd(xmm_tmp, xmm_tmp);
954 Psllq(xmm_tmp, byte{63});
955
956 // Shift modulo 64.
957 Move(tmp_shift, shift);
958 And(tmp_shift, Immediate(0x3F));
959 Movd(xmm_shift, tmp_shift);
960
961 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
962 movaps(dst, src);
963 src = dst;
964 }
965 Pxor(dst, src, xmm_tmp);
966 Psrlq(dst, xmm_shift);
967 Psrlq(xmm_tmp, xmm_shift);
968 Psubq(dst, xmm_tmp);
969 }
970
I64x2Mul(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister tmp1,XMMRegister tmp2)971 void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
972 XMMRegister rhs, XMMRegister tmp1,
973 XMMRegister tmp2) {
974 ASM_CODE_COMMENT(this);
975 DCHECK(!AreAliased(dst, tmp1, tmp2));
976 DCHECK(!AreAliased(lhs, tmp1, tmp2));
977 DCHECK(!AreAliased(rhs, tmp1, tmp2));
978
979 if (CpuFeatures::IsSupported(AVX)) {
980 CpuFeatureScope avx_scope(this, AVX);
981 // 1. Multiply high dword of each qword of left with right.
982 vpsrlq(tmp1, lhs, byte{32});
983 vpmuludq(tmp1, tmp1, rhs);
984 // 2. Multiply high dword of each qword of right with left.
985 vpsrlq(tmp2, rhs, byte{32});
986 vpmuludq(tmp2, tmp2, lhs);
987 // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
988 vpaddq(tmp2, tmp2, tmp1);
989 vpsllq(tmp2, tmp2, byte{32});
990 // 4. Multiply low dwords (this is the low dword of result).
991 vpmuludq(dst, lhs, rhs);
992 // 5. Add 3 and 4.
993 vpaddq(dst, dst, tmp2);
994 } else {
995 // Same algorithm as AVX version, but with moves to not overwrite inputs.
996 movaps(tmp1, lhs);
997 movaps(tmp2, rhs);
998 psrlq(tmp1, byte{32});
999 pmuludq(tmp1, rhs);
1000 psrlq(tmp2, byte{32});
1001 pmuludq(tmp2, lhs);
1002 paddq(tmp2, tmp1);
1003 psllq(tmp2, byte{32});
1004 if (dst == rhs) {
1005 // pmuludq is commutative
1006 pmuludq(dst, lhs);
1007 } else {
1008 if (dst != lhs) {
1009 movaps(dst, lhs);
1010 }
1011 pmuludq(dst, rhs);
1012 }
1013 paddq(dst, tmp2);
1014 }
1015 }
1016
1017 // 1. Unpack src0, src1 into even-number elements of scratch.
1018 // 2. Unpack src1, src0 into even-number elements of dst.
1019 // 3. Multiply 1. with 2.
1020 // For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
I64x2ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)1021 void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
1022 XMMRegister src2, XMMRegister scratch,
1023 bool low, bool is_signed) {
1024 ASM_CODE_COMMENT(this);
1025 if (CpuFeatures::IsSupported(AVX)) {
1026 CpuFeatureScope avx_scope(this, AVX);
1027 if (low) {
1028 vpunpckldq(scratch, src1, src1);
1029 vpunpckldq(dst, src2, src2);
1030 } else {
1031 vpunpckhdq(scratch, src1, src1);
1032 vpunpckhdq(dst, src2, src2);
1033 }
1034 if (is_signed) {
1035 vpmuldq(dst, scratch, dst);
1036 } else {
1037 vpmuludq(dst, scratch, dst);
1038 }
1039 } else {
1040 uint8_t mask = low ? 0x50 : 0xFA;
1041 pshufd(scratch, src1, mask);
1042 pshufd(dst, src2, mask);
1043 if (is_signed) {
1044 CpuFeatureScope sse4_scope(this, SSE4_1);
1045 pmuldq(dst, scratch);
1046 } else {
1047 pmuludq(dst, scratch);
1048 }
1049 }
1050 }
1051
I64x2SConvertI32x4High(XMMRegister dst,XMMRegister src)1052 void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
1053 XMMRegister src) {
1054 ASM_CODE_COMMENT(this);
1055 if (CpuFeatures::IsSupported(AVX)) {
1056 CpuFeatureScope avx_scope(this, AVX);
1057 vpunpckhqdq(dst, src, src);
1058 vpmovsxdq(dst, dst);
1059 } else {
1060 CpuFeatureScope sse_scope(this, SSE4_1);
1061 if (dst == src) {
1062 movhlps(dst, src);
1063 } else {
1064 pshufd(dst, src, 0xEE);
1065 }
1066 pmovsxdq(dst, dst);
1067 }
1068 }
1069
I64x2UConvertI32x4High(XMMRegister dst,XMMRegister src,XMMRegister scratch)1070 void SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst,
1071 XMMRegister src,
1072 XMMRegister scratch) {
1073 ASM_CODE_COMMENT(this);
1074 if (CpuFeatures::IsSupported(AVX)) {
1075 CpuFeatureScope avx_scope(this, AVX);
1076 vpxor(scratch, scratch, scratch);
1077 vpunpckhdq(dst, src, scratch);
1078 } else {
1079 if (dst == src) {
1080 // xorps can be executed on more ports than pshufd.
1081 xorps(scratch, scratch);
1082 punpckhdq(dst, scratch);
1083 } else {
1084 CpuFeatureScope sse_scope(this, SSE4_1);
1085 // No dependency on dst.
1086 pshufd(dst, src, 0xEE);
1087 pmovzxdq(dst, dst);
1088 }
1089 }
1090 }
1091
S128Not(XMMRegister dst,XMMRegister src,XMMRegister scratch)1092 void SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src,
1093 XMMRegister scratch) {
1094 ASM_CODE_COMMENT(this);
1095 if (dst == src) {
1096 Pcmpeqd(scratch, scratch);
1097 Pxor(dst, scratch);
1098 } else {
1099 Pcmpeqd(dst, dst);
1100 Pxor(dst, src);
1101 }
1102 }
1103
S128Select(XMMRegister dst,XMMRegister mask,XMMRegister src1,XMMRegister src2,XMMRegister scratch)1104 void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
1105 XMMRegister src1, XMMRegister src2,
1106 XMMRegister scratch) {
1107 ASM_CODE_COMMENT(this);
1108 // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
1109 // pandn(x, y) = !x & y, so we have to flip the mask and input.
1110 if (CpuFeatures::IsSupported(AVX)) {
1111 CpuFeatureScope avx_scope(this, AVX);
1112 vpandn(scratch, mask, src2);
1113 vpand(dst, src1, mask);
1114 vpor(dst, dst, scratch);
1115 } else {
1116 DCHECK_EQ(dst, mask);
1117 // Use float ops as they are 1 byte shorter than int ops.
1118 movaps(scratch, mask);
1119 andnps(scratch, src2);
1120 andps(dst, src1);
1121 orps(dst, scratch);
1122 }
1123 }
1124
S128Load8Splat(XMMRegister dst,Operand src,XMMRegister scratch)1125 void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
1126 XMMRegister scratch) {
1127 ASM_CODE_COMMENT(this);
1128 // The trap handler uses the current pc to creating a landing, so that it can
1129 // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1130 // first instruction in each case below is the one that loads.
1131 if (CpuFeatures::IsSupported(AVX2)) {
1132 CpuFeatureScope avx2_scope(this, AVX2);
1133 vpbroadcastb(dst, src);
1134 } else if (CpuFeatures::IsSupported(AVX)) {
1135 CpuFeatureScope avx_scope(this, AVX);
1136 // Avoid dependency on previous value of dst.
1137 vpinsrb(dst, scratch, src, uint8_t{0});
1138 vpxor(scratch, scratch, scratch);
1139 vpshufb(dst, dst, scratch);
1140 } else {
1141 CpuFeatureScope ssse4_scope(this, SSE4_1);
1142 CpuFeatureScope ssse3_scope(this, SSSE3);
1143 pinsrb(dst, src, uint8_t{0});
1144 xorps(scratch, scratch);
1145 pshufb(dst, scratch);
1146 }
1147 }
1148
S128Load16Splat(XMMRegister dst,Operand src,XMMRegister scratch)1149 void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
1150 XMMRegister scratch) {
1151 ASM_CODE_COMMENT(this);
1152 // The trap handler uses the current pc to creating a landing, so that it can
1153 // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1154 // first instruction in each case below is the one that loads.
1155 if (CpuFeatures::IsSupported(AVX2)) {
1156 CpuFeatureScope avx2_scope(this, AVX2);
1157 vpbroadcastw(dst, src);
1158 } else if (CpuFeatures::IsSupported(AVX)) {
1159 CpuFeatureScope avx_scope(this, AVX);
1160 // Avoid dependency on previous value of dst.
1161 vpinsrw(dst, scratch, src, uint8_t{0});
1162 vpshuflw(dst, dst, uint8_t{0});
1163 vpunpcklqdq(dst, dst, dst);
1164 } else {
1165 pinsrw(dst, src, uint8_t{0});
1166 pshuflw(dst, dst, uint8_t{0});
1167 movlhps(dst, dst);
1168 }
1169 }
1170
S128Load32Splat(XMMRegister dst,Operand src)1171 void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
1172 ASM_CODE_COMMENT(this);
1173 // The trap handler uses the current pc to creating a landing, so that it can
1174 // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1175 // first instruction in each case below is the one that loads.
1176 if (CpuFeatures::IsSupported(AVX)) {
1177 CpuFeatureScope avx_scope(this, AVX);
1178 vbroadcastss(dst, src);
1179 } else {
1180 movss(dst, src);
1181 shufps(dst, dst, byte{0});
1182 }
1183 }
1184
S128Store64Lane(Operand dst,XMMRegister src,uint8_t laneidx)1185 void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
1186 uint8_t laneidx) {
1187 ASM_CODE_COMMENT(this);
1188 if (laneidx == 0) {
1189 Movlps(dst, src);
1190 } else {
1191 DCHECK_EQ(1, laneidx);
1192 Movhps(dst, src);
1193 }
1194 }
1195
1196 } // namespace internal
1197 } // namespace v8
1198
1199 #undef DCHECK_OPERAND_IS_NOT_REG
1200