1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "jit/MacroAssembler.h"
8 #include "jit/x86-shared/MacroAssembler-x86-shared.h"
9
10 #include "jit/MacroAssembler-inl.h"
11
12 using namespace js;
13 using namespace js::jit;
14
15 using mozilla::DebugOnly;
16 using mozilla::FloatingPoint;
17 using mozilla::Maybe;
18 using mozilla::SpecificNaN;
19
checkedConvertFloat32x4ToInt32x4(FloatRegister src,FloatRegister dest,Register temp,Label * oolEntry,Label * rejoin)20 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(
21 FloatRegister src, FloatRegister dest, Register temp, Label* oolEntry,
22 Label* rejoin) {
23 // Does the conversion and jumps to the OOL entry if the result value
24 // is the undefined integer pattern.
25 static const SimdConstant InvalidResult =
26 SimdConstant::SplatX4(int32_t(-2147483648));
27 convertFloat32x4ToInt32x4(src, dest);
28
29 ScratchSimd128Scope scratch(asMasm());
30 asMasm().loadConstantSimd128Int(InvalidResult, scratch);
31 packedEqualInt32x4(Operand(dest), scratch);
32 // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
33 // the two following instructions.
34 vmovmskps(scratch, temp);
35 cmp32(temp, Imm32(0));
36 j(Assembler::NotEqual, oolEntry);
37 bind(rejoin);
38 }
39
oolConvertFloat32x4ToInt32x4(FloatRegister src,Register temp,Label * rejoin,Label * onConversionError)40 void MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(
41 FloatRegister src, Register temp, Label* rejoin, Label* onConversionError) {
42 static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
43 static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
44
45 ScratchSimd128Scope scratch(asMasm());
46 asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
47 vcmpleps(Operand(src), scratch, scratch);
48 vmovmskps(scratch, temp);
49 cmp32(temp, Imm32(15));
50 j(Assembler::NotEqual, onConversionError);
51
52 asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
53 vcmpleps(Operand(src), scratch, scratch);
54 vmovmskps(scratch, temp);
55 cmp32(temp, Imm32(0));
56 j(Assembler::NotEqual, onConversionError);
57
58 jump(rejoin);
59 }
60
checkedConvertFloat32x4ToUint32x4(FloatRegister in,FloatRegister out,Register temp,FloatRegister tempF,Label * failed)61 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
62 FloatRegister in, FloatRegister out, Register temp, FloatRegister tempF,
63 Label* failed) {
64 // Classify lane values into 4 disjoint classes:
65 //
66 // N-lanes: in <= -1.0
67 // A-lanes: -1.0 < in <= 0x0.ffffffp31
68 // B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
69 // V-lanes: 0x1.0p32 <= in, or isnan(in)
70 //
71 // We need to bail out to throw a RangeError if we see any N-lanes or
72 // V-lanes.
73 //
74 // For A-lanes and B-lanes, we make two float -> int32 conversions:
75 //
76 // A = cvttps2dq(in)
77 // B = cvttps2dq(in - 0x1.0p31f)
78 //
79 // Note that the subtraction for the B computation is exact for B-lanes.
80 // There is no rounding, so B is the low 31 bits of the correctly converted
81 // result.
82 //
83 // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
84 // out of range for a signed int32_t. This conveniently provides the missing
85 // high bit for B, so the desired result is A for A-lanes and A|B for
86 // B-lanes.
87
88 ScratchSimd128Scope scratch(asMasm());
89
90 // TODO: If the majority of lanes are A-lanes, it could be faster to compute
91 // A first, use vmovmskps to check for any non-A-lanes and handle them in
92 // ool code. OTOH, we we're wrong about the lane distribution, that would be
93 // slower.
94
95 // Compute B in |scratch|.
96 static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
97 static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
98 asMasm().loadConstantSimd128Float(Bias, scratch);
99 packedAddFloat32(Operand(in), scratch);
100 convertFloat32x4ToInt32x4(scratch, scratch);
101
102 // Compute A in |out|. This is the last time we use |in| and the first time
103 // we use |out|, so we can tolerate if they are the same register.
104 convertFloat32x4ToInt32x4(in, out);
105
106 // We can identify A-lanes by the sign bits in A: Any A-lanes will be
107 // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
108 // mask of non-A-lanes into |tempF|.
109 zeroSimd128Float(tempF);
110 packedGreaterThanInt32x4(Operand(out), tempF);
111
112 // Clear the A-lanes in B.
113 bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
114
115 // Compute the final result: A for A-lanes, A|B for B-lanes.
116 bitwiseOrSimdInt(out, Operand(scratch), out);
117
118 // We still need to filter out the V-lanes. They would show up as 0x80000000
119 // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
120 // the remaining negative lanes in B.
121 vmovmskps(scratch, temp);
122 cmp32(temp, Imm32(0));
123 j(Assembler::NotEqual, failed);
124 }
125
createInt32x4(Register lane0,Register lane1,Register lane2,Register lane3,FloatRegister dest)126 void MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1,
127 Register lane2, Register lane3,
128 FloatRegister dest) {
129 if (AssemblerX86Shared::HasSSE41()) {
130 vmovd(lane0, dest);
131 vpinsrd(1, lane1, dest, dest);
132 vpinsrd(2, lane2, dest, dest);
133 vpinsrd(3, lane3, dest, dest);
134 return;
135 }
136
137 asMasm().reserveStack(Simd128DataSize);
138 store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
139 store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
140 store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
141 store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
142 loadAlignedSimd128Int(Address(StackPointer, 0), dest);
143 asMasm().freeStack(Simd128DataSize);
144 }
145
createFloat32x4(FloatRegister lane0,FloatRegister lane1,FloatRegister lane2,FloatRegister lane3,FloatRegister temp,FloatRegister output)146 void MacroAssemblerX86Shared::createFloat32x4(
147 FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
148 FloatRegister lane3, FloatRegister temp, FloatRegister output) {
149 FloatRegister lane0Copy = reusedInputSimd128Float(lane0, output);
150 FloatRegister lane1Copy = reusedInputSimd128Float(lane1, temp);
151 vunpcklps(lane3, lane1Copy, temp);
152 vunpcklps(lane2, lane0Copy, output);
153 vunpcklps(temp, output, output);
154 }
155
splatX16(Register input,FloatRegister output)156 void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
157 vmovd(input, output);
158 if (AssemblerX86Shared::HasSSSE3()) {
159 zeroSimd128Int(ScratchSimd128Reg);
160 vpshufb(ScratchSimd128Reg, output, output);
161 } else {
162 // Use two shifts to duplicate the low 8 bits into the low 16 bits.
163 vpsllw(Imm32(8), output, output);
164 vmovdqa(output, ScratchSimd128Reg);
165 vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
166 vpor(ScratchSimd128Reg, output, output);
167 // Then do an X8 splat.
168 vpshuflw(0, output, output);
169 vpshufd(0, output, output);
170 }
171 }
172
splatX8(Register input,FloatRegister output)173 void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
174 vmovd(input, output);
175 vpshuflw(0, output, output);
176 vpshufd(0, output, output);
177 }
178
splatX4(Register input,FloatRegister output)179 void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {
180 vmovd(input, output);
181 vpshufd(0, output, output);
182 }
183
splatX4(FloatRegister input,FloatRegister output)184 void MacroAssemblerX86Shared::splatX4(FloatRegister input,
185 FloatRegister output) {
186 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
187 vshufps(0, inputCopy, inputCopy, output);
188 }
189
splatX2(FloatRegister input,FloatRegister output)190 void MacroAssemblerX86Shared::splatX2(FloatRegister input,
191 FloatRegister output) {
192 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
193 vshufpd(0, inputCopy, inputCopy, output);
194 }
195
reinterpretSimd(bool isIntegerLaneType,FloatRegister input,FloatRegister output)196 void MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType,
197 FloatRegister input,
198 FloatRegister output) {
199 if (input.aliases(output)) {
200 return;
201 }
202 if (isIntegerLaneType) {
203 vmovdqa(input, output);
204 } else {
205 vmovaps(input, output);
206 }
207 }
208
extractLaneInt32x4(FloatRegister input,Register output,unsigned lane)209 void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
210 Register output,
211 unsigned lane) {
212 if (lane == 0) {
213 // The value we want to extract is in the low double-word
214 moveLowInt32(input, output);
215 } else if (AssemblerX86Shared::HasSSE41()) {
216 vpextrd(lane, input, output);
217 } else {
218 uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
219 shuffleInt32(mask, input, ScratchSimd128Reg);
220 moveLowInt32(ScratchSimd128Reg, output);
221 }
222 }
223
extractLaneFloat32x4(FloatRegister input,FloatRegister output,unsigned lane)224 void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input,
225 FloatRegister output,
226 unsigned lane) {
227 if (lane == 0) {
228 // The value we want to extract is in the low double-word
229 if (input != output) {
230 moveFloat32(input, output);
231 }
232 } else if (lane == 2) {
233 moveHighPairToLowPairFloat32(input, output);
234 } else {
235 uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
236 shuffleFloat32(mask, input, output);
237 }
238 }
239
extractLaneFloat64x2(FloatRegister input,FloatRegister output,unsigned lane)240 void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input,
241 FloatRegister output,
242 unsigned lane) {
243 if (lane == 0) {
244 // The value we want to extract is in the low quadword
245 if (input != output) {
246 moveDouble(input, output);
247 }
248 } else {
249 vpalignr(Operand(input), output, 8);
250 }
251 }
252
extractLaneInt16x8(FloatRegister input,Register output,unsigned lane,SimdSign sign)253 void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input,
254 Register output, unsigned lane,
255 SimdSign sign) {
256 // Unlike pextrd and pextrb, this is available in SSE2.
257 vpextrw(lane, input, output);
258 if (sign == SimdSign::Signed) {
259 movswl(output, output);
260 }
261 }
262
extractLaneInt8x16(FloatRegister input,Register output,unsigned lane,SimdSign sign)263 void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input,
264 Register output, unsigned lane,
265 SimdSign sign) {
266 if (AssemblerX86Shared::HasSSE41()) {
267 vpextrb(lane, input, output);
268 // vpextrb clears the high bits, so no further extension required.
269 if (sign == SimdSign::Unsigned) {
270 sign = SimdSign::NotApplicable;
271 }
272 } else {
273 // Extract the relevant 16 bits containing our lane, then shift the
274 // right 8 bits into place.
275 extractLaneInt16x8(input, output, lane / 2, SimdSign::Unsigned);
276 if (lane % 2) {
277 shrl(Imm32(8), output);
278 // The shrl handles the zero-extension. Don't repeat it.
279 if (sign == SimdSign::Unsigned) {
280 sign = SimdSign::NotApplicable;
281 }
282 }
283 }
284
285 // We have the right low 8 bits in |output|, but we may need to fix the high
286 // bits. Note that this requires |output| to be one of the %eax-%edx
287 // registers.
288 switch (sign) {
289 case SimdSign::Signed:
290 movsbl(output, output);
291 break;
292 case SimdSign::Unsigned:
293 movzbl(output, output);
294 break;
295 case SimdSign::NotApplicable:
296 // No adjustment needed.
297 break;
298 }
299 }
300
extractLaneSimdBool(FloatRegister input,Register output,unsigned numLanes,unsigned lane)301 void MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input,
302 Register output,
303 unsigned numLanes,
304 unsigned lane) {
305 switch (numLanes) {
306 case 4:
307 extractLaneInt32x4(input, output, lane);
308 break;
309 case 8:
310 // Get a lane, don't bother fixing the high bits since we'll mask below.
311 extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
312 break;
313 case 16:
314 extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
315 break;
316 default:
317 MOZ_CRASH("Unhandled SIMD number of lanes");
318 }
319 // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
320 asMasm().and32(Imm32(1), output);
321 }
322
insertLaneSimdInt(FloatRegister input,Register value,FloatRegister output,unsigned lane,unsigned numLanes)323 void MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input,
324 Register value,
325 FloatRegister output,
326 unsigned lane,
327 unsigned numLanes) {
328 if (numLanes == 8) {
329 // Available in SSE 2.
330 vpinsrw(lane, value, input, output);
331 return;
332 }
333
334 // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
335 // value goes into the first component, as vmovd clears out the higher lanes
336 // of the output.
337 if (AssemblerX86Shared::HasSSE41()) {
338 // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
339 switch (numLanes) {
340 case 4:
341 vpinsrd(lane, value, input, output);
342 return;
343 case 16:
344 vpinsrb(lane, value, input, output);
345 return;
346 }
347 }
348
349 asMasm().reserveStack(Simd128DataSize);
350 storeAlignedSimd128Int(input, Address(StackPointer, 0));
351 switch (numLanes) {
352 case 4:
353 store32(value, Address(StackPointer, lane * sizeof(int32_t)));
354 break;
355 case 16:
356 // Note that this requires `value` to be in one the registers where the
357 // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
358 store8(value, Address(StackPointer, lane * sizeof(int8_t)));
359 break;
360 default:
361 MOZ_CRASH("Unsupported SIMD numLanes");
362 }
363 loadAlignedSimd128Int(Address(StackPointer, 0), output);
364 asMasm().freeStack(Simd128DataSize);
365 }
366
insertLaneFloat32x4(FloatRegister input,FloatRegister value,FloatRegister output,unsigned lane)367 void MacroAssemblerX86Shared::insertLaneFloat32x4(FloatRegister input,
368 FloatRegister value,
369 FloatRegister output,
370 unsigned lane) {
371 // This code can't work if this is not true. That's probably a bug.
372 MOZ_RELEASE_ASSERT(input == output);
373
374 if (lane == 0) {
375 if (value != output) {
376 vmovss(value, input, output);
377 }
378 return;
379 }
380
381 if (AssemblerX86Shared::HasSSE41()) {
382 // The input value is in the low float32 of the 'value' FloatRegister.
383 vinsertps(vinsertpsMask(0, lane), value, output, output);
384 return;
385 }
386
387 asMasm().reserveStack(Simd128DataSize);
388 storeAlignedSimd128Float(input, Address(StackPointer, 0));
389 asMasm().storeFloat32(value, Address(StackPointer, lane * sizeof(int32_t)));
390 loadAlignedSimd128Float(Address(StackPointer, 0), output);
391 asMasm().freeStack(Simd128DataSize);
392 }
393
insertLaneFloat64x2(FloatRegister input,FloatRegister value,FloatRegister output,unsigned lane)394 void MacroAssemblerX86Shared::insertLaneFloat64x2(FloatRegister input,
395 FloatRegister value,
396 FloatRegister output,
397 unsigned lane) {
398 if (input == output && output == value) {
399 // No-op
400 return;
401 }
402
403 if (input != output && value != output) {
404 // Merge input and value into output, so make input==output
405 vmovapd(input, output);
406 input = output;
407 }
408
409 if (input == output) {
410 // Merge value into output
411 if (lane == 0) {
412 // move low qword of value into low qword of output
413 vmovsd(value, output, output);
414 } else {
415 // move low qword of value into high qword of output
416 vshufpd(0, value, output, output);
417 }
418 } else {
419 MOZ_ASSERT(value == output);
420 // Merge input into output
421 if (lane == 0) {
422 // move high qword of input into high qword of output
423 vshufpd(2, input, output, output);
424 } else {
425 // move low qword of output into high qword of output
426 vmovddup(output, output);
427 // move low qword of input into low qword of output
428 vmovsd(input, output, output);
429 }
430 }
431 }
432
allTrueSimdBool(FloatRegister input,Register output)433 void MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input,
434 Register output) {
435 // We know that the input lanes are boolean, so they are either 0 or -1.
436 // The all-true vector has all 128 bits set, no matter the lane geometry.
437 vpmovmskb(input, output);
438 cmp32(output, Imm32(0xffff));
439 emitSet(Assembler::Zero, output);
440 }
441
anyTrueSimdBool(FloatRegister input,Register output)442 void MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input,
443 Register output) {
444 vpmovmskb(input, output);
445 cmp32(output, Imm32(0x0));
446 emitSet(Assembler::NonZero, output);
447 }
448
swizzleInt32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])449 void MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input,
450 FloatRegister output,
451 unsigned lanes[4]) {
452 uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1],
453 lanes[2], lanes[3]);
454 shuffleInt32(mask, input, output);
455 }
456
457 // For SIMD.js
oldSwizzleInt8x16(FloatRegister input,FloatRegister output,const Maybe<Register> & temp,int8_t lanes[16])458 void MacroAssemblerX86Shared::oldSwizzleInt8x16(FloatRegister input,
459 FloatRegister output,
460 const Maybe<Register>& temp,
461 int8_t lanes[16]) {
462 if (AssemblerX86Shared::HasSSSE3()) {
463 ScratchSimd128Scope scratch(asMasm());
464 asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
465 FloatRegister inputCopy = reusedInputInt32x4(input, output);
466 vpshufb(scratch, inputCopy, output);
467 return;
468 }
469
470 // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
471 MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
472 asMasm().reserveStack(2 * Simd128DataSize);
473 storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
474 for (unsigned i = 0; i < 16; i++) {
475 load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
476 store8(*temp, Address(StackPointer, i));
477 }
478 loadAlignedSimd128Int(Address(StackPointer, 0), output);
479 asMasm().freeStack(2 * Simd128DataSize);
480 }
481
LanesMatch(unsigned lanes[4],unsigned x,unsigned y,unsigned z,unsigned w)482 static inline bool LanesMatch(unsigned lanes[4], unsigned x, unsigned y,
483 unsigned z, unsigned w) {
484 return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
485 }
486
swizzleFloat32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])487 void MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input,
488 FloatRegister output,
489 unsigned lanes[4]) {
490 if (AssemblerX86Shared::HasSSE3()) {
491 if (LanesMatch(lanes, 0, 0, 2, 2)) {
492 vmovsldup(input, output);
493 return;
494 }
495 if (LanesMatch(lanes, 1, 1, 3, 3)) {
496 vmovshdup(input, output);
497 return;
498 }
499 }
500
501 // TODO Here and below, arch specific lowering could identify this pattern
502 // and use defineReuseInput to avoid this move (bug 1084404)
503 if (LanesMatch(lanes, 2, 3, 2, 3)) {
504 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
505 vmovhlps(input, inputCopy, output);
506 return;
507 }
508
509 if (LanesMatch(lanes, 0, 1, 0, 1)) {
510 if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
511 vmovddup(input, output);
512 return;
513 }
514 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
515 vmovlhps(input, inputCopy, output);
516 return;
517 }
518
519 if (LanesMatch(lanes, 0, 0, 1, 1)) {
520 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
521 vunpcklps(input, inputCopy, output);
522 return;
523 }
524
525 if (LanesMatch(lanes, 2, 2, 3, 3)) {
526 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
527 vunpckhps(input, inputCopy, output);
528 return;
529 }
530
531 uint32_t x = lanes[0];
532 uint32_t y = lanes[1];
533 uint32_t z = lanes[2];
534 uint32_t w = lanes[3];
535
536 uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
537 shuffleFloat32(mask, input, output);
538 }
539
blendInt8x16(FloatRegister lhs,FloatRegister rhs,FloatRegister output,FloatRegister temp,const uint8_t lanes[16])540 void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
541 FloatRegister output,
542 FloatRegister temp,
543 const uint8_t lanes[16]) {
544 MOZ_ASSERT(AssemblerX86Shared::HasSSSE3());
545 MOZ_ASSERT(lhs == output);
546 MOZ_ASSERT(lhs == rhs || !temp.isInvalid());
547
548 // TODO: For sse4.1, consider whether PBLENDVB would not be better, even if it
549 // is variable and requires xmm0 to be free and the loading of a mask.
550
551 // Set scratch = lanes to select from lhs.
552 int8_t mask[16];
553 for (unsigned i = 0; i < 16; i++) {
554 mask[i] = ~lanes[i];
555 }
556 ScratchSimd128Scope scratch(asMasm());
557 asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(mask), scratch);
558 if (lhs == rhs) {
559 asMasm().moveSimd128Int(rhs, temp);
560 rhs = temp;
561 }
562 vpand(Operand(scratch), lhs, lhs);
563 vpandn(Operand(rhs), scratch, scratch);
564 vpor(scratch, lhs, lhs);
565 }
566
blendInt16x8(FloatRegister lhs,FloatRegister rhs,FloatRegister output,const uint16_t lanes[8])567 void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
568 FloatRegister output,
569 const uint16_t lanes[8]) {
570 MOZ_ASSERT(AssemblerX86Shared::HasSSE41());
571 MOZ_ASSERT(lhs == output);
572
573 uint32_t mask = 0;
574 for (unsigned i = 0; i < 8; i++) {
575 if (lanes[i]) {
576 mask |= (1 << i);
577 }
578 }
579 vpblendw(mask, rhs, lhs, lhs);
580 }
581
shuffleInt8x16(FloatRegister lhs,FloatRegister rhs,FloatRegister output,const Maybe<FloatRegister> & maybeFloatTemp,const Maybe<Register> & maybeTemp,const uint8_t lanes[16])582 void MacroAssemblerX86Shared::shuffleInt8x16(
583 FloatRegister lhs, FloatRegister rhs, FloatRegister output,
584 const Maybe<FloatRegister>& maybeFloatTemp,
585 const Maybe<Register>& maybeTemp, const uint8_t lanes[16]) {
586 DebugOnly<bool> hasSSSE3 = AssemblerX86Shared::HasSSSE3();
587 MOZ_ASSERT(hasSSSE3 == !!maybeFloatTemp);
588 MOZ_ASSERT(!hasSSSE3 == !!maybeTemp);
589
590 // Use pshufb if it is available.
591 if (AssemblerX86Shared::HasSSSE3()) {
592 ScratchSimd128Scope scratch(asMasm());
593
594 // Use pshufb instructions to gather the lanes from each source vector.
595 // A negative index creates a zero lane, so the two vectors can be combined.
596
597 // Set scratch = lanes from lhs.
598 int8_t idx[16];
599 for (unsigned i = 0; i < 16; i++) {
600 idx[i] = lanes[i] < 16 ? lanes[i] : -1;
601 }
602 asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx),
603 *maybeFloatTemp);
604 FloatRegister lhsCopy = reusedInputInt32x4(lhs, scratch);
605 vpshufb(*maybeFloatTemp, lhsCopy, scratch);
606
607 // Set output = lanes from rhs.
608 // TODO: The alternative to loading this constant is to complement
609 // the one that is already in *maybeFloatTemp, takes two instructions
610 // and a temp register: PCMPEQD tmp, tmp; PXOR *maybeFloatTemp, tmp.
611 // But scratch is available here so that's OK. But it's not given
612 // that avoiding the load is a win.
613 for (unsigned i = 0; i < 16; i++) {
614 idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
615 }
616 asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx),
617 *maybeFloatTemp);
618 FloatRegister rhsCopy = reusedInputInt32x4(rhs, output);
619 vpshufb(*maybeFloatTemp, rhsCopy, output);
620
621 // Combine.
622 vpor(scratch, output, output);
623 return;
624 }
625
626 // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
627 asMasm().reserveStack(3 * Simd128DataSize);
628 storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
629 storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
630 for (unsigned i = 0; i < 16; i++) {
631 load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]),
632 *maybeTemp);
633 store8(*maybeTemp, Address(StackPointer, i));
634 }
635 loadAlignedSimd128Int(Address(StackPointer, 0), output);
636 asMasm().freeStack(3 * Simd128DataSize);
637 }
638
shuffleX4(FloatRegister lhs,Operand rhs,FloatRegister out,const Maybe<FloatRegister> & maybeTemp,unsigned lanes[4])639 void MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs,
640 FloatRegister out,
641 const Maybe<FloatRegister>& maybeTemp,
642 unsigned lanes[4]) {
643 uint32_t x = lanes[0];
644 uint32_t y = lanes[1];
645 uint32_t z = lanes[2];
646 uint32_t w = lanes[3];
647
648 // Check that lanes come from LHS in majority:
649 unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
650 MOZ_ASSERT(numLanesFromLHS >= 2);
651
652 // When reading this method, remember that vshufps takes the two first
653 // inputs of the destination operand (right operand) and the two last
654 // inputs of the source operand (left operand).
655 //
656 // Legend for explanations:
657 // - L: LHS
658 // - R: RHS
659 // - T: temporary
660
661 uint32_t mask;
662
663 // If all lanes came from a single vector, we should use swizzle instead.
664 MOZ_ASSERT(numLanesFromLHS < 4);
665
666 // If all values stay in their lane, this is a blend.
667 if (AssemblerX86Shared::HasSSE41()) {
668 if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
669 vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
670 return;
671 }
672 }
673
674 // One element of the second, all other elements of the first
675 if (numLanesFromLHS == 3) {
676 unsigned firstMask = -1, secondMask = -1;
677
678 // register-register vmovss preserves the high lanes.
679 if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
680 vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
681 return;
682 }
683
684 // SSE4.1 vinsertps can handle any single element.
685 unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
686 if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
687 unsigned srcLane;
688 unsigned dstLane;
689 if (x >= 4) {
690 srcLane = x - 4;
691 dstLane = 0;
692 } else if (y >= 4) {
693 srcLane = y - 4;
694 dstLane = 1;
695 } else if (z >= 4) {
696 srcLane = z - 4;
697 dstLane = 2;
698 } else {
699 MOZ_ASSERT(w >= 4);
700 srcLane = w - 4;
701 dstLane = 3;
702 }
703 vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
704 return;
705 }
706
707 MOZ_ASSERT(!!maybeTemp);
708 FloatRegister rhsCopy = *maybeTemp;
709 loadAlignedSimd128Float(rhs, rhsCopy);
710
711 if (x < 4 && y < 4) {
712 if (w >= 4) {
713 w %= 4;
714 // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
715 firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
716 // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
717 secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
718 } else {
719 MOZ_ASSERT(z >= 4);
720 z %= 4;
721 // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
722 firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
723 // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
724 secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
725 }
726
727 vshufps(firstMask, lhs, rhsCopy, rhsCopy);
728 vshufps(secondMask, rhsCopy, lhs, out);
729 return;
730 }
731
732 MOZ_ASSERT(z < 4 && w < 4);
733
734 if (y >= 4) {
735 y %= 4;
736 // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
737 firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
738 // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
739 secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
740 } else {
741 MOZ_ASSERT(x >= 4);
742 x %= 4;
743 // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
744 firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
745 // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
746 secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
747 }
748
749 vshufps(firstMask, lhs, rhsCopy, rhsCopy);
750 if (AssemblerX86Shared::HasAVX()) {
751 vshufps(secondMask, lhs, rhsCopy, out);
752 } else {
753 vshufps(secondMask, lhs, rhsCopy, rhsCopy);
754 moveSimd128Float(rhsCopy, out);
755 }
756 return;
757 }
758
759 // Two elements from one vector, two other elements from the other
760 MOZ_ASSERT(numLanesFromLHS == 2);
761
762 // TODO Here and below, symmetric case would be more handy to avoid a move,
763 // but can't be reached because operands would get swapped (bug 1084404).
764 if (LanesMatch(lanes, 2, 3, 6, 7)) {
765 ScratchSimd128Scope scratch(asMasm());
766 if (AssemblerX86Shared::HasAVX()) {
767 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
768 vmovhlps(lhs, rhsCopy, out);
769 } else {
770 loadAlignedSimd128Float(rhs, scratch);
771 vmovhlps(lhs, scratch, scratch);
772 moveSimd128Float(scratch, out);
773 }
774 return;
775 }
776
777 if (LanesMatch(lanes, 0, 1, 4, 5)) {
778 FloatRegister rhsCopy;
779 ScratchSimd128Scope scratch(asMasm());
780 if (rhs.kind() == Operand::FPREG) {
781 // No need to make an actual copy, since the operand is already
782 // in a register, and it won't be clobbered by the vmovlhps.
783 rhsCopy = FloatRegister::FromCode(rhs.fpu());
784 } else {
785 loadAlignedSimd128Float(rhs, scratch);
786 rhsCopy = scratch;
787 }
788 vmovlhps(rhsCopy, lhs, out);
789 return;
790 }
791
792 if (LanesMatch(lanes, 0, 4, 1, 5)) {
793 vunpcklps(rhs, lhs, out);
794 return;
795 }
796
797 // TODO swapped case would be better (bug 1084404)
798 if (LanesMatch(lanes, 4, 0, 5, 1)) {
799 ScratchSimd128Scope scratch(asMasm());
800 if (AssemblerX86Shared::HasAVX()) {
801 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
802 vunpcklps(lhs, rhsCopy, out);
803 } else {
804 loadAlignedSimd128Float(rhs, scratch);
805 vunpcklps(lhs, scratch, scratch);
806 moveSimd128Float(scratch, out);
807 }
808 return;
809 }
810
811 if (LanesMatch(lanes, 2, 6, 3, 7)) {
812 vunpckhps(rhs, lhs, out);
813 return;
814 }
815
816 // TODO swapped case would be better (bug 1084404)
817 if (LanesMatch(lanes, 6, 2, 7, 3)) {
818 ScratchSimd128Scope scratch(asMasm());
819 if (AssemblerX86Shared::HasAVX()) {
820 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
821 vunpckhps(lhs, rhsCopy, out);
822 } else {
823 loadAlignedSimd128Float(rhs, scratch);
824 vunpckhps(lhs, scratch, scratch);
825 moveSimd128Float(scratch, out);
826 }
827 return;
828 }
829
830 // In one vshufps
831 if (x < 4 && y < 4) {
832 mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
833 vshufps(mask, rhs, lhs, out);
834 return;
835 }
836
837 // At creation, we should have explicitly swapped in this case.
838 MOZ_ASSERT(!(z >= 4 && w >= 4));
839
840 // In two vshufps, for the most generic case:
841 uint32_t firstMask[4], secondMask[4];
842 unsigned i = 0, j = 2, k = 0;
843
844 #define COMPUTE_MASK(lane) \
845 if (lane >= 4) { \
846 firstMask[j] = lane % 4; \
847 secondMask[k++] = j++; \
848 } else { \
849 firstMask[i] = lane; \
850 secondMask[k++] = i++; \
851 }
852
853 COMPUTE_MASK(x)
854 COMPUTE_MASK(y)
855 COMPUTE_MASK(z)
856 COMPUTE_MASK(w)
857 #undef COMPUTE_MASK
858
859 MOZ_ASSERT(i == 2 && j == 4 && k == 4);
860
861 mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
862 firstMask[2], firstMask[3]);
863 vshufps(mask, rhs, lhs, lhs);
864
865 mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
866 secondMask[2], secondMask[3]);
867 vshufps(mask, lhs, lhs, lhs);
868 }
869
ToSimdFloatRegister(const Operand & op)870 static inline FloatRegister ToSimdFloatRegister(const Operand& op) {
871 return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
872 }
873
compareInt8x16(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)874 void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
875 Assembler::Condition cond,
876 FloatRegister output) {
877 static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
878 ScratchSimd128Scope scratch(asMasm());
879 switch (cond) {
880 case Assembler::Condition::GreaterThan:
881 vpcmpgtb(rhs, lhs, output);
882 break;
883 case Assembler::Condition::Equal:
884 vpcmpeqb(rhs, lhs, output);
885 break;
886 case Assembler::Condition::LessThan:
887 // src := rhs
888 if (rhs.kind() == Operand::FPREG) {
889 moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
890 } else {
891 loadAlignedSimd128Int(rhs, scratch);
892 }
893
894 // src := src > lhs (i.e. lhs < rhs)
895 // Improve by doing custom lowering (rhs is tied to the output register)
896 vpcmpgtb(Operand(lhs), scratch, scratch);
897 moveSimd128Int(scratch, output);
898 break;
899 case Assembler::Condition::NotEqual:
900 // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
901 // should invert the comparison by, e.g. swapping the arms of a select
902 // if that's what it's used in.
903 asMasm().loadConstantSimd128Int(allOnes, scratch);
904 vpcmpeqb(rhs, lhs, output);
905 bitwiseXorSimdInt(output, Operand(scratch), output);
906 break;
907 case Assembler::Condition::GreaterThanOrEqual:
908 // src := rhs
909 if (rhs.kind() == Operand::FPREG) {
910 moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
911 } else {
912 loadAlignedSimd128Int(rhs, scratch);
913 }
914 vpcmpgtb(Operand(lhs), scratch, scratch);
915 asMasm().loadConstantSimd128Int(allOnes, output);
916 bitwiseXorSimdInt(output, Operand(scratch), output);
917 break;
918 case Assembler::Condition::LessThanOrEqual:
919 // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
920 asMasm().loadConstantSimd128Int(allOnes, scratch);
921 vpcmpgtb(rhs, lhs, output);
922 bitwiseXorSimdInt(output, Operand(scratch), output);
923 break;
924 default:
925 MOZ_CRASH("unexpected condition op");
926 }
927 }
928
unsignedCompareInt8x16(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output,FloatRegister tmp1,FloatRegister tmp2)929 void MacroAssemblerX86Shared::unsignedCompareInt8x16(
930 FloatRegister lhs, Operand rhs, Assembler::Condition cond,
931 FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
932 // We widen the inputs to 16 bits, transforming them to nonnegative values;
933 // then compare them as signed using the logic from compareInt8x16(); then
934 // merge the results (which is surprisingly complicated). rhs is left
935 // untouched. The logic is open-coded to streamline it.
936 //
937 // TODO? Rhs could be in memory (for Ion, anyway), in which case loading it
938 // into scratch first would be better than loading it twice from memory.
939
940 MOZ_ASSERT(AssemblerX86Shared::HasSSE41()); // PMOVZX, PMOVSX
941 MOZ_ASSERT(lhs == output);
942 MOZ_ASSERT(lhs != tmp1 && lhs != tmp2);
943 MOZ_ASSERT_IF(rhs.kind() == Operand::FPREG,
944 ToSimdFloatRegister(rhs) != lhs &&
945 ToSimdFloatRegister(rhs) != tmp1 &&
946 ToSimdFloatRegister(rhs) != tmp2);
947 MOZ_ASSERT(tmp1 != ScratchSimd128Reg && tmp2 != ScratchSimd128Reg);
948
949 bool complement = false;
950 switch (cond) {
951 case Assembler::Above:
952 case Assembler::BelowOrEqual:
953 complement = cond == Assembler::BelowOrEqual;
954
955 // Low eight bytes of inputs widened to words
956 vpmovzxbw(Operand(lhs), tmp1);
957 vpmovzxbw(rhs, tmp2);
958 // Compare leaving 16-bit results
959 vpcmpgtw(Operand(tmp2), tmp1, tmp1); // lhs < rhs in tmp1
960
961 // High eight bytes of inputs widened to words
962 vpalignr(rhs, tmp2, 8);
963 vpmovzxbw(Operand(tmp2), tmp2);
964 vpalignr(Operand(lhs), output, 8);
965 vpmovzxbw(Operand(output), output);
966 // Compare leaving 16-bit results
967 vpcmpgtw(Operand(tmp2), output, output); // lhs < rhs in output
968
969 break;
970 case Assembler::Below:
971 case Assembler::AboveOrEqual:
972 complement = cond == Assembler::AboveOrEqual;
973
974 // Same as above but with operands reversed
975
976 // Low eight bytes of inputs widened to words
977 vpmovzxbw(Operand(lhs), tmp2);
978 vpmovzxbw(rhs, tmp1);
979 // Compare leaving 16-bit results
980 vpcmpgtw(Operand(tmp2), tmp1, tmp1); // rhs < lhs in tmp1
981
982 // High eight bytes of inputs widened to words
983 vpalignr(Operand(lhs), tmp2, 8);
984 vpmovzxbw(Operand(tmp2), tmp2);
985 vpalignr(rhs, output, 8);
986 vpmovzxbw(Operand(output), output);
987 // Compare leaving 16-bit results
988 vpcmpgtw(Operand(tmp2), output, output); // rhs < lhs in output
989
990 break;
991 default:
992 MOZ_CRASH("Unsupported condition code");
993 }
994
995 // Merge output (results of high byte compares) and tmp1 (results of low byte
996 // compares) by truncating word results to bytes (to avoid signed saturation),
997 // packing, and then concatenating and shifting.
998 vpsrlw(Imm32(8), tmp1, tmp1);
999 vpackuswb(Operand(tmp1), tmp1, tmp1);
1000 vpsrlw(Imm32(8), output, output);
1001 vpackuswb(Operand(output), output, output);
1002 vpalignr(Operand(tmp1), output, 8);
1003
1004 // Complement when needed for opposite sense of the operator.
1005 if (complement) {
1006 vpcmpeqd(Operand(tmp1), tmp1, tmp1);
1007 vpxor(Operand(tmp1), output, output);
1008 }
1009 }
1010
compareInt16x8(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1011 void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
1012 Assembler::Condition cond,
1013 FloatRegister output) {
1014 static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
1015
1016 ScratchSimd128Scope scratch(asMasm());
1017 switch (cond) {
1018 case Assembler::Condition::GreaterThan:
1019 vpcmpgtw(rhs, lhs, output);
1020 break;
1021 case Assembler::Condition::Equal:
1022 vpcmpeqw(rhs, lhs, output);
1023 break;
1024 case Assembler::Condition::LessThan:
1025 // src := rhs
1026 if (rhs.kind() == Operand::FPREG) {
1027 moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1028 } else {
1029 loadAlignedSimd128Int(rhs, scratch);
1030 }
1031
1032 // src := src > lhs (i.e. lhs < rhs)
1033 // Improve by doing custom lowering (rhs is tied to the output register)
1034 vpcmpgtw(Operand(lhs), scratch, scratch);
1035 moveSimd128Int(scratch, output);
1036 break;
1037 case Assembler::Condition::NotEqual:
1038 // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
1039 // should invert the comparison by, e.g. swapping the arms of a select
1040 // if that's what it's used in.
1041 asMasm().loadConstantSimd128Int(allOnes, scratch);
1042 vpcmpeqw(rhs, lhs, output);
1043 bitwiseXorSimdInt(output, Operand(scratch), output);
1044 break;
1045 case Assembler::Condition::GreaterThanOrEqual:
1046 // src := rhs
1047 if (rhs.kind() == Operand::FPREG) {
1048 moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1049 } else {
1050 loadAlignedSimd128Int(rhs, scratch);
1051 }
1052 vpcmpgtw(Operand(lhs), scratch, scratch);
1053 asMasm().loadConstantSimd128Int(allOnes, output);
1054 bitwiseXorSimdInt(output, Operand(scratch), output);
1055 break;
1056 case Assembler::Condition::LessThanOrEqual:
1057 // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
1058 asMasm().loadConstantSimd128Int(allOnes, scratch);
1059 vpcmpgtw(rhs, lhs, output);
1060 bitwiseXorSimdInt(output, Operand(scratch), output);
1061 break;
1062 default:
1063 MOZ_CRASH("unexpected condition op");
1064 }
1065 }
1066
unsignedCompareInt16x8(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output,FloatRegister tmp1,FloatRegister tmp2)1067 void MacroAssemblerX86Shared::unsignedCompareInt16x8(
1068 FloatRegister lhs, Operand rhs, Assembler::Condition cond,
1069 FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
1070 // See comments at unsignedCompareInt8x16.
1071
1072 MOZ_ASSERT(AssemblerX86Shared::HasSSE41()); // PMOVZX, PMOVSX
1073 MOZ_ASSERT(lhs == output);
1074
1075 bool complement = false;
1076 switch (cond) {
1077 case Assembler::Above:
1078 case Assembler::BelowOrEqual:
1079 complement = cond == Assembler::BelowOrEqual;
1080
1081 vpmovzxwd(Operand(lhs), tmp1);
1082 vpmovzxwd(rhs, tmp2);
1083 vpcmpgtd(Operand(tmp2), tmp1, tmp1);
1084
1085 vpalignr(rhs, tmp2, 8);
1086 vpmovzxwd(Operand(tmp2), tmp2);
1087 vpalignr(Operand(lhs), output, 8);
1088 vpmovzxwd(Operand(output), output);
1089 vpcmpgtd(Operand(tmp2), output, output);
1090
1091 break;
1092 case Assembler::Below:
1093 case Assembler::AboveOrEqual:
1094 complement = cond == Assembler::AboveOrEqual;
1095
1096 vpmovzxwd(Operand(lhs), tmp2);
1097 vpmovzxwd(rhs, tmp1);
1098 vpcmpgtd(Operand(tmp2), tmp1, tmp1);
1099
1100 vpalignr(Operand(lhs), tmp2, 8);
1101 vpmovzxwd(Operand(tmp2), tmp2);
1102 vpalignr(rhs, output, 8);
1103 vpmovzxwd(Operand(output), output);
1104 vpcmpgtd(Operand(tmp2), output, output);
1105
1106 break;
1107 default:
1108 MOZ_CRASH();
1109 }
1110
1111 vpsrld(Imm32(16), tmp1, tmp1);
1112 vpackusdw(Operand(tmp1), tmp1, tmp1);
1113 vpsrld(Imm32(16), output, output);
1114 vpackusdw(Operand(output), output, output);
1115 vpalignr(Operand(tmp1), output, 8);
1116
1117 if (complement) {
1118 vpcmpeqd(Operand(tmp1), tmp1, tmp1);
1119 vpxor(Operand(tmp1), output, output);
1120 }
1121 }
1122
compareInt32x4(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1123 void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
1124 Assembler::Condition cond,
1125 FloatRegister output) {
1126 static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
1127 ScratchSimd128Scope scratch(asMasm());
1128 switch (cond) {
1129 case Assembler::Condition::GreaterThan:
1130 packedGreaterThanInt32x4(rhs, lhs);
1131 break;
1132 case Assembler::Condition::Equal:
1133 packedEqualInt32x4(rhs, lhs);
1134 break;
1135 case Assembler::Condition::LessThan:
1136 // src := rhs
1137 if (rhs.kind() == Operand::FPREG) {
1138 moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1139 } else {
1140 loadAlignedSimd128Int(rhs, scratch);
1141 }
1142
1143 // src := src > lhs (i.e. lhs < rhs)
1144 // Improve by doing custom lowering (rhs is tied to the output register)
1145 packedGreaterThanInt32x4(Operand(lhs), scratch);
1146 moveSimd128Int(scratch, lhs);
1147 break;
1148 case Assembler::Condition::NotEqual:
1149 // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
1150 // should invert the comparison by, e.g. swapping the arms of a select
1151 // if that's what it's used in.
1152 asMasm().loadConstantSimd128Int(allOnes, scratch);
1153 packedEqualInt32x4(rhs, lhs);
1154 bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
1155 break;
1156 case Assembler::Condition::GreaterThanOrEqual:
1157 // src := rhs
1158 if (rhs.kind() == Operand::FPREG) {
1159 moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1160 } else {
1161 loadAlignedSimd128Int(rhs, scratch);
1162 }
1163 packedGreaterThanInt32x4(Operand(lhs), scratch);
1164 asMasm().loadConstantSimd128Int(allOnes, lhs);
1165 bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
1166 break;
1167 case Assembler::Condition::LessThanOrEqual:
1168 // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
1169 asMasm().loadConstantSimd128Int(allOnes, scratch);
1170 packedGreaterThanInt32x4(rhs, lhs);
1171 bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
1172 break;
1173 default:
1174 MOZ_CRASH("unexpected condition op");
1175 }
1176 }
1177
unsignedCompareInt32x4(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output,FloatRegister tmp1,FloatRegister tmp2)1178 void MacroAssemblerX86Shared::unsignedCompareInt32x4(
1179 FloatRegister lhs, Operand rhs, Assembler::Condition cond,
1180 FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
1181 // See comments at unsignedCompareInt8x16, the logic is similar. However we
1182 // only have PCMPGTQ on SSE4.2 or later, so for SSE4.1 we need to use subtract
1183 // to compute the flags.
1184
1185 MOZ_ASSERT(AssemblerX86Shared::HasSSE41()); // PMOVZX, PMOVSX
1186 MOZ_ASSERT(lhs == output);
1187
1188 bool complement = false;
1189 switch (cond) {
1190 case Assembler::Below:
1191 case Assembler::AboveOrEqual:
1192 complement = cond == Assembler::AboveOrEqual;
1193
1194 // The effect of the subtract is that the high doubleword of each quadword
1195 // becomes either 0 (ge) or -1 (lt).
1196
1197 vpmovzxdq(Operand(lhs), tmp1);
1198 vpmovzxdq(rhs, tmp2);
1199 vpsubq(Operand(tmp2), tmp1, tmp1); // flag1 junk flag0 junk
1200 vpsrlq(Imm32(32), tmp1, tmp1); // zero flag1 zero flag0
1201 vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1,
1202 tmp1); // zero zero flag1 flag0
1203
1204 vpalignr(rhs, tmp2, 8);
1205 vpmovzxdq(Operand(tmp2), tmp2);
1206 vpalignr(Operand(lhs), output, 8);
1207 vpmovzxdq(Operand(output), output);
1208 vpsubq(Operand(tmp2), output, output); // flag3 junk flag2 junk
1209 vpsrlq(Imm32(32), output, output); // zero flag3 zero flag2
1210 vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output,
1211 output); // flag3 flag2 zero zero
1212
1213 vpor(Operand(tmp1), output, output);
1214 break;
1215
1216 case Assembler::Above:
1217 case Assembler::BelowOrEqual:
1218 complement = cond == Assembler::BelowOrEqual;
1219
1220 // The effect of the subtract is that the high doubleword of each quadword
1221 // becomes either 0 (le) or -1 (gt).
1222
1223 vpmovzxdq(Operand(lhs), tmp2);
1224 vpmovzxdq(rhs, tmp1);
1225 vpsubq(Operand(tmp2), tmp1, tmp1); // flag1 junk flag0 junk
1226 vpsrlq(Imm32(32), tmp1, tmp1); // zero flag1 zero flag0
1227 vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1,
1228 tmp1); // zero zero flag1 flag0
1229
1230 vpalignr(Operand(lhs), tmp2, 8);
1231 vpmovzxdq(Operand(tmp2), tmp2);
1232 vpalignr(rhs, output, 8);
1233 vpmovzxdq(Operand(output), output);
1234 vpsubq(Operand(tmp2), output, output); // flag3 junk flag2 junk
1235 vpsrlq(Imm32(32), output, output); // zero flag3 zero flag2
1236 vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output,
1237 output); // flag3 flag2 zero zero
1238
1239 vpor(Operand(tmp1), output, output);
1240 break;
1241
1242 default:
1243 MOZ_CRASH();
1244 }
1245
1246 if (complement) {
1247 vpcmpeqd(Operand(tmp1), tmp1, tmp1);
1248 vpxor(Operand(tmp1), output, output);
1249 }
1250 }
1251
compareFloat32x4(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1252 void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,
1253 Assembler::Condition cond,
1254 FloatRegister output) {
1255 switch (cond) {
1256 case Assembler::Condition::Equal:
1257 vcmpeqps(rhs, lhs, output);
1258 break;
1259 case Assembler::Condition::LessThan:
1260 vcmpltps(rhs, lhs, output);
1261 break;
1262 case Assembler::Condition::LessThanOrEqual:
1263 vcmpleps(rhs, lhs, output);
1264 break;
1265 case Assembler::Condition::NotEqual:
1266 vcmpneqps(rhs, lhs, output);
1267 break;
1268 case Assembler::Condition::GreaterThanOrEqual:
1269 case Assembler::Condition::GreaterThan:
1270 // We reverse these before register allocation so that we don't have to
1271 // copy into and out of temporaries after codegen.
1272 MOZ_CRASH("should have reversed this");
1273 default:
1274 MOZ_CRASH("unexpected condition op");
1275 }
1276 }
1277
compareFloat64x2(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1278 void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
1279 Assembler::Condition cond,
1280 FloatRegister output) {
1281 switch (cond) {
1282 case Assembler::Condition::Equal:
1283 vcmpeqpd(rhs, lhs, output);
1284 break;
1285 case Assembler::Condition::LessThan:
1286 vcmpltpd(rhs, lhs, output);
1287 break;
1288 case Assembler::Condition::LessThanOrEqual:
1289 vcmplepd(rhs, lhs, output);
1290 break;
1291 case Assembler::Condition::NotEqual:
1292 vcmpneqpd(rhs, lhs, output);
1293 break;
1294 case Assembler::Condition::GreaterThanOrEqual:
1295 case Assembler::Condition::GreaterThan:
1296 // We reverse these before register allocation so that we don't have to
1297 // copy into and out of temporaries after codegen.
1298 MOZ_CRASH("should have reversed this");
1299 default:
1300 MOZ_CRASH("unexpected condition op");
1301 }
1302 }
1303
mulInt32x4(FloatRegister lhs,Operand rhs,const Maybe<FloatRegister> & temp,FloatRegister output)1304 void MacroAssemblerX86Shared::mulInt32x4(FloatRegister lhs, Operand rhs,
1305 const Maybe<FloatRegister>& temp,
1306 FloatRegister output) {
1307 if (AssemblerX86Shared::HasSSE41()) {
1308 vpmulld(rhs, lhs, output);
1309 return;
1310 }
1311
1312 ScratchSimd128Scope scratch(asMasm());
1313 loadAlignedSimd128Int(rhs, scratch);
1314 vpmuludq(lhs, scratch, scratch);
1315 // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
1316
1317 MOZ_ASSERT(!!temp);
1318 vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
1319 vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, *temp);
1320 vpmuludq(*temp, lhs, lhs);
1321 // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
1322
1323 vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
1324 // lhs contains (Ry, Rw, Rx, Rz)
1325 vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
1326 }
1327
1328 /* clang-format off */
1329
1330 // NaN is s111 1111 1qxx ... where the q indicates quiet or
1331 // signaling NaN; q=1 means quiet.
1332 //
1333 // If the values are both zero the second value is returned from min/max, this
1334 // matters if one of them is signed.
1335 //
1336 // If one value is a NaN then the second value is returned.
1337 //
1338 // Generally for min/max, the sign of zero matters (-0 < 0) and NaN inputs are
1339 // always quiet and we want to propagate NaN.
1340
1341 // For min:
1342 // - we compute x=min(a,b) and y=min(b,a) and then OR them together
1343 // - if the values are not NaN but not both zero then x==y, the OR
1344 // matters not
1345 // - if the values are -0 and 0 then we will get -0 from the OR
1346 // - if one of the values is NaN then x or y will be NaN and we will
1347 // get a NaN from the OR, with some arbitrary sign, and since
1348 // the input NaN is quiet the output NaN will be quiet.
1349
1350 // For max:
1351 // - the UNORD comparison will create a mask of ~0 in scratch if at least
1352 // one value is NaN, otherwise 0
1353 // - we compute x=max(a,b) and y=max(b,a) and then AND them together
1354 // - if the values are not NaN but not both zero then x==y, the
1355 // AND matters not
1356 // - if the values are -0 and 0 then the AND will create 0
1357 // - if one of the values is NaN then the AND will select the wrong
1358 // value
1359 // - a final OR of the result with the mask from the UNORD comparison
1360 // will leave valid results alone and create NaN where the UNORD
1361 // was true.
1362
1363 /* clang-format on */
1364
minFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister output)1365 void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs,
1366 FloatRegister output) {
1367 ScratchSimd128Scope scratch(asMasm());
1368 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
1369 vminps(Operand(lhs), rhsCopy, scratch);
1370 vminps(rhs, lhs, output);
1371 vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN
1372 }
1373
maxFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1374 void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs,
1375 FloatRegister temp,
1376 FloatRegister output) {
1377 ScratchSimd128Scope scratch(asMasm());
1378 FloatRegister lhsCopy = reusedInputSimd128Float(lhs, scratch);
1379 vcmpunordps(rhs, lhsCopy, scratch);
1380
1381 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, temp);
1382 vmaxps(Operand(lhs), rhsCopy, temp);
1383 vmaxps(rhs, lhs, output);
1384
1385 vandps(temp, output, output);
1386 vorps(scratch, output, output); // or in the all-ones NaNs
1387 }
1388
minFloat64x2(FloatRegister lhs,Operand rhs,FloatRegister output)1389 void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, Operand rhs,
1390 FloatRegister output) {
1391 ScratchSimd128Scope scratch(asMasm());
1392 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
1393 vminpd(Operand(lhs), rhsCopy, scratch);
1394 vminpd(rhs, lhs, output);
1395 vorpd(scratch, output, output); // NaN or'd with arbitrary bits is NaN
1396 }
1397
maxFloat64x2(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1398 void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, Operand rhs,
1399 FloatRegister temp,
1400 FloatRegister output) {
1401 ScratchSimd128Scope scratch(asMasm());
1402 FloatRegister lhsCopy = reusedInputSimd128Float(lhs, scratch);
1403 vcmpunordpd(rhs, lhsCopy, scratch);
1404
1405 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, temp);
1406 vmaxpd(Operand(lhs), rhsCopy, temp);
1407 vmaxpd(rhs, lhs, output);
1408
1409 vandpd(temp, output, output);
1410 vorpd(scratch, output, output); // or in the all-ones NaNs
1411 }
1412
minNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1413 void MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs,
1414 FloatRegister temp,
1415 FloatRegister output) {
1416 ScratchSimd128Scope scratch(asMasm());
1417 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
1418 temp);
1419
1420 FloatRegister mask = scratch;
1421 FloatRegister tmpCopy = reusedInputSimd128Float(temp, scratch);
1422 vpcmpeqd(Operand(lhs), tmpCopy, mask);
1423 vandps(temp, mask, mask);
1424
1425 FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
1426 vminps(rhs, lhsCopy, temp);
1427 vorps(mask, temp, temp);
1428
1429 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, mask);
1430 vcmpneqps(rhs, rhsCopy, mask);
1431
1432 if (AssemblerX86Shared::HasAVX()) {
1433 vblendvps(mask, lhs, temp, output);
1434 } else {
1435 // Emulate vblendvps.
1436 // With SSE.4.1 we could use blendvps, however it's awkward since
1437 // it requires the mask to be in xmm0.
1438 if (lhs != output) {
1439 moveSimd128Float(lhs, output);
1440 }
1441 vandps(Operand(mask), output, output);
1442 vandnps(Operand(temp), mask, mask);
1443 vorps(Operand(mask), output, output);
1444 }
1445 }
1446
maxNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1447 void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
1448 FloatRegister temp,
1449 FloatRegister output) {
1450 ScratchSimd128Scope scratch(asMasm());
1451 FloatRegister mask = scratch;
1452
1453 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
1454 vpcmpeqd(Operand(lhs), mask, mask);
1455
1456 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
1457 temp);
1458 vandps(temp, mask, mask);
1459
1460 FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
1461 vmaxps(rhs, lhsCopy, temp);
1462 vandnps(Operand(temp), mask, mask);
1463
1464 // Ensure temp always contains the temporary result
1465 mask = temp;
1466 temp = scratch;
1467
1468 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, mask);
1469 vcmpneqps(rhs, rhsCopy, mask);
1470
1471 if (AssemblerX86Shared::HasAVX()) {
1472 vblendvps(mask, lhs, temp, output);
1473 } else {
1474 // Emulate vblendvps.
1475 // With SSE.4.1 we could use blendvps, however it's awkward since
1476 // it requires the mask to be in xmm0.
1477 if (lhs != output) {
1478 moveSimd128Float(lhs, output);
1479 }
1480 vandps(Operand(mask), output, output);
1481 vandnps(Operand(temp), mask, mask);
1482 vorps(Operand(mask), output, output);
1483 }
1484 }
1485
negFloat32x4(Operand in,FloatRegister out)1486 void MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out) {
1487 ScratchSimd128Scope scratch(asMasm());
1488 FloatRegister result = out;
1489 if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1490 result = scratch;
1491 }
1492 // All zeros but the sign bit
1493 static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
1494 asMasm().loadConstantSimd128Float(minusZero, result);
1495 bitwiseXorFloat32x4(result, in, result);
1496 if (result == scratch) {
1497 moveSimd128Float(result, out);
1498 }
1499 }
1500
negFloat64x2(Operand in,FloatRegister out)1501 void MacroAssemblerX86Shared::negFloat64x2(Operand in, FloatRegister out) {
1502 ScratchSimd128Scope scratch(asMasm());
1503 FloatRegister result = out;
1504 if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1505 result = scratch;
1506 }
1507 // All zeros but the sign bit
1508 static const SimdConstant minusZero = SimdConstant::SplatX2(-0.0);
1509 asMasm().loadConstantSimd128Float(minusZero, result);
1510 vxorpd(ToSimdFloatRegister(in), result, result);
1511 if (result == scratch) {
1512 moveSimd128Float(result, out);
1513 }
1514 }
1515
notInt8x16(Operand in,FloatRegister out)1516 void MacroAssemblerX86Shared::notInt8x16(Operand in, FloatRegister out) {
1517 ScratchSimd128Scope scratch(asMasm());
1518 FloatRegister result = out;
1519 if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1520 result = scratch;
1521 }
1522 static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
1523 asMasm().loadConstantSimd128Int(allOnes, result);
1524 bitwiseXorSimdInt(result, in, result);
1525 if (result == scratch) {
1526 moveSimd128Float(result, out);
1527 }
1528 }
1529
notInt16x8(Operand in,FloatRegister out)1530 void MacroAssemblerX86Shared::notInt16x8(Operand in, FloatRegister out) {
1531 // Bug, really
1532 MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
1533 static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
1534 asMasm().loadConstantSimd128Int(allOnes, out);
1535 bitwiseXorSimdInt(out, in, out);
1536 }
1537
notInt32x4(Operand in,FloatRegister out)1538 void MacroAssemblerX86Shared::notInt32x4(Operand in, FloatRegister out) {
1539 // Bug, really
1540 MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
1541 static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
1542 asMasm().loadConstantSimd128Int(allOnes, out);
1543 bitwiseXorSimdInt(out, in, out);
1544 }
1545
notFloat32x4(Operand in,FloatRegister out)1546 void MacroAssemblerX86Shared::notFloat32x4(Operand in, FloatRegister out) {
1547 // Bug, really
1548 MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
1549 float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
1550 static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
1551 asMasm().loadConstantSimd128Float(allOnes, out);
1552 bitwiseXorFloat32x4(out, in, out);
1553 }
1554
absFloat32x4(Operand in,FloatRegister out)1555 void MacroAssemblerX86Shared::absFloat32x4(Operand in, FloatRegister out) {
1556 ScratchSimd128Scope scratch(asMasm());
1557 FloatRegister result = out;
1558 if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1559 result = scratch;
1560 }
1561 // All ones but the sign bit
1562 float signMask =
1563 SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
1564 static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
1565 asMasm().loadConstantSimd128Float(signMasks, result);
1566 bitwiseAndFloat32x4(result, in, result);
1567 if (result == scratch) {
1568 moveSimd128Float(result, out);
1569 }
1570 }
1571
absFloat64x2(Operand in,FloatRegister out)1572 void MacroAssemblerX86Shared::absFloat64x2(Operand in, FloatRegister out) {
1573 ScratchSimd128Scope scratch(asMasm());
1574 FloatRegister result = out;
1575 if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1576 result = scratch;
1577 }
1578 // All ones but the sign bit
1579 double signMask =
1580 SpecificNaN<double>(0, FloatingPoint<double>::kSignificandBits);
1581 static const SimdConstant signMasks = SimdConstant::SplatX2(signMask);
1582 asMasm().loadConstantSimd128Float(signMasks, result);
1583 vandpd(ToSimdFloatRegister(in), result, result);
1584 if (result == scratch) {
1585 moveSimd128Float(result, out);
1586 }
1587 }
1588
MaskSimdShiftCount(MacroAssembler & masm,unsigned shiftmask,Register count,Register temp,FloatRegister dest)1589 static inline void MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask,
1590 Register count, Register temp,
1591 FloatRegister dest) {
1592 masm.mov(count, temp);
1593 masm.andl(Imm32(shiftmask), temp);
1594 masm.vmovd(temp, dest);
1595 }
1596
packedShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest,void (MacroAssemblerX86Shared::* shift)(FloatRegister,FloatRegister,FloatRegister),void (MacroAssemblerX86Shared::* extend)(const Operand &,FloatRegister))1597 void MacroAssemblerX86Shared::packedShiftByScalarInt8x16(
1598 FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1599 FloatRegister dest,
1600 void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,
1601 FloatRegister),
1602 void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) {
1603 ScratchSimd128Scope scratch(asMasm());
1604 MaskSimdShiftCount(asMasm(), 7, count, temp, scratch);
1605
1606 // High bytes
1607 vpalignr(Operand(in), xtmp, 8);
1608 (this->*extend)(Operand(xtmp), xtmp);
1609 (this->*shift)(scratch, xtmp, xtmp);
1610
1611 // Low bytes
1612 (this->*extend)(Operand(dest), dest);
1613 (this->*shift)(scratch, dest, dest);
1614
1615 // Mask off garbage to avoid saturation during packing
1616 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)),
1617 scratch);
1618 vpand(Operand(scratch), xtmp, xtmp);
1619 vpand(Operand(scratch), dest, dest);
1620
1621 vpackuswb(Operand(xtmp), dest, dest);
1622 }
1623
packedLeftShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest)1624 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
1625 FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1626 FloatRegister dest) {
1627 packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
1628 &MacroAssemblerX86Shared::vpsllw,
1629 &MacroAssemblerX86Shared::vpmovzxbw);
1630 }
1631
packedLeftShiftByScalarInt8x16(Imm32 count,FloatRegister src,FloatRegister dest)1632 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
1633 Imm32 count, FloatRegister src, FloatRegister dest) {
1634 MOZ_ASSERT(count.value <= 7);
1635 if (src != dest) {
1636 asMasm().moveSimd128(src, dest);
1637 }
1638 // Use the doubling trick for low shift counts, otherwise mask off the bits
1639 // that are shifted out of the low byte of each word and use word shifts. The
1640 // optimal cutoff remains to be explored.
1641 if (count.value <= 3) {
1642 for (int32_t shift = count.value; shift > 0; --shift) {
1643 asMasm().addInt8x16(dest, dest);
1644 }
1645 } else {
1646 ScratchSimd128Scope scratch(asMasm());
1647 // Whether SplatX8 or SplatX16 is best depends on the constant probably?
1648 asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0xFF >> count.value),
1649 scratch);
1650 vpand(Operand(scratch), dest, dest);
1651 vpsllw(count, dest, dest);
1652 }
1653 }
1654
packedRightShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest)1655 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
1656 FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1657 FloatRegister dest) {
1658 packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
1659 &MacroAssemblerX86Shared::vpsraw,
1660 &MacroAssemblerX86Shared::vpmovsxbw);
1661 }
1662
packedRightShiftByScalarInt8x16(Imm32 count,FloatRegister src,FloatRegister temp,FloatRegister dest)1663 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
1664 Imm32 count, FloatRegister src, FloatRegister temp, FloatRegister dest) {
1665 MOZ_ASSERT(count.value <= 7);
1666 ScratchSimd128Scope scratch(asMasm());
1667
1668 asMasm().moveSimd128(src, scratch);
1669 vpslldq(Imm32(1), scratch, scratch); // Low bytes -> high bytes
1670 vpsraw(Imm32(count.value + 8), scratch, scratch); // Shift low bytes
1671 vpsraw(count, dest, dest); // Shift high bytes
1672 asMasm().loadConstantSimd128Int(SimdConstant::SplatX8(0xFF00), temp);
1673 bitwiseAndSimdInt(dest, Operand(temp), dest); // Keep high bytes
1674 bitwiseAndNotSimdInt(temp, Operand(scratch), temp); // Keep low bytes
1675 bitwiseOrSimdInt(dest, Operand(temp), dest); // Combine
1676 }
1677
packedUnsignedRightShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest)1678 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
1679 FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1680 FloatRegister dest) {
1681 packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
1682 &MacroAssemblerX86Shared::vpsrlw,
1683 &MacroAssemblerX86Shared::vpmovzxbw);
1684 }
1685
packedUnsignedRightShiftByScalarInt8x16(Imm32 count,FloatRegister src,FloatRegister dest)1686 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
1687 Imm32 count, FloatRegister src, FloatRegister dest) {
1688 MOZ_ASSERT(count.value <= 7);
1689 if (src != dest) {
1690 asMasm().moveSimd128(src, dest);
1691 }
1692 ScratchSimd128Scope scratch(asMasm());
1693 // Whether SplatX8 or SplatX16 is best depends on the constant probably?
1694 asMasm().loadConstantSimd128Int(
1695 SimdConstant::SplatX16((0xFF << count.value) & 0xFF), scratch);
1696 vpand(Operand(scratch), dest, dest);
1697 vpsrlw(count, dest, dest);
1698 }
1699
packedLeftShiftByScalarInt16x8(FloatRegister in,Register count,Register temp,FloatRegister dest)1700 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(
1701 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1702 ScratchSimd128Scope scratch(asMasm());
1703 MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
1704 vpsllw(scratch, in, dest);
1705 }
1706
packedRightShiftByScalarInt16x8(FloatRegister in,Register count,Register temp,FloatRegister dest)1707 void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(
1708 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1709 ScratchSimd128Scope scratch(asMasm());
1710 MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
1711 vpsraw(scratch, in, dest);
1712 }
1713
packedUnsignedRightShiftByScalarInt16x8(FloatRegister in,Register count,Register temp,FloatRegister dest)1714 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(
1715 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1716 ScratchSimd128Scope scratch(asMasm());
1717 MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
1718 vpsrlw(scratch, in, dest);
1719 }
1720
packedLeftShiftByScalarInt32x4(FloatRegister in,Register count,Register temp,FloatRegister dest)1721 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(
1722 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1723 ScratchSimd128Scope scratch(asMasm());
1724 MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
1725 vpslld(scratch, in, dest);
1726 }
1727
packedRightShiftByScalarInt32x4(FloatRegister in,Register count,Register temp,FloatRegister dest)1728 void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(
1729 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1730 ScratchSimd128Scope scratch(asMasm());
1731 MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
1732 vpsrad(scratch, in, dest);
1733 }
1734
packedUnsignedRightShiftByScalarInt32x4(FloatRegister in,Register count,Register temp,FloatRegister dest)1735 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(
1736 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1737 ScratchSimd128Scope scratch(asMasm());
1738 MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
1739 vpsrld(scratch, in, dest);
1740 }
1741
packedLeftShiftByScalarInt64x2(FloatRegister in,Register count,Register temp,FloatRegister dest)1742 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2(
1743 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1744 ScratchSimd128Scope scratch(asMasm());
1745 MaskSimdShiftCount(asMasm(), 63, count, temp, scratch);
1746 vpsllq(scratch, in, dest);
1747 }
1748
packedRightShiftByScalarInt64x2(FloatRegister in,Register count,Register temp1,FloatRegister temp2,FloatRegister dest)1749 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
1750 FloatRegister in, Register count, Register temp1, FloatRegister temp2,
1751 FloatRegister dest) {
1752 ScratchSimd128Scope scratch(asMasm());
1753 movl(count, temp1); // temp1 is zero-extended shift count
1754 andl(Imm32(63), temp1); // temp1 is masked shift count
1755 vmovd(temp1, scratch); // and scratch 64-bit ditto
1756 vpxor(Operand(temp2), temp2, temp2); // temp2=0
1757 vpcmpgtq(Operand(in), temp2, temp2); // temp2=~0 where `in` negative
1758 vpsrlq(scratch, in, dest); // dest shifted, maybe wrong sign
1759 negl(temp1); // temp1 is - masked count
1760 addl(Imm32(63), temp1); // temp1 is 63 - masked count
1761 vmovd(temp1, scratch); // and scratch ditto
1762 vpsllq(scratch, temp2, temp2); // temp2 has the sign bits
1763 vpor(Operand(temp2), dest, dest); // dest has right sign
1764 }
1765
packedUnsignedRightShiftByScalarInt64x2(FloatRegister in,Register count,Register temp,FloatRegister dest)1766 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
1767 FloatRegister in, Register count, Register temp, FloatRegister dest) {
1768 ScratchSimd128Scope scratch(asMasm());
1769 MaskSimdShiftCount(asMasm(), 63, count, temp, scratch);
1770 vpsrlq(scratch, in, dest);
1771 }
1772
selectSimd128(FloatRegister mask,FloatRegister onTrue,FloatRegister onFalse,FloatRegister temp,FloatRegister output)1773 void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
1774 FloatRegister onTrue,
1775 FloatRegister onFalse,
1776 FloatRegister temp,
1777 FloatRegister output) {
1778 if (onTrue != output) {
1779 vmovaps(onTrue, output);
1780 }
1781 if (mask != temp) {
1782 vmovaps(mask, temp);
1783 }
1784
1785 // SSE4.1 has plain blendvps which can do this, but it is awkward
1786 // to use because it requires the mask to be in xmm0.
1787
1788 bitwiseAndSimdInt(output, Operand(temp), output);
1789 bitwiseAndNotSimdInt(temp, Operand(onFalse), temp);
1790 bitwiseOrSimdInt(output, Operand(temp), output);
1791 }
1792
1793 // Code sequences for int32x4<->float32x4 culled from v8; commentary added.
1794
unsignedConvertInt32x4ToFloat32x4(FloatRegister src,FloatRegister dest)1795 void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
1796 FloatRegister src, FloatRegister dest) {
1797 ScratchSimd128Scope scratch(asMasm());
1798 if (src != dest) {
1799 vmovaps(src, dest);
1800 }
1801 vpxor(Operand(scratch), scratch, scratch); // extract low bits
1802 vpblendw(0x55, dest, scratch, scratch); // into scratch
1803 vpsubd(Operand(scratch), dest, dest); // and high bits into dest
1804 vcvtdq2ps(scratch, scratch); // convert low bits
1805 vpsrld(Imm32(1), dest, dest); // get high into unsigned range
1806 vcvtdq2ps(dest, dest); // convert
1807 vaddps(Operand(dest), dest, dest); // and back into signed
1808 vaddps(Operand(scratch), dest, dest); // combine high+low: may round
1809 }
1810
truncSatFloat32x4ToInt32x4(FloatRegister src,FloatRegister dest)1811 void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
1812 FloatRegister dest) {
1813 ScratchSimd128Scope scratch(asMasm());
1814 if (src != dest) {
1815 vmovaps(src, dest);
1816 }
1817
1818 // The cvttps2dq instruction is the workhorse but does not handle NaN or out
1819 // of range values as we need it to. We want to saturate too-large positive
1820 // values to 7FFFFFFFh and too-large negative values to 80000000h. NaN and -0
1821 // become 0.
1822
1823 // Convert NaN to 0 by masking away values that compare unordered to itself.
1824 vmovaps(dest, scratch);
1825 vcmpeqps(Operand(scratch), scratch, scratch);
1826 vpand(Operand(scratch), dest, dest);
1827
1828 // Compute the complement of each non-NaN lane's sign bit, we'll need this to
1829 // correct the result of cvttps2dq. All other output bits are garbage.
1830 vpxor(Operand(dest), scratch, scratch);
1831
1832 // Convert. This will make the output 80000000h if the input is out of range.
1833 vcvttps2dq(dest, dest);
1834
1835 // Preserve the computed complemented sign bit if the output was 80000000h.
1836 // The sign bit will be 1 precisely for nonnegative values that overflowed.
1837 vpand(Operand(dest), scratch, scratch);
1838
1839 // Create a mask with that sign bit. Now a lane is either FFFFFFFFh if there
1840 // was a positive overflow, otherwise zero.
1841 vpsrad(Imm32(31), scratch, scratch);
1842
1843 // Convert overflow lanes to 0x7FFFFFFF.
1844 vpxor(Operand(scratch), dest, dest);
1845 }
1846
unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,FloatRegister temp,FloatRegister dest)1847 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
1848 FloatRegister src, FloatRegister temp, FloatRegister dest) {
1849 ScratchSimd128Scope scratch(asMasm());
1850 if (src != dest) {
1851 vmovaps(src, dest);
1852 }
1853
1854 // The cvttps2dq instruction is the workhorse but does not handle NaN or out
1855 // of range values as we need it to. We want to saturate too-large positive
1856 // values to FFFFFFFFh and negative values to zero. NaN and -0 become 0.
1857
1858 // Convert NaN and negative values to zeroes in dest.
1859 vpxor(Operand(scratch), scratch, scratch);
1860 vmaxps(Operand(scratch), dest, dest);
1861
1862 // Compute the float value of 0x7FFFFFFF (the largest positive signed integer
1863 // value) in all lanes in scratch. We use it to bias the conversion to handle
1864 // edge cases.
1865 vpcmpeqd(Operand(scratch), scratch, scratch);
1866 vpsrld(Imm32(1), scratch, scratch);
1867 vcvtdq2ps(scratch, scratch);
1868
1869 // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned
1870 // range but above the signed range into the signed range; 0 => -7FFFFFFFh.
1871 vmovaps(dest, temp);
1872 vsubps(Operand(scratch), temp, temp);
1873
1874 // scratch = mask of biased values that are greater than 7FFFFFFFh.
1875 vcmpleps(Operand(temp), scratch, scratch);
1876
1877 // Convert the biased values to integer. Positive values above 7FFFFFFFh will
1878 // have been converted to 80000000h, all others become the expected integer.
1879 vcvttps2dq(temp, temp);
1880
1881 // As lanes of scratch are ~0 where the result overflows, this computes
1882 // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes
1883 // untouched as the biased integer.
1884 vpxor(Operand(scratch), temp, temp);
1885
1886 // Convert negative biased lanes in temp to zero. After this, temp will be
1887 // zero where the result should be zero or is less than 80000000h, 7FFFFFFF
1888 // where the result overflows, and will have the converted biased result in
1889 // other lanes (for input values >= 80000000h).
1890 vpxor(Operand(scratch), scratch, scratch);
1891 vpmaxsd(Operand(scratch), temp, temp);
1892
1893 // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will
1894 // be what they should be.
1895 vcvttps2dq(dest, dest);
1896
1897 // Add temp to the result. Overflow lanes with 80000000h becomes FFFFFFFFh,
1898 // biased high-value unsigned lanes become unbiased, everything else is left
1899 // unchanged.
1900 vpaddd(Operand(temp), dest, dest);
1901 }
1902