1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "jit/MacroAssembler.h"
8 #include "jit/x86-shared/MacroAssembler-x86-shared.h"
9
10 #include "jit/MacroAssembler-inl.h"
11
12 using namespace js;
13 using namespace js::jit;
14
15 using mozilla::DebugOnly;
16 using mozilla::FloatingPoint;
17 using mozilla::Maybe;
18 using mozilla::SpecificNaN;
19
20 // The following routines are from the old asm.js implementation but are UNUSED
21 // in the wasm implementation currently. They are preserved here because it's
22 // sad to throw out working code. They are defined in the header file.
23 //
24 // Before using these, they should minimally be moved to
25 // MacroAssembler-x86-shared-SIMD.cpp, and it would be a wrong move to assume
26 // that they are correct according to the wasm spec.
27
checkedConvertFloat32x4ToInt32x4(FloatRegister src,FloatRegister dest,Register temp,Label * oolEntry,Label * rejoin)28 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(
29 FloatRegister src, FloatRegister dest, Register temp, Label* oolEntry,
30 Label* rejoin) {
31 // Does the conversion and jumps to the OOL entry if the result value
32 // is the undefined integer pattern.
33 static const SimdConstant InvalidResult =
34 SimdConstant::SplatX4(int32_t(-2147483648));
35 convertFloat32x4ToInt32x4(src, dest);
36
37 ScratchSimd128Scope scratch(asMasm());
38 asMasm().loadConstantSimd128Int(InvalidResult, scratch);
39 vpcmpeqd(Operand(dest), scratch, scratch);
40 // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
41 // the two following instructions.
42 vmovmskps(scratch, temp);
43 cmp32(temp, Imm32(0));
44 j(Assembler::NotEqual, oolEntry);
45 bind(rejoin);
46 }
47
oolConvertFloat32x4ToInt32x4(FloatRegister src,Register temp,Label * rejoin,Label * onConversionError)48 void MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(
49 FloatRegister src, Register temp, Label* rejoin, Label* onConversionError) {
50 static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
51 static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
52
53 ScratchSimd128Scope scratch(asMasm());
54 asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
55 vcmpleps(Operand(src), scratch);
56 vmovmskps(scratch, temp);
57 cmp32(temp, Imm32(15));
58 j(Assembler::NotEqual, onConversionError);
59
60 asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
61 vcmpleps(Operand(src), scratch);
62 vmovmskps(scratch, temp);
63 cmp32(temp, Imm32(0));
64 j(Assembler::NotEqual, onConversionError);
65
66 jump(rejoin);
67 }
68
checkedConvertFloat32x4ToUint32x4(FloatRegister in,FloatRegister out,Register temp,FloatRegister tempF,Label * failed)69 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
70 FloatRegister in, FloatRegister out, Register temp, FloatRegister tempF,
71 Label* failed) {
72 // Classify lane values into 4 disjoint classes:
73 //
74 // N-lanes: in <= -1.0
75 // A-lanes: -1.0 < in <= 0x0.ffffffp31
76 // B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
77 // V-lanes: 0x1.0p32 <= in, or isnan(in)
78 //
79 // We need to bail out to throw a RangeError if we see any N-lanes or
80 // V-lanes.
81 //
82 // For A-lanes and B-lanes, we make two float -> int32 conversions:
83 //
84 // A = cvttps2dq(in)
85 // B = cvttps2dq(in - 0x1.0p31f)
86 //
87 // Note that the subtraction for the B computation is exact for B-lanes.
88 // There is no rounding, so B is the low 31 bits of the correctly converted
89 // result.
90 //
91 // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
92 // out of range for a signed int32_t. This conveniently provides the missing
93 // high bit for B, so the desired result is A for A-lanes and A|B for
94 // B-lanes.
95
96 ScratchSimd128Scope scratch(asMasm());
97
98 // TODO: If the majority of lanes are A-lanes, it could be faster to compute
99 // A first, use vmovmskps to check for any non-A-lanes and handle them in
100 // ool code. OTOH, we we're wrong about the lane distribution, that would be
101 // slower.
102
103 // Compute B in |scratch|.
104 static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
105 static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
106 asMasm().loadConstantSimd128Float(Bias, scratch);
107 packedAddFloat32(Operand(in), scratch);
108 convertFloat32x4ToInt32x4(scratch, scratch);
109
110 // Compute A in |out|. This is the last time we use |in| and the first time
111 // we use |out|, so we can tolerate if they are the same register.
112 convertFloat32x4ToInt32x4(in, out);
113
114 // We can identify A-lanes by the sign bits in A: Any A-lanes will be
115 // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
116 // mask of non-A-lanes into |tempF|.
117 zeroSimd128Float(tempF);
118 vpcmpgtd(Operand(out), tempF, tempF);
119
120 // Clear the A-lanes in B.
121 bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
122
123 // Compute the final result: A for A-lanes, A|B for B-lanes.
124 bitwiseOrSimdInt(out, Operand(scratch), out);
125
126 // We still need to filter out the V-lanes. They would show up as 0x80000000
127 // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
128 // the remaining negative lanes in B.
129 vmovmskps(scratch, temp);
130 cmp32(temp, Imm32(0));
131 j(Assembler::NotEqual, failed);
132 }
133
createInt32x4(Register lane0,Register lane1,Register lane2,Register lane3,FloatRegister dest)134 void MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1,
135 Register lane2, Register lane3,
136 FloatRegister dest) {
137 if (AssemblerX86Shared::HasSSE41()) {
138 vmovd(lane0, dest);
139 vpinsrd(1, lane1, dest, dest);
140 vpinsrd(2, lane2, dest, dest);
141 vpinsrd(3, lane3, dest, dest);
142 return;
143 }
144
145 asMasm().reserveStack(Simd128DataSize);
146 store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
147 store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
148 store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
149 store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
150 loadAlignedSimd128Int(Address(StackPointer, 0), dest);
151 asMasm().freeStack(Simd128DataSize);
152 }
153
createFloat32x4(FloatRegister lane0,FloatRegister lane1,FloatRegister lane2,FloatRegister lane3,FloatRegister temp,FloatRegister output)154 void MacroAssemblerX86Shared::createFloat32x4(
155 FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
156 FloatRegister lane3, FloatRegister temp, FloatRegister output) {
157 FloatRegister lane0Copy = reusedInputSimd128Float(lane0, output);
158 FloatRegister lane1Copy = reusedInputSimd128Float(lane1, temp);
159 vunpcklps(lane3, lane1Copy, temp);
160 vunpcklps(lane2, lane0Copy, output);
161 vunpcklps(temp, output, output);
162 }
163
reinterpretSimd(bool isIntegerLaneType,FloatRegister input,FloatRegister output)164 void MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType,
165 FloatRegister input,
166 FloatRegister output) {
167 if (input.aliases(output)) {
168 return;
169 }
170 if (isIntegerLaneType) {
171 vmovdqa(input, output);
172 } else {
173 vmovaps(input, output);
174 }
175 }
176
extractLaneSimdBool(FloatRegister input,Register output,unsigned numLanes,unsigned lane)177 void MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input,
178 Register output,
179 unsigned numLanes,
180 unsigned lane) {
181 switch (numLanes) {
182 case 4:
183 extractLaneInt32x4(input, output, lane);
184 break;
185 case 8:
186 // Get a lane, don't bother fixing the high bits since we'll mask below.
187 extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
188 break;
189 case 16:
190 extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
191 break;
192 default:
193 MOZ_CRASH("Unhandled SIMD number of lanes");
194 }
195 // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
196 asMasm().and32(Imm32(1), output);
197 }
198
allTrueSimdBool(FloatRegister input,Register output)199 void MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input,
200 Register output) {
201 // We know that the input lanes are boolean, so they are either 0 or -1.
202 // The all-true vector has all 128 bits set, no matter the lane geometry.
203 vpmovmskb(input, output);
204 cmp32(output, Imm32(0xffff));
205 emitSet(Assembler::Zero, output);
206 }
207
anyTrueSimdBool(FloatRegister input,Register output)208 void MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input,
209 Register output) {
210 vpmovmskb(input, output);
211 cmp32(output, Imm32(0x0));
212 emitSet(Assembler::NonZero, output);
213 }
214
swizzleInt32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])215 void MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input,
216 FloatRegister output,
217 unsigned lanes[4]) {
218 uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1],
219 lanes[2], lanes[3]);
220 shuffleInt32(mask, input, output);
221 }
222
223 // For SIMD.js
oldSwizzleInt8x16(FloatRegister input,FloatRegister output,const Maybe<Register> & temp,int8_t lanes[16])224 void MacroAssemblerX86Shared::oldSwizzleInt8x16(FloatRegister input,
225 FloatRegister output,
226 const Maybe<Register>& temp,
227 int8_t lanes[16]) {
228 if (AssemblerX86Shared::HasSSSE3()) {
229 ScratchSimd128Scope scratch(asMasm());
230 asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
231 FloatRegister inputCopy = reusedInputInt32x4(input, output);
232 vpshufb(scratch, inputCopy, output);
233 return;
234 }
235
236 // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
237 MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
238 asMasm().reserveStack(2 * Simd128DataSize);
239 storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
240 for (unsigned i = 0; i < 16; i++) {
241 load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
242 store8(*temp, Address(StackPointer, i));
243 }
244 loadAlignedSimd128Int(Address(StackPointer, 0), output);
245 asMasm().freeStack(2 * Simd128DataSize);
246 }
247
LanesMatch(unsigned lanes[4],unsigned x,unsigned y,unsigned z,unsigned w)248 static inline bool LanesMatch(unsigned lanes[4], unsigned x, unsigned y,
249 unsigned z, unsigned w) {
250 return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
251 }
252
swizzleFloat32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])253 void MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input,
254 FloatRegister output,
255 unsigned lanes[4]) {
256 if (AssemblerX86Shared::HasSSE3()) {
257 if (LanesMatch(lanes, 0, 0, 2, 2)) {
258 vmovsldup(input, output);
259 return;
260 }
261 if (LanesMatch(lanes, 1, 1, 3, 3)) {
262 vmovshdup(input, output);
263 return;
264 }
265 }
266
267 // TODO Here and below, arch specific lowering could identify this pattern
268 // and use defineReuseInput to avoid this move (bug 1084404)
269 if (LanesMatch(lanes, 2, 3, 2, 3)) {
270 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
271 vmovhlps(input, inputCopy, output);
272 return;
273 }
274
275 if (LanesMatch(lanes, 0, 1, 0, 1)) {
276 if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
277 vmovddup(Operand(input), output);
278 return;
279 }
280 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
281 vmovlhps(input, inputCopy, output);
282 return;
283 }
284
285 if (LanesMatch(lanes, 0, 0, 1, 1)) {
286 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
287 vunpcklps(input, inputCopy, output);
288 return;
289 }
290
291 if (LanesMatch(lanes, 2, 2, 3, 3)) {
292 FloatRegister inputCopy = reusedInputSimd128Float(input, output);
293 vunpckhps(input, inputCopy, output);
294 return;
295 }
296
297 uint32_t x = lanes[0];
298 uint32_t y = lanes[1];
299 uint32_t z = lanes[2];
300 uint32_t w = lanes[3];
301
302 uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
303 shuffleFloat32(mask, input, output);
304 }
305
shuffleX4(FloatRegister lhs,Operand rhs,FloatRegister out,const Maybe<FloatRegister> & maybeTemp,unsigned lanes[4])306 void MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs,
307 FloatRegister out,
308 const Maybe<FloatRegister>& maybeTemp,
309 unsigned lanes[4]) {
310 uint32_t x = lanes[0];
311 uint32_t y = lanes[1];
312 uint32_t z = lanes[2];
313 uint32_t w = lanes[3];
314
315 // Check that lanes come from LHS in majority:
316 unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
317 MOZ_ASSERT(numLanesFromLHS >= 2);
318
319 // When reading this method, remember that vshufps takes the two first
320 // inputs of the destination operand (right operand) and the two last
321 // inputs of the source operand (left operand).
322 //
323 // Legend for explanations:
324 // - L: LHS
325 // - R: RHS
326 // - T: temporary
327
328 uint32_t mask;
329
330 // If all lanes came from a single vector, we should use swizzle instead.
331 MOZ_ASSERT(numLanesFromLHS < 4);
332
333 // If all values stay in their lane, this is a blend.
334 if (AssemblerX86Shared::HasSSE41()) {
335 if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
336 vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
337 return;
338 }
339 }
340
341 // One element of the second, all other elements of the first
342 if (numLanesFromLHS == 3) {
343 unsigned firstMask = -1, secondMask = -1;
344
345 // register-register vmovss preserves the high lanes.
346 if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
347 vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
348 return;
349 }
350
351 // SSE4.1 vinsertps can handle any single element.
352 unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
353 if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
354 unsigned srcLane;
355 unsigned dstLane;
356 if (x >= 4) {
357 srcLane = x - 4;
358 dstLane = 0;
359 } else if (y >= 4) {
360 srcLane = y - 4;
361 dstLane = 1;
362 } else if (z >= 4) {
363 srcLane = z - 4;
364 dstLane = 2;
365 } else {
366 MOZ_ASSERT(w >= 4);
367 srcLane = w - 4;
368 dstLane = 3;
369 }
370 vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
371 return;
372 }
373
374 MOZ_ASSERT(!!maybeTemp);
375 FloatRegister rhsCopy = *maybeTemp;
376 loadAlignedSimd128Float(rhs, rhsCopy);
377
378 if (x < 4 && y < 4) {
379 if (w >= 4) {
380 w %= 4;
381 // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
382 firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
383 // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
384 secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
385 } else {
386 MOZ_ASSERT(z >= 4);
387 z %= 4;
388 // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
389 firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
390 // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
391 secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
392 }
393
394 vshufps(firstMask, lhs, rhsCopy, rhsCopy);
395 vshufps(secondMask, rhsCopy, lhs, out);
396 return;
397 }
398
399 MOZ_ASSERT(z < 4 && w < 4);
400
401 if (y >= 4) {
402 y %= 4;
403 // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
404 firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
405 // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
406 secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
407 } else {
408 MOZ_ASSERT(x >= 4);
409 x %= 4;
410 // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
411 firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
412 // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
413 secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
414 }
415
416 vshufps(firstMask, lhs, rhsCopy, rhsCopy);
417 if (AssemblerX86Shared::HasAVX()) {
418 vshufps(secondMask, lhs, rhsCopy, out);
419 } else {
420 vshufps(secondMask, lhs, rhsCopy, rhsCopy);
421 moveSimd128Float(rhsCopy, out);
422 }
423 return;
424 }
425
426 // Two elements from one vector, two other elements from the other
427 MOZ_ASSERT(numLanesFromLHS == 2);
428
429 // TODO Here and below, symmetric case would be more handy to avoid a move,
430 // but can't be reached because operands would get swapped (bug 1084404).
431 if (LanesMatch(lanes, 2, 3, 6, 7)) {
432 ScratchSimd128Scope scratch(asMasm());
433 if (AssemblerX86Shared::HasAVX()) {
434 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
435 vmovhlps(lhs, rhsCopy, out);
436 } else {
437 loadAlignedSimd128Float(rhs, scratch);
438 vmovhlps(lhs, scratch, scratch);
439 moveSimd128Float(scratch, out);
440 }
441 return;
442 }
443
444 if (LanesMatch(lanes, 0, 1, 4, 5)) {
445 FloatRegister rhsCopy;
446 ScratchSimd128Scope scratch(asMasm());
447 if (rhs.kind() == Operand::FPREG) {
448 // No need to make an actual copy, since the operand is already
449 // in a register, and it won't be clobbered by the vmovlhps.
450 rhsCopy = FloatRegister::FromCode(rhs.fpu());
451 } else {
452 loadAlignedSimd128Float(rhs, scratch);
453 rhsCopy = scratch;
454 }
455 vmovlhps(rhsCopy, lhs, out);
456 return;
457 }
458
459 if (LanesMatch(lanes, 0, 4, 1, 5)) {
460 vunpcklps(rhs, lhs, out);
461 return;
462 }
463
464 // TODO swapped case would be better (bug 1084404)
465 if (LanesMatch(lanes, 4, 0, 5, 1)) {
466 ScratchSimd128Scope scratch(asMasm());
467 if (AssemblerX86Shared::HasAVX()) {
468 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
469 vunpcklps(lhs, rhsCopy, out);
470 } else {
471 loadAlignedSimd128Float(rhs, scratch);
472 vunpcklps(lhs, scratch, scratch);
473 moveSimd128Float(scratch, out);
474 }
475 return;
476 }
477
478 if (LanesMatch(lanes, 2, 6, 3, 7)) {
479 vunpckhps(rhs, lhs, out);
480 return;
481 }
482
483 // TODO swapped case would be better (bug 1084404)
484 if (LanesMatch(lanes, 6, 2, 7, 3)) {
485 ScratchSimd128Scope scratch(asMasm());
486 if (AssemblerX86Shared::HasAVX()) {
487 FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
488 vunpckhps(lhs, rhsCopy, out);
489 } else {
490 loadAlignedSimd128Float(rhs, scratch);
491 vunpckhps(lhs, scratch, scratch);
492 moveSimd128Float(scratch, out);
493 }
494 return;
495 }
496
497 // In one vshufps
498 if (x < 4 && y < 4) {
499 mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
500 vshufps(mask, rhs, lhs, out);
501 return;
502 }
503
504 // At creation, we should have explicitly swapped in this case.
505 MOZ_ASSERT(!(z >= 4 && w >= 4));
506
507 // In two vshufps, for the most generic case:
508 uint32_t firstMask[4], secondMask[4];
509 unsigned i = 0, j = 2, k = 0;
510
511 #define COMPUTE_MASK(lane) \
512 if (lane >= 4) { \
513 firstMask[j] = lane % 4; \
514 secondMask[k++] = j++; \
515 } else { \
516 firstMask[i] = lane; \
517 secondMask[k++] = i++; \
518 }
519
520 COMPUTE_MASK(x)
521 COMPUTE_MASK(y)
522 COMPUTE_MASK(z)
523 COMPUTE_MASK(w)
524 #undef COMPUTE_MASK
525
526 MOZ_ASSERT(i == 2 && j == 4 && k == 4);
527
528 mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
529 firstMask[2], firstMask[3]);
530 vshufps(mask, rhs, lhs, lhs);
531
532 mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
533 secondMask[2], secondMask[3]);
534 vshufps(mask, lhs, lhs, lhs);
535 }
536
minNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)537 void MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs,
538 FloatRegister temp,
539 FloatRegister output) {
540 ScratchSimd128Scope scratch(asMasm());
541 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
542 temp);
543
544 FloatRegister mask = scratch;
545 FloatRegister tmpCopy = reusedInputSimd128Float(temp, scratch);
546 vpcmpeqd(Operand(lhs), tmpCopy, mask);
547 vandps(temp, mask, mask);
548
549 FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
550 vminps(rhs, lhsCopy, temp);
551 vorps(mask, temp, temp);
552
553 if (AssemblerX86Shared::HasAVX()) {
554 MOZ_CRASH("Can do better by avoiding the movaps");
555 } else {
556 vmovaps(rhs, mask);
557 vcmpneqps(rhs, mask);
558 }
559
560 if (AssemblerX86Shared::HasAVX()) {
561 vblendvps(mask, lhs, temp, output);
562 } else {
563 // Emulate vblendvps.
564 // With SSE.4.1 we could use blendvps, however it's awkward since
565 // it requires the mask to be in xmm0.
566 if (lhs != output) {
567 moveSimd128Float(lhs, output);
568 }
569 vandps(Operand(mask), output, output);
570 vandnps(Operand(temp), mask, mask);
571 vorps(Operand(mask), output, output);
572 }
573 }
574
maxNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)575 void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
576 FloatRegister temp,
577 FloatRegister output) {
578 ScratchSimd128Scope scratch(asMasm());
579 FloatRegister mask = scratch;
580
581 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
582 vpcmpeqd(Operand(lhs), mask, mask);
583
584 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
585 temp);
586 vandps(temp, mask, mask);
587
588 FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
589 vmaxps(rhs, lhsCopy, temp);
590 vandnps(Operand(temp), mask, mask);
591
592 // Ensure temp always contains the temporary result
593 mask = temp;
594 temp = scratch;
595
596 if (AssemblerX86Shared::HasAVX()) {
597 MOZ_CRASH("Can do better by avoiding the movaps");
598 } else {
599 vmovaps(rhs, mask);
600 vcmpneqps(rhs, mask);
601 }
602
603 if (AssemblerX86Shared::HasAVX()) {
604 vblendvps(mask, lhs, temp, output);
605 } else {
606 // Emulate vblendvps.
607 // With SSE.4.1 we could use blendvps, however it's awkward since
608 // it requires the mask to be in xmm0.
609 if (lhs != output) {
610 moveSimd128Float(lhs, output);
611 }
612 vandps(Operand(mask), output, output);
613 vandnps(Operand(temp), mask, mask);
614 vorps(Operand(mask), output, output);
615 }
616 }
617
selectX4(FloatRegister mask,FloatRegister onTrue,FloatRegister onFalse,FloatRegister temp,FloatRegister output)618 void MacroAssemblerX86Shared::selectX4(FloatRegister mask, FloatRegister onTrue,
619 FloatRegister onFalse,
620 FloatRegister temp,
621 FloatRegister output) {
622 if (AssemblerX86Shared::HasAVX()) {
623 vblendvps(mask, onTrue, onFalse, output);
624 } else {
625 selectSimd128(mask, onTrue, onFalse, temp, output);
626 }
627 }
628