1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "jit/MacroAssembler.h"
8 #include "jit/x86-shared/MacroAssembler-x86-shared.h"
9 
10 #include "jit/MacroAssembler-inl.h"
11 
12 using namespace js;
13 using namespace js::jit;
14 
15 using mozilla::DebugOnly;
16 using mozilla::FloatingPoint;
17 using mozilla::Maybe;
18 using mozilla::SpecificNaN;
19 
20 // The following routines are from the old asm.js implementation but are UNUSED
21 // in the wasm implementation currently.  They are preserved here because it's
22 // sad to throw out working code.  They are defined in the header file.
23 //
24 // Before using these, they should minimally be moved to
25 // MacroAssembler-x86-shared-SIMD.cpp, and it would be a wrong move to assume
26 // that they are correct according to the wasm spec.
27 
checkedConvertFloat32x4ToInt32x4(FloatRegister src,FloatRegister dest,Register temp,Label * oolEntry,Label * rejoin)28 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(
29     FloatRegister src, FloatRegister dest, Register temp, Label* oolEntry,
30     Label* rejoin) {
31   // Does the conversion and jumps to the OOL entry if the result value
32   // is the undefined integer pattern.
33   static const SimdConstant InvalidResult =
34       SimdConstant::SplatX4(int32_t(-2147483648));
35   convertFloat32x4ToInt32x4(src, dest);
36 
37   ScratchSimd128Scope scratch(asMasm());
38   asMasm().loadConstantSimd128Int(InvalidResult, scratch);
39   vpcmpeqd(Operand(dest), scratch, scratch);
40   // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
41   // the two following instructions.
42   vmovmskps(scratch, temp);
43   cmp32(temp, Imm32(0));
44   j(Assembler::NotEqual, oolEntry);
45   bind(rejoin);
46 }
47 
oolConvertFloat32x4ToInt32x4(FloatRegister src,Register temp,Label * rejoin,Label * onConversionError)48 void MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(
49     FloatRegister src, Register temp, Label* rejoin, Label* onConversionError) {
50   static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
51   static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
52 
53   ScratchSimd128Scope scratch(asMasm());
54   asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
55   vcmpleps(Operand(src), scratch);
56   vmovmskps(scratch, temp);
57   cmp32(temp, Imm32(15));
58   j(Assembler::NotEqual, onConversionError);
59 
60   asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
61   vcmpleps(Operand(src), scratch);
62   vmovmskps(scratch, temp);
63   cmp32(temp, Imm32(0));
64   j(Assembler::NotEqual, onConversionError);
65 
66   jump(rejoin);
67 }
68 
checkedConvertFloat32x4ToUint32x4(FloatRegister in,FloatRegister out,Register temp,FloatRegister tempF,Label * failed)69 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
70     FloatRegister in, FloatRegister out, Register temp, FloatRegister tempF,
71     Label* failed) {
72   // Classify lane values into 4 disjoint classes:
73   //
74   //   N-lanes:             in <= -1.0
75   //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
76   //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
77   //   V-lanes: 0x1.0p32 <= in, or isnan(in)
78   //
79   // We need to bail out to throw a RangeError if we see any N-lanes or
80   // V-lanes.
81   //
82   // For A-lanes and B-lanes, we make two float -> int32 conversions:
83   //
84   //   A = cvttps2dq(in)
85   //   B = cvttps2dq(in - 0x1.0p31f)
86   //
87   // Note that the subtraction for the B computation is exact for B-lanes.
88   // There is no rounding, so B is the low 31 bits of the correctly converted
89   // result.
90   //
91   // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
92   // out of range for a signed int32_t. This conveniently provides the missing
93   // high bit for B, so the desired result is A for A-lanes and A|B for
94   // B-lanes.
95 
96   ScratchSimd128Scope scratch(asMasm());
97 
98   // TODO: If the majority of lanes are A-lanes, it could be faster to compute
99   // A first, use vmovmskps to check for any non-A-lanes and handle them in
100   // ool code. OTOH, we we're wrong about the lane distribution, that would be
101   // slower.
102 
103   // Compute B in |scratch|.
104   static const float Adjust = 0x80000000;  // 0x1.0p31f for the benefit of MSVC.
105   static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
106   asMasm().loadConstantSimd128Float(Bias, scratch);
107   packedAddFloat32(Operand(in), scratch);
108   convertFloat32x4ToInt32x4(scratch, scratch);
109 
110   // Compute A in |out|. This is the last time we use |in| and the first time
111   // we use |out|, so we can tolerate if they are the same register.
112   convertFloat32x4ToInt32x4(in, out);
113 
114   // We can identify A-lanes by the sign bits in A: Any A-lanes will be
115   // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
116   // mask of non-A-lanes into |tempF|.
117   zeroSimd128Float(tempF);
118   vpcmpgtd(Operand(out), tempF, tempF);
119 
120   // Clear the A-lanes in B.
121   bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
122 
123   // Compute the final result: A for A-lanes, A|B for B-lanes.
124   bitwiseOrSimdInt(out, Operand(scratch), out);
125 
126   // We still need to filter out the V-lanes. They would show up as 0x80000000
127   // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
128   // the remaining negative lanes in B.
129   vmovmskps(scratch, temp);
130   cmp32(temp, Imm32(0));
131   j(Assembler::NotEqual, failed);
132 }
133 
createInt32x4(Register lane0,Register lane1,Register lane2,Register lane3,FloatRegister dest)134 void MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1,
135                                             Register lane2, Register lane3,
136                                             FloatRegister dest) {
137   if (AssemblerX86Shared::HasSSE41()) {
138     vmovd(lane0, dest);
139     vpinsrd(1, lane1, dest, dest);
140     vpinsrd(2, lane2, dest, dest);
141     vpinsrd(3, lane3, dest, dest);
142     return;
143   }
144 
145   asMasm().reserveStack(Simd128DataSize);
146   store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
147   store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
148   store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
149   store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
150   loadAlignedSimd128Int(Address(StackPointer, 0), dest);
151   asMasm().freeStack(Simd128DataSize);
152 }
153 
createFloat32x4(FloatRegister lane0,FloatRegister lane1,FloatRegister lane2,FloatRegister lane3,FloatRegister temp,FloatRegister output)154 void MacroAssemblerX86Shared::createFloat32x4(
155     FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
156     FloatRegister lane3, FloatRegister temp, FloatRegister output) {
157   FloatRegister lane0Copy = reusedInputSimd128Float(lane0, output);
158   FloatRegister lane1Copy = reusedInputSimd128Float(lane1, temp);
159   vunpcklps(lane3, lane1Copy, temp);
160   vunpcklps(lane2, lane0Copy, output);
161   vunpcklps(temp, output, output);
162 }
163 
reinterpretSimd(bool isIntegerLaneType,FloatRegister input,FloatRegister output)164 void MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType,
165                                               FloatRegister input,
166                                               FloatRegister output) {
167   if (input.aliases(output)) {
168     return;
169   }
170   if (isIntegerLaneType) {
171     vmovdqa(input, output);
172   } else {
173     vmovaps(input, output);
174   }
175 }
176 
extractLaneSimdBool(FloatRegister input,Register output,unsigned numLanes,unsigned lane)177 void MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input,
178                                                   Register output,
179                                                   unsigned numLanes,
180                                                   unsigned lane) {
181   switch (numLanes) {
182     case 4:
183       extractLaneInt32x4(input, output, lane);
184       break;
185     case 8:
186       // Get a lane, don't bother fixing the high bits since we'll mask below.
187       extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
188       break;
189     case 16:
190       extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
191       break;
192     default:
193       MOZ_CRASH("Unhandled SIMD number of lanes");
194   }
195   // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
196   asMasm().and32(Imm32(1), output);
197 }
198 
allTrueSimdBool(FloatRegister input,Register output)199 void MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input,
200                                               Register output) {
201   // We know that the input lanes are boolean, so they are either 0 or -1.
202   // The all-true vector has all 128 bits set, no matter the lane geometry.
203   vpmovmskb(input, output);
204   cmp32(output, Imm32(0xffff));
205   emitSet(Assembler::Zero, output);
206 }
207 
anyTrueSimdBool(FloatRegister input,Register output)208 void MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input,
209                                               Register output) {
210   vpmovmskb(input, output);
211   cmp32(output, Imm32(0x0));
212   emitSet(Assembler::NonZero, output);
213 }
214 
swizzleInt32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])215 void MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input,
216                                              FloatRegister output,
217                                              unsigned lanes[4]) {
218   uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1],
219                                                      lanes[2], lanes[3]);
220   shuffleInt32(mask, input, output);
221 }
222 
223 // For SIMD.js
oldSwizzleInt8x16(FloatRegister input,FloatRegister output,const Maybe<Register> & temp,int8_t lanes[16])224 void MacroAssemblerX86Shared::oldSwizzleInt8x16(FloatRegister input,
225                                                 FloatRegister output,
226                                                 const Maybe<Register>& temp,
227                                                 int8_t lanes[16]) {
228   if (AssemblerX86Shared::HasSSSE3()) {
229     ScratchSimd128Scope scratch(asMasm());
230     asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
231     FloatRegister inputCopy = reusedInputInt32x4(input, output);
232     vpshufb(scratch, inputCopy, output);
233     return;
234   }
235 
236   // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
237   MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
238   asMasm().reserveStack(2 * Simd128DataSize);
239   storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
240   for (unsigned i = 0; i < 16; i++) {
241     load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
242     store8(*temp, Address(StackPointer, i));
243   }
244   loadAlignedSimd128Int(Address(StackPointer, 0), output);
245   asMasm().freeStack(2 * Simd128DataSize);
246 }
247 
LanesMatch(unsigned lanes[4],unsigned x,unsigned y,unsigned z,unsigned w)248 static inline bool LanesMatch(unsigned lanes[4], unsigned x, unsigned y,
249                               unsigned z, unsigned w) {
250   return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
251 }
252 
swizzleFloat32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])253 void MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input,
254                                                FloatRegister output,
255                                                unsigned lanes[4]) {
256   if (AssemblerX86Shared::HasSSE3()) {
257     if (LanesMatch(lanes, 0, 0, 2, 2)) {
258       vmovsldup(input, output);
259       return;
260     }
261     if (LanesMatch(lanes, 1, 1, 3, 3)) {
262       vmovshdup(input, output);
263       return;
264     }
265   }
266 
267   // TODO Here and below, arch specific lowering could identify this pattern
268   // and use defineReuseInput to avoid this move (bug 1084404)
269   if (LanesMatch(lanes, 2, 3, 2, 3)) {
270     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
271     vmovhlps(input, inputCopy, output);
272     return;
273   }
274 
275   if (LanesMatch(lanes, 0, 1, 0, 1)) {
276     if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
277       vmovddup(Operand(input), output);
278       return;
279     }
280     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
281     vmovlhps(input, inputCopy, output);
282     return;
283   }
284 
285   if (LanesMatch(lanes, 0, 0, 1, 1)) {
286     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
287     vunpcklps(input, inputCopy, output);
288     return;
289   }
290 
291   if (LanesMatch(lanes, 2, 2, 3, 3)) {
292     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
293     vunpckhps(input, inputCopy, output);
294     return;
295   }
296 
297   uint32_t x = lanes[0];
298   uint32_t y = lanes[1];
299   uint32_t z = lanes[2];
300   uint32_t w = lanes[3];
301 
302   uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
303   shuffleFloat32(mask, input, output);
304 }
305 
shuffleX4(FloatRegister lhs,Operand rhs,FloatRegister out,const Maybe<FloatRegister> & maybeTemp,unsigned lanes[4])306 void MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs,
307                                         FloatRegister out,
308                                         const Maybe<FloatRegister>& maybeTemp,
309                                         unsigned lanes[4]) {
310   uint32_t x = lanes[0];
311   uint32_t y = lanes[1];
312   uint32_t z = lanes[2];
313   uint32_t w = lanes[3];
314 
315   // Check that lanes come from LHS in majority:
316   unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
317   MOZ_ASSERT(numLanesFromLHS >= 2);
318 
319   // When reading this method, remember that vshufps takes the two first
320   // inputs of the destination operand (right operand) and the two last
321   // inputs of the source operand (left operand).
322   //
323   // Legend for explanations:
324   // - L: LHS
325   // - R: RHS
326   // - T: temporary
327 
328   uint32_t mask;
329 
330   // If all lanes came from a single vector, we should use swizzle instead.
331   MOZ_ASSERT(numLanesFromLHS < 4);
332 
333   // If all values stay in their lane, this is a blend.
334   if (AssemblerX86Shared::HasSSE41()) {
335     if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
336       vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
337       return;
338     }
339   }
340 
341   // One element of the second, all other elements of the first
342   if (numLanesFromLHS == 3) {
343     unsigned firstMask = -1, secondMask = -1;
344 
345     // register-register vmovss preserves the high lanes.
346     if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
347       vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
348       return;
349     }
350 
351     // SSE4.1 vinsertps can handle any single element.
352     unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
353     if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
354       unsigned srcLane;
355       unsigned dstLane;
356       if (x >= 4) {
357         srcLane = x - 4;
358         dstLane = 0;
359       } else if (y >= 4) {
360         srcLane = y - 4;
361         dstLane = 1;
362       } else if (z >= 4) {
363         srcLane = z - 4;
364         dstLane = 2;
365       } else {
366         MOZ_ASSERT(w >= 4);
367         srcLane = w - 4;
368         dstLane = 3;
369       }
370       vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
371       return;
372     }
373 
374     MOZ_ASSERT(!!maybeTemp);
375     FloatRegister rhsCopy = *maybeTemp;
376     loadAlignedSimd128Float(rhs, rhsCopy);
377 
378     if (x < 4 && y < 4) {
379       if (w >= 4) {
380         w %= 4;
381         // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
382         firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
383         // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
384         secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
385       } else {
386         MOZ_ASSERT(z >= 4);
387         z %= 4;
388         // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
389         firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
390         // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
391         secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
392       }
393 
394       vshufps(firstMask, lhs, rhsCopy, rhsCopy);
395       vshufps(secondMask, rhsCopy, lhs, out);
396       return;
397     }
398 
399     MOZ_ASSERT(z < 4 && w < 4);
400 
401     if (y >= 4) {
402       y %= 4;
403       // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
404       firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
405       // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
406       secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
407     } else {
408       MOZ_ASSERT(x >= 4);
409       x %= 4;
410       // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
411       firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
412       // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
413       secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
414     }
415 
416     vshufps(firstMask, lhs, rhsCopy, rhsCopy);
417     if (AssemblerX86Shared::HasAVX()) {
418       vshufps(secondMask, lhs, rhsCopy, out);
419     } else {
420       vshufps(secondMask, lhs, rhsCopy, rhsCopy);
421       moveSimd128Float(rhsCopy, out);
422     }
423     return;
424   }
425 
426   // Two elements from one vector, two other elements from the other
427   MOZ_ASSERT(numLanesFromLHS == 2);
428 
429   // TODO Here and below, symmetric case would be more handy to avoid a move,
430   // but can't be reached because operands would get swapped (bug 1084404).
431   if (LanesMatch(lanes, 2, 3, 6, 7)) {
432     ScratchSimd128Scope scratch(asMasm());
433     if (AssemblerX86Shared::HasAVX()) {
434       FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
435       vmovhlps(lhs, rhsCopy, out);
436     } else {
437       loadAlignedSimd128Float(rhs, scratch);
438       vmovhlps(lhs, scratch, scratch);
439       moveSimd128Float(scratch, out);
440     }
441     return;
442   }
443 
444   if (LanesMatch(lanes, 0, 1, 4, 5)) {
445     FloatRegister rhsCopy;
446     ScratchSimd128Scope scratch(asMasm());
447     if (rhs.kind() == Operand::FPREG) {
448       // No need to make an actual copy, since the operand is already
449       // in a register, and it won't be clobbered by the vmovlhps.
450       rhsCopy = FloatRegister::FromCode(rhs.fpu());
451     } else {
452       loadAlignedSimd128Float(rhs, scratch);
453       rhsCopy = scratch;
454     }
455     vmovlhps(rhsCopy, lhs, out);
456     return;
457   }
458 
459   if (LanesMatch(lanes, 0, 4, 1, 5)) {
460     vunpcklps(rhs, lhs, out);
461     return;
462   }
463 
464   // TODO swapped case would be better (bug 1084404)
465   if (LanesMatch(lanes, 4, 0, 5, 1)) {
466     ScratchSimd128Scope scratch(asMasm());
467     if (AssemblerX86Shared::HasAVX()) {
468       FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
469       vunpcklps(lhs, rhsCopy, out);
470     } else {
471       loadAlignedSimd128Float(rhs, scratch);
472       vunpcklps(lhs, scratch, scratch);
473       moveSimd128Float(scratch, out);
474     }
475     return;
476   }
477 
478   if (LanesMatch(lanes, 2, 6, 3, 7)) {
479     vunpckhps(rhs, lhs, out);
480     return;
481   }
482 
483   // TODO swapped case would be better (bug 1084404)
484   if (LanesMatch(lanes, 6, 2, 7, 3)) {
485     ScratchSimd128Scope scratch(asMasm());
486     if (AssemblerX86Shared::HasAVX()) {
487       FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
488       vunpckhps(lhs, rhsCopy, out);
489     } else {
490       loadAlignedSimd128Float(rhs, scratch);
491       vunpckhps(lhs, scratch, scratch);
492       moveSimd128Float(scratch, out);
493     }
494     return;
495   }
496 
497   // In one vshufps
498   if (x < 4 && y < 4) {
499     mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
500     vshufps(mask, rhs, lhs, out);
501     return;
502   }
503 
504   // At creation, we should have explicitly swapped in this case.
505   MOZ_ASSERT(!(z >= 4 && w >= 4));
506 
507   // In two vshufps, for the most generic case:
508   uint32_t firstMask[4], secondMask[4];
509   unsigned i = 0, j = 2, k = 0;
510 
511 #define COMPUTE_MASK(lane)   \
512   if (lane >= 4) {           \
513     firstMask[j] = lane % 4; \
514     secondMask[k++] = j++;   \
515   } else {                   \
516     firstMask[i] = lane;     \
517     secondMask[k++] = i++;   \
518   }
519 
520   COMPUTE_MASK(x)
521   COMPUTE_MASK(y)
522   COMPUTE_MASK(z)
523   COMPUTE_MASK(w)
524 #undef COMPUTE_MASK
525 
526   MOZ_ASSERT(i == 2 && j == 4 && k == 4);
527 
528   mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
529                                             firstMask[2], firstMask[3]);
530   vshufps(mask, rhs, lhs, lhs);
531 
532   mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
533                                             secondMask[2], secondMask[3]);
534   vshufps(mask, lhs, lhs, lhs);
535 }
536 
minNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)537 void MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs,
538                                               FloatRegister temp,
539                                               FloatRegister output) {
540   ScratchSimd128Scope scratch(asMasm());
541   asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
542                                   temp);
543 
544   FloatRegister mask = scratch;
545   FloatRegister tmpCopy = reusedInputSimd128Float(temp, scratch);
546   vpcmpeqd(Operand(lhs), tmpCopy, mask);
547   vandps(temp, mask, mask);
548 
549   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
550   vminps(rhs, lhsCopy, temp);
551   vorps(mask, temp, temp);
552 
553   if (AssemblerX86Shared::HasAVX()) {
554     MOZ_CRASH("Can do better by avoiding the movaps");
555   } else {
556     vmovaps(rhs, mask);
557     vcmpneqps(rhs, mask);
558   }
559 
560   if (AssemblerX86Shared::HasAVX()) {
561     vblendvps(mask, lhs, temp, output);
562   } else {
563     // Emulate vblendvps.
564     // With SSE.4.1 we could use blendvps, however it's awkward since
565     // it requires the mask to be in xmm0.
566     if (lhs != output) {
567       moveSimd128Float(lhs, output);
568     }
569     vandps(Operand(mask), output, output);
570     vandnps(Operand(temp), mask, mask);
571     vorps(Operand(mask), output, output);
572   }
573 }
574 
maxNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)575 void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
576                                               FloatRegister temp,
577                                               FloatRegister output) {
578   ScratchSimd128Scope scratch(asMasm());
579   FloatRegister mask = scratch;
580 
581   asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
582   vpcmpeqd(Operand(lhs), mask, mask);
583 
584   asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
585                                   temp);
586   vandps(temp, mask, mask);
587 
588   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
589   vmaxps(rhs, lhsCopy, temp);
590   vandnps(Operand(temp), mask, mask);
591 
592   // Ensure temp always contains the temporary result
593   mask = temp;
594   temp = scratch;
595 
596   if (AssemblerX86Shared::HasAVX()) {
597     MOZ_CRASH("Can do better by avoiding the movaps");
598   } else {
599     vmovaps(rhs, mask);
600     vcmpneqps(rhs, mask);
601   }
602 
603   if (AssemblerX86Shared::HasAVX()) {
604     vblendvps(mask, lhs, temp, output);
605   } else {
606     // Emulate vblendvps.
607     // With SSE.4.1 we could use blendvps, however it's awkward since
608     // it requires the mask to be in xmm0.
609     if (lhs != output) {
610       moveSimd128Float(lhs, output);
611     }
612     vandps(Operand(mask), output, output);
613     vandnps(Operand(temp), mask, mask);
614     vorps(Operand(mask), output, output);
615   }
616 }
617 
selectX4(FloatRegister mask,FloatRegister onTrue,FloatRegister onFalse,FloatRegister temp,FloatRegister output)618 void MacroAssemblerX86Shared::selectX4(FloatRegister mask, FloatRegister onTrue,
619                                        FloatRegister onFalse,
620                                        FloatRegister temp,
621                                        FloatRegister output) {
622   if (AssemblerX86Shared::HasAVX()) {
623     vblendvps(mask, onTrue, onFalse, output);
624   } else {
625     selectSimd128(mask, onTrue, onFalse, temp, output);
626   }
627 }
628