1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "jit/MacroAssembler.h"
8 #include "jit/x86-shared/MacroAssembler-x86-shared.h"
9 
10 #include "jit/MacroAssembler-inl.h"
11 
12 using namespace js;
13 using namespace js::jit;
14 
15 using mozilla::DebugOnly;
16 using mozilla::FloatingPoint;
17 using mozilla::Maybe;
18 using mozilla::SpecificNaN;
19 
checkedConvertFloat32x4ToInt32x4(FloatRegister src,FloatRegister dest,Register temp,Label * oolEntry,Label * rejoin)20 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(
21     FloatRegister src, FloatRegister dest, Register temp, Label* oolEntry,
22     Label* rejoin) {
23   // Does the conversion and jumps to the OOL entry if the result value
24   // is the undefined integer pattern.
25   static const SimdConstant InvalidResult =
26       SimdConstant::SplatX4(int32_t(-2147483648));
27   convertFloat32x4ToInt32x4(src, dest);
28 
29   ScratchSimd128Scope scratch(asMasm());
30   asMasm().loadConstantSimd128Int(InvalidResult, scratch);
31   packedEqualInt32x4(Operand(dest), scratch);
32   // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
33   // the two following instructions.
34   vmovmskps(scratch, temp);
35   cmp32(temp, Imm32(0));
36   j(Assembler::NotEqual, oolEntry);
37   bind(rejoin);
38 }
39 
oolConvertFloat32x4ToInt32x4(FloatRegister src,Register temp,Label * rejoin,Label * onConversionError)40 void MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(
41     FloatRegister src, Register temp, Label* rejoin, Label* onConversionError) {
42   static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
43   static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
44 
45   ScratchSimd128Scope scratch(asMasm());
46   asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
47   vcmpleps(Operand(src), scratch, scratch);
48   vmovmskps(scratch, temp);
49   cmp32(temp, Imm32(15));
50   j(Assembler::NotEqual, onConversionError);
51 
52   asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
53   vcmpleps(Operand(src), scratch, scratch);
54   vmovmskps(scratch, temp);
55   cmp32(temp, Imm32(0));
56   j(Assembler::NotEqual, onConversionError);
57 
58   jump(rejoin);
59 }
60 
checkedConvertFloat32x4ToUint32x4(FloatRegister in,FloatRegister out,Register temp,FloatRegister tempF,Label * failed)61 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
62     FloatRegister in, FloatRegister out, Register temp, FloatRegister tempF,
63     Label* failed) {
64   // Classify lane values into 4 disjoint classes:
65   //
66   //   N-lanes:             in <= -1.0
67   //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
68   //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
69   //   V-lanes: 0x1.0p32 <= in, or isnan(in)
70   //
71   // We need to bail out to throw a RangeError if we see any N-lanes or
72   // V-lanes.
73   //
74   // For A-lanes and B-lanes, we make two float -> int32 conversions:
75   //
76   //   A = cvttps2dq(in)
77   //   B = cvttps2dq(in - 0x1.0p31f)
78   //
79   // Note that the subtraction for the B computation is exact for B-lanes.
80   // There is no rounding, so B is the low 31 bits of the correctly converted
81   // result.
82   //
83   // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
84   // out of range for a signed int32_t. This conveniently provides the missing
85   // high bit for B, so the desired result is A for A-lanes and A|B for
86   // B-lanes.
87 
88   ScratchSimd128Scope scratch(asMasm());
89 
90   // TODO: If the majority of lanes are A-lanes, it could be faster to compute
91   // A first, use vmovmskps to check for any non-A-lanes and handle them in
92   // ool code. OTOH, we we're wrong about the lane distribution, that would be
93   // slower.
94 
95   // Compute B in |scratch|.
96   static const float Adjust = 0x80000000;  // 0x1.0p31f for the benefit of MSVC.
97   static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
98   asMasm().loadConstantSimd128Float(Bias, scratch);
99   packedAddFloat32(Operand(in), scratch);
100   convertFloat32x4ToInt32x4(scratch, scratch);
101 
102   // Compute A in |out|. This is the last time we use |in| and the first time
103   // we use |out|, so we can tolerate if they are the same register.
104   convertFloat32x4ToInt32x4(in, out);
105 
106   // We can identify A-lanes by the sign bits in A: Any A-lanes will be
107   // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
108   // mask of non-A-lanes into |tempF|.
109   zeroSimd128Float(tempF);
110   packedGreaterThanInt32x4(Operand(out), tempF);
111 
112   // Clear the A-lanes in B.
113   bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
114 
115   // Compute the final result: A for A-lanes, A|B for B-lanes.
116   bitwiseOrSimdInt(out, Operand(scratch), out);
117 
118   // We still need to filter out the V-lanes. They would show up as 0x80000000
119   // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
120   // the remaining negative lanes in B.
121   vmovmskps(scratch, temp);
122   cmp32(temp, Imm32(0));
123   j(Assembler::NotEqual, failed);
124 }
125 
createInt32x4(Register lane0,Register lane1,Register lane2,Register lane3,FloatRegister dest)126 void MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1,
127                                             Register lane2, Register lane3,
128                                             FloatRegister dest) {
129   if (AssemblerX86Shared::HasSSE41()) {
130     vmovd(lane0, dest);
131     vpinsrd(1, lane1, dest, dest);
132     vpinsrd(2, lane2, dest, dest);
133     vpinsrd(3, lane3, dest, dest);
134     return;
135   }
136 
137   asMasm().reserveStack(Simd128DataSize);
138   store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
139   store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
140   store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
141   store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
142   loadAlignedSimd128Int(Address(StackPointer, 0), dest);
143   asMasm().freeStack(Simd128DataSize);
144 }
145 
createFloat32x4(FloatRegister lane0,FloatRegister lane1,FloatRegister lane2,FloatRegister lane3,FloatRegister temp,FloatRegister output)146 void MacroAssemblerX86Shared::createFloat32x4(
147     FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
148     FloatRegister lane3, FloatRegister temp, FloatRegister output) {
149   FloatRegister lane0Copy = reusedInputSimd128Float(lane0, output);
150   FloatRegister lane1Copy = reusedInputSimd128Float(lane1, temp);
151   vunpcklps(lane3, lane1Copy, temp);
152   vunpcklps(lane2, lane0Copy, output);
153   vunpcklps(temp, output, output);
154 }
155 
splatX16(Register input,FloatRegister output)156 void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
157   vmovd(input, output);
158   if (AssemblerX86Shared::HasSSSE3()) {
159     zeroSimd128Int(ScratchSimd128Reg);
160     vpshufb(ScratchSimd128Reg, output, output);
161   } else {
162     // Use two shifts to duplicate the low 8 bits into the low 16 bits.
163     vpsllw(Imm32(8), output, output);
164     vmovdqa(output, ScratchSimd128Reg);
165     vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
166     vpor(ScratchSimd128Reg, output, output);
167     // Then do an X8 splat.
168     vpshuflw(0, output, output);
169     vpshufd(0, output, output);
170   }
171 }
172 
splatX8(Register input,FloatRegister output)173 void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
174   vmovd(input, output);
175   vpshuflw(0, output, output);
176   vpshufd(0, output, output);
177 }
178 
splatX4(Register input,FloatRegister output)179 void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {
180   vmovd(input, output);
181   vpshufd(0, output, output);
182 }
183 
splatX4(FloatRegister input,FloatRegister output)184 void MacroAssemblerX86Shared::splatX4(FloatRegister input,
185                                       FloatRegister output) {
186   FloatRegister inputCopy = reusedInputSimd128Float(input, output);
187   vshufps(0, inputCopy, inputCopy, output);
188 }
189 
splatX2(FloatRegister input,FloatRegister output)190 void MacroAssemblerX86Shared::splatX2(FloatRegister input,
191                                       FloatRegister output) {
192   FloatRegister inputCopy = reusedInputSimd128Float(input, output);
193   vshufpd(0, inputCopy, inputCopy, output);
194 }
195 
reinterpretSimd(bool isIntegerLaneType,FloatRegister input,FloatRegister output)196 void MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType,
197                                               FloatRegister input,
198                                               FloatRegister output) {
199   if (input.aliases(output)) {
200     return;
201   }
202   if (isIntegerLaneType) {
203     vmovdqa(input, output);
204   } else {
205     vmovaps(input, output);
206   }
207 }
208 
extractLaneInt32x4(FloatRegister input,Register output,unsigned lane)209 void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
210                                                  Register output,
211                                                  unsigned lane) {
212   if (lane == 0) {
213     // The value we want to extract is in the low double-word
214     moveLowInt32(input, output);
215   } else if (AssemblerX86Shared::HasSSE41()) {
216     vpextrd(lane, input, output);
217   } else {
218     uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
219     shuffleInt32(mask, input, ScratchSimd128Reg);
220     moveLowInt32(ScratchSimd128Reg, output);
221   }
222 }
223 
extractLaneFloat32x4(FloatRegister input,FloatRegister output,unsigned lane)224 void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input,
225                                                    FloatRegister output,
226                                                    unsigned lane) {
227   if (lane == 0) {
228     // The value we want to extract is in the low double-word
229     if (input != output) {
230       moveFloat32(input, output);
231     }
232   } else if (lane == 2) {
233     moveHighPairToLowPairFloat32(input, output);
234   } else {
235     uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
236     shuffleFloat32(mask, input, output);
237   }
238 }
239 
extractLaneFloat64x2(FloatRegister input,FloatRegister output,unsigned lane)240 void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input,
241                                                    FloatRegister output,
242                                                    unsigned lane) {
243   if (lane == 0) {
244     // The value we want to extract is in the low quadword
245     if (input != output) {
246       moveDouble(input, output);
247     }
248   } else {
249     vpalignr(Operand(input), output, 8);
250   }
251 }
252 
extractLaneInt16x8(FloatRegister input,Register output,unsigned lane,SimdSign sign)253 void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input,
254                                                  Register output, unsigned lane,
255                                                  SimdSign sign) {
256   // Unlike pextrd and pextrb, this is available in SSE2.
257   vpextrw(lane, input, output);
258   if (sign == SimdSign::Signed) {
259     movswl(output, output);
260   }
261 }
262 
extractLaneInt8x16(FloatRegister input,Register output,unsigned lane,SimdSign sign)263 void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input,
264                                                  Register output, unsigned lane,
265                                                  SimdSign sign) {
266   if (AssemblerX86Shared::HasSSE41()) {
267     vpextrb(lane, input, output);
268     // vpextrb clears the high bits, so no further extension required.
269     if (sign == SimdSign::Unsigned) {
270       sign = SimdSign::NotApplicable;
271     }
272   } else {
273     // Extract the relevant 16 bits containing our lane, then shift the
274     // right 8 bits into place.
275     extractLaneInt16x8(input, output, lane / 2, SimdSign::Unsigned);
276     if (lane % 2) {
277       shrl(Imm32(8), output);
278       // The shrl handles the zero-extension. Don't repeat it.
279       if (sign == SimdSign::Unsigned) {
280         sign = SimdSign::NotApplicable;
281       }
282     }
283   }
284 
285   // We have the right low 8 bits in |output|, but we may need to fix the high
286   // bits. Note that this requires |output| to be one of the %eax-%edx
287   // registers.
288   switch (sign) {
289     case SimdSign::Signed:
290       movsbl(output, output);
291       break;
292     case SimdSign::Unsigned:
293       movzbl(output, output);
294       break;
295     case SimdSign::NotApplicable:
296       // No adjustment needed.
297       break;
298   }
299 }
300 
extractLaneSimdBool(FloatRegister input,Register output,unsigned numLanes,unsigned lane)301 void MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input,
302                                                   Register output,
303                                                   unsigned numLanes,
304                                                   unsigned lane) {
305   switch (numLanes) {
306     case 4:
307       extractLaneInt32x4(input, output, lane);
308       break;
309     case 8:
310       // Get a lane, don't bother fixing the high bits since we'll mask below.
311       extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
312       break;
313     case 16:
314       extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
315       break;
316     default:
317       MOZ_CRASH("Unhandled SIMD number of lanes");
318   }
319   // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
320   asMasm().and32(Imm32(1), output);
321 }
322 
insertLaneSimdInt(FloatRegister input,Register value,FloatRegister output,unsigned lane,unsigned numLanes)323 void MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input,
324                                                 Register value,
325                                                 FloatRegister output,
326                                                 unsigned lane,
327                                                 unsigned numLanes) {
328   if (numLanes == 8) {
329     // Available in SSE 2.
330     vpinsrw(lane, value, input, output);
331     return;
332   }
333 
334   // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
335   // value goes into the first component, as vmovd clears out the higher lanes
336   // of the output.
337   if (AssemblerX86Shared::HasSSE41()) {
338     // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
339     switch (numLanes) {
340       case 4:
341         vpinsrd(lane, value, input, output);
342         return;
343       case 16:
344         vpinsrb(lane, value, input, output);
345         return;
346     }
347   }
348 
349   asMasm().reserveStack(Simd128DataSize);
350   storeAlignedSimd128Int(input, Address(StackPointer, 0));
351   switch (numLanes) {
352     case 4:
353       store32(value, Address(StackPointer, lane * sizeof(int32_t)));
354       break;
355     case 16:
356       // Note that this requires `value` to be in one the registers where the
357       // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
358       store8(value, Address(StackPointer, lane * sizeof(int8_t)));
359       break;
360     default:
361       MOZ_CRASH("Unsupported SIMD numLanes");
362   }
363   loadAlignedSimd128Int(Address(StackPointer, 0), output);
364   asMasm().freeStack(Simd128DataSize);
365 }
366 
insertLaneFloat32x4(FloatRegister input,FloatRegister value,FloatRegister output,unsigned lane)367 void MacroAssemblerX86Shared::insertLaneFloat32x4(FloatRegister input,
368                                                   FloatRegister value,
369                                                   FloatRegister output,
370                                                   unsigned lane) {
371   // This code can't work if this is not true.  That's probably a bug.
372   MOZ_RELEASE_ASSERT(input == output);
373 
374   if (lane == 0) {
375     if (value != output) {
376       vmovss(value, input, output);
377     }
378     return;
379   }
380 
381   if (AssemblerX86Shared::HasSSE41()) {
382     // The input value is in the low float32 of the 'value' FloatRegister.
383     vinsertps(vinsertpsMask(0, lane), value, output, output);
384     return;
385   }
386 
387   asMasm().reserveStack(Simd128DataSize);
388   storeAlignedSimd128Float(input, Address(StackPointer, 0));
389   asMasm().storeFloat32(value, Address(StackPointer, lane * sizeof(int32_t)));
390   loadAlignedSimd128Float(Address(StackPointer, 0), output);
391   asMasm().freeStack(Simd128DataSize);
392 }
393 
insertLaneFloat64x2(FloatRegister input,FloatRegister value,FloatRegister output,unsigned lane)394 void MacroAssemblerX86Shared::insertLaneFloat64x2(FloatRegister input,
395                                                   FloatRegister value,
396                                                   FloatRegister output,
397                                                   unsigned lane) {
398   if (input == output && output == value) {
399     // No-op
400     return;
401   }
402 
403   if (input != output && value != output) {
404     // Merge input and value into output, so make input==output
405     vmovapd(input, output);
406     input = output;
407   }
408 
409   if (input == output) {
410     // Merge value into output
411     if (lane == 0) {
412       // move low qword of value into low qword of output
413       vmovsd(value, output, output);
414     } else {
415       // move low qword of value into high qword of output
416       vshufpd(0, value, output, output);
417     }
418   } else {
419     MOZ_ASSERT(value == output);
420     // Merge input into output
421     if (lane == 0) {
422       // move high qword of input into high qword of output
423       vshufpd(2, input, output, output);
424     } else {
425       // move low qword of output into high qword of output
426       vmovddup(output, output);
427       // move low qword of input into low qword of output
428       vmovsd(input, output, output);
429     }
430   }
431 }
432 
allTrueSimdBool(FloatRegister input,Register output)433 void MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input,
434                                               Register output) {
435   // We know that the input lanes are boolean, so they are either 0 or -1.
436   // The all-true vector has all 128 bits set, no matter the lane geometry.
437   vpmovmskb(input, output);
438   cmp32(output, Imm32(0xffff));
439   emitSet(Assembler::Zero, output);
440 }
441 
anyTrueSimdBool(FloatRegister input,Register output)442 void MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input,
443                                               Register output) {
444   vpmovmskb(input, output);
445   cmp32(output, Imm32(0x0));
446   emitSet(Assembler::NonZero, output);
447 }
448 
swizzleInt32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])449 void MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input,
450                                              FloatRegister output,
451                                              unsigned lanes[4]) {
452   uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1],
453                                                      lanes[2], lanes[3]);
454   shuffleInt32(mask, input, output);
455 }
456 
457 // For SIMD.js
oldSwizzleInt8x16(FloatRegister input,FloatRegister output,const Maybe<Register> & temp,int8_t lanes[16])458 void MacroAssemblerX86Shared::oldSwizzleInt8x16(FloatRegister input,
459                                                 FloatRegister output,
460                                                 const Maybe<Register>& temp,
461                                                 int8_t lanes[16]) {
462   if (AssemblerX86Shared::HasSSSE3()) {
463     ScratchSimd128Scope scratch(asMasm());
464     asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
465     FloatRegister inputCopy = reusedInputInt32x4(input, output);
466     vpshufb(scratch, inputCopy, output);
467     return;
468   }
469 
470   // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
471   MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
472   asMasm().reserveStack(2 * Simd128DataSize);
473   storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
474   for (unsigned i = 0; i < 16; i++) {
475     load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
476     store8(*temp, Address(StackPointer, i));
477   }
478   loadAlignedSimd128Int(Address(StackPointer, 0), output);
479   asMasm().freeStack(2 * Simd128DataSize);
480 }
481 
LanesMatch(unsigned lanes[4],unsigned x,unsigned y,unsigned z,unsigned w)482 static inline bool LanesMatch(unsigned lanes[4], unsigned x, unsigned y,
483                               unsigned z, unsigned w) {
484   return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
485 }
486 
swizzleFloat32x4(FloatRegister input,FloatRegister output,unsigned lanes[4])487 void MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input,
488                                                FloatRegister output,
489                                                unsigned lanes[4]) {
490   if (AssemblerX86Shared::HasSSE3()) {
491     if (LanesMatch(lanes, 0, 0, 2, 2)) {
492       vmovsldup(input, output);
493       return;
494     }
495     if (LanesMatch(lanes, 1, 1, 3, 3)) {
496       vmovshdup(input, output);
497       return;
498     }
499   }
500 
501   // TODO Here and below, arch specific lowering could identify this pattern
502   // and use defineReuseInput to avoid this move (bug 1084404)
503   if (LanesMatch(lanes, 2, 3, 2, 3)) {
504     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
505     vmovhlps(input, inputCopy, output);
506     return;
507   }
508 
509   if (LanesMatch(lanes, 0, 1, 0, 1)) {
510     if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
511       vmovddup(input, output);
512       return;
513     }
514     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
515     vmovlhps(input, inputCopy, output);
516     return;
517   }
518 
519   if (LanesMatch(lanes, 0, 0, 1, 1)) {
520     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
521     vunpcklps(input, inputCopy, output);
522     return;
523   }
524 
525   if (LanesMatch(lanes, 2, 2, 3, 3)) {
526     FloatRegister inputCopy = reusedInputSimd128Float(input, output);
527     vunpckhps(input, inputCopy, output);
528     return;
529   }
530 
531   uint32_t x = lanes[0];
532   uint32_t y = lanes[1];
533   uint32_t z = lanes[2];
534   uint32_t w = lanes[3];
535 
536   uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
537   shuffleFloat32(mask, input, output);
538 }
539 
blendInt8x16(FloatRegister lhs,FloatRegister rhs,FloatRegister output,FloatRegister temp,const uint8_t lanes[16])540 void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
541                                            FloatRegister output,
542                                            FloatRegister temp,
543                                            const uint8_t lanes[16]) {
544   MOZ_ASSERT(AssemblerX86Shared::HasSSSE3());
545   MOZ_ASSERT(lhs == output);
546   MOZ_ASSERT(lhs == rhs || !temp.isInvalid());
547 
548   // TODO: For sse4.1, consider whether PBLENDVB would not be better, even if it
549   // is variable and requires xmm0 to be free and the loading of a mask.
550 
551   // Set scratch = lanes to select from lhs.
552   int8_t mask[16];
553   for (unsigned i = 0; i < 16; i++) {
554     mask[i] = ~lanes[i];
555   }
556   ScratchSimd128Scope scratch(asMasm());
557   asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(mask), scratch);
558   if (lhs == rhs) {
559     asMasm().moveSimd128Int(rhs, temp);
560     rhs = temp;
561   }
562   vpand(Operand(scratch), lhs, lhs);
563   vpandn(Operand(rhs), scratch, scratch);
564   vpor(scratch, lhs, lhs);
565 }
566 
blendInt16x8(FloatRegister lhs,FloatRegister rhs,FloatRegister output,const uint16_t lanes[8])567 void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
568                                            FloatRegister output,
569                                            const uint16_t lanes[8]) {
570   MOZ_ASSERT(AssemblerX86Shared::HasSSE41());
571   MOZ_ASSERT(lhs == output);
572 
573   uint32_t mask = 0;
574   for (unsigned i = 0; i < 8; i++) {
575     if (lanes[i]) {
576       mask |= (1 << i);
577     }
578   }
579   vpblendw(mask, rhs, lhs, lhs);
580 }
581 
shuffleInt8x16(FloatRegister lhs,FloatRegister rhs,FloatRegister output,const Maybe<FloatRegister> & maybeFloatTemp,const Maybe<Register> & maybeTemp,const uint8_t lanes[16])582 void MacroAssemblerX86Shared::shuffleInt8x16(
583     FloatRegister lhs, FloatRegister rhs, FloatRegister output,
584     const Maybe<FloatRegister>& maybeFloatTemp,
585     const Maybe<Register>& maybeTemp, const uint8_t lanes[16]) {
586   DebugOnly<bool> hasSSSE3 = AssemblerX86Shared::HasSSSE3();
587   MOZ_ASSERT(hasSSSE3 == !!maybeFloatTemp);
588   MOZ_ASSERT(!hasSSSE3 == !!maybeTemp);
589 
590   // Use pshufb if it is available.
591   if (AssemblerX86Shared::HasSSSE3()) {
592     ScratchSimd128Scope scratch(asMasm());
593 
594     // Use pshufb instructions to gather the lanes from each source vector.
595     // A negative index creates a zero lane, so the two vectors can be combined.
596 
597     // Set scratch = lanes from lhs.
598     int8_t idx[16];
599     for (unsigned i = 0; i < 16; i++) {
600       idx[i] = lanes[i] < 16 ? lanes[i] : -1;
601     }
602     asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx),
603                                     *maybeFloatTemp);
604     FloatRegister lhsCopy = reusedInputInt32x4(lhs, scratch);
605     vpshufb(*maybeFloatTemp, lhsCopy, scratch);
606 
607     // Set output = lanes from rhs.
608     // TODO: The alternative to loading this constant is to complement
609     // the one that is already in *maybeFloatTemp, takes two instructions
610     // and a temp register: PCMPEQD tmp, tmp; PXOR *maybeFloatTemp, tmp.
611     // But scratch is available here so that's OK.  But it's not given
612     // that avoiding the load is a win.
613     for (unsigned i = 0; i < 16; i++) {
614       idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
615     }
616     asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx),
617                                     *maybeFloatTemp);
618     FloatRegister rhsCopy = reusedInputInt32x4(rhs, output);
619     vpshufb(*maybeFloatTemp, rhsCopy, output);
620 
621     // Combine.
622     vpor(scratch, output, output);
623     return;
624   }
625 
626   // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
627   asMasm().reserveStack(3 * Simd128DataSize);
628   storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
629   storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
630   for (unsigned i = 0; i < 16; i++) {
631     load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]),
632                     *maybeTemp);
633     store8(*maybeTemp, Address(StackPointer, i));
634   }
635   loadAlignedSimd128Int(Address(StackPointer, 0), output);
636   asMasm().freeStack(3 * Simd128DataSize);
637 }
638 
shuffleX4(FloatRegister lhs,Operand rhs,FloatRegister out,const Maybe<FloatRegister> & maybeTemp,unsigned lanes[4])639 void MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs,
640                                         FloatRegister out,
641                                         const Maybe<FloatRegister>& maybeTemp,
642                                         unsigned lanes[4]) {
643   uint32_t x = lanes[0];
644   uint32_t y = lanes[1];
645   uint32_t z = lanes[2];
646   uint32_t w = lanes[3];
647 
648   // Check that lanes come from LHS in majority:
649   unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
650   MOZ_ASSERT(numLanesFromLHS >= 2);
651 
652   // When reading this method, remember that vshufps takes the two first
653   // inputs of the destination operand (right operand) and the two last
654   // inputs of the source operand (left operand).
655   //
656   // Legend for explanations:
657   // - L: LHS
658   // - R: RHS
659   // - T: temporary
660 
661   uint32_t mask;
662 
663   // If all lanes came from a single vector, we should use swizzle instead.
664   MOZ_ASSERT(numLanesFromLHS < 4);
665 
666   // If all values stay in their lane, this is a blend.
667   if (AssemblerX86Shared::HasSSE41()) {
668     if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
669       vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
670       return;
671     }
672   }
673 
674   // One element of the second, all other elements of the first
675   if (numLanesFromLHS == 3) {
676     unsigned firstMask = -1, secondMask = -1;
677 
678     // register-register vmovss preserves the high lanes.
679     if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
680       vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
681       return;
682     }
683 
684     // SSE4.1 vinsertps can handle any single element.
685     unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
686     if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
687       unsigned srcLane;
688       unsigned dstLane;
689       if (x >= 4) {
690         srcLane = x - 4;
691         dstLane = 0;
692       } else if (y >= 4) {
693         srcLane = y - 4;
694         dstLane = 1;
695       } else if (z >= 4) {
696         srcLane = z - 4;
697         dstLane = 2;
698       } else {
699         MOZ_ASSERT(w >= 4);
700         srcLane = w - 4;
701         dstLane = 3;
702       }
703       vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
704       return;
705     }
706 
707     MOZ_ASSERT(!!maybeTemp);
708     FloatRegister rhsCopy = *maybeTemp;
709     loadAlignedSimd128Float(rhs, rhsCopy);
710 
711     if (x < 4 && y < 4) {
712       if (w >= 4) {
713         w %= 4;
714         // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
715         firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
716         // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
717         secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
718       } else {
719         MOZ_ASSERT(z >= 4);
720         z %= 4;
721         // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
722         firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
723         // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
724         secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
725       }
726 
727       vshufps(firstMask, lhs, rhsCopy, rhsCopy);
728       vshufps(secondMask, rhsCopy, lhs, out);
729       return;
730     }
731 
732     MOZ_ASSERT(z < 4 && w < 4);
733 
734     if (y >= 4) {
735       y %= 4;
736       // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
737       firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
738       // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
739       secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
740     } else {
741       MOZ_ASSERT(x >= 4);
742       x %= 4;
743       // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
744       firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
745       // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
746       secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
747     }
748 
749     vshufps(firstMask, lhs, rhsCopy, rhsCopy);
750     if (AssemblerX86Shared::HasAVX()) {
751       vshufps(secondMask, lhs, rhsCopy, out);
752     } else {
753       vshufps(secondMask, lhs, rhsCopy, rhsCopy);
754       moveSimd128Float(rhsCopy, out);
755     }
756     return;
757   }
758 
759   // Two elements from one vector, two other elements from the other
760   MOZ_ASSERT(numLanesFromLHS == 2);
761 
762   // TODO Here and below, symmetric case would be more handy to avoid a move,
763   // but can't be reached because operands would get swapped (bug 1084404).
764   if (LanesMatch(lanes, 2, 3, 6, 7)) {
765     ScratchSimd128Scope scratch(asMasm());
766     if (AssemblerX86Shared::HasAVX()) {
767       FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
768       vmovhlps(lhs, rhsCopy, out);
769     } else {
770       loadAlignedSimd128Float(rhs, scratch);
771       vmovhlps(lhs, scratch, scratch);
772       moveSimd128Float(scratch, out);
773     }
774     return;
775   }
776 
777   if (LanesMatch(lanes, 0, 1, 4, 5)) {
778     FloatRegister rhsCopy;
779     ScratchSimd128Scope scratch(asMasm());
780     if (rhs.kind() == Operand::FPREG) {
781       // No need to make an actual copy, since the operand is already
782       // in a register, and it won't be clobbered by the vmovlhps.
783       rhsCopy = FloatRegister::FromCode(rhs.fpu());
784     } else {
785       loadAlignedSimd128Float(rhs, scratch);
786       rhsCopy = scratch;
787     }
788     vmovlhps(rhsCopy, lhs, out);
789     return;
790   }
791 
792   if (LanesMatch(lanes, 0, 4, 1, 5)) {
793     vunpcklps(rhs, lhs, out);
794     return;
795   }
796 
797   // TODO swapped case would be better (bug 1084404)
798   if (LanesMatch(lanes, 4, 0, 5, 1)) {
799     ScratchSimd128Scope scratch(asMasm());
800     if (AssemblerX86Shared::HasAVX()) {
801       FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
802       vunpcklps(lhs, rhsCopy, out);
803     } else {
804       loadAlignedSimd128Float(rhs, scratch);
805       vunpcklps(lhs, scratch, scratch);
806       moveSimd128Float(scratch, out);
807     }
808     return;
809   }
810 
811   if (LanesMatch(lanes, 2, 6, 3, 7)) {
812     vunpckhps(rhs, lhs, out);
813     return;
814   }
815 
816   // TODO swapped case would be better (bug 1084404)
817   if (LanesMatch(lanes, 6, 2, 7, 3)) {
818     ScratchSimd128Scope scratch(asMasm());
819     if (AssemblerX86Shared::HasAVX()) {
820       FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
821       vunpckhps(lhs, rhsCopy, out);
822     } else {
823       loadAlignedSimd128Float(rhs, scratch);
824       vunpckhps(lhs, scratch, scratch);
825       moveSimd128Float(scratch, out);
826     }
827     return;
828   }
829 
830   // In one vshufps
831   if (x < 4 && y < 4) {
832     mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
833     vshufps(mask, rhs, lhs, out);
834     return;
835   }
836 
837   // At creation, we should have explicitly swapped in this case.
838   MOZ_ASSERT(!(z >= 4 && w >= 4));
839 
840   // In two vshufps, for the most generic case:
841   uint32_t firstMask[4], secondMask[4];
842   unsigned i = 0, j = 2, k = 0;
843 
844 #define COMPUTE_MASK(lane)   \
845   if (lane >= 4) {           \
846     firstMask[j] = lane % 4; \
847     secondMask[k++] = j++;   \
848   } else {                   \
849     firstMask[i] = lane;     \
850     secondMask[k++] = i++;   \
851   }
852 
853   COMPUTE_MASK(x)
854   COMPUTE_MASK(y)
855   COMPUTE_MASK(z)
856   COMPUTE_MASK(w)
857 #undef COMPUTE_MASK
858 
859   MOZ_ASSERT(i == 2 && j == 4 && k == 4);
860 
861   mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
862                                             firstMask[2], firstMask[3]);
863   vshufps(mask, rhs, lhs, lhs);
864 
865   mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
866                                             secondMask[2], secondMask[3]);
867   vshufps(mask, lhs, lhs, lhs);
868 }
869 
ToSimdFloatRegister(const Operand & op)870 static inline FloatRegister ToSimdFloatRegister(const Operand& op) {
871   return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
872 }
873 
compareInt8x16(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)874 void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
875                                              Assembler::Condition cond,
876                                              FloatRegister output) {
877   static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
878   ScratchSimd128Scope scratch(asMasm());
879   switch (cond) {
880     case Assembler::Condition::GreaterThan:
881       vpcmpgtb(rhs, lhs, output);
882       break;
883     case Assembler::Condition::Equal:
884       vpcmpeqb(rhs, lhs, output);
885       break;
886     case Assembler::Condition::LessThan:
887       // src := rhs
888       if (rhs.kind() == Operand::FPREG) {
889         moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
890       } else {
891         loadAlignedSimd128Int(rhs, scratch);
892       }
893 
894       // src := src > lhs (i.e. lhs < rhs)
895       // Improve by doing custom lowering (rhs is tied to the output register)
896       vpcmpgtb(Operand(lhs), scratch, scratch);
897       moveSimd128Int(scratch, output);
898       break;
899     case Assembler::Condition::NotEqual:
900       // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
901       // should invert the comparison by, e.g. swapping the arms of a select
902       // if that's what it's used in.
903       asMasm().loadConstantSimd128Int(allOnes, scratch);
904       vpcmpeqb(rhs, lhs, output);
905       bitwiseXorSimdInt(output, Operand(scratch), output);
906       break;
907     case Assembler::Condition::GreaterThanOrEqual:
908       // src := rhs
909       if (rhs.kind() == Operand::FPREG) {
910         moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
911       } else {
912         loadAlignedSimd128Int(rhs, scratch);
913       }
914       vpcmpgtb(Operand(lhs), scratch, scratch);
915       asMasm().loadConstantSimd128Int(allOnes, output);
916       bitwiseXorSimdInt(output, Operand(scratch), output);
917       break;
918     case Assembler::Condition::LessThanOrEqual:
919       // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
920       asMasm().loadConstantSimd128Int(allOnes, scratch);
921       vpcmpgtb(rhs, lhs, output);
922       bitwiseXorSimdInt(output, Operand(scratch), output);
923       break;
924     default:
925       MOZ_CRASH("unexpected condition op");
926   }
927 }
928 
unsignedCompareInt8x16(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output,FloatRegister tmp1,FloatRegister tmp2)929 void MacroAssemblerX86Shared::unsignedCompareInt8x16(
930     FloatRegister lhs, Operand rhs, Assembler::Condition cond,
931     FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
932   // We widen the inputs to 16 bits, transforming them to nonnegative values;
933   // then compare them as signed using the logic from compareInt8x16(); then
934   // merge the results (which is surprisingly complicated).  rhs is left
935   // untouched.  The logic is open-coded to streamline it.
936   //
937   // TODO?  Rhs could be in memory (for Ion, anyway), in which case loading it
938   // into scratch first would be better than loading it twice from memory.
939 
940   MOZ_ASSERT(AssemblerX86Shared::HasSSE41());  // PMOVZX, PMOVSX
941   MOZ_ASSERT(lhs == output);
942   MOZ_ASSERT(lhs != tmp1 && lhs != tmp2);
943   MOZ_ASSERT_IF(rhs.kind() == Operand::FPREG,
944                 ToSimdFloatRegister(rhs) != lhs &&
945                     ToSimdFloatRegister(rhs) != tmp1 &&
946                     ToSimdFloatRegister(rhs) != tmp2);
947   MOZ_ASSERT(tmp1 != ScratchSimd128Reg && tmp2 != ScratchSimd128Reg);
948 
949   bool complement = false;
950   switch (cond) {
951     case Assembler::Above:
952     case Assembler::BelowOrEqual:
953       complement = cond == Assembler::BelowOrEqual;
954 
955       // Low eight bytes of inputs widened to words
956       vpmovzxbw(Operand(lhs), tmp1);
957       vpmovzxbw(rhs, tmp2);
958       // Compare leaving 16-bit results
959       vpcmpgtw(Operand(tmp2), tmp1, tmp1);  // lhs < rhs in tmp1
960 
961       // High eight bytes of inputs widened to words
962       vpalignr(rhs, tmp2, 8);
963       vpmovzxbw(Operand(tmp2), tmp2);
964       vpalignr(Operand(lhs), output, 8);
965       vpmovzxbw(Operand(output), output);
966       // Compare leaving 16-bit results
967       vpcmpgtw(Operand(tmp2), output, output);  // lhs < rhs in output
968 
969       break;
970     case Assembler::Below:
971     case Assembler::AboveOrEqual:
972       complement = cond == Assembler::AboveOrEqual;
973 
974       // Same as above but with operands reversed
975 
976       // Low eight bytes of inputs widened to words
977       vpmovzxbw(Operand(lhs), tmp2);
978       vpmovzxbw(rhs, tmp1);
979       // Compare leaving 16-bit results
980       vpcmpgtw(Operand(tmp2), tmp1, tmp1);  // rhs < lhs in tmp1
981 
982       // High eight bytes of inputs widened to words
983       vpalignr(Operand(lhs), tmp2, 8);
984       vpmovzxbw(Operand(tmp2), tmp2);
985       vpalignr(rhs, output, 8);
986       vpmovzxbw(Operand(output), output);
987       // Compare leaving 16-bit results
988       vpcmpgtw(Operand(tmp2), output, output);  // rhs < lhs in output
989 
990       break;
991     default:
992       MOZ_CRASH("Unsupported condition code");
993   }
994 
995   // Merge output (results of high byte compares) and tmp1 (results of low byte
996   // compares) by truncating word results to bytes (to avoid signed saturation),
997   // packing, and then concatenating and shifting.
998   vpsrlw(Imm32(8), tmp1, tmp1);
999   vpackuswb(Operand(tmp1), tmp1, tmp1);
1000   vpsrlw(Imm32(8), output, output);
1001   vpackuswb(Operand(output), output, output);
1002   vpalignr(Operand(tmp1), output, 8);
1003 
1004   // Complement when needed for opposite sense of the operator.
1005   if (complement) {
1006     vpcmpeqd(Operand(tmp1), tmp1, tmp1);
1007     vpxor(Operand(tmp1), output, output);
1008   }
1009 }
1010 
compareInt16x8(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1011 void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
1012                                              Assembler::Condition cond,
1013                                              FloatRegister output) {
1014   static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
1015 
1016   ScratchSimd128Scope scratch(asMasm());
1017   switch (cond) {
1018     case Assembler::Condition::GreaterThan:
1019       vpcmpgtw(rhs, lhs, output);
1020       break;
1021     case Assembler::Condition::Equal:
1022       vpcmpeqw(rhs, lhs, output);
1023       break;
1024     case Assembler::Condition::LessThan:
1025       // src := rhs
1026       if (rhs.kind() == Operand::FPREG) {
1027         moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1028       } else {
1029         loadAlignedSimd128Int(rhs, scratch);
1030       }
1031 
1032       // src := src > lhs (i.e. lhs < rhs)
1033       // Improve by doing custom lowering (rhs is tied to the output register)
1034       vpcmpgtw(Operand(lhs), scratch, scratch);
1035       moveSimd128Int(scratch, output);
1036       break;
1037     case Assembler::Condition::NotEqual:
1038       // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
1039       // should invert the comparison by, e.g. swapping the arms of a select
1040       // if that's what it's used in.
1041       asMasm().loadConstantSimd128Int(allOnes, scratch);
1042       vpcmpeqw(rhs, lhs, output);
1043       bitwiseXorSimdInt(output, Operand(scratch), output);
1044       break;
1045     case Assembler::Condition::GreaterThanOrEqual:
1046       // src := rhs
1047       if (rhs.kind() == Operand::FPREG) {
1048         moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1049       } else {
1050         loadAlignedSimd128Int(rhs, scratch);
1051       }
1052       vpcmpgtw(Operand(lhs), scratch, scratch);
1053       asMasm().loadConstantSimd128Int(allOnes, output);
1054       bitwiseXorSimdInt(output, Operand(scratch), output);
1055       break;
1056     case Assembler::Condition::LessThanOrEqual:
1057       // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
1058       asMasm().loadConstantSimd128Int(allOnes, scratch);
1059       vpcmpgtw(rhs, lhs, output);
1060       bitwiseXorSimdInt(output, Operand(scratch), output);
1061       break;
1062     default:
1063       MOZ_CRASH("unexpected condition op");
1064   }
1065 }
1066 
unsignedCompareInt16x8(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output,FloatRegister tmp1,FloatRegister tmp2)1067 void MacroAssemblerX86Shared::unsignedCompareInt16x8(
1068     FloatRegister lhs, Operand rhs, Assembler::Condition cond,
1069     FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
1070   // See comments at unsignedCompareInt8x16.
1071 
1072   MOZ_ASSERT(AssemblerX86Shared::HasSSE41());  // PMOVZX, PMOVSX
1073   MOZ_ASSERT(lhs == output);
1074 
1075   bool complement = false;
1076   switch (cond) {
1077     case Assembler::Above:
1078     case Assembler::BelowOrEqual:
1079       complement = cond == Assembler::BelowOrEqual;
1080 
1081       vpmovzxwd(Operand(lhs), tmp1);
1082       vpmovzxwd(rhs, tmp2);
1083       vpcmpgtd(Operand(tmp2), tmp1, tmp1);
1084 
1085       vpalignr(rhs, tmp2, 8);
1086       vpmovzxwd(Operand(tmp2), tmp2);
1087       vpalignr(Operand(lhs), output, 8);
1088       vpmovzxwd(Operand(output), output);
1089       vpcmpgtd(Operand(tmp2), output, output);
1090 
1091       break;
1092     case Assembler::Below:
1093     case Assembler::AboveOrEqual:
1094       complement = cond == Assembler::AboveOrEqual;
1095 
1096       vpmovzxwd(Operand(lhs), tmp2);
1097       vpmovzxwd(rhs, tmp1);
1098       vpcmpgtd(Operand(tmp2), tmp1, tmp1);
1099 
1100       vpalignr(Operand(lhs), tmp2, 8);
1101       vpmovzxwd(Operand(tmp2), tmp2);
1102       vpalignr(rhs, output, 8);
1103       vpmovzxwd(Operand(output), output);
1104       vpcmpgtd(Operand(tmp2), output, output);
1105 
1106       break;
1107     default:
1108       MOZ_CRASH();
1109   }
1110 
1111   vpsrld(Imm32(16), tmp1, tmp1);
1112   vpackusdw(Operand(tmp1), tmp1, tmp1);
1113   vpsrld(Imm32(16), output, output);
1114   vpackusdw(Operand(output), output, output);
1115   vpalignr(Operand(tmp1), output, 8);
1116 
1117   if (complement) {
1118     vpcmpeqd(Operand(tmp1), tmp1, tmp1);
1119     vpxor(Operand(tmp1), output, output);
1120   }
1121 }
1122 
compareInt32x4(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1123 void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
1124                                              Assembler::Condition cond,
1125                                              FloatRegister output) {
1126   static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
1127   ScratchSimd128Scope scratch(asMasm());
1128   switch (cond) {
1129     case Assembler::Condition::GreaterThan:
1130       packedGreaterThanInt32x4(rhs, lhs);
1131       break;
1132     case Assembler::Condition::Equal:
1133       packedEqualInt32x4(rhs, lhs);
1134       break;
1135     case Assembler::Condition::LessThan:
1136       // src := rhs
1137       if (rhs.kind() == Operand::FPREG) {
1138         moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1139       } else {
1140         loadAlignedSimd128Int(rhs, scratch);
1141       }
1142 
1143       // src := src > lhs (i.e. lhs < rhs)
1144       // Improve by doing custom lowering (rhs is tied to the output register)
1145       packedGreaterThanInt32x4(Operand(lhs), scratch);
1146       moveSimd128Int(scratch, lhs);
1147       break;
1148     case Assembler::Condition::NotEqual:
1149       // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
1150       // should invert the comparison by, e.g. swapping the arms of a select
1151       // if that's what it's used in.
1152       asMasm().loadConstantSimd128Int(allOnes, scratch);
1153       packedEqualInt32x4(rhs, lhs);
1154       bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
1155       break;
1156     case Assembler::Condition::GreaterThanOrEqual:
1157       // src := rhs
1158       if (rhs.kind() == Operand::FPREG) {
1159         moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
1160       } else {
1161         loadAlignedSimd128Int(rhs, scratch);
1162       }
1163       packedGreaterThanInt32x4(Operand(lhs), scratch);
1164       asMasm().loadConstantSimd128Int(allOnes, lhs);
1165       bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
1166       break;
1167     case Assembler::Condition::LessThanOrEqual:
1168       // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
1169       asMasm().loadConstantSimd128Int(allOnes, scratch);
1170       packedGreaterThanInt32x4(rhs, lhs);
1171       bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
1172       break;
1173     default:
1174       MOZ_CRASH("unexpected condition op");
1175   }
1176 }
1177 
unsignedCompareInt32x4(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output,FloatRegister tmp1,FloatRegister tmp2)1178 void MacroAssemblerX86Shared::unsignedCompareInt32x4(
1179     FloatRegister lhs, Operand rhs, Assembler::Condition cond,
1180     FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
1181   // See comments at unsignedCompareInt8x16, the logic is similar.  However we
1182   // only have PCMPGTQ on SSE4.2 or later, so for SSE4.1 we need to use subtract
1183   // to compute the flags.
1184 
1185   MOZ_ASSERT(AssemblerX86Shared::HasSSE41());  // PMOVZX, PMOVSX
1186   MOZ_ASSERT(lhs == output);
1187 
1188   bool complement = false;
1189   switch (cond) {
1190     case Assembler::Below:
1191     case Assembler::AboveOrEqual:
1192       complement = cond == Assembler::AboveOrEqual;
1193 
1194       // The effect of the subtract is that the high doubleword of each quadword
1195       // becomes either 0 (ge) or -1 (lt).
1196 
1197       vpmovzxdq(Operand(lhs), tmp1);
1198       vpmovzxdq(rhs, tmp2);
1199       vpsubq(Operand(tmp2), tmp1, tmp1);  // flag1 junk flag0 junk
1200       vpsrlq(Imm32(32), tmp1, tmp1);      // zero flag1 zero flag0
1201       vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1,
1202               tmp1);  // zero zero flag1 flag0
1203 
1204       vpalignr(rhs, tmp2, 8);
1205       vpmovzxdq(Operand(tmp2), tmp2);
1206       vpalignr(Operand(lhs), output, 8);
1207       vpmovzxdq(Operand(output), output);
1208       vpsubq(Operand(tmp2), output, output);  // flag3 junk flag2 junk
1209       vpsrlq(Imm32(32), output, output);      // zero flag3 zero flag2
1210       vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output,
1211               output);  // flag3 flag2 zero zero
1212 
1213       vpor(Operand(tmp1), output, output);
1214       break;
1215 
1216     case Assembler::Above:
1217     case Assembler::BelowOrEqual:
1218       complement = cond == Assembler::BelowOrEqual;
1219 
1220       // The effect of the subtract is that the high doubleword of each quadword
1221       // becomes either 0 (le) or -1 (gt).
1222 
1223       vpmovzxdq(Operand(lhs), tmp2);
1224       vpmovzxdq(rhs, tmp1);
1225       vpsubq(Operand(tmp2), tmp1, tmp1);  // flag1 junk flag0 junk
1226       vpsrlq(Imm32(32), tmp1, tmp1);      // zero flag1 zero flag0
1227       vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1,
1228               tmp1);  // zero zero flag1 flag0
1229 
1230       vpalignr(Operand(lhs), tmp2, 8);
1231       vpmovzxdq(Operand(tmp2), tmp2);
1232       vpalignr(rhs, output, 8);
1233       vpmovzxdq(Operand(output), output);
1234       vpsubq(Operand(tmp2), output, output);  // flag3 junk flag2 junk
1235       vpsrlq(Imm32(32), output, output);      // zero flag3 zero flag2
1236       vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output,
1237               output);  // flag3 flag2 zero zero
1238 
1239       vpor(Operand(tmp1), output, output);
1240       break;
1241 
1242     default:
1243       MOZ_CRASH();
1244   }
1245 
1246   if (complement) {
1247     vpcmpeqd(Operand(tmp1), tmp1, tmp1);
1248     vpxor(Operand(tmp1), output, output);
1249   }
1250 }
1251 
compareFloat32x4(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1252 void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,
1253                                                Assembler::Condition cond,
1254                                                FloatRegister output) {
1255   switch (cond) {
1256     case Assembler::Condition::Equal:
1257       vcmpeqps(rhs, lhs, output);
1258       break;
1259     case Assembler::Condition::LessThan:
1260       vcmpltps(rhs, lhs, output);
1261       break;
1262     case Assembler::Condition::LessThanOrEqual:
1263       vcmpleps(rhs, lhs, output);
1264       break;
1265     case Assembler::Condition::NotEqual:
1266       vcmpneqps(rhs, lhs, output);
1267       break;
1268     case Assembler::Condition::GreaterThanOrEqual:
1269     case Assembler::Condition::GreaterThan:
1270       // We reverse these before register allocation so that we don't have to
1271       // copy into and out of temporaries after codegen.
1272       MOZ_CRASH("should have reversed this");
1273     default:
1274       MOZ_CRASH("unexpected condition op");
1275   }
1276 }
1277 
compareFloat64x2(FloatRegister lhs,Operand rhs,Assembler::Condition cond,FloatRegister output)1278 void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
1279                                                Assembler::Condition cond,
1280                                                FloatRegister output) {
1281   switch (cond) {
1282     case Assembler::Condition::Equal:
1283       vcmpeqpd(rhs, lhs, output);
1284       break;
1285     case Assembler::Condition::LessThan:
1286       vcmpltpd(rhs, lhs, output);
1287       break;
1288     case Assembler::Condition::LessThanOrEqual:
1289       vcmplepd(rhs, lhs, output);
1290       break;
1291     case Assembler::Condition::NotEqual:
1292       vcmpneqpd(rhs, lhs, output);
1293       break;
1294     case Assembler::Condition::GreaterThanOrEqual:
1295     case Assembler::Condition::GreaterThan:
1296       // We reverse these before register allocation so that we don't have to
1297       // copy into and out of temporaries after codegen.
1298       MOZ_CRASH("should have reversed this");
1299     default:
1300       MOZ_CRASH("unexpected condition op");
1301   }
1302 }
1303 
mulInt32x4(FloatRegister lhs,Operand rhs,const Maybe<FloatRegister> & temp,FloatRegister output)1304 void MacroAssemblerX86Shared::mulInt32x4(FloatRegister lhs, Operand rhs,
1305                                          const Maybe<FloatRegister>& temp,
1306                                          FloatRegister output) {
1307   if (AssemblerX86Shared::HasSSE41()) {
1308     vpmulld(rhs, lhs, output);
1309     return;
1310   }
1311 
1312   ScratchSimd128Scope scratch(asMasm());
1313   loadAlignedSimd128Int(rhs, scratch);
1314   vpmuludq(lhs, scratch, scratch);
1315   // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
1316 
1317   MOZ_ASSERT(!!temp);
1318   vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
1319   vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, *temp);
1320   vpmuludq(*temp, lhs, lhs);
1321   // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
1322 
1323   vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
1324   // lhs contains (Ry, Rw, Rx, Rz)
1325   vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
1326 }
1327 
1328 /* clang-format off */
1329 
1330 // NaN is s111 1111 1qxx ... where the q indicates quiet or
1331 // signaling NaN; q=1 means quiet.
1332 //
1333 // If the values are both zero the second value is returned from min/max, this
1334 // matters if one of them is signed.
1335 //
1336 // If one value is a NaN then the second value is returned.
1337 //
1338 // Generally for min/max, the sign of zero matters (-0 < 0) and NaN inputs are
1339 // always quiet and we want to propagate NaN.
1340 
1341 // For min:
1342 // - we compute x=min(a,b) and y=min(b,a) and then OR them together
1343 // - if the values are not NaN but not both zero then x==y, the OR
1344 //   matters not
1345 // - if the values are -0 and 0 then we will get -0 from the OR
1346 // - if one of the values is NaN then x or y will be NaN and we will
1347 //   get a NaN from the OR, with some arbitrary sign, and since
1348 //   the input NaN is quiet the output NaN will be quiet.
1349 
1350 // For max:
1351 // - the UNORD comparison will create a mask of ~0 in scratch if at least
1352 //   one value is NaN, otherwise 0
1353 // - we compute x=max(a,b) and y=max(b,a) and then AND them together
1354 // - if the values are not NaN but not both zero then x==y, the
1355 //   AND matters not
1356 // - if the values are -0 and 0 then the AND will create 0
1357 // - if one of the values is NaN then the AND will select the wrong
1358 //   value
1359 // - a final OR of the result with the mask from the UNORD comparison
1360 //   will leave valid results alone and create NaN where the UNORD
1361 //   was true.
1362 
1363 /* clang-format on */
1364 
minFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister output)1365 void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs,
1366                                            FloatRegister output) {
1367   ScratchSimd128Scope scratch(asMasm());
1368   FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
1369   vminps(Operand(lhs), rhsCopy, scratch);
1370   vminps(rhs, lhs, output);
1371   vorps(scratch, output, output);  // NaN or'd with arbitrary bits is NaN
1372 }
1373 
maxFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1374 void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs,
1375                                            FloatRegister temp,
1376                                            FloatRegister output) {
1377   ScratchSimd128Scope scratch(asMasm());
1378   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, scratch);
1379   vcmpunordps(rhs, lhsCopy, scratch);
1380 
1381   FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, temp);
1382   vmaxps(Operand(lhs), rhsCopy, temp);
1383   vmaxps(rhs, lhs, output);
1384 
1385   vandps(temp, output, output);
1386   vorps(scratch, output, output);  // or in the all-ones NaNs
1387 }
1388 
minFloat64x2(FloatRegister lhs,Operand rhs,FloatRegister output)1389 void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, Operand rhs,
1390                                            FloatRegister output) {
1391   ScratchSimd128Scope scratch(asMasm());
1392   FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
1393   vminpd(Operand(lhs), rhsCopy, scratch);
1394   vminpd(rhs, lhs, output);
1395   vorpd(scratch, output, output);  // NaN or'd with arbitrary bits is NaN
1396 }
1397 
maxFloat64x2(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1398 void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, Operand rhs,
1399                                            FloatRegister temp,
1400                                            FloatRegister output) {
1401   ScratchSimd128Scope scratch(asMasm());
1402   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, scratch);
1403   vcmpunordpd(rhs, lhsCopy, scratch);
1404 
1405   FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, temp);
1406   vmaxpd(Operand(lhs), rhsCopy, temp);
1407   vmaxpd(rhs, lhs, output);
1408 
1409   vandpd(temp, output, output);
1410   vorpd(scratch, output, output);  // or in the all-ones NaNs
1411 }
1412 
minNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1413 void MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs,
1414                                               FloatRegister temp,
1415                                               FloatRegister output) {
1416   ScratchSimd128Scope scratch(asMasm());
1417   asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
1418                                   temp);
1419 
1420   FloatRegister mask = scratch;
1421   FloatRegister tmpCopy = reusedInputSimd128Float(temp, scratch);
1422   vpcmpeqd(Operand(lhs), tmpCopy, mask);
1423   vandps(temp, mask, mask);
1424 
1425   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
1426   vminps(rhs, lhsCopy, temp);
1427   vorps(mask, temp, temp);
1428 
1429   FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, mask);
1430   vcmpneqps(rhs, rhsCopy, mask);
1431 
1432   if (AssemblerX86Shared::HasAVX()) {
1433     vblendvps(mask, lhs, temp, output);
1434   } else {
1435     // Emulate vblendvps.
1436     // With SSE.4.1 we could use blendvps, however it's awkward since
1437     // it requires the mask to be in xmm0.
1438     if (lhs != output) {
1439       moveSimd128Float(lhs, output);
1440     }
1441     vandps(Operand(mask), output, output);
1442     vandnps(Operand(temp), mask, mask);
1443     vorps(Operand(mask), output, output);
1444   }
1445 }
1446 
maxNumFloat32x4(FloatRegister lhs,Operand rhs,FloatRegister temp,FloatRegister output)1447 void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
1448                                               FloatRegister temp,
1449                                               FloatRegister output) {
1450   ScratchSimd128Scope scratch(asMasm());
1451   FloatRegister mask = scratch;
1452 
1453   asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
1454   vpcmpeqd(Operand(lhs), mask, mask);
1455 
1456   asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
1457                                   temp);
1458   vandps(temp, mask, mask);
1459 
1460   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
1461   vmaxps(rhs, lhsCopy, temp);
1462   vandnps(Operand(temp), mask, mask);
1463 
1464   // Ensure temp always contains the temporary result
1465   mask = temp;
1466   temp = scratch;
1467 
1468   FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, mask);
1469   vcmpneqps(rhs, rhsCopy, mask);
1470 
1471   if (AssemblerX86Shared::HasAVX()) {
1472     vblendvps(mask, lhs, temp, output);
1473   } else {
1474     // Emulate vblendvps.
1475     // With SSE.4.1 we could use blendvps, however it's awkward since
1476     // it requires the mask to be in xmm0.
1477     if (lhs != output) {
1478       moveSimd128Float(lhs, output);
1479     }
1480     vandps(Operand(mask), output, output);
1481     vandnps(Operand(temp), mask, mask);
1482     vorps(Operand(mask), output, output);
1483   }
1484 }
1485 
negFloat32x4(Operand in,FloatRegister out)1486 void MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out) {
1487   ScratchSimd128Scope scratch(asMasm());
1488   FloatRegister result = out;
1489   if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1490     result = scratch;
1491   }
1492   // All zeros but the sign bit
1493   static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
1494   asMasm().loadConstantSimd128Float(minusZero, result);
1495   bitwiseXorFloat32x4(result, in, result);
1496   if (result == scratch) {
1497     moveSimd128Float(result, out);
1498   }
1499 }
1500 
negFloat64x2(Operand in,FloatRegister out)1501 void MacroAssemblerX86Shared::negFloat64x2(Operand in, FloatRegister out) {
1502   ScratchSimd128Scope scratch(asMasm());
1503   FloatRegister result = out;
1504   if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1505     result = scratch;
1506   }
1507   // All zeros but the sign bit
1508   static const SimdConstant minusZero = SimdConstant::SplatX2(-0.0);
1509   asMasm().loadConstantSimd128Float(minusZero, result);
1510   vxorpd(ToSimdFloatRegister(in), result, result);
1511   if (result == scratch) {
1512     moveSimd128Float(result, out);
1513   }
1514 }
1515 
notInt8x16(Operand in,FloatRegister out)1516 void MacroAssemblerX86Shared::notInt8x16(Operand in, FloatRegister out) {
1517   ScratchSimd128Scope scratch(asMasm());
1518   FloatRegister result = out;
1519   if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1520     result = scratch;
1521   }
1522   static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
1523   asMasm().loadConstantSimd128Int(allOnes, result);
1524   bitwiseXorSimdInt(result, in, result);
1525   if (result == scratch) {
1526     moveSimd128Float(result, out);
1527   }
1528 }
1529 
notInt16x8(Operand in,FloatRegister out)1530 void MacroAssemblerX86Shared::notInt16x8(Operand in, FloatRegister out) {
1531   // Bug, really
1532   MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
1533   static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
1534   asMasm().loadConstantSimd128Int(allOnes, out);
1535   bitwiseXorSimdInt(out, in, out);
1536 }
1537 
notInt32x4(Operand in,FloatRegister out)1538 void MacroAssemblerX86Shared::notInt32x4(Operand in, FloatRegister out) {
1539   // Bug, really
1540   MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
1541   static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
1542   asMasm().loadConstantSimd128Int(allOnes, out);
1543   bitwiseXorSimdInt(out, in, out);
1544 }
1545 
notFloat32x4(Operand in,FloatRegister out)1546 void MacroAssemblerX86Shared::notFloat32x4(Operand in, FloatRegister out) {
1547   // Bug, really
1548   MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
1549   float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
1550   static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
1551   asMasm().loadConstantSimd128Float(allOnes, out);
1552   bitwiseXorFloat32x4(out, in, out);
1553 }
1554 
absFloat32x4(Operand in,FloatRegister out)1555 void MacroAssemblerX86Shared::absFloat32x4(Operand in, FloatRegister out) {
1556   ScratchSimd128Scope scratch(asMasm());
1557   FloatRegister result = out;
1558   if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1559     result = scratch;
1560   }
1561   // All ones but the sign bit
1562   float signMask =
1563       SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
1564   static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
1565   asMasm().loadConstantSimd128Float(signMasks, result);
1566   bitwiseAndFloat32x4(result, in, result);
1567   if (result == scratch) {
1568     moveSimd128Float(result, out);
1569   }
1570 }
1571 
absFloat64x2(Operand in,FloatRegister out)1572 void MacroAssemblerX86Shared::absFloat64x2(Operand in, FloatRegister out) {
1573   ScratchSimd128Scope scratch(asMasm());
1574   FloatRegister result = out;
1575   if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
1576     result = scratch;
1577   }
1578   // All ones but the sign bit
1579   double signMask =
1580       SpecificNaN<double>(0, FloatingPoint<double>::kSignificandBits);
1581   static const SimdConstant signMasks = SimdConstant::SplatX2(signMask);
1582   asMasm().loadConstantSimd128Float(signMasks, result);
1583   vandpd(ToSimdFloatRegister(in), result, result);
1584   if (result == scratch) {
1585     moveSimd128Float(result, out);
1586   }
1587 }
1588 
MaskSimdShiftCount(MacroAssembler & masm,unsigned shiftmask,Register count,Register temp,FloatRegister dest)1589 static inline void MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask,
1590                                       Register count, Register temp,
1591                                       FloatRegister dest) {
1592   masm.mov(count, temp);
1593   masm.andl(Imm32(shiftmask), temp);
1594   masm.vmovd(temp, dest);
1595 }
1596 
packedShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest,void (MacroAssemblerX86Shared::* shift)(FloatRegister,FloatRegister,FloatRegister),void (MacroAssemblerX86Shared::* extend)(const Operand &,FloatRegister))1597 void MacroAssemblerX86Shared::packedShiftByScalarInt8x16(
1598     FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1599     FloatRegister dest,
1600     void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,
1601                                            FloatRegister),
1602     void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) {
1603   ScratchSimd128Scope scratch(asMasm());
1604   MaskSimdShiftCount(asMasm(), 7, count, temp, scratch);
1605 
1606   // High bytes
1607   vpalignr(Operand(in), xtmp, 8);
1608   (this->*extend)(Operand(xtmp), xtmp);
1609   (this->*shift)(scratch, xtmp, xtmp);
1610 
1611   // Low bytes
1612   (this->*extend)(Operand(dest), dest);
1613   (this->*shift)(scratch, dest, dest);
1614 
1615   // Mask off garbage to avoid saturation during packing
1616   asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)),
1617                                   scratch);
1618   vpand(Operand(scratch), xtmp, xtmp);
1619   vpand(Operand(scratch), dest, dest);
1620 
1621   vpackuswb(Operand(xtmp), dest, dest);
1622 }
1623 
packedLeftShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest)1624 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
1625     FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1626     FloatRegister dest) {
1627   packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
1628                              &MacroAssemblerX86Shared::vpsllw,
1629                              &MacroAssemblerX86Shared::vpmovzxbw);
1630 }
1631 
packedLeftShiftByScalarInt8x16(Imm32 count,FloatRegister src,FloatRegister dest)1632 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
1633     Imm32 count, FloatRegister src, FloatRegister dest) {
1634   MOZ_ASSERT(count.value <= 7);
1635   if (src != dest) {
1636     asMasm().moveSimd128(src, dest);
1637   }
1638   // Use the doubling trick for low shift counts, otherwise mask off the bits
1639   // that are shifted out of the low byte of each word and use word shifts.  The
1640   // optimal cutoff remains to be explored.
1641   if (count.value <= 3) {
1642     for (int32_t shift = count.value; shift > 0; --shift) {
1643       asMasm().addInt8x16(dest, dest);
1644     }
1645   } else {
1646     ScratchSimd128Scope scratch(asMasm());
1647     // Whether SplatX8 or SplatX16 is best depends on the constant probably?
1648     asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0xFF >> count.value),
1649                                     scratch);
1650     vpand(Operand(scratch), dest, dest);
1651     vpsllw(count, dest, dest);
1652   }
1653 }
1654 
packedRightShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest)1655 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
1656     FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1657     FloatRegister dest) {
1658   packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
1659                              &MacroAssemblerX86Shared::vpsraw,
1660                              &MacroAssemblerX86Shared::vpmovsxbw);
1661 }
1662 
packedRightShiftByScalarInt8x16(Imm32 count,FloatRegister src,FloatRegister temp,FloatRegister dest)1663 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
1664     Imm32 count, FloatRegister src, FloatRegister temp, FloatRegister dest) {
1665   MOZ_ASSERT(count.value <= 7);
1666   ScratchSimd128Scope scratch(asMasm());
1667 
1668   asMasm().moveSimd128(src, scratch);
1669   vpslldq(Imm32(1), scratch, scratch);               // Low bytes -> high bytes
1670   vpsraw(Imm32(count.value + 8), scratch, scratch);  // Shift low bytes
1671   vpsraw(count, dest, dest);                         // Shift high bytes
1672   asMasm().loadConstantSimd128Int(SimdConstant::SplatX8(0xFF00), temp);
1673   bitwiseAndSimdInt(dest, Operand(temp), dest);        // Keep high bytes
1674   bitwiseAndNotSimdInt(temp, Operand(scratch), temp);  // Keep low bytes
1675   bitwiseOrSimdInt(dest, Operand(temp), dest);         // Combine
1676 }
1677 
packedUnsignedRightShiftByScalarInt8x16(FloatRegister in,Register count,Register temp,FloatRegister xtmp,FloatRegister dest)1678 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
1679     FloatRegister in, Register count, Register temp, FloatRegister xtmp,
1680     FloatRegister dest) {
1681   packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
1682                              &MacroAssemblerX86Shared::vpsrlw,
1683                              &MacroAssemblerX86Shared::vpmovzxbw);
1684 }
1685 
packedUnsignedRightShiftByScalarInt8x16(Imm32 count,FloatRegister src,FloatRegister dest)1686 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
1687     Imm32 count, FloatRegister src, FloatRegister dest) {
1688   MOZ_ASSERT(count.value <= 7);
1689   if (src != dest) {
1690     asMasm().moveSimd128(src, dest);
1691   }
1692   ScratchSimd128Scope scratch(asMasm());
1693   // Whether SplatX8 or SplatX16 is best depends on the constant probably?
1694   asMasm().loadConstantSimd128Int(
1695       SimdConstant::SplatX16((0xFF << count.value) & 0xFF), scratch);
1696   vpand(Operand(scratch), dest, dest);
1697   vpsrlw(count, dest, dest);
1698 }
1699 
packedLeftShiftByScalarInt16x8(FloatRegister in,Register count,Register temp,FloatRegister dest)1700 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(
1701     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1702   ScratchSimd128Scope scratch(asMasm());
1703   MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
1704   vpsllw(scratch, in, dest);
1705 }
1706 
packedRightShiftByScalarInt16x8(FloatRegister in,Register count,Register temp,FloatRegister dest)1707 void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(
1708     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1709   ScratchSimd128Scope scratch(asMasm());
1710   MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
1711   vpsraw(scratch, in, dest);
1712 }
1713 
packedUnsignedRightShiftByScalarInt16x8(FloatRegister in,Register count,Register temp,FloatRegister dest)1714 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(
1715     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1716   ScratchSimd128Scope scratch(asMasm());
1717   MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
1718   vpsrlw(scratch, in, dest);
1719 }
1720 
packedLeftShiftByScalarInt32x4(FloatRegister in,Register count,Register temp,FloatRegister dest)1721 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(
1722     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1723   ScratchSimd128Scope scratch(asMasm());
1724   MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
1725   vpslld(scratch, in, dest);
1726 }
1727 
packedRightShiftByScalarInt32x4(FloatRegister in,Register count,Register temp,FloatRegister dest)1728 void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(
1729     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1730   ScratchSimd128Scope scratch(asMasm());
1731   MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
1732   vpsrad(scratch, in, dest);
1733 }
1734 
packedUnsignedRightShiftByScalarInt32x4(FloatRegister in,Register count,Register temp,FloatRegister dest)1735 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(
1736     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1737   ScratchSimd128Scope scratch(asMasm());
1738   MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
1739   vpsrld(scratch, in, dest);
1740 }
1741 
packedLeftShiftByScalarInt64x2(FloatRegister in,Register count,Register temp,FloatRegister dest)1742 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2(
1743     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1744   ScratchSimd128Scope scratch(asMasm());
1745   MaskSimdShiftCount(asMasm(), 63, count, temp, scratch);
1746   vpsllq(scratch, in, dest);
1747 }
1748 
packedRightShiftByScalarInt64x2(FloatRegister in,Register count,Register temp1,FloatRegister temp2,FloatRegister dest)1749 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
1750     FloatRegister in, Register count, Register temp1, FloatRegister temp2,
1751     FloatRegister dest) {
1752   ScratchSimd128Scope scratch(asMasm());
1753   movl(count, temp1);                   // temp1 is zero-extended shift count
1754   andl(Imm32(63), temp1);               // temp1 is masked shift count
1755   vmovd(temp1, scratch);                //   and scratch 64-bit ditto
1756   vpxor(Operand(temp2), temp2, temp2);  // temp2=0
1757   vpcmpgtq(Operand(in), temp2, temp2);  // temp2=~0 where `in` negative
1758   vpsrlq(scratch, in, dest);            // dest shifted, maybe wrong sign
1759   negl(temp1);                          // temp1 is - masked count
1760   addl(Imm32(63), temp1);               // temp1 is 63 - masked count
1761   vmovd(temp1, scratch);                //   and scratch ditto
1762   vpsllq(scratch, temp2, temp2);        // temp2 has the sign bits
1763   vpor(Operand(temp2), dest, dest);     // dest has right sign
1764 }
1765 
packedUnsignedRightShiftByScalarInt64x2(FloatRegister in,Register count,Register temp,FloatRegister dest)1766 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
1767     FloatRegister in, Register count, Register temp, FloatRegister dest) {
1768   ScratchSimd128Scope scratch(asMasm());
1769   MaskSimdShiftCount(asMasm(), 63, count, temp, scratch);
1770   vpsrlq(scratch, in, dest);
1771 }
1772 
selectSimd128(FloatRegister mask,FloatRegister onTrue,FloatRegister onFalse,FloatRegister temp,FloatRegister output)1773 void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
1774                                             FloatRegister onTrue,
1775                                             FloatRegister onFalse,
1776                                             FloatRegister temp,
1777                                             FloatRegister output) {
1778   if (onTrue != output) {
1779     vmovaps(onTrue, output);
1780   }
1781   if (mask != temp) {
1782     vmovaps(mask, temp);
1783   }
1784 
1785   // SSE4.1 has plain blendvps which can do this, but it is awkward
1786   // to use because it requires the mask to be in xmm0.
1787 
1788   bitwiseAndSimdInt(output, Operand(temp), output);
1789   bitwiseAndNotSimdInt(temp, Operand(onFalse), temp);
1790   bitwiseOrSimdInt(output, Operand(temp), output);
1791 }
1792 
1793 // Code sequences for int32x4<->float32x4 culled from v8; commentary added.
1794 
unsignedConvertInt32x4ToFloat32x4(FloatRegister src,FloatRegister dest)1795 void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
1796     FloatRegister src, FloatRegister dest) {
1797   ScratchSimd128Scope scratch(asMasm());
1798   if (src != dest) {
1799     vmovaps(src, dest);
1800   }
1801   vpxor(Operand(scratch), scratch, scratch);  // extract low bits
1802   vpblendw(0x55, dest, scratch, scratch);     //   into scratch
1803   vpsubd(Operand(scratch), dest, dest);       //     and high bits into dest
1804   vcvtdq2ps(scratch, scratch);                // convert low bits
1805   vpsrld(Imm32(1), dest, dest);               // get high into unsigned range
1806   vcvtdq2ps(dest, dest);                      //   convert
1807   vaddps(Operand(dest), dest, dest);          //     and back into signed
1808   vaddps(Operand(scratch), dest, dest);       // combine high+low: may round
1809 }
1810 
truncSatFloat32x4ToInt32x4(FloatRegister src,FloatRegister dest)1811 void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
1812                                                          FloatRegister dest) {
1813   ScratchSimd128Scope scratch(asMasm());
1814   if (src != dest) {
1815     vmovaps(src, dest);
1816   }
1817 
1818   // The cvttps2dq instruction is the workhorse but does not handle NaN or out
1819   // of range values as we need it to.  We want to saturate too-large positive
1820   // values to 7FFFFFFFh and too-large negative values to 80000000h.  NaN and -0
1821   // become 0.
1822 
1823   // Convert NaN to 0 by masking away values that compare unordered to itself.
1824   vmovaps(dest, scratch);
1825   vcmpeqps(Operand(scratch), scratch, scratch);
1826   vpand(Operand(scratch), dest, dest);
1827 
1828   // Compute the complement of each non-NaN lane's sign bit, we'll need this to
1829   // correct the result of cvttps2dq.  All other output bits are garbage.
1830   vpxor(Operand(dest), scratch, scratch);
1831 
1832   // Convert.  This will make the output 80000000h if the input is out of range.
1833   vcvttps2dq(dest, dest);
1834 
1835   // Preserve the computed complemented sign bit if the output was 80000000h.
1836   // The sign bit will be 1 precisely for nonnegative values that overflowed.
1837   vpand(Operand(dest), scratch, scratch);
1838 
1839   // Create a mask with that sign bit.  Now a lane is either FFFFFFFFh if there
1840   // was a positive overflow, otherwise zero.
1841   vpsrad(Imm32(31), scratch, scratch);
1842 
1843   // Convert overflow lanes to 0x7FFFFFFF.
1844   vpxor(Operand(scratch), dest, dest);
1845 }
1846 
unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,FloatRegister temp,FloatRegister dest)1847 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
1848     FloatRegister src, FloatRegister temp, FloatRegister dest) {
1849   ScratchSimd128Scope scratch(asMasm());
1850   if (src != dest) {
1851     vmovaps(src, dest);
1852   }
1853 
1854   // The cvttps2dq instruction is the workhorse but does not handle NaN or out
1855   // of range values as we need it to.  We want to saturate too-large positive
1856   // values to FFFFFFFFh and negative values to zero.  NaN and -0 become 0.
1857 
1858   // Convert NaN and negative values to zeroes in dest.
1859   vpxor(Operand(scratch), scratch, scratch);
1860   vmaxps(Operand(scratch), dest, dest);
1861 
1862   // Compute the float value of 0x7FFFFFFF (the largest positive signed integer
1863   // value) in all lanes in scratch.  We use it to bias the conversion to handle
1864   // edge cases.
1865   vpcmpeqd(Operand(scratch), scratch, scratch);
1866   vpsrld(Imm32(1), scratch, scratch);
1867   vcvtdq2ps(scratch, scratch);
1868 
1869   // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned
1870   // range but above the signed range into the signed range; 0 => -7FFFFFFFh.
1871   vmovaps(dest, temp);
1872   vsubps(Operand(scratch), temp, temp);
1873 
1874   // scratch = mask of biased values that are greater than 7FFFFFFFh.
1875   vcmpleps(Operand(temp), scratch, scratch);
1876 
1877   // Convert the biased values to integer.  Positive values above 7FFFFFFFh will
1878   // have been converted to 80000000h, all others become the expected integer.
1879   vcvttps2dq(temp, temp);
1880 
1881   // As lanes of scratch are ~0 where the result overflows, this computes
1882   // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes
1883   // untouched as the biased integer.
1884   vpxor(Operand(scratch), temp, temp);
1885 
1886   // Convert negative biased lanes in temp to zero.  After this, temp will be
1887   // zero where the result should be zero or is less than 80000000h, 7FFFFFFF
1888   // where the result overflows, and will have the converted biased result in
1889   // other lanes (for input values >= 80000000h).
1890   vpxor(Operand(scratch), scratch, scratch);
1891   vpmaxsd(Operand(scratch), temp, temp);
1892 
1893   // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will
1894   // be what they should be.
1895   vcvttps2dq(dest, dest);
1896 
1897   // Add temp to the result.  Overflow lanes with 80000000h becomes FFFFFFFFh,
1898   // biased high-value unsigned lanes become unbiased, everything else is left
1899   // unchanged.
1900   vpaddd(Operand(temp), dest, dest);
1901 }
1902