1 #include "Jitter_CodeGen_x86.h"
2
3 using namespace Jitter;
4
5 template <typename MDOP>
Emit_Md_Avx_VarVar(const STATEMENT & statement)6 void CCodeGen_x86::Emit_Md_Avx_VarVar(const STATEMENT& statement)
7 {
8 auto dst = statement.dst->GetSymbol().get();
9 auto src1 = statement.src1->GetSymbol().get();
10
11 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
12
13 ((m_assembler).*(MDOP::OpVoAvx()))(dstRegister, MakeVariable128SymbolAddress(src1));
14
15 CommitSymbolRegisterMdAvx(dst, dstRegister);
16 }
17
18 template <typename MDOP>
Emit_Md_Avx_VarVarVar(const STATEMENT & statement)19 void CCodeGen_x86::Emit_Md_Avx_VarVarVar(const STATEMENT& statement)
20 {
21 auto dst = statement.dst->GetSymbol().get();
22 auto src1 = statement.src1->GetSymbol().get();
23 auto src2 = statement.src2->GetSymbol().get();
24
25 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
26 auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM1);
27
28 ((m_assembler).*(MDOP::OpVoAvx()))(dstRegister, src1Register, MakeVariable128SymbolAddress(src2));
29
30 CommitSymbolRegisterMdAvx(dst, dstRegister);
31 }
32
33 template <typename MDOP>
Emit_Md_Avx_VarVarVarRev(const STATEMENT & statement)34 void CCodeGen_x86::Emit_Md_Avx_VarVarVarRev(const STATEMENT& statement)
35 {
36 auto dst = statement.dst->GetSymbol().get();
37 auto src1 = statement.src1->GetSymbol().get();
38 auto src2 = statement.src2->GetSymbol().get();
39
40 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
41 auto src2Register = PrepareSymbolRegisterUseMdAvx(src2, CX86Assembler::xMM1);
42
43 ((m_assembler).*(MDOP::OpVoAvx()))(dstRegister, src2Register, MakeVariable128SymbolAddress(src1));
44
45 CommitSymbolRegisterMdAvx(dst, dstRegister);
46 }
47
48 template <typename MDOPSHIFT, uint8 SAMASK>
Emit_Md_Avx_Shift_VarVarCst(const STATEMENT & statement)49 void CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst(const STATEMENT& statement)
50 {
51 auto dst = statement.dst->GetSymbol().get();
52 auto src1 = statement.src1->GetSymbol().get();
53 auto src2 = statement.src2->GetSymbol().get();
54
55 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
56 auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM1);
57
58 ((m_assembler).*(MDOPSHIFT::OpVoAvx()))(dstRegister, src1Register, static_cast<uint8>(src2->m_valueLow & SAMASK));
59
60 CommitSymbolRegisterMdAvx(dst, dstRegister);
61 }
62
Emit_Md_Avx_Mov_RegVar(const STATEMENT & statement)63 void CCodeGen_x86::Emit_Md_Avx_Mov_RegVar(const STATEMENT& statement)
64 {
65 auto dst = statement.dst->GetSymbol().get();
66 auto src1 = statement.src1->GetSymbol().get();
67
68 m_assembler.VmovapsVo(m_mdRegisters[dst->m_valueLow], MakeVariable128SymbolAddress(src1));
69 }
70
Emit_Md_Avx_Mov_MemReg(const STATEMENT & statement)71 void CCodeGen_x86::Emit_Md_Avx_Mov_MemReg(const STATEMENT& statement)
72 {
73 auto dst = statement.dst->GetSymbol().get();
74 auto src1 = statement.src1->GetSymbol().get();
75
76 m_assembler.VmovapsVo(MakeMemory128SymbolAddress(dst), m_mdRegisters[src1->m_valueLow]);
77 }
78
Emit_Md_Avx_Mov_MemMem(const STATEMENT & statement)79 void CCodeGen_x86::Emit_Md_Avx_Mov_MemMem(const STATEMENT& statement)
80 {
81 auto dst = statement.dst->GetSymbol().get();
82 auto src1 = statement.src1->GetSymbol().get();
83
84 auto tmpRegister = CX86Assembler::xMM0;
85
86 m_assembler.VmovapsVo(tmpRegister, MakeMemory128SymbolAddress(src1));
87 m_assembler.VmovapsVo(MakeMemory128SymbolAddress(dst), tmpRegister);
88 }
89
Emit_Md_Avx_MovMasked_VarVarVar(const STATEMENT & statement)90 void CCodeGen_x86::Emit_Md_Avx_MovMasked_VarVarVar(const STATEMENT& statement)
91 {
92 auto dst = statement.dst->GetSymbol().get();
93 auto src1 = statement.src1->GetSymbol().get();
94 auto src2 = statement.src2->GetSymbol().get();
95 uint8 mask = static_cast<uint8>(statement.jmpCondition);
96
97 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
98 auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM1);
99
100 m_assembler.VblendpsVo(dstRegister, src1Register, MakeVariable128SymbolAddress(src2), mask);
101
102 CommitSymbolRegisterMdAvx(dst, dstRegister);
103 }
104
Emit_Md_Avx_AddSSW_VarVarVar(const STATEMENT & statement)105 void CCodeGen_x86::Emit_Md_Avx_AddSSW_VarVarVar(const STATEMENT& statement)
106 {
107 auto dst = statement.dst->GetSymbol().get();
108 auto src1 = statement.src1->GetSymbol().get();
109 auto src2 = statement.src2->GetSymbol().get();
110
111 auto uxRegister = CX86Assembler::xMM0;
112 auto uyRegister = CX86Assembler::xMM1;
113 auto resRegister = CX86Assembler::xMM2;
114 auto cstRegister = CX86Assembler::xMM3;
115
116 // This is based on code from http://locklessinc.com/articles/sat_arithmetic/ modified to work without cmovns
117 // s32b sat_adds32b(s32b x, s32b y)
118 // {
119 // u32b ux = x;
120 // u32b uy = y;
121 // u32b res = ux + uy;
122 //
123 // /* Calculate overflowed result. (Don't change the sign bit of ux) */
124 // ux = (ux >> 31) + INT_MAX;
125 //
126 // s32b sign = (s32b) ((ux ^ uy) | ~(uy ^ res))
127 // sign >>= 31; /* Arithmetic shift, either 0 or ~0*/
128 // res = (res & sign) | (ux & ~sign);
129 //
130 // return res;
131 // }
132
133 //ux = src1
134 //uy = src2
135 m_assembler.VmovdqaVo(uxRegister, MakeVariable128SymbolAddress(src1));
136 m_assembler.VmovdqaVo(uyRegister, MakeVariable128SymbolAddress(src2));
137
138 //res = ux + uy
139 m_assembler.VpadddVo(resRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
140
141 //cst = 0x7FFFFFFF
142 m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
143 m_assembler.VpsrldVo(cstRegister, cstRegister, 1);
144
145 //ux = (ux >> 31)
146 m_assembler.VpsrldVo(uxRegister, uxRegister, 31);
147
148 //ux += 0x7FFFFFFF
149 m_assembler.VpadddVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
150
151 //uy = ~(uy ^ res)
152 //------
153 //uy ^ res
154 m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
155
156 //~(uy ^ res)
157 m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
158 m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
159
160 //cst = ux ^ uy (reloading uy from src2 because we don't have any registers available)
161 m_assembler.VpxorVo(cstRegister, uxRegister, MakeVariable128SymbolAddress(src2));
162
163 //uy = ((ux ^ uy) | ~(uy ^ res)) >> 31; (signed operation)
164 m_assembler.VporVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
165 m_assembler.VpsradVo(uyRegister, uyRegister, 31);
166
167 //res = (res & uy) (uy is the sign value)
168 m_assembler.VpandVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
169
170 //ux = (ux & ~uy)
171 //------
172 //~uy
173 m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
174 m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
175
176 //ux & ~uy
177 m_assembler.VpandVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
178
179 //res = (res & uy) | (ux & ~uy)
180 m_assembler.VporVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
181
182 //Copy final result
183 m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
184 }
185
Emit_Md_Avx_SubSSW_VarVarVar(const STATEMENT & statement)186 void CCodeGen_x86::Emit_Md_Avx_SubSSW_VarVarVar(const STATEMENT& statement)
187 {
188 auto dst = statement.dst->GetSymbol().get();
189 auto src1 = statement.src1->GetSymbol().get();
190 auto src2 = statement.src2->GetSymbol().get();
191
192 auto uxRegister = CX86Assembler::xMM0;
193 auto uyRegister = CX86Assembler::xMM1;
194 auto resRegister = CX86Assembler::xMM2;
195 auto cstRegister = CX86Assembler::xMM3;
196
197 // This is based on code from http://locklessinc.com/articles/sat_arithmetic/ modified to work without cmovns
198 // s32b sat_subs32b(s32b x, s32b y)
199 // {
200 // u32b ux = x;
201 // u32b uy = y;
202 // u32b res = ux - uy;
203 //
204 // ux = (ux >> 31) + INT_MAX;
205 //
206 // s32b sign = (s32b) ((ux ^ uy) & (ux ^ res))
207 // sign >>= 31; /* Arithmetic shift, either 0 or ~0*/
208 // res = (res & ~sign) | (ux & sign);
209 //
210 // return res;
211 // }
212
213 //ux = src1
214 //uy = src2
215 m_assembler.VmovdqaVo(uxRegister, MakeVariable128SymbolAddress(src1));
216
217 //res = ux - uy
218 m_assembler.VpsubdVo(resRegister, uxRegister, MakeVariable128SymbolAddress(src2));
219
220 //cst = 0x7FFFFFFF
221 m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
222 m_assembler.VpsrldVo(cstRegister, cstRegister, 1);
223
224 //ux = (ux >> 31)
225 m_assembler.VpsrldVo(uxRegister, uxRegister, 31);
226
227 //ux += 0x7FFFFFFF
228 m_assembler.VpadddVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
229
230 //uy = (ux ^ res)
231 //------
232 //ux ^ res
233 m_assembler.VpxorVo(uyRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
234
235 //cst = ux ^ uy (reloading uy from src2 because we don't have any registers available)
236 m_assembler.VpxorVo(cstRegister, uxRegister, MakeVariable128SymbolAddress(src2));
237
238 //uy = ((ux ^ uy) & (ux ^ res)) >> 31; (signed operation)
239 m_assembler.VpandVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
240 m_assembler.VpsradVo(uyRegister, uyRegister, 31);
241
242 //ux = (ux & uy) (uy is the sign value)
243 m_assembler.VpandVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
244
245 //res = (res & ~uy)
246 //------
247 //~uy
248 m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
249 m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
250
251 //res & ~uy
252 m_assembler.VpandVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
253
254 //res = (res & ~uy) | (ux & uy)
255 m_assembler.VporVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
256
257 //Copy final result
258 m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
259 }
260
Emit_Md_Avx_AddUSW_VarVarVar(const STATEMENT & statement)261 void CCodeGen_x86::Emit_Md_Avx_AddUSW_VarVarVar(const STATEMENT& statement)
262 {
263 auto dst = statement.dst->GetSymbol().get();
264 auto src1 = statement.src1->GetSymbol().get();
265 auto src2 = statement.src2->GetSymbol().get();
266
267 auto xRegister = CX86Assembler::xMM0;
268 auto resRegister = CX86Assembler::xMM1;
269 auto tmpRegister = CX86Assembler::xMM2;
270 auto tmp2Register = CX86Assembler::xMM3;
271
272 // This is based on code from http://locklessinc.com/articles/sat_arithmetic/
273 // u32b sat_addu32b(u32b x, u32b y)
274 // {
275 // u32b res = x + y;
276 // res |= -(res < x);
277 //
278 // return res;
279 // }
280
281 m_assembler.VmovdqaVo(xRegister, MakeVariable128SymbolAddress(src1));
282 m_assembler.VpadddVo(resRegister, xRegister, MakeVariable128SymbolAddress(src2));
283
284 //-(res < x)
285 //PCMPGT will compare two signed integers, but we want unsigned comparison
286 //Thus, we add 0x80000000 to both values to "convert" them to signed
287 m_assembler.VpcmpeqdVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
288 m_assembler.VpslldVo(tmpRegister, tmpRegister, 31);
289 m_assembler.VpadddVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
290
291 m_assembler.VpcmpeqdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
292 m_assembler.VpslldVo(tmp2Register, tmp2Register, 31);
293 m_assembler.VpadddVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
294
295 m_assembler.VpcmpgtdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
296
297 //res |= -(res < x)
298 m_assembler.VporVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
299
300 //Store result
301 m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
302 }
303
Emit_Md_Avx_SubUSW_VarVarVar(const STATEMENT & statement)304 void CCodeGen_x86::Emit_Md_Avx_SubUSW_VarVarVar(const STATEMENT& statement)
305 {
306 auto dst = statement.dst->GetSymbol().get();
307 auto src1 = statement.src1->GetSymbol().get();
308 auto src2 = statement.src2->GetSymbol().get();
309
310 auto xRegister = CX86Assembler::xMM0;
311 auto resRegister = CX86Assembler::xMM1;
312 auto tmpRegister = CX86Assembler::xMM2;
313 auto tmp2Register = CX86Assembler::xMM3;
314
315 // This is based on code from http://locklessinc.com/articles/sat_arithmetic/
316 // u32b sat_subu32b(u32b x, u32b y)
317 // {
318 // u32b res = x - y;
319 // res &= -(res <= x);
320 //
321 // return res;
322 // }
323
324 m_assembler.VmovdqaVo(xRegister, MakeVariable128SymbolAddress(src1));
325 m_assembler.VpsubdVo(resRegister, xRegister, MakeVariable128SymbolAddress(src2));
326
327 //-(res <= x)
328 //PCMPGT will compare two signed integers, but we want unsigned comparison
329 //Thus, we add 0x80000000 to both values to "convert" them to signed
330 m_assembler.VpcmpeqdVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
331 m_assembler.VpslldVo(tmpRegister, tmpRegister, 31);
332 m_assembler.VpadddVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
333
334 m_assembler.VpcmpeqdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
335 m_assembler.VpslldVo(tmp2Register, tmp2Register, 31);
336 m_assembler.VpadddVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
337
338 m_assembler.VpcmpeqdVo(xRegister, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
339 m_assembler.VpcmpgtdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
340 m_assembler.VporVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
341
342 //res &= -(res <= x);
343 m_assembler.VpandVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
344
345 //Store result
346 m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
347 }
348
Emit_Md_Avx_PackHB_VarVarVar(const STATEMENT & statement)349 void CCodeGen_x86::Emit_Md_Avx_PackHB_VarVarVar(const STATEMENT& statement)
350 {
351 auto dst = statement.dst->GetSymbol().get();
352 auto src1 = statement.src1->GetSymbol().get();
353 auto src2 = statement.src2->GetSymbol().get();
354
355 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
356 auto tempRegister = CX86Assembler::xMM1;
357 auto temp2Register = CX86Assembler::xMM2;
358 auto maskRegister = CX86Assembler::xMM3;
359
360 //Generate mask (0x00FF x8)
361 m_assembler.VpcmpeqdVo(maskRegister, maskRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
362 m_assembler.VpsrlwVo(maskRegister, maskRegister, 0x08);
363
364 //Mask both operands
365 m_assembler.VpandVo(temp2Register, maskRegister, MakeVariable128SymbolAddress(src2));
366 m_assembler.VpandVo(tempRegister, maskRegister, MakeVariable128SymbolAddress(src1));
367
368 //Pack
369 m_assembler.VpackuswbVo(dstRegister, temp2Register, CX86Assembler::MakeXmmRegisterAddress(tempRegister));
370
371 CommitSymbolRegisterMdAvx(dst, dstRegister);
372 }
373
Emit_Md_Avx_PackWH_VarVarVar(const STATEMENT & statement)374 void CCodeGen_x86::Emit_Md_Avx_PackWH_VarVarVar(const STATEMENT& statement)
375 {
376 auto dst = statement.dst->GetSymbol().get();
377 auto src1 = statement.src1->GetSymbol().get();
378 auto src2 = statement.src2->GetSymbol().get();
379
380 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
381 auto resultRegister = CX86Assembler::xMM1;
382 auto tempRegister = CX86Assembler::xMM2;
383
384 m_assembler.VmovapsVo(resultRegister, MakeVariable128SymbolAddress(src2));
385 m_assembler.VmovapsVo(tempRegister, MakeVariable128SymbolAddress(src1));
386
387 //Sign extend the lower half word of our registers
388 m_assembler.VpslldVo(resultRegister, resultRegister, 0x10);
389 m_assembler.VpsradVo(resultRegister, resultRegister, 0x10);
390
391 m_assembler.VpslldVo(tempRegister, tempRegister, 0x10);
392 m_assembler.VpsradVo(tempRegister, tempRegister, 0x10);
393
394 //Pack
395 m_assembler.VpackssdwVo(dstRegister, resultRegister, CX86Assembler::MakeXmmRegisterAddress(tempRegister));
396
397 CommitSymbolRegisterMdAvx(dst, dstRegister);
398 }
399
Emit_Md_Avx_Not_VarVar(const STATEMENT & statement)400 void CCodeGen_x86::Emit_Md_Avx_Not_VarVar(const STATEMENT& statement)
401 {
402 auto dst = statement.dst->GetSymbol().get();
403 auto src1 = statement.src1->GetSymbol().get();
404
405 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
406 auto cstRegister = CX86Assembler::xMM1;
407
408 assert(dstRegister != cstRegister);
409
410 m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
411 m_assembler.VpxorVo(dstRegister, cstRegister, MakeVariable128SymbolAddress(src1));
412
413 CommitSymbolRegisterMdAvx(dst, dstRegister);
414 }
415
Emit_Md_Avx_Abs_VarVar(const STATEMENT & statement)416 void CCodeGen_x86::Emit_Md_Avx_Abs_VarVar(const STATEMENT& statement)
417 {
418 auto dst = statement.dst->GetSymbol().get();
419 auto src1 = statement.src1->GetSymbol().get();
420
421 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
422 auto maskRegister = CX86Assembler::xMM1;
423
424 assert(dstRegister != maskRegister);
425
426 m_assembler.VpcmpeqdVo(maskRegister, maskRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
427 m_assembler.VpsrldVo(maskRegister, maskRegister, 1);
428 m_assembler.VpandVo(dstRegister, maskRegister, MakeVariable128SymbolAddress(src1));
429
430 CommitSymbolRegisterMdAvx(dst, dstRegister);
431 }
432
Emit_Md_Avx_MakeSz_VarVar(const STATEMENT & statement)433 void CCodeGen_x86::Emit_Md_Avx_MakeSz_VarVar(const STATEMENT& statement)
434 {
435 auto dst = statement.dst->GetSymbol().get();
436 auto src1 = statement.src1->GetSymbol().get();
437
438 auto dstRegister = PrepareSymbolRegisterDef(dst, CX86Assembler::rDX);
439 auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM0);
440 auto szRegister = CX86Assembler::xMM1;
441 auto zeroRegister = CX86Assembler::xMM2;
442
443 //Compute sign
444 m_assembler.VpsradVo(szRegister, src1Register, 31);
445
446 //Compute zero
447 m_assembler.VpxorVo(zeroRegister, zeroRegister, CX86Assembler::MakeXmmRegisterAddress(zeroRegister));
448 m_assembler.VcmppsVo(zeroRegister, zeroRegister, CX86Assembler::MakeXmmRegisterAddress(src1Register), CX86Assembler::SSE_CMP_EQ);
449
450 //Pack
451 m_assembler.VpackssdwVo(szRegister, szRegister, CX86Assembler::MakeXmmRegisterAddress(zeroRegister));
452
453 //Extract bits
454 m_assembler.VpshufbVo(szRegister, szRegister, MakeConstant128Address(g_makeSzShufflePattern));
455 m_assembler.VpmovmskbVo(dstRegister, szRegister);
456
457 CommitSymbolRegister(dst, dstRegister);
458 }
459
Emit_Md_Avx_Expand_VarVar(const STATEMENT & statement)460 void CCodeGen_x86::Emit_Md_Avx_Expand_VarVar(const STATEMENT& statement)
461 {
462 auto dst = statement.dst->GetSymbol().get();
463 auto src1 = statement.src1->GetSymbol().get();
464
465 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
466
467 m_assembler.VmovdVo(dstRegister, MakeVariableSymbolAddress(src1));
468 m_assembler.VshufpsVo(dstRegister, dstRegister, CX86Assembler::MakeXmmRegisterAddress(dstRegister), 0x00);
469
470 CommitSymbolRegisterMdAvx(dst, dstRegister);
471 }
472
Emit_Md_Avx_Expand_VarCst(const STATEMENT & statement)473 void CCodeGen_x86::Emit_Md_Avx_Expand_VarCst(const STATEMENT& statement)
474 {
475 auto dst = statement.dst->GetSymbol().get();
476 auto src1 = statement.src1->GetSymbol().get();
477
478 auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
479 auto cstRegister = CX86Assembler::rAX;
480
481 m_assembler.MovId(cstRegister, src1->m_valueLow);
482 m_assembler.VmovdVo(dstRegister, CX86Assembler::MakeRegisterAddress(cstRegister));
483 m_assembler.VshufpsVo(dstRegister, dstRegister, CX86Assembler::MakeXmmRegisterAddress(dstRegister), 0x00);
484
485 CommitSymbolRegisterMdAvx(dst, dstRegister);
486 }
487
Emit_Avx_MergeTo256_MemVarVar(const STATEMENT & statement)488 void CCodeGen_x86::Emit_Avx_MergeTo256_MemVarVar(const STATEMENT& statement)
489 {
490 auto dst = statement.dst->GetSymbol().get();
491 auto src1 = statement.src1->GetSymbol().get();
492 auto src2 = statement.src2->GetSymbol().get();
493
494 assert(dst->m_type == SYM_TEMPORARY256);
495
496 auto src1Register = CX86Assembler::xMM0;
497 auto src2Register = CX86Assembler::xMM1;
498
499 //TODO: Improve this to write out registers directly to temporary's memory space
500 //instead of passing by temporary registers
501
502 m_assembler.VmovdqaVo(src1Register, MakeVariable128SymbolAddress(src1));
503 m_assembler.VmovdqaVo(src2Register, MakeVariable128SymbolAddress(src2));
504
505 m_assembler.VmovdqaVo(MakeTemporary256SymbolElementAddress(dst, 0x00), src1Register);
506 m_assembler.VmovdqaVo(MakeTemporary256SymbolElementAddress(dst, 0x10), src2Register);
507 }
508
Emit_Md_Avx_Srl256_VarMemVar(const STATEMENT & statement)509 void CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemVar(const STATEMENT& statement)
510 {
511 auto dst = statement.dst->GetSymbol().get();
512 auto src1 = statement.src1->GetSymbol().get();
513 auto src2 = statement.src2->GetSymbol().get();
514
515 auto offsetRegister = CX86Assembler::rAX;
516 auto resultRegister = CX86Assembler::xMM0;
517
518 assert(src1->m_type == SYM_TEMPORARY256);
519
520 m_assembler.MovEd(offsetRegister, MakeVariableSymbolAddress(src2));
521 m_assembler.AndId(CX86Assembler::MakeRegisterAddress(offsetRegister), 0x7F);
522 m_assembler.ShrEd(CX86Assembler::MakeRegisterAddress(offsetRegister), 3);
523 m_assembler.AddId(CX86Assembler::MakeRegisterAddress(offsetRegister), src1->m_stackLocation + m_stackLevel);
524
525 m_assembler.VmovdquVo(resultRegister, CX86Assembler::MakeBaseIndexScaleAddress(CX86Assembler::rSP, offsetRegister, 1));
526 m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resultRegister);
527 }
528
Emit_Md_Avx_Srl256_VarMemCst(const STATEMENT & statement)529 void CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemCst(const STATEMENT& statement)
530 {
531 auto dst = statement.dst->GetSymbol().get();
532 auto src1 = statement.src1->GetSymbol().get();
533 auto src2 = statement.src2->GetSymbol().get();
534
535 auto resultRegister = CX86Assembler::xMM0;
536
537 assert(src1->m_type == SYM_TEMPORARY256);
538 assert(src2->m_type == SYM_CONSTANT);
539
540 uint32 offset = (src2->m_valueLow & 0x7F) / 8;
541
542 m_assembler.VmovdquVo(resultRegister, MakeTemporary256SymbolElementAddress(src1, offset));
543 m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resultRegister);
544 }
545
546 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdAvxConstMatchers[] =
547 {
548 { OP_MD_ADD_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDB> },
549 { OP_MD_ADD_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDH> },
550 { OP_MD_ADD_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDW> },
551
552 { OP_MD_ADDSS_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDSSB> },
553 { OP_MD_ADDSS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDSSH> },
554 { OP_MD_ADDSS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_AddSSW_VarVarVar },
555
556 { OP_MD_ADDUS_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDUSB> },
557 { OP_MD_ADDUS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDUSH> },
558 { OP_MD_ADDUS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_AddUSW_VarVarVar },
559
560 { OP_MD_SUB_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBB> },
561 { OP_MD_SUB_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBH> },
562 { OP_MD_SUB_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBW> },
563
564 { OP_MD_SUBSS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBSSH> },
565 { OP_MD_SUBSS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_SubSSW_VarVarVar },
566
567 { OP_MD_SUBUS_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBUSB> },
568 { OP_MD_SUBUS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBUSH> },
569 { OP_MD_SUBUS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_SubUSW_VarVarVar },
570
571 { OP_MD_CMPEQ_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPEQB> },
572 { OP_MD_CMPEQ_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPEQH> },
573 { OP_MD_CMPEQ_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPEQW> },
574
575 { OP_MD_CMPGT_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTB> },
576 { OP_MD_CMPGT_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTH> },
577 { OP_MD_CMPGT_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTW> },
578
579 { OP_MD_MIN_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MINH> },
580 { OP_MD_MIN_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MINW> },
581
582 { OP_MD_MAX_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MAXH> },
583 { OP_MD_MAX_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MAXW> },
584
585 { OP_MD_AND, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_AND> },
586 { OP_MD_OR, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_OR> },
587 { OP_MD_XOR, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_XOR> },
588
589 { OP_MD_NOT, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Not_VarVar },
590
591 { OP_MD_SRLH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRLH, 0x0F> },
592 { OP_MD_SRAH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRAH, 0x0F> },
593 { OP_MD_SLLH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SLLH, 0x0F> },
594
595 { OP_MD_SRLW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRLW, 0x1F> },
596 { OP_MD_SRAW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRAW, 0x1F> },
597 { OP_MD_SLLW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SLLW, 0x1F> },
598
599 { OP_MD_UNPACK_LOWER_BH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_LOWER_BH> },
600 { OP_MD_UNPACK_LOWER_HW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_LOWER_HW> },
601 { OP_MD_UNPACK_LOWER_WD, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_LOWER_WD> },
602
603 { OP_MD_UNPACK_UPPER_BH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_UPPER_BH> },
604 { OP_MD_UNPACK_UPPER_HW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_UPPER_HW> },
605 { OP_MD_UNPACK_UPPER_WD, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_UPPER_WD> },
606
607 { OP_MD_ADD_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDS> },
608 { OP_MD_SUB_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBS> },
609 { OP_MD_MUL_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MULS> },
610 { OP_MD_DIV_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_DIVS> },
611
612 { OP_MD_ABS_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Abs_VarVar },
613
614 { OP_MD_CMPLT_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPLTS> },
615 { OP_MD_CMPGT_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTS> },
616
617 { OP_MD_MIN_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MINS> },
618 { OP_MD_MAX_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MAXS> },
619
620 { OP_MD_TOWORD_TRUNCATE, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVar<MDOP_TOWORD_TRUNCATE> },
621 { OP_MD_TOSINGLE, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVar<MDOP_TOSINGLE> },
622
623 { OP_MD_EXPAND, MATCH_VARIABLE128, MATCH_VARIABLE, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Expand_VarVar },
624 { OP_MD_EXPAND, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Expand_VarCst },
625
626 { OP_MD_PACK_HB, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_PackHB_VarVarVar, },
627 { OP_MD_PACK_WH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_PackWH_VarVarVar, },
628
629 { OP_MD_MAKESZ, MATCH_VARIABLE, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_MakeSz_VarVar },
630
631 { OP_MOV, MATCH_REGISTER128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Mov_RegVar, },
632 { OP_MOV, MATCH_MEMORY128, MATCH_REGISTER128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Mov_MemReg, },
633 { OP_MOV, MATCH_MEMORY128, MATCH_MEMORY128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Mov_MemMem, },
634
635 { OP_MD_MOV_MASKED, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_MovMasked_VarVarVar },
636
637 { OP_MERGETO256, MATCH_MEMORY256, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Avx_MergeTo256_MemVarVar },
638 { OP_MD_SRL256, MATCH_VARIABLE128, MATCH_MEMORY256, MATCH_VARIABLE, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemVar },
639 { OP_MD_SRL256, MATCH_VARIABLE128, MATCH_MEMORY256, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemCst },
640
641 { OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
642 };
643