1 #include "Jitter_CodeGen_x86.h"
2 
3 using namespace Jitter;
4 
5 template <typename MDOP>
Emit_Md_Avx_VarVar(const STATEMENT & statement)6 void CCodeGen_x86::Emit_Md_Avx_VarVar(const STATEMENT& statement)
7 {
8 	auto dst = statement.dst->GetSymbol().get();
9 	auto src1 = statement.src1->GetSymbol().get();
10 
11 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
12 
13 	((m_assembler).*(MDOP::OpVoAvx()))(dstRegister, MakeVariable128SymbolAddress(src1));
14 
15 	CommitSymbolRegisterMdAvx(dst, dstRegister);
16 }
17 
18 template <typename MDOP>
Emit_Md_Avx_VarVarVar(const STATEMENT & statement)19 void CCodeGen_x86::Emit_Md_Avx_VarVarVar(const STATEMENT& statement)
20 {
21 	auto dst = statement.dst->GetSymbol().get();
22 	auto src1 = statement.src1->GetSymbol().get();
23 	auto src2 = statement.src2->GetSymbol().get();
24 
25 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
26 	auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM1);
27 
28 	((m_assembler).*(MDOP::OpVoAvx()))(dstRegister, src1Register, MakeVariable128SymbolAddress(src2));
29 
30 	CommitSymbolRegisterMdAvx(dst, dstRegister);
31 }
32 
33 template <typename MDOP>
Emit_Md_Avx_VarVarVarRev(const STATEMENT & statement)34 void CCodeGen_x86::Emit_Md_Avx_VarVarVarRev(const STATEMENT& statement)
35 {
36 	auto dst = statement.dst->GetSymbol().get();
37 	auto src1 = statement.src1->GetSymbol().get();
38 	auto src2 = statement.src2->GetSymbol().get();
39 
40 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
41 	auto src2Register = PrepareSymbolRegisterUseMdAvx(src2, CX86Assembler::xMM1);
42 
43 	((m_assembler).*(MDOP::OpVoAvx()))(dstRegister, src2Register, MakeVariable128SymbolAddress(src1));
44 
45 	CommitSymbolRegisterMdAvx(dst, dstRegister);
46 }
47 
48 template <typename MDOPSHIFT, uint8 SAMASK>
Emit_Md_Avx_Shift_VarVarCst(const STATEMENT & statement)49 void CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst(const STATEMENT& statement)
50 {
51 	auto dst = statement.dst->GetSymbol().get();
52 	auto src1 = statement.src1->GetSymbol().get();
53 	auto src2 = statement.src2->GetSymbol().get();
54 
55 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
56 	auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM1);
57 
58 	((m_assembler).*(MDOPSHIFT::OpVoAvx()))(dstRegister, src1Register, static_cast<uint8>(src2->m_valueLow & SAMASK));
59 
60 	CommitSymbolRegisterMdAvx(dst, dstRegister);
61 }
62 
Emit_Md_Avx_Mov_RegVar(const STATEMENT & statement)63 void CCodeGen_x86::Emit_Md_Avx_Mov_RegVar(const STATEMENT& statement)
64 {
65 	auto dst = statement.dst->GetSymbol().get();
66 	auto src1 = statement.src1->GetSymbol().get();
67 
68 	m_assembler.VmovapsVo(m_mdRegisters[dst->m_valueLow], MakeVariable128SymbolAddress(src1));
69 }
70 
Emit_Md_Avx_Mov_MemReg(const STATEMENT & statement)71 void CCodeGen_x86::Emit_Md_Avx_Mov_MemReg(const STATEMENT& statement)
72 {
73 	auto dst = statement.dst->GetSymbol().get();
74 	auto src1 = statement.src1->GetSymbol().get();
75 
76 	m_assembler.VmovapsVo(MakeMemory128SymbolAddress(dst), m_mdRegisters[src1->m_valueLow]);
77 }
78 
Emit_Md_Avx_Mov_MemMem(const STATEMENT & statement)79 void CCodeGen_x86::Emit_Md_Avx_Mov_MemMem(const STATEMENT& statement)
80 {
81 	auto dst = statement.dst->GetSymbol().get();
82 	auto src1 = statement.src1->GetSymbol().get();
83 
84 	auto tmpRegister = CX86Assembler::xMM0;
85 
86 	m_assembler.VmovapsVo(tmpRegister, MakeMemory128SymbolAddress(src1));
87 	m_assembler.VmovapsVo(MakeMemory128SymbolAddress(dst), tmpRegister);
88 }
89 
Emit_Md_Avx_MovMasked_VarVarVar(const STATEMENT & statement)90 void CCodeGen_x86::Emit_Md_Avx_MovMasked_VarVarVar(const STATEMENT& statement)
91 {
92 	auto dst = statement.dst->GetSymbol().get();
93 	auto src1 = statement.src1->GetSymbol().get();
94 	auto src2 = statement.src2->GetSymbol().get();
95 	uint8 mask = static_cast<uint8>(statement.jmpCondition);
96 
97 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
98 	auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM1);
99 
100 	m_assembler.VblendpsVo(dstRegister, src1Register, MakeVariable128SymbolAddress(src2), mask);
101 
102 	CommitSymbolRegisterMdAvx(dst, dstRegister);
103 }
104 
Emit_Md_Avx_AddSSW_VarVarVar(const STATEMENT & statement)105 void CCodeGen_x86::Emit_Md_Avx_AddSSW_VarVarVar(const STATEMENT& statement)
106 {
107 	auto dst = statement.dst->GetSymbol().get();
108 	auto src1 = statement.src1->GetSymbol().get();
109 	auto src2 = statement.src2->GetSymbol().get();
110 
111 	auto uxRegister = CX86Assembler::xMM0;
112 	auto uyRegister = CX86Assembler::xMM1;
113 	auto resRegister = CX86Assembler::xMM2;
114 	auto cstRegister = CX86Assembler::xMM3;
115 
116 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/ modified to work without cmovns
117 //	s32b sat_adds32b(s32b x, s32b y)
118 //	{
119 //		u32b ux = x;
120 //		u32b uy = y;
121 //		u32b res = ux + uy;
122 //
123 //		/* Calculate overflowed result. (Don't change the sign bit of ux) */
124 //		ux = (ux >> 31) + INT_MAX;
125 //
126 //		s32b sign = (s32b) ((ux ^ uy) | ~(uy ^ res))
127 //		sign >>= 31;		/* Arithmetic shift, either 0 or ~0*/
128 //		res = (res & sign) | (ux & ~sign);
129 //
130 //		return res;
131 //	}
132 
133 	//ux = src1
134 	//uy = src2
135 	m_assembler.VmovdqaVo(uxRegister, MakeVariable128SymbolAddress(src1));
136 	m_assembler.VmovdqaVo(uyRegister, MakeVariable128SymbolAddress(src2));
137 
138 	//res = ux + uy
139 	m_assembler.VpadddVo(resRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
140 
141 	//cst = 0x7FFFFFFF
142 	m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
143 	m_assembler.VpsrldVo(cstRegister, cstRegister, 1);
144 
145 	//ux = (ux >> 31)
146 	m_assembler.VpsrldVo(uxRegister, uxRegister, 31);
147 
148 	//ux += 0x7FFFFFFF
149 	m_assembler.VpadddVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
150 
151 	//uy = ~(uy ^ res)
152 	//------
153 	//uy ^ res
154 	m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
155 
156 	//~(uy ^ res)
157 	m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
158 	m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
159 
160 	//cst = ux ^ uy (reloading uy from src2 because we don't have any registers available)
161 	m_assembler.VpxorVo(cstRegister, uxRegister, MakeVariable128SymbolAddress(src2));
162 
163 	//uy = ((ux ^ uy) | ~(uy ^ res)) >> 31; (signed operation)
164 	m_assembler.VporVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
165 	m_assembler.VpsradVo(uyRegister, uyRegister, 31);
166 
167 	//res = (res & uy)	(uy is the sign value)
168 	m_assembler.VpandVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
169 
170 	//ux = (ux & ~uy)
171 	//------
172 	//~uy
173 	m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
174 	m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
175 
176 	//ux & ~uy
177 	m_assembler.VpandVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
178 
179 	//res = (res & uy) | (ux & ~uy)
180 	m_assembler.VporVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
181 
182 	//Copy final result
183 	m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
184 }
185 
Emit_Md_Avx_SubSSW_VarVarVar(const STATEMENT & statement)186 void CCodeGen_x86::Emit_Md_Avx_SubSSW_VarVarVar(const STATEMENT& statement)
187 {
188 	auto dst = statement.dst->GetSymbol().get();
189 	auto src1 = statement.src1->GetSymbol().get();
190 	auto src2 = statement.src2->GetSymbol().get();
191 
192 	auto uxRegister = CX86Assembler::xMM0;
193 	auto uyRegister = CX86Assembler::xMM1;
194 	auto resRegister = CX86Assembler::xMM2;
195 	auto cstRegister = CX86Assembler::xMM3;
196 
197 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/ modified to work without cmovns
198 //	s32b sat_subs32b(s32b x, s32b y)
199 //	{
200 //		u32b ux = x;
201 //		u32b uy = y;
202 //		u32b res = ux - uy;
203 //
204 //		ux = (ux >> 31) + INT_MAX;
205 //
206 //		s32b sign = (s32b) ((ux ^ uy) & (ux ^ res))
207 //		sign >>= 31;		/* Arithmetic shift, either 0 or ~0*/
208 //		res = (res & ~sign) | (ux & sign);
209 //
210 //		return res;
211 //	}
212 
213 	//ux = src1
214 	//uy = src2
215 	m_assembler.VmovdqaVo(uxRegister, MakeVariable128SymbolAddress(src1));
216 
217 	//res = ux - uy
218 	m_assembler.VpsubdVo(resRegister, uxRegister, MakeVariable128SymbolAddress(src2));
219 
220 	//cst = 0x7FFFFFFF
221 	m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
222 	m_assembler.VpsrldVo(cstRegister, cstRegister, 1);
223 
224 	//ux = (ux >> 31)
225 	m_assembler.VpsrldVo(uxRegister, uxRegister, 31);
226 
227 	//ux += 0x7FFFFFFF
228 	m_assembler.VpadddVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
229 
230 	//uy = (ux ^ res)
231 	//------
232 	//ux ^ res
233 	m_assembler.VpxorVo(uyRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
234 
235 	//cst = ux ^ uy (reloading uy from src2 because we don't have any registers available)
236 	m_assembler.VpxorVo(cstRegister, uxRegister, MakeVariable128SymbolAddress(src2));
237 
238 	//uy = ((ux ^ uy) & (ux ^ res)) >> 31; (signed operation)
239 	m_assembler.VpandVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
240 	m_assembler.VpsradVo(uyRegister, uyRegister, 31);
241 
242 	//ux = (ux & uy)	(uy is the sign value)
243 	m_assembler.VpandVo(uxRegister, uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
244 
245 	//res = (res & ~uy)
246 	//------
247 	//~uy
248 	m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
249 	m_assembler.VpxorVo(uyRegister, uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
250 
251 	//res & ~uy
252 	m_assembler.VpandVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
253 
254 	//res = (res & ~uy) | (ux & uy)
255 	m_assembler.VporVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
256 
257 	//Copy final result
258 	m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
259 }
260 
Emit_Md_Avx_AddUSW_VarVarVar(const STATEMENT & statement)261 void CCodeGen_x86::Emit_Md_Avx_AddUSW_VarVarVar(const STATEMENT& statement)
262 {
263 	auto dst = statement.dst->GetSymbol().get();
264 	auto src1 = statement.src1->GetSymbol().get();
265 	auto src2 = statement.src2->GetSymbol().get();
266 
267 	auto xRegister = CX86Assembler::xMM0;
268 	auto resRegister = CX86Assembler::xMM1;
269 	auto tmpRegister = CX86Assembler::xMM2;
270 	auto tmp2Register = CX86Assembler::xMM3;
271 
272 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/
273 //	u32b sat_addu32b(u32b x, u32b y)
274 //	{
275 //		u32b res = x + y;
276 //		res |= -(res < x);
277 //
278 //		return res;
279 //	}
280 
281 	m_assembler.VmovdqaVo(xRegister, MakeVariable128SymbolAddress(src1));
282 	m_assembler.VpadddVo(resRegister, xRegister, MakeVariable128SymbolAddress(src2));
283 
284 	//-(res < x)
285 	//PCMPGT will compare two signed integers, but we want unsigned comparison
286 	//Thus, we add 0x80000000 to both values to "convert" them to signed
287 	m_assembler.VpcmpeqdVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
288 	m_assembler.VpslldVo(tmpRegister, tmpRegister, 31);
289 	m_assembler.VpadddVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
290 
291 	m_assembler.VpcmpeqdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
292 	m_assembler.VpslldVo(tmp2Register, tmp2Register, 31);
293 	m_assembler.VpadddVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
294 
295 	m_assembler.VpcmpgtdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
296 
297 	//res |= -(res < x)
298 	m_assembler.VporVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
299 
300 	//Store result
301 	m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
302 }
303 
Emit_Md_Avx_SubUSW_VarVarVar(const STATEMENT & statement)304 void CCodeGen_x86::Emit_Md_Avx_SubUSW_VarVarVar(const STATEMENT& statement)
305 {
306 	auto dst = statement.dst->GetSymbol().get();
307 	auto src1 = statement.src1->GetSymbol().get();
308 	auto src2 = statement.src2->GetSymbol().get();
309 
310 	auto xRegister = CX86Assembler::xMM0;
311 	auto resRegister = CX86Assembler::xMM1;
312 	auto tmpRegister = CX86Assembler::xMM2;
313 	auto tmp2Register = CX86Assembler::xMM3;
314 
315 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/
316 //	u32b sat_subu32b(u32b x, u32b y)
317 //	{
318 //		u32b res = x - y;
319 //		res &= -(res <= x);
320 //
321 //		return res;
322 //	}
323 
324 	m_assembler.VmovdqaVo(xRegister, MakeVariable128SymbolAddress(src1));
325 	m_assembler.VpsubdVo(resRegister, xRegister, MakeVariable128SymbolAddress(src2));
326 
327 	//-(res <= x)
328 	//PCMPGT will compare two signed integers, but we want unsigned comparison
329 	//Thus, we add 0x80000000 to both values to "convert" them to signed
330 	m_assembler.VpcmpeqdVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
331 	m_assembler.VpslldVo(tmpRegister, tmpRegister, 31);
332 	m_assembler.VpadddVo(tmpRegister, tmpRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
333 
334 	m_assembler.VpcmpeqdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
335 	m_assembler.VpslldVo(tmp2Register, tmp2Register, 31);
336 	m_assembler.VpadddVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
337 
338 	m_assembler.VpcmpeqdVo(xRegister, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
339 	m_assembler.VpcmpgtdVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
340 	m_assembler.VporVo(tmp2Register, tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
341 
342 	//res &= -(res <= x);
343 	m_assembler.VpandVo(resRegister, resRegister, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
344 
345 	//Store result
346 	m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
347 }
348 
Emit_Md_Avx_PackHB_VarVarVar(const STATEMENT & statement)349 void CCodeGen_x86::Emit_Md_Avx_PackHB_VarVarVar(const STATEMENT& statement)
350 {
351 	auto dst = statement.dst->GetSymbol().get();
352 	auto src1 = statement.src1->GetSymbol().get();
353 	auto src2 = statement.src2->GetSymbol().get();
354 
355 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
356 	auto tempRegister = CX86Assembler::xMM1;
357 	auto temp2Register = CX86Assembler::xMM2;
358 	auto maskRegister = CX86Assembler::xMM3;
359 
360 	//Generate mask (0x00FF x8)
361 	m_assembler.VpcmpeqdVo(maskRegister, maskRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
362 	m_assembler.VpsrlwVo(maskRegister, maskRegister, 0x08);
363 
364 	//Mask both operands
365 	m_assembler.VpandVo(temp2Register, maskRegister, MakeVariable128SymbolAddress(src2));
366 	m_assembler.VpandVo(tempRegister, maskRegister, MakeVariable128SymbolAddress(src1));
367 
368 	//Pack
369 	m_assembler.VpackuswbVo(dstRegister, temp2Register, CX86Assembler::MakeXmmRegisterAddress(tempRegister));
370 
371 	CommitSymbolRegisterMdAvx(dst, dstRegister);
372 }
373 
Emit_Md_Avx_PackWH_VarVarVar(const STATEMENT & statement)374 void CCodeGen_x86::Emit_Md_Avx_PackWH_VarVarVar(const STATEMENT& statement)
375 {
376 	auto dst = statement.dst->GetSymbol().get();
377 	auto src1 = statement.src1->GetSymbol().get();
378 	auto src2 = statement.src2->GetSymbol().get();
379 
380 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
381 	auto resultRegister = CX86Assembler::xMM1;
382 	auto tempRegister = CX86Assembler::xMM2;
383 
384 	m_assembler.VmovapsVo(resultRegister, MakeVariable128SymbolAddress(src2));
385 	m_assembler.VmovapsVo(tempRegister, MakeVariable128SymbolAddress(src1));
386 
387 	//Sign extend the lower half word of our registers
388 	m_assembler.VpslldVo(resultRegister, resultRegister, 0x10);
389 	m_assembler.VpsradVo(resultRegister, resultRegister, 0x10);
390 
391 	m_assembler.VpslldVo(tempRegister, tempRegister, 0x10);
392 	m_assembler.VpsradVo(tempRegister, tempRegister, 0x10);
393 
394 	//Pack
395 	m_assembler.VpackssdwVo(dstRegister, resultRegister, CX86Assembler::MakeXmmRegisterAddress(tempRegister));
396 
397 	CommitSymbolRegisterMdAvx(dst, dstRegister);
398 }
399 
Emit_Md_Avx_Not_VarVar(const STATEMENT & statement)400 void CCodeGen_x86::Emit_Md_Avx_Not_VarVar(const STATEMENT& statement)
401 {
402 	auto dst = statement.dst->GetSymbol().get();
403 	auto src1 = statement.src1->GetSymbol().get();
404 
405 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
406 	auto cstRegister = CX86Assembler::xMM1;
407 
408 	assert(dstRegister != cstRegister);
409 
410 	m_assembler.VpcmpeqdVo(cstRegister, cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
411 	m_assembler.VpxorVo(dstRegister, cstRegister, MakeVariable128SymbolAddress(src1));
412 
413 	CommitSymbolRegisterMdAvx(dst, dstRegister);
414 }
415 
Emit_Md_Avx_Abs_VarVar(const STATEMENT & statement)416 void CCodeGen_x86::Emit_Md_Avx_Abs_VarVar(const STATEMENT& statement)
417 {
418 	auto dst = statement.dst->GetSymbol().get();
419 	auto src1 = statement.src1->GetSymbol().get();
420 
421 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
422 	auto maskRegister = CX86Assembler::xMM1;
423 
424 	assert(dstRegister != maskRegister);
425 
426 	m_assembler.VpcmpeqdVo(maskRegister, maskRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
427 	m_assembler.VpsrldVo(maskRegister, maskRegister, 1);
428 	m_assembler.VpandVo(dstRegister, maskRegister, MakeVariable128SymbolAddress(src1));
429 
430 	CommitSymbolRegisterMdAvx(dst, dstRegister);
431 }
432 
Emit_Md_Avx_MakeSz_VarVar(const STATEMENT & statement)433 void CCodeGen_x86::Emit_Md_Avx_MakeSz_VarVar(const STATEMENT& statement)
434 {
435 	auto dst = statement.dst->GetSymbol().get();
436 	auto src1 = statement.src1->GetSymbol().get();
437 
438 	auto dstRegister = PrepareSymbolRegisterDef(dst, CX86Assembler::rDX);
439 	auto src1Register = PrepareSymbolRegisterUseMdAvx(src1, CX86Assembler::xMM0);
440 	auto szRegister = CX86Assembler::xMM1;
441 	auto zeroRegister = CX86Assembler::xMM2;
442 
443 	//Compute sign
444 	m_assembler.VpsradVo(szRegister, src1Register, 31);
445 
446 	//Compute zero
447 	m_assembler.VpxorVo(zeroRegister, zeroRegister, CX86Assembler::MakeXmmRegisterAddress(zeroRegister));
448 	m_assembler.VcmppsVo(zeroRegister, zeroRegister, CX86Assembler::MakeXmmRegisterAddress(src1Register), CX86Assembler::SSE_CMP_EQ);
449 
450 	//Pack
451 	m_assembler.VpackssdwVo(szRegister, szRegister, CX86Assembler::MakeXmmRegisterAddress(zeroRegister));
452 
453 	//Extract bits
454 	m_assembler.VpshufbVo(szRegister, szRegister, MakeConstant128Address(g_makeSzShufflePattern));
455 	m_assembler.VpmovmskbVo(dstRegister, szRegister);
456 
457 	CommitSymbolRegister(dst, dstRegister);
458 }
459 
Emit_Md_Avx_Expand_VarVar(const STATEMENT & statement)460 void CCodeGen_x86::Emit_Md_Avx_Expand_VarVar(const STATEMENT& statement)
461 {
462 	auto dst = statement.dst->GetSymbol().get();
463 	auto src1 = statement.src1->GetSymbol().get();
464 
465 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
466 
467 	m_assembler.VmovdVo(dstRegister, MakeVariableSymbolAddress(src1));
468 	m_assembler.VshufpsVo(dstRegister, dstRegister, CX86Assembler::MakeXmmRegisterAddress(dstRegister), 0x00);
469 
470 	CommitSymbolRegisterMdAvx(dst, dstRegister);
471 }
472 
Emit_Md_Avx_Expand_VarCst(const STATEMENT & statement)473 void CCodeGen_x86::Emit_Md_Avx_Expand_VarCst(const STATEMENT& statement)
474 {
475 	auto dst = statement.dst->GetSymbol().get();
476 	auto src1 = statement.src1->GetSymbol().get();
477 
478 	auto dstRegister = PrepareSymbolRegisterDefMd(dst, CX86Assembler::xMM0);
479 	auto cstRegister = CX86Assembler::rAX;
480 
481 	m_assembler.MovId(cstRegister, src1->m_valueLow);
482 	m_assembler.VmovdVo(dstRegister, CX86Assembler::MakeRegisterAddress(cstRegister));
483 	m_assembler.VshufpsVo(dstRegister, dstRegister, CX86Assembler::MakeXmmRegisterAddress(dstRegister), 0x00);
484 
485 	CommitSymbolRegisterMdAvx(dst, dstRegister);
486 }
487 
Emit_Avx_MergeTo256_MemVarVar(const STATEMENT & statement)488 void CCodeGen_x86::Emit_Avx_MergeTo256_MemVarVar(const STATEMENT& statement)
489 {
490 	auto dst = statement.dst->GetSymbol().get();
491 	auto src1 = statement.src1->GetSymbol().get();
492 	auto src2 = statement.src2->GetSymbol().get();
493 
494 	assert(dst->m_type == SYM_TEMPORARY256);
495 
496 	auto src1Register = CX86Assembler::xMM0;
497 	auto src2Register = CX86Assembler::xMM1;
498 
499 	//TODO: Improve this to write out registers directly to temporary's memory space
500 	//instead of passing by temporary registers
501 
502 	m_assembler.VmovdqaVo(src1Register, MakeVariable128SymbolAddress(src1));
503 	m_assembler.VmovdqaVo(src2Register, MakeVariable128SymbolAddress(src2));
504 
505 	m_assembler.VmovdqaVo(MakeTemporary256SymbolElementAddress(dst, 0x00), src1Register);
506 	m_assembler.VmovdqaVo(MakeTemporary256SymbolElementAddress(dst, 0x10), src2Register);
507 }
508 
Emit_Md_Avx_Srl256_VarMemVar(const STATEMENT & statement)509 void CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemVar(const STATEMENT& statement)
510 {
511 	auto dst = statement.dst->GetSymbol().get();
512 	auto src1 = statement.src1->GetSymbol().get();
513 	auto src2 = statement.src2->GetSymbol().get();
514 
515 	auto offsetRegister = CX86Assembler::rAX;
516 	auto resultRegister = CX86Assembler::xMM0;
517 
518 	assert(src1->m_type == SYM_TEMPORARY256);
519 
520 	m_assembler.MovEd(offsetRegister, MakeVariableSymbolAddress(src2));
521 	m_assembler.AndId(CX86Assembler::MakeRegisterAddress(offsetRegister), 0x7F);
522 	m_assembler.ShrEd(CX86Assembler::MakeRegisterAddress(offsetRegister), 3);
523 	m_assembler.AddId(CX86Assembler::MakeRegisterAddress(offsetRegister), src1->m_stackLocation + m_stackLevel);
524 
525 	m_assembler.VmovdquVo(resultRegister, CX86Assembler::MakeBaseIndexScaleAddress(CX86Assembler::rSP, offsetRegister, 1));
526 	m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resultRegister);
527 }
528 
Emit_Md_Avx_Srl256_VarMemCst(const STATEMENT & statement)529 void CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemCst(const STATEMENT& statement)
530 {
531 	auto dst = statement.dst->GetSymbol().get();
532 	auto src1 = statement.src1->GetSymbol().get();
533 	auto src2 = statement.src2->GetSymbol().get();
534 
535 	auto resultRegister = CX86Assembler::xMM0;
536 
537 	assert(src1->m_type == SYM_TEMPORARY256);
538 	assert(src2->m_type == SYM_CONSTANT);
539 
540 	uint32 offset = (src2->m_valueLow & 0x7F) / 8;
541 
542 	m_assembler.VmovdquVo(resultRegister, MakeTemporary256SymbolElementAddress(src1, offset));
543 	m_assembler.VmovdqaVo(MakeVariable128SymbolAddress(dst), resultRegister);
544 }
545 
546 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdAvxConstMatchers[] =
547 {
548 	{ OP_MD_ADD_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDB> },
549 	{ OP_MD_ADD_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDH> },
550 	{ OP_MD_ADD_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDW> },
551 
552 	{ OP_MD_ADDSS_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDSSB> },
553 	{ OP_MD_ADDSS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDSSH> },
554 	{ OP_MD_ADDSS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_AddSSW_VarVarVar       },
555 
556 	{ OP_MD_ADDUS_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDUSB> },
557 	{ OP_MD_ADDUS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDUSH> },
558 	{ OP_MD_ADDUS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_AddUSW_VarVarVar       },
559 
560 	{ OP_MD_SUB_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBB> },
561 	{ OP_MD_SUB_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBH> },
562 	{ OP_MD_SUB_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBW> },
563 
564 	{ OP_MD_SUBSS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBSSH> },
565 	{ OP_MD_SUBSS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_SubSSW_VarVarVar       },
566 
567 	{ OP_MD_SUBUS_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBUSB> },
568 	{ OP_MD_SUBUS_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBUSH> },
569 	{ OP_MD_SUBUS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_SubUSW_VarVarVar       },
570 
571 	{ OP_MD_CMPEQ_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPEQB> },
572 	{ OP_MD_CMPEQ_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPEQH> },
573 	{ OP_MD_CMPEQ_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPEQW> },
574 
575 	{ OP_MD_CMPGT_B, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTB> },
576 	{ OP_MD_CMPGT_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTH> },
577 	{ OP_MD_CMPGT_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTW> },
578 
579 	{ OP_MD_MIN_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MINH> },
580 	{ OP_MD_MIN_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MINW> },
581 
582 	{ OP_MD_MAX_H, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MAXH> },
583 	{ OP_MD_MAX_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MAXW> },
584 
585 	{ OP_MD_AND, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_AND> },
586 	{ OP_MD_OR,  MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_OR>  },
587 	{ OP_MD_XOR, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_XOR> },
588 
589 	{ OP_MD_NOT, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Not_VarVar },
590 
591 	{ OP_MD_SRLH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRLH, 0x0F> },
592 	{ OP_MD_SRAH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRAH, 0x0F> },
593 	{ OP_MD_SLLH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SLLH, 0x0F> },
594 
595 	{ OP_MD_SRLW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRLW, 0x1F> },
596 	{ OP_MD_SRAW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SRAW, 0x1F> },
597 	{ OP_MD_SLLW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Shift_VarVarCst<MDOP_SLLW, 0x1F> },
598 
599 	{ OP_MD_UNPACK_LOWER_BH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_LOWER_BH> },
600 	{ OP_MD_UNPACK_LOWER_HW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_LOWER_HW> },
601 	{ OP_MD_UNPACK_LOWER_WD, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_LOWER_WD> },
602 
603 	{ OP_MD_UNPACK_UPPER_BH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_UPPER_BH> },
604 	{ OP_MD_UNPACK_UPPER_HW, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_UPPER_HW> },
605 	{ OP_MD_UNPACK_UPPER_WD, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVarRev<MDOP_UNPACK_UPPER_WD> },
606 
607 	{ OP_MD_ADD_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_ADDS> },
608 	{ OP_MD_SUB_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_SUBS> },
609 	{ OP_MD_MUL_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MULS> },
610 	{ OP_MD_DIV_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_DIVS> },
611 
612 	{ OP_MD_ABS_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Abs_VarVar },
613 
614 	{ OP_MD_CMPLT_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPLTS> },
615 	{ OP_MD_CMPGT_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_CMPGTS> },
616 
617 	{ OP_MD_MIN_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MINS> },
618 	{ OP_MD_MAX_S, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVarVar<MDOP_MAXS> },
619 
620 	{ OP_MD_TOWORD_TRUNCATE, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVar<MDOP_TOWORD_TRUNCATE> },
621 	{ OP_MD_TOSINGLE,        MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_VarVar<MDOP_TOSINGLE>        },
622 
623 	{ OP_MD_EXPAND, MATCH_VARIABLE128, MATCH_VARIABLE, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Expand_VarVar },
624 	{ OP_MD_EXPAND, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Expand_VarCst },
625 
626 	{ OP_MD_PACK_HB, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_PackHB_VarVarVar, },
627 	{ OP_MD_PACK_WH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_PackWH_VarVarVar, },
628 
629 	{ OP_MD_MAKESZ, MATCH_VARIABLE, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_MakeSz_VarVar },
630 
631 	{ OP_MOV, MATCH_REGISTER128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Mov_RegVar, },
632 	{ OP_MOV, MATCH_MEMORY128,   MATCH_REGISTER128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Mov_MemReg, },
633 	{ OP_MOV, MATCH_MEMORY128,   MATCH_MEMORY128,   MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Mov_MemMem, },
634 
635 	{ OP_MD_MOV_MASKED, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_MovMasked_VarVarVar },
636 
637 	{ OP_MERGETO256, MATCH_MEMORY256,   MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Avx_MergeTo256_MemVarVar },
638 	{ OP_MD_SRL256,  MATCH_VARIABLE128, MATCH_MEMORY256,   MATCH_VARIABLE,    MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemVar  },
639 	{ OP_MD_SRL256,  MATCH_VARIABLE128, MATCH_MEMORY256,   MATCH_CONSTANT,    MATCH_NIL, &CCodeGen_x86::Emit_Md_Avx_Srl256_VarMemCst  },
640 
641 	{ OP_MOV, MATCH_NIL,         MATCH_NIL,         MATCH_NIL, MATCH_NIL, nullptr },
642 };
643