1 #include "Jitter_CodeGen_x86.h"
2 
3 using namespace Jitter;
4 
5 const LITERAL128 CCodeGen_x86::g_makeSzShufflePattern = { 0x00020406080A0C0E, 0x8080808080808080 };
6 
MakeRelative128SymbolElementAddress(CSymbol * symbol,unsigned int elementIdx)7 CX86Assembler::CAddress CCodeGen_x86::MakeRelative128SymbolElementAddress(CSymbol* symbol, unsigned int elementIdx)
8 {
9 	assert(symbol->m_type == SYM_RELATIVE128);
10 	assert((symbol->m_valueLow & 0xF) == 0);
11 	return CX86Assembler::MakeIndRegOffAddress(CX86Assembler::rBP, symbol->m_valueLow + (elementIdx * 4));
12 }
13 
MakeTemporary128SymbolElementAddress(CSymbol * symbol,unsigned int elementIdx)14 CX86Assembler::CAddress CCodeGen_x86::MakeTemporary128SymbolElementAddress(CSymbol* symbol, unsigned int elementIdx)
15 {
16 	assert(symbol->m_type == SYM_TEMPORARY128);
17 //	assert(((symbol->m_stackLocation + m_stackLevel) & 0xF) == 0);
18 	return CX86Assembler::MakeIndRegOffAddress(CX86Assembler::rSP, symbol->m_stackLocation + m_stackLevel + (elementIdx * 4));
19 }
20 
MakeTemporary256SymbolElementAddress(CSymbol * symbol,unsigned int elementIdx)21 CX86Assembler::CAddress CCodeGen_x86::MakeTemporary256SymbolElementAddress(CSymbol* symbol, unsigned int elementIdx)
22 {
23 	assert(symbol->m_type == SYM_TEMPORARY256);
24 	assert(((symbol->m_stackLocation + m_stackLevel) & 0x1F) == 0);
25 	return CX86Assembler::MakeIndRegOffAddress(CX86Assembler::rSP, symbol->m_stackLocation + m_stackLevel + elementIdx);
26 }
27 
MakeVariable128SymbolAddress(CSymbol * symbol)28 CX86Assembler::CAddress CCodeGen_x86::MakeVariable128SymbolAddress(CSymbol* symbol)
29 {
30 	switch(symbol->m_type)
31 	{
32 	case SYM_REGISTER128:
33 		return CX86Assembler::MakeXmmRegisterAddress(m_mdRegisters[symbol->m_valueLow]);
34 		break;
35 	case SYM_RELATIVE128:
36 		return MakeRelative128SymbolElementAddress(symbol, 0);
37 		break;
38 	case SYM_TEMPORARY128:
39 		return MakeTemporary128SymbolElementAddress(symbol, 0);
40 		break;
41 	default:
42 		throw std::exception();
43 		break;
44 	}
45 }
46 
MakeMemory128SymbolAddress(CSymbol * symbol)47 CX86Assembler::CAddress CCodeGen_x86::MakeMemory128SymbolAddress(CSymbol* symbol)
48 {
49 	switch(symbol->m_type)
50 	{
51 	case SYM_RELATIVE128:
52 		return MakeRelative128SymbolElementAddress(symbol, 0);
53 		break;
54 	case SYM_TEMPORARY128:
55 		return MakeTemporary128SymbolElementAddress(symbol, 0);
56 		break;
57 	default:
58 		throw std::exception();
59 		break;
60 	}
61 }
62 
MakeMemory128SymbolElementAddress(CSymbol * symbol,unsigned int elementIdx)63 CX86Assembler::CAddress CCodeGen_x86::MakeMemory128SymbolElementAddress(CSymbol* symbol, unsigned int elementIdx)
64 {
65 	switch(symbol->m_type)
66 	{
67 	case SYM_RELATIVE128:
68 		return MakeRelative128SymbolElementAddress(symbol, elementIdx);
69 		break;
70 	case SYM_TEMPORARY128:
71 		return MakeTemporary128SymbolElementAddress(symbol, elementIdx);
72 		break;
73 	default:
74 		throw std::exception();
75 		break;
76 	}
77 }
78 
79 template <typename MDOP>
Emit_Md_RegVar(const STATEMENT & statement)80 void CCodeGen_x86::Emit_Md_RegVar(const STATEMENT& statement)
81 {
82 	auto dst = statement.dst->GetSymbol().get();
83 	auto src1 = statement.src1->GetSymbol().get();
84 
85 	((m_assembler).*(MDOP::OpVo()))(m_mdRegisters[dst->m_valueLow], MakeVariable128SymbolAddress(src1));
86 }
87 
88 template <typename MDOP>
Emit_Md_MemVar(const STATEMENT & statement)89 void CCodeGen_x86::Emit_Md_MemVar(const STATEMENT& statement)
90 {
91 	auto dst = statement.dst->GetSymbol().get();
92 	auto src1 = statement.src1->GetSymbol().get();
93 
94 	auto dstRegister = CX86Assembler::xMM0;
95 
96 	((m_assembler).*(MDOP::OpVo()))(dstRegister, MakeVariable128SymbolAddress(src1));
97 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), dstRegister);
98 }
99 
100 template <typename MDOP>
Emit_Md_RegRegReg(const STATEMENT & statement)101 void CCodeGen_x86::Emit_Md_RegRegReg(const STATEMENT& statement)
102 {
103 	auto dst = statement.dst->GetSymbol().get();
104 	auto src1 = statement.src1->GetSymbol().get();
105 	auto src2 = statement.src2->GetSymbol().get();
106 
107 	if(dst->Equals(src1))
108 	{
109 		((m_assembler).*(MDOP::OpVo()))(m_mdRegisters[dst->m_valueLow],
110 			CX86Assembler::MakeXmmRegisterAddress(m_mdRegisters[src2->m_valueLow]));
111 	}
112 	else
113 	{
114 		auto src2Register = m_mdRegisters[src2->m_valueLow];
115 
116 		if(dst->Equals(src2))
117 		{
118 			m_assembler.MovapsVo(CX86Assembler::xMM0, CX86Assembler::MakeXmmRegisterAddress(m_mdRegisters[src2->m_valueLow]));
119 			src2Register = CX86Assembler::xMM0;
120 		}
121 
122 		m_assembler.MovapsVo(m_mdRegisters[dst->m_valueLow], CX86Assembler::MakeXmmRegisterAddress(m_mdRegisters[src1->m_valueLow]));
123 		((m_assembler).*(MDOP::OpVo()))(m_mdRegisters[dst->m_valueLow], CX86Assembler::MakeXmmRegisterAddress(src2Register));
124 	}
125 }
126 
127 template <typename MDOP>
Emit_Md_RegMemReg(const STATEMENT & statement)128 void CCodeGen_x86::Emit_Md_RegMemReg(const STATEMENT& statement)
129 {
130 	auto dst = statement.dst->GetSymbol().get();
131 	auto src1 = statement.src1->GetSymbol().get();
132 	auto src2 = statement.src2->GetSymbol().get();
133 
134 	auto dstRegister = m_mdRegisters[dst->m_valueLow];
135 	auto src2Register = m_mdRegisters[src2->m_valueLow];
136 
137 	if(dst->Equals(src2))
138 	{
139 		m_assembler.MovapsVo(CX86Assembler::xMM0, CX86Assembler::MakeXmmRegisterAddress(src2Register));
140 		src2Register = CX86Assembler::xMM0;
141 	}
142 
143 	m_assembler.MovapsVo(dstRegister, MakeVariable128SymbolAddress(src1));
144 	((m_assembler).*(MDOP::OpVo()))(dstRegister, CX86Assembler::MakeXmmRegisterAddress(src2Register));
145 }
146 
147 template <typename MDOP>
Emit_Md_RegVarVar(const STATEMENT & statement)148 void CCodeGen_x86::Emit_Md_RegVarVar(const STATEMENT& statement)
149 {
150 	auto dst = statement.dst->GetSymbol().get();
151 	auto src1 = statement.src1->GetSymbol().get();
152 	auto src2 = statement.src2->GetSymbol().get();
153 
154 	//If we get in here, it must absolutely mean that the second source isn't a register
155 	//Otherwise, some of the assumuptions done below will be wrong (dst mustn't be equal to src2)
156 	assert(src2->m_type != SYM_REGISTER128);
157 
158 	auto dstRegister = m_mdRegisters[dst->m_valueLow];
159 
160 	if(!dst->Equals(src1))
161 	{
162 		m_assembler.MovapsVo(dstRegister, MakeVariable128SymbolAddress(src1));
163 	}
164 
165 	((m_assembler).*(MDOP::OpVo()))(dstRegister, MakeVariable128SymbolAddress(src2));
166 }
167 
168 template <typename MDOP>
Emit_Md_MemVarVar(const STATEMENT & statement)169 void CCodeGen_x86::Emit_Md_MemVarVar(const STATEMENT& statement)
170 {
171 	auto dst = statement.dst->GetSymbol().get();
172 	auto src1 = statement.src1->GetSymbol().get();
173 	auto src2 = statement.src2->GetSymbol().get();
174 
175 	auto dstRegister = CX86Assembler::xMM0;
176 
177 	m_assembler.MovapsVo(dstRegister, MakeVariable128SymbolAddress(src1));
178 	((m_assembler).*(MDOP::OpVo()))(dstRegister, MakeVariable128SymbolAddress(src2));
179 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), dstRegister);
180 }
181 
182 template <typename MDOP>
Emit_Md_VarVarVarRev(const STATEMENT & statement)183 void CCodeGen_x86::Emit_Md_VarVarVarRev(const STATEMENT& statement)
184 {
185 	//TODO: This could be improved further, but we might want
186 	//to reverse the operands somewhere else as to not
187 	//copy paste the code from the "non-reversed" path
188 
189 	auto dst = statement.dst->GetSymbol().get();
190 	auto src1 = statement.src1->GetSymbol().get();
191 	auto src2 = statement.src2->GetSymbol().get();
192 
193 	auto dstRegister = CX86Assembler::xMM0;
194 
195 	m_assembler.MovapsVo(dstRegister, MakeVariable128SymbolAddress(src2));
196 	((m_assembler).*(MDOP::OpVo()))(dstRegister, MakeVariable128SymbolAddress(src1));
197 	m_assembler.MovapsVo(MakeVariable128SymbolAddress(dst), dstRegister);
198 }
199 
200 template <typename MDOPSHIFT, uint8 SAMASK>
Emit_Md_Shift_RegVarCst(const STATEMENT & statement)201 void CCodeGen_x86::Emit_Md_Shift_RegVarCst(const STATEMENT& statement)
202 {
203 	auto dst = statement.dst->GetSymbol().get();
204 	auto src1 = statement.src1->GetSymbol().get();
205 	auto src2 = statement.src2->GetSymbol().get();
206 
207 	auto dstRegister = m_mdRegisters[dst->m_valueLow];
208 
209 	if(!dst->Equals(src1))
210 	{
211 		m_assembler.MovapsVo(dstRegister, MakeVariable128SymbolAddress(src1));
212 	}
213 
214 	((m_assembler).*(MDOPSHIFT::OpVo()))(dstRegister, static_cast<uint8>(src2->m_valueLow & SAMASK));
215 }
216 
217 template <typename MDOPSHIFT, uint8 SAMASK>
Emit_Md_Shift_MemVarCst(const STATEMENT & statement)218 void CCodeGen_x86::Emit_Md_Shift_MemVarCst(const STATEMENT& statement)
219 {
220 	auto dst = statement.dst->GetSymbol().get();
221 	auto src1 = statement.src1->GetSymbol().get();
222 	auto src2 = statement.src2->GetSymbol().get();
223 
224 	auto tmpRegister = CX86Assembler::xMM0;
225 
226 	m_assembler.MovapsVo(tmpRegister, MakeVariable128SymbolAddress(src1));
227 	((m_assembler).*(MDOPSHIFT::OpVo()))(tmpRegister, static_cast<uint8>(src2->m_valueLow & SAMASK));
228 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), tmpRegister);
229 }
230 
231 template <typename MDOPSINGLEOP>
Emit_Md_SingleOp_RegVar(const STATEMENT & statement)232 void CCodeGen_x86::Emit_Md_SingleOp_RegVar(const STATEMENT& statement)
233 {
234 	auto dst = statement.dst->GetSymbol().get();
235 	auto src1 = statement.src1->GetSymbol().get();
236 
237 	auto resultRegister = m_mdRegisters[dst->m_valueLow];
238 
239 	if(!dst->Equals(src1))
240 	{
241 		m_assembler.MovapsVo(resultRegister, MakeVariable128SymbolAddress(src1));
242 	}
243 
244 	((*this).*(MDOPSINGLEOP::OpVr()))(resultRegister);
245 }
246 
247 template <typename MDOPSINGLEOP>
Emit_Md_SingleOp_MemVar(const STATEMENT & statement)248 void CCodeGen_x86::Emit_Md_SingleOp_MemVar(const STATEMENT& statement)
249 {
250 	auto dst = statement.dst->GetSymbol().get();
251 	auto src1 = statement.src1->GetSymbol().get();
252 
253 	auto resultRegister = CX86Assembler::xMM0;
254 
255 	m_assembler.MovapsVo(resultRegister, MakeVariable128SymbolAddress(src1));
256 	((*this).*(MDOPSINGLEOP::OpVr()))(resultRegister);
257 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), resultRegister);
258 }
259 
Emit_Md_AddSSW_VarVarVar(const STATEMENT & statement)260 void CCodeGen_x86::Emit_Md_AddSSW_VarVarVar(const STATEMENT& statement)
261 {
262 	auto dst = statement.dst->GetSymbol().get();
263 	auto src1 = statement.src1->GetSymbol().get();
264 	auto src2 = statement.src2->GetSymbol().get();
265 
266 	auto uxRegister = CX86Assembler::xMM0;
267 	auto uyRegister = CX86Assembler::xMM1;
268 	auto resRegister = CX86Assembler::xMM2;
269 	auto cstRegister = CX86Assembler::xMM3;
270 
271 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/ modified to work without cmovns
272 //	s32b sat_adds32b(s32b x, s32b y)
273 //	{
274 //		u32b ux = x;
275 //		u32b uy = y;
276 //		u32b res = ux + uy;
277 //
278 //		/* Calculate overflowed result. (Don't change the sign bit of ux) */
279 //		ux = (ux >> 31) + INT_MAX;
280 //
281 //		s32b sign = (s32b) ((ux ^ uy) | ~(uy ^ res))
282 //		sign >>= 31;		/* Arithmetic shift, either 0 or ~0*/
283 //		res = (res & sign) | (ux & ~sign);
284 //
285 //		return res;
286 //	}
287 
288 	//ux = src1
289 	//uy = src2
290 	m_assembler.MovapsVo(uxRegister, MakeVariable128SymbolAddress(src1));
291 	m_assembler.MovapsVo(uyRegister, MakeVariable128SymbolAddress(src2));
292 
293 	//res = ux + uy
294 	m_assembler.MovapsVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
295 	m_assembler.PadddVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
296 
297 	//cst = 0x7FFFFFFF
298 	m_assembler.PcmpeqdVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
299 	m_assembler.PsrldVo(cstRegister, 1);
300 
301 	//ux = (ux >> 31)
302 	m_assembler.PsrldVo(uxRegister, 31);
303 
304 	//ux += 0x7FFFFFFF
305 	m_assembler.PadddVo(uxRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
306 
307 	//uy = ~(uy ^ res)
308 	//------
309 	//uy ^ res
310 	m_assembler.PxorVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
311 
312 	//~(uy ^ res)
313 	m_assembler.PcmpeqdVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
314 	m_assembler.PxorVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
315 
316 	//cst = ux ^ uy (reloading uy from src2 because we don't have any registers available)
317 	m_assembler.MovapsVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
318 	m_assembler.PxorVo(cstRegister, MakeVariable128SymbolAddress(src2));
319 
320 	//uy = ((ux ^ uy) | ~(uy ^ res)) >> 31; (signed operation)
321 	m_assembler.PorVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
322 	m_assembler.PsradVo(uyRegister, 31);
323 
324 	//res = (res & uy)	(uy is the sign value)
325 	m_assembler.PandVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
326 
327 	//ux = (ux & ~uy)
328 	//------
329 	//~uy
330 	m_assembler.PcmpeqdVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
331 	m_assembler.PxorVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
332 
333 	//ux & ~uy
334 	m_assembler.PandVo(uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
335 
336 	//res = (res & uy) | (ux & ~uy)
337 	m_assembler.PorVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
338 
339 	//Copy final result
340 	m_assembler.MovapsVo(MakeVariable128SymbolAddress(dst), resRegister);
341 }
342 
Emit_Md_SubSSW_VarVarVar(const STATEMENT & statement)343 void CCodeGen_x86::Emit_Md_SubSSW_VarVarVar(const STATEMENT& statement)
344 {
345 	auto dst = statement.dst->GetSymbol().get();
346 	auto src1 = statement.src1->GetSymbol().get();
347 	auto src2 = statement.src2->GetSymbol().get();
348 
349 	auto uxRegister = CX86Assembler::xMM0;
350 	auto uyRegister = CX86Assembler::xMM1;
351 	auto resRegister = CX86Assembler::xMM2;
352 	auto cstRegister = CX86Assembler::xMM3;
353 
354 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/ modified to work without cmovns
355 //	s32b sat_subs32b(s32b x, s32b y)
356 //	{
357 //		u32b ux = x;
358 //		u32b uy = y;
359 //		u32b res = ux - uy;
360 //
361 //		ux = (ux >> 31) + INT_MAX;
362 //
363 //		s32b sign = (s32b) ((ux ^ uy) & (ux ^ res))
364 //		sign >>= 31;		/* Arithmetic shift, either 0 or ~0*/
365 //		res = (res & ~sign) | (ux & sign);
366 //
367 //		return res;
368 //	}
369 
370 	//ux = src1
371 	//uy = src2
372 	m_assembler.MovdqaVo(uxRegister, MakeVariable128SymbolAddress(src1));
373 	m_assembler.MovdqaVo(uyRegister, MakeVariable128SymbolAddress(src2));
374 
375 	//res = ux - uy
376 	m_assembler.MovdqaVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
377 	m_assembler.PsubdVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
378 
379 	//cst = 0x7FFFFFFF
380 	m_assembler.PcmpeqdVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
381 	m_assembler.PsrldVo(cstRegister, 1);
382 
383 	//ux = (ux >> 31)
384 	m_assembler.PsrldVo(uxRegister, 31);
385 
386 	//ux += 0x7FFFFFFF
387 	m_assembler.PadddVo(uxRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
388 
389 	//uy = (ux ^ res)
390 	//------
391 	//ux ^ res
392 	m_assembler.MovdqaVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
393 	m_assembler.PxorVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
394 
395 	//cst = ux ^ uy (reloading uy from src2 because we don't have any registers available)
396 	m_assembler.MovdqaVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
397 	m_assembler.PxorVo(cstRegister, MakeVariable128SymbolAddress(src2));
398 
399 	//uy = ((ux ^ uy) & (ux ^ res)) >> 31; (signed operation)
400 	m_assembler.PandVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
401 	m_assembler.PsradVo(uyRegister, 31);
402 
403 	//ux = (ux & uy)	(uy is the sign value)
404 	m_assembler.PandVo(uxRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
405 
406 	//res = (res & ~uy)
407 	//------
408 	//~uy
409 	m_assembler.PcmpeqdVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
410 	m_assembler.PxorVo(uyRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
411 
412 	//res & ~uy
413 	m_assembler.PandVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uyRegister));
414 
415 	//res = (res & ~uy) | (ux & uy)
416 	m_assembler.PorVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(uxRegister));
417 
418 	//Copy final result
419 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
420 }
421 
Emit_Md_AddUSW_VarVarVar(const STATEMENT & statement)422 void CCodeGen_x86::Emit_Md_AddUSW_VarVarVar(const STATEMENT& statement)
423 {
424 	auto dst = statement.dst->GetSymbol().get();
425 	auto src1 = statement.src1->GetSymbol().get();
426 	auto src2 = statement.src2->GetSymbol().get();
427 
428 	auto xRegister = CX86Assembler::xMM0;
429 	auto resRegister = CX86Assembler::xMM1;
430 	auto tmpRegister = CX86Assembler::xMM2;
431 	auto tmp2Register = CX86Assembler::xMM3;
432 
433 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/
434 //	u32b sat_addu32b(u32b x, u32b y)
435 //	{
436 //		u32b res = x + y;
437 //		res |= -(res < x);
438 //
439 //		return res;
440 //	}
441 
442 	m_assembler.MovdqaVo(xRegister, MakeVariable128SymbolAddress(src1));
443 	m_assembler.MovdqaVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(xRegister));
444 	m_assembler.PadddVo(resRegister, MakeVariable128SymbolAddress(src2));
445 
446 	//-(res < x)
447 	//PCMPGT will compare two signed integers, but we want unsigned comparison
448 	//Thus, we add 0x80000000 to both values to "convert" them to signed
449 	m_assembler.PcmpeqdVo(tmpRegister, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
450 	m_assembler.PslldVo(tmpRegister, 31);
451 	m_assembler.PadddVo(tmpRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
452 
453 	m_assembler.PcmpeqdVo(tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
454 	m_assembler.PslldVo(tmp2Register, 31);
455 	m_assembler.PadddVo(tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
456 
457 	m_assembler.PcmpgtdVo(tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
458 
459 	//res |= -(res < x)
460 	m_assembler.PorVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
461 
462 	//Store result
463 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
464 }
465 
Emit_Md_SubUSW_VarVarVar(const STATEMENT & statement)466 void CCodeGen_x86::Emit_Md_SubUSW_VarVarVar(const STATEMENT& statement)
467 {
468 	auto dst = statement.dst->GetSymbol().get();
469 	auto src1 = statement.src1->GetSymbol().get();
470 	auto src2 = statement.src2->GetSymbol().get();
471 
472 	auto xRegister = CX86Assembler::xMM0;
473 	auto resRegister = CX86Assembler::xMM1;
474 	auto tmpRegister = CX86Assembler::xMM2;
475 	auto tmp2Register = CX86Assembler::xMM3;
476 
477 //	This is based on code from http://locklessinc.com/articles/sat_arithmetic/
478 //	u32b sat_subu32b(u32b x, u32b y)
479 //	{
480 //		u32b res = x - y;
481 //		res &= -(res <= x);
482 //
483 //		return res;
484 //	}
485 
486 	m_assembler.MovdqaVo(xRegister, MakeVariable128SymbolAddress(src1));
487 	m_assembler.MovdqaVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(xRegister));
488 	m_assembler.PsubdVo(resRegister, MakeVariable128SymbolAddress(src2));
489 
490 	//-(res <= x)
491 	//PCMPGT will compare two signed integers, but we want unsigned comparison
492 	//Thus, we add 0x80000000 to both values to "convert" them to signed
493 	m_assembler.PcmpeqdVo(tmpRegister, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
494 	m_assembler.PslldVo(tmpRegister, 31);
495 	m_assembler.PadddVo(tmpRegister, CX86Assembler::MakeXmmRegisterAddress(resRegister));
496 
497 	m_assembler.PcmpeqdVo(tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
498 	m_assembler.PslldVo(tmp2Register, 31);
499 	m_assembler.PadddVo(tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
500 
501 	m_assembler.MovdqaVo(xRegister, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
502 
503 	m_assembler.PcmpeqdVo(xRegister, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
504 	m_assembler.PcmpgtdVo(tmp2Register, CX86Assembler::MakeXmmRegisterAddress(tmpRegister));
505 	m_assembler.PorVo(tmp2Register, CX86Assembler::MakeXmmRegisterAddress(xRegister));
506 
507 	//res &= -(res <= x);
508 	m_assembler.PandVo(resRegister, CX86Assembler::MakeXmmRegisterAddress(tmp2Register));
509 
510 	//Store result
511 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), resRegister);
512 }
513 
Emit_Md_MinW_VarVarVar(const STATEMENT & statement)514 void CCodeGen_x86::Emit_Md_MinW_VarVarVar(const STATEMENT& statement)
515 {
516 	auto dst = statement.dst->GetSymbol().get();
517 	auto src1 = statement.src1->GetSymbol().get();
518 	auto src2 = statement.src2->GetSymbol().get();
519 
520 	auto src1Register = CX86Assembler::xMM0;
521 	auto src2Register = CX86Assembler::xMM1;
522 	auto mask1Register = CX86Assembler::xMM2;
523 	auto mask2Register = CX86Assembler::xMM3;
524 
525 	m_assembler.MovdqaVo(src1Register, MakeVariable128SymbolAddress(src1));
526 	m_assembler.MovdqaVo(src2Register, MakeVariable128SymbolAddress(src2));
527 
528 	m_assembler.MovdqaVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(src2Register));
529 	m_assembler.PcmpgtdVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(src1Register));
530 	m_assembler.MovdqaVo(mask2Register, CX86Assembler::MakeXmmRegisterAddress(mask1Register));
531 
532 	m_assembler.PandVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(src1Register));
533 	m_assembler.PandnVo(mask2Register, CX86Assembler::MakeXmmRegisterAddress(src2Register));
534 	m_assembler.PorVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(mask2Register));
535 
536 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), mask1Register);
537 }
538 
Emit_Md_MaxW_VarVarVar(const STATEMENT & statement)539 void CCodeGen_x86::Emit_Md_MaxW_VarVarVar(const STATEMENT& statement)
540 {
541 	auto dst = statement.dst->GetSymbol().get();
542 	auto src1 = statement.src1->GetSymbol().get();
543 	auto src2 = statement.src2->GetSymbol().get();
544 
545 	auto src1Register = CX86Assembler::xMM0;
546 	auto src2Register = CX86Assembler::xMM1;
547 	auto mask1Register = CX86Assembler::xMM2;
548 	auto mask2Register = CX86Assembler::xMM3;
549 
550 	m_assembler.MovdqaVo(src1Register, MakeVariable128SymbolAddress(src1));
551 	m_assembler.MovdqaVo(src2Register, MakeVariable128SymbolAddress(src2));
552 
553 	m_assembler.MovdqaVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(src1Register));
554 	m_assembler.PcmpgtdVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(src2Register));
555 	m_assembler.MovdqaVo(mask2Register, CX86Assembler::MakeXmmRegisterAddress(mask1Register));
556 
557 	m_assembler.PandVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(src1Register));
558 	m_assembler.PandnVo(mask2Register, CX86Assembler::MakeXmmRegisterAddress(src2Register));
559 	m_assembler.PorVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(mask2Register));
560 
561 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), mask1Register);
562 }
563 
Emit_Md_PackHB_VarVarVar(const STATEMENT & statement)564 void CCodeGen_x86::Emit_Md_PackHB_VarVarVar(const STATEMENT& statement)
565 {
566 	auto dst = statement.dst->GetSymbol().get();
567 	auto src1 = statement.src1->GetSymbol().get();
568 	auto src2 = statement.src2->GetSymbol().get();
569 
570 	auto resultRegister = CX86Assembler::xMM0;
571 	auto tempRegister = CX86Assembler::xMM1;
572 	auto maskRegister = CX86Assembler::xMM2;
573 
574 	m_assembler.MovapsVo(resultRegister, MakeVariable128SymbolAddress(src2));
575 	m_assembler.MovapsVo(tempRegister, MakeVariable128SymbolAddress(src1));
576 
577 	//Generate mask (0x00FF x8)
578 	m_assembler.PcmpeqdVo(maskRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
579 	m_assembler.PsrlwVo(maskRegister, 0x08);
580 
581 	//Mask both operands
582 	m_assembler.PandVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
583 	m_assembler.PandVo(tempRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
584 
585 	//Pack
586 	m_assembler.PackuswbVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(tempRegister));
587 
588 	m_assembler.MovapsVo(MakeVariable128SymbolAddress(dst), resultRegister);
589 }
590 
Emit_Md_PackWH_VarVarVar(const STATEMENT & statement)591 void CCodeGen_x86::Emit_Md_PackWH_VarVarVar(const STATEMENT& statement)
592 {
593 	auto dst = statement.dst->GetSymbol().get();
594 	auto src1 = statement.src1->GetSymbol().get();
595 	auto src2 = statement.src2->GetSymbol().get();
596 
597 	auto resultRegister = CX86Assembler::xMM0;
598 	auto tempRegister = CX86Assembler::xMM1;
599 
600 	m_assembler.MovapsVo(resultRegister, MakeVariable128SymbolAddress(src2));
601 	m_assembler.MovapsVo(tempRegister, MakeVariable128SymbolAddress(src1));
602 
603 	//Sign extend the lower half word of our registers
604 	m_assembler.PslldVo(resultRegister, 0x10);
605 	m_assembler.PsradVo(resultRegister, 0x10);
606 
607 	m_assembler.PslldVo(tempRegister, 0x10);
608 	m_assembler.PsradVo(tempRegister, 0x10);
609 
610 	//Pack
611 	m_assembler.PackssdwVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(tempRegister));
612 
613 	m_assembler.MovapsVo(MakeVariable128SymbolAddress(dst), resultRegister);
614 }
615 
Emit_Md_MovMasked_VarVarVar(const STATEMENT & statement)616 void CCodeGen_x86::Emit_Md_MovMasked_VarVarVar(const STATEMENT& statement)
617 {
618 	auto dst = statement.dst->GetSymbol().get();
619 	auto src1 = statement.src1->GetSymbol().get();
620 	auto src2 = statement.src2->GetSymbol().get();
621 	uint8 mask = static_cast<uint8>(statement.jmpCondition);
622 
623 	auto mask0Register = CX86Assembler::xMM0;
624 	auto mask1Register = CX86Assembler::xMM1;
625 
626 	m_assembler.MovId(CX86Assembler::rAX, ~0);
627 	m_assembler.MovdVo(mask0Register, CX86Assembler::MakeRegisterAddress(CX86Assembler::rAX));
628 
629 	//Generate shuffle selector
630 	//0x00 -> gives us 0x00000000
631 	//0x02 -> gives us 0xFFFFFFFF
632 	uint8 shuffleSelector = 0;
633 	for(unsigned int i = 0; i < 4; i++)
634 	{
635 		if(mask & (1 << i))
636 		{
637 			shuffleSelector |= (0x02) << (i * 2);
638 		}
639 	}
640 
641 	//mask0 -> proper mask
642 	m_assembler.PshufdVo(mask0Register, CX86Assembler::MakeXmmRegisterAddress(mask0Register), shuffleSelector);
643 
644 	//mask1 -> mask inverse
645 	m_assembler.PcmpeqdVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(mask1Register));
646 	m_assembler.PxorVo(mask1Register, CX86Assembler::MakeXmmRegisterAddress(mask0Register));
647 
648 	//Generate result
649 	m_assembler.PandVo(mask0Register, MakeVariable128SymbolAddress(src1));
650 	m_assembler.PandVo(mask1Register, MakeVariable128SymbolAddress(src2));
651 	m_assembler.PorVo(mask0Register, CX86Assembler::MakeXmmRegisterAddress(mask1Register));
652 
653 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), mask0Register);
654 }
655 
Emit_Md_MovMasked_Sse41_VarVarVar(const STATEMENT & statement)656 void CCodeGen_x86::Emit_Md_MovMasked_Sse41_VarVarVar(const STATEMENT& statement)
657 {
658 	auto dst = statement.dst->GetSymbol().get();
659 	auto src1 = statement.src1->GetSymbol().get();
660 	auto src2 = statement.src2->GetSymbol().get();
661 	uint8 mask = static_cast<uint8>(statement.jmpCondition);
662 
663 	//This could be improved if src1 and src2 are different
664 	assert(dst->Equals(src1));
665 
666 	if(dst->IsRegister() && dst->Equals(src1))
667 	{
668 		m_assembler.BlendpsVo(m_mdRegisters[dst->m_valueLow], MakeVariable128SymbolAddress(src2), mask);
669 	}
670 	else
671 	{
672 		auto tempRegister = CX86Assembler::xMM0;
673 		m_assembler.MovapsVo(tempRegister, MakeVariable128SymbolAddress(src1));
674 		m_assembler.BlendpsVo(tempRegister, MakeVariable128SymbolAddress(src2), mask);
675 		m_assembler.MovapsVo(MakeVariable128SymbolAddress(dst), tempRegister);
676 	}
677 }
678 
Emit_Md_Mov_RegVar(const STATEMENT & statement)679 void CCodeGen_x86::Emit_Md_Mov_RegVar(const STATEMENT& statement)
680 {
681 	auto dst = statement.dst->GetSymbol().get();
682 	auto src1 = statement.src1->GetSymbol().get();
683 
684 	m_assembler.MovapsVo(m_mdRegisters[dst->m_valueLow], MakeVariable128SymbolAddress(src1));
685 }
686 
Emit_Md_Mov_MemReg(const STATEMENT & statement)687 void CCodeGen_x86::Emit_Md_Mov_MemReg(const STATEMENT& statement)
688 {
689 	auto dst = statement.dst->GetSymbol().get();
690 	auto src1 = statement.src1->GetSymbol().get();
691 
692 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), m_mdRegisters[src1->m_valueLow]);
693 }
694 
Emit_Md_Mov_MemMem(const STATEMENT & statement)695 void CCodeGen_x86::Emit_Md_Mov_MemMem(const STATEMENT& statement)
696 {
697 	CSymbol* dst = statement.dst->GetSymbol().get();
698 	CSymbol* src1 = statement.src1->GetSymbol().get();
699 
700 	CX86Assembler::XMMREGISTER resultRegister = CX86Assembler::xMM0;
701 
702 	m_assembler.MovapsVo(resultRegister, MakeMemory128SymbolAddress(src1));
703 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), resultRegister);
704 }
705 
Emit_Md_Abs(CX86Assembler::XMMREGISTER dstRegister)706 void CCodeGen_x86::Emit_Md_Abs(CX86Assembler::XMMREGISTER dstRegister)
707 {
708 	auto maskRegister = CX86Assembler::xMM1;
709 
710 	assert(dstRegister != maskRegister);
711 
712 	m_assembler.PcmpeqdVo(maskRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
713 	m_assembler.PsrldVo(maskRegister, 1);
714 	m_assembler.PandVo(dstRegister, CX86Assembler::MakeXmmRegisterAddress(maskRegister));
715 }
716 
Emit_Md_Not(CX86Assembler::XMMREGISTER dstRegister)717 void CCodeGen_x86::Emit_Md_Not(CX86Assembler::XMMREGISTER dstRegister)
718 {
719 	auto cstRegister = CX86Assembler::xMM1;
720 
721 	assert(dstRegister != cstRegister);
722 
723 	m_assembler.PcmpeqdVo(cstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
724 	m_assembler.PxorVo(dstRegister, CX86Assembler::MakeXmmRegisterAddress(cstRegister));
725 }
726 
Emit_Md_MakeSz(CX86Assembler::XMMREGISTER dstRegister,const CX86Assembler::CAddress & srcAddress)727 void CCodeGen_x86::Emit_Md_MakeSz(CX86Assembler::XMMREGISTER dstRegister, const CX86Assembler::CAddress& srcAddress)
728 {
729 	auto zeroRegister = CX86Assembler::xMM1;
730 	assert(dstRegister != zeroRegister);
731 
732 	m_assembler.MovdqaVo(dstRegister, srcAddress);
733 	m_assembler.PsradVo(dstRegister, 31);
734 
735 	m_assembler.PxorVo(zeroRegister, CX86Assembler::MakeXmmRegisterAddress(zeroRegister));
736 	m_assembler.CmppsVo(zeroRegister, srcAddress, CX86Assembler::SSE_CMP_EQ);
737 
738 	m_assembler.PackssdwVo(dstRegister, CX86Assembler::MakeXmmRegisterAddress(zeroRegister));
739 }
740 
Emit_Md_MakeSz_VarVar(const STATEMENT & statement)741 void CCodeGen_x86::Emit_Md_MakeSz_VarVar(const STATEMENT& statement)
742 {
743 	auto dst = statement.dst->GetSymbol().get();
744 	auto src1 = statement.src1->GetSymbol().get();
745 
746 	auto szRegister = CX86Assembler::xMM0;
747 	auto tmpFlagRegister = CX86Assembler::rAX;
748 	auto dstRegister = PrepareSymbolRegisterDef(dst, CX86Assembler::rDX);
749 
750 	Emit_Md_MakeSz(szRegister, MakeVariable128SymbolAddress(src1));
751 
752 	//Extract bits
753 	m_assembler.PmovmskbVo(tmpFlagRegister, szRegister);
754 
755 	//Generate bit field
756 	m_assembler.XorEd(dstRegister, CX86Assembler::MakeRegisterAddress(dstRegister));
757 	for(unsigned int i = 0; i < 8; i++)
758 	{
759 		m_assembler.ShrEd(CX86Assembler::MakeRegisterAddress(tmpFlagRegister), 2);
760 		m_assembler.RclEd(CX86Assembler::MakeRegisterAddress(dstRegister), 1);
761 	}
762 
763 	CommitSymbolRegister(dst, dstRegister);
764 }
765 
Emit_Md_MakeSz_Ssse3_VarVar(const STATEMENT & statement)766 void CCodeGen_x86::Emit_Md_MakeSz_Ssse3_VarVar(const STATEMENT& statement)
767 {
768 	auto dst = statement.dst->GetSymbol().get();
769 	auto src1 = statement.src1->GetSymbol().get();
770 
771 	auto szRegister = CX86Assembler::xMM0;
772 	auto dstRegister = PrepareSymbolRegisterDef(dst, CX86Assembler::rDX);
773 
774 	Emit_Md_MakeSz(szRegister, MakeVariable128SymbolAddress(src1));
775 
776 	//Extract bits
777 	m_assembler.PshufbVo(szRegister, MakeConstant128Address(g_makeSzShufflePattern));
778 	m_assembler.PmovmskbVo(dstRegister, szRegister);
779 
780 	CommitSymbolRegister(dst, dstRegister);
781 }
782 
Emit_Md_Expand_RegReg(const STATEMENT & statement)783 void CCodeGen_x86::Emit_Md_Expand_RegReg(const STATEMENT& statement)
784 {
785 	auto dst = statement.dst->GetSymbol().get();
786 	auto src1 = statement.src1->GetSymbol().get();
787 
788 	auto resultRegister = m_mdRegisters[dst->m_valueLow];
789 
790 	m_assembler.MovdVo(resultRegister, CX86Assembler::MakeRegisterAddress(m_registers[src1->m_valueLow]));
791 	m_assembler.PshufdVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(resultRegister), 0x00);
792 }
793 
Emit_Md_Expand_RegMem(const STATEMENT & statement)794 void CCodeGen_x86::Emit_Md_Expand_RegMem(const STATEMENT& statement)
795 {
796 	auto dst = statement.dst->GetSymbol().get();
797 	auto src1 = statement.src1->GetSymbol().get();
798 
799 	auto resultRegister = m_mdRegisters[dst->m_valueLow];
800 
801 	m_assembler.MovssEd(resultRegister, MakeMemorySymbolAddress(src1));
802 	m_assembler.ShufpsVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(resultRegister), 0x00);
803 }
804 
Emit_Md_Expand_RegCst(const STATEMENT & statement)805 void CCodeGen_x86::Emit_Md_Expand_RegCst(const STATEMENT& statement)
806 {
807 	auto dst = statement.dst->GetSymbol().get();
808 	auto src1 = statement.src1->GetSymbol().get();
809 
810 	auto cstRegister = CX86Assembler::rAX;
811 	auto resultRegister = m_mdRegisters[dst->m_valueLow];
812 
813 	m_assembler.MovId(cstRegister, src1->m_valueLow);
814 	m_assembler.MovdVo(resultRegister, CX86Assembler::MakeRegisterAddress(cstRegister));
815 	m_assembler.PshufdVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(resultRegister), 0x00);
816 }
817 
Emit_Md_Expand_MemReg(const STATEMENT & statement)818 void CCodeGen_x86::Emit_Md_Expand_MemReg(const STATEMENT& statement)
819 {
820 	auto dst = statement.dst->GetSymbol().get();
821 	auto src1 = statement.src1->GetSymbol().get();
822 
823 	auto resultRegister = CX86Assembler::xMM0;
824 
825 	m_assembler.MovdVo(resultRegister, CX86Assembler::MakeRegisterAddress(m_registers[src1->m_valueLow]));
826 	m_assembler.ShufpsVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(resultRegister), 0x00);
827 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), resultRegister);
828 }
829 
Emit_Md_Expand_MemMem(const STATEMENT & statement)830 void CCodeGen_x86::Emit_Md_Expand_MemMem(const STATEMENT& statement)
831 {
832 	auto dst = statement.dst->GetSymbol().get();
833 	auto src1 = statement.src1->GetSymbol().get();
834 
835 	auto resultRegister = CX86Assembler::xMM0;
836 
837 	m_assembler.MovssEd(resultRegister, MakeMemorySymbolAddress(src1));
838 	m_assembler.ShufpsVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(resultRegister), 0x00);
839 	m_assembler.MovapsVo(MakeMemory128SymbolAddress(dst), resultRegister);
840 }
841 
Emit_Md_Expand_MemCst(const STATEMENT & statement)842 void CCodeGen_x86::Emit_Md_Expand_MemCst(const STATEMENT& statement)
843 {
844 	auto dst = statement.dst->GetSymbol().get();
845 	auto src1 = statement.src1->GetSymbol().get();
846 
847 	auto cstRegister = CX86Assembler::rAX;
848 	auto resultRegister = CX86Assembler::xMM0;
849 
850 	m_assembler.MovId(cstRegister, src1->m_valueLow);
851 	m_assembler.MovdVo(resultRegister, CX86Assembler::MakeRegisterAddress(cstRegister));
852 	m_assembler.PshufdVo(resultRegister, CX86Assembler::MakeXmmRegisterAddress(resultRegister), 0x00);
853 	m_assembler.MovdqaVo(MakeMemory128SymbolAddress(dst), resultRegister);
854 }
855 
Emit_Md_Srl256_VarMem(CSymbol * dst,CSymbol * src1,const CX86Assembler::CAddress & offsetAddress)856 void CCodeGen_x86::Emit_Md_Srl256_VarMem(CSymbol* dst, CSymbol* src1, const CX86Assembler::CAddress& offsetAddress)
857 {
858 	auto offsetRegister = CX86Assembler::rAX;
859 	auto resultRegister = CX86Assembler::xMM0;
860 
861 	assert(src1->m_type == SYM_TEMPORARY256);
862 
863 	m_assembler.MovEd(offsetRegister, offsetAddress);
864 	m_assembler.AndId(CX86Assembler::MakeRegisterAddress(offsetRegister), 0x7F);
865 	m_assembler.ShrEd(CX86Assembler::MakeRegisterAddress(offsetRegister), 3);
866 	m_assembler.AddId(CX86Assembler::MakeRegisterAddress(offsetRegister), src1->m_stackLocation + m_stackLevel);
867 
868 	m_assembler.MovdquVo(resultRegister, CX86Assembler::MakeBaseIndexScaleAddress(CX86Assembler::rSP, offsetRegister, 1));
869 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), resultRegister);
870 }
871 
Emit_Md_Srl256_VarMemVar(const STATEMENT & statement)872 void CCodeGen_x86::Emit_Md_Srl256_VarMemVar(const STATEMENT& statement)
873 {
874 	auto dst = statement.dst->GetSymbol().get();
875 	auto src1 = statement.src1->GetSymbol().get();
876 	auto src2 = statement.src2->GetSymbol().get();
877 
878 	Emit_Md_Srl256_VarMem(dst, src1, MakeVariableSymbolAddress(src2));
879 }
880 
Emit_Md_Srl256_VarMemCst(const STATEMENT & statement)881 void CCodeGen_x86::Emit_Md_Srl256_VarMemCst(const STATEMENT& statement)
882 {
883 	auto dst = statement.dst->GetSymbol().get();
884 	auto src1 = statement.src1->GetSymbol().get();
885 	auto src2 = statement.src2->GetSymbol().get();
886 
887 	auto resultRegister = CX86Assembler::xMM0;
888 
889 	assert(src1->m_type == SYM_TEMPORARY256);
890 	assert(src2->m_type == SYM_CONSTANT);
891 
892 	uint32 offset = (src2->m_valueLow & 0x7F) / 8;
893 
894 	m_assembler.MovdquVo(resultRegister, MakeTemporary256SymbolElementAddress(src1, offset));
895 	m_assembler.MovdqaVo(MakeVariable128SymbolAddress(dst), resultRegister);
896 }
897 
Emit_MergeTo256_MemVarVar(const STATEMENT & statement)898 void CCodeGen_x86::Emit_MergeTo256_MemVarVar(const STATEMENT& statement)
899 {
900 	auto dst = statement.dst->GetSymbol().get();
901 	auto src1 = statement.src1->GetSymbol().get();
902 	auto src2 = statement.src2->GetSymbol().get();
903 
904 	assert(dst->m_type == SYM_TEMPORARY256);
905 
906 	auto src1Register = CX86Assembler::xMM0;
907 	auto src2Register = CX86Assembler::xMM1;
908 
909 	//TODO: Improve this to write out registers directly to temporary's memory space
910 	//instead of passing by temporary registers
911 
912 	m_assembler.MovdqaVo(src1Register, MakeVariable128SymbolAddress(src1));
913 	m_assembler.MovdqaVo(src2Register, MakeVariable128SymbolAddress(src2));
914 
915 	m_assembler.MovdqaVo(MakeTemporary256SymbolElementAddress(dst, 0x00), src1Register);
916 	m_assembler.MovdqaVo(MakeTemporary256SymbolElementAddress(dst, 0x10), src2Register);
917 }
918 
919 #define MD_CONST_MATCHERS_SHIFT(MDOP_CST, MDOP, SAMASK) \
920 	{ MDOP_CST, MATCH_REGISTER128, MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Shift_RegVarCst<MDOP, SAMASK> }, \
921 	{ MDOP_CST, MATCH_MEMORY128,   MATCH_VARIABLE128, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Shift_MemVarCst<MDOP, SAMASK> },
922 
923 #define MD_CONST_MATCHERS_2OPS(MDOP_CST, MDOP) \
924 	{ MDOP_CST, MATCH_REGISTER128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_RegVar<MDOP> }, \
925 	{ MDOP_CST, MATCH_MEMORY128,   MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_MemVar<MDOP> },
926 
927 #define MD_CONST_MATCHERS_3OPS(MDOP_CST, MDOP) \
928 	{ MDOP_CST, MATCH_REGISTER128, MATCH_REGISTER128, MATCH_REGISTER128, MATCH_NIL, &CCodeGen_x86::Emit_Md_RegRegReg<MDOP> }, \
929 	{ MDOP_CST, MATCH_REGISTER128, MATCH_MEMORY128,   MATCH_REGISTER128, MATCH_NIL, &CCodeGen_x86::Emit_Md_RegMemReg<MDOP> }, \
930 	{ MDOP_CST, MATCH_REGISTER128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_RegVarVar<MDOP> }, \
931 	{ MDOP_CST, MATCH_MEMORY128,   MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_MemVarVar<MDOP> },
932 
933 #define MD_CONST_MATCHERS_3OPS_REV(MDOP_CST, MDOP) \
934 	{ MDOP_CST, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_VarVarVarRev<MDOP> },
935 
936 #define MD_CONST_MATCHERS_SINGLEOP(MDOP_CST, MDOP) \
937 	{ MDOP_CST, MATCH_REGISTER128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_SingleOp_RegVar<MDOP> }, \
938 	{ MDOP_CST, MATCH_MEMORY128,   MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_SingleOp_MemVar<MDOP> },
939 
940 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdConstMatchers[] =
941 {
942 	MD_CONST_MATCHERS_3OPS(OP_MD_ADD_B, MDOP_ADDB)
943 	MD_CONST_MATCHERS_3OPS(OP_MD_ADD_H, MDOP_ADDH)
944 	MD_CONST_MATCHERS_3OPS(OP_MD_ADD_W, MDOP_ADDW)
945 
946 	MD_CONST_MATCHERS_3OPS(OP_MD_ADDSS_B, MDOP_ADDSSB)
947 	MD_CONST_MATCHERS_3OPS(OP_MD_ADDSS_H, MDOP_ADDSSH)
948 	{ OP_MD_ADDSS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_AddSSW_VarVarVar },
949 
950 	MD_CONST_MATCHERS_3OPS(OP_MD_ADDUS_B, MDOP_ADDUSB)
951 	MD_CONST_MATCHERS_3OPS(OP_MD_ADDUS_H, MDOP_ADDUSH)
952 	{ OP_MD_ADDUS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_AddUSW_VarVarVar },
953 
954 	MD_CONST_MATCHERS_3OPS(OP_MD_SUB_B, MDOP_SUBB)
955 	MD_CONST_MATCHERS_3OPS(OP_MD_SUB_H, MDOP_SUBH)
956 	MD_CONST_MATCHERS_3OPS(OP_MD_SUB_W, MDOP_SUBW)
957 
958 	MD_CONST_MATCHERS_3OPS(OP_MD_SUBSS_H, MDOP_SUBSSH)
959 	{ OP_MD_SUBSS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_SubSSW_VarVarVar },
960 
961 	MD_CONST_MATCHERS_3OPS(OP_MD_SUBUS_B, MDOP_SUBUSB)
962 	MD_CONST_MATCHERS_3OPS(OP_MD_SUBUS_H, MDOP_SUBUSH)
963 	{ OP_MD_SUBUS_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_SubUSW_VarVarVar },
964 
965 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPEQ_B, MDOP_CMPEQB)
966 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPEQ_H, MDOP_CMPEQH)
967 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPEQ_W, MDOP_CMPEQW)
968 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPGT_B, MDOP_CMPGTB)
969 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPGT_H, MDOP_CMPGTH)
970 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPGT_W, MDOP_CMPGTW)
971 
972 	MD_CONST_MATCHERS_3OPS(OP_MD_MIN_H, MDOP_MINH)
973 
974 	MD_CONST_MATCHERS_3OPS(OP_MD_MAX_H, MDOP_MAXH)
975 
976 	MD_CONST_MATCHERS_3OPS(OP_MD_AND, MDOP_AND)
977 	MD_CONST_MATCHERS_3OPS(OP_MD_OR,  MDOP_OR)
978 	MD_CONST_MATCHERS_3OPS(OP_MD_XOR, MDOP_XOR)
979 
980 	MD_CONST_MATCHERS_SHIFT(OP_MD_SRLH, MDOP_SRLH, 0x0F)
981 	MD_CONST_MATCHERS_SHIFT(OP_MD_SRAH, MDOP_SRAH, 0x0F)
982 	MD_CONST_MATCHERS_SHIFT(OP_MD_SLLH, MDOP_SLLH, 0x0F)
983 
984 	MD_CONST_MATCHERS_SHIFT(OP_MD_SRLW, MDOP_SRLW, 0x1F)
985 	MD_CONST_MATCHERS_SHIFT(OP_MD_SRAW, MDOP_SRAW, 0x1F)
986 	MD_CONST_MATCHERS_SHIFT(OP_MD_SLLW, MDOP_SLLW, 0x1F)
987 
988 	{ OP_MD_SRL256, MATCH_VARIABLE128, MATCH_MEMORY256, MATCH_VARIABLE, MATCH_NIL, &CCodeGen_x86::Emit_Md_Srl256_VarMemVar },
989 	{ OP_MD_SRL256, MATCH_VARIABLE128, MATCH_MEMORY256, MATCH_CONSTANT, MATCH_NIL, &CCodeGen_x86::Emit_Md_Srl256_VarMemCst },
990 
991 	{ OP_MD_EXPAND, MATCH_REGISTER128, MATCH_REGISTER, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Expand_RegReg },
992 	{ OP_MD_EXPAND, MATCH_REGISTER128, MATCH_MEMORY,   MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Expand_RegMem },
993 	{ OP_MD_EXPAND, MATCH_REGISTER128, MATCH_CONSTANT, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Expand_RegCst },
994 	{ OP_MD_EXPAND, MATCH_MEMORY128,   MATCH_REGISTER, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Expand_MemReg },
995 	{ OP_MD_EXPAND, MATCH_MEMORY128,   MATCH_MEMORY,   MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Expand_MemMem },
996 	{ OP_MD_EXPAND, MATCH_MEMORY128,   MATCH_CONSTANT, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Expand_MemCst },
997 
998 	{ OP_MD_PACK_HB, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_PackHB_VarVarVar },
999 	{ OP_MD_PACK_WH, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_PackWH_VarVarVar },
1000 
1001 	MD_CONST_MATCHERS_3OPS_REV(OP_MD_UNPACK_LOWER_BH, MDOP_UNPACK_LOWER_BH)
1002 	MD_CONST_MATCHERS_3OPS_REV(OP_MD_UNPACK_LOWER_HW, MDOP_UNPACK_LOWER_HW)
1003 	MD_CONST_MATCHERS_3OPS_REV(OP_MD_UNPACK_LOWER_WD, MDOP_UNPACK_LOWER_WD)
1004 
1005 	MD_CONST_MATCHERS_3OPS_REV(OP_MD_UNPACK_UPPER_BH, MDOP_UNPACK_UPPER_BH)
1006 	MD_CONST_MATCHERS_3OPS_REV(OP_MD_UNPACK_UPPER_HW, MDOP_UNPACK_UPPER_HW)
1007 	MD_CONST_MATCHERS_3OPS_REV(OP_MD_UNPACK_UPPER_WD, MDOP_UNPACK_UPPER_WD)
1008 
1009 	MD_CONST_MATCHERS_3OPS(OP_MD_ADD_S, MDOP_ADDS)
1010 	MD_CONST_MATCHERS_3OPS(OP_MD_SUB_S, MDOP_SUBS)
1011 	MD_CONST_MATCHERS_3OPS(OP_MD_MUL_S, MDOP_MULS)
1012 	MD_CONST_MATCHERS_3OPS(OP_MD_DIV_S, MDOP_DIVS)
1013 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPLT_S, MDOP_CMPLTS)
1014 	MD_CONST_MATCHERS_3OPS(OP_MD_CMPGT_S, MDOP_CMPGTS)
1015 
1016 	MD_CONST_MATCHERS_3OPS(OP_MD_MIN_S, MDOP_MINS)
1017 	MD_CONST_MATCHERS_3OPS(OP_MD_MAX_S, MDOP_MAXS)
1018 
1019 	MD_CONST_MATCHERS_SINGLEOP(OP_MD_ABS_S, MDOP_ABS)
1020 	MD_CONST_MATCHERS_SINGLEOP(OP_MD_NOT,   MDOP_NOT)
1021 
1022 	MD_CONST_MATCHERS_2OPS(OP_MD_TOWORD_TRUNCATE, MDOP_TOWORD_TRUNCATE)
1023 	MD_CONST_MATCHERS_2OPS(OP_MD_TOSINGLE,        MDOP_TOSINGLE)
1024 
1025 	{ OP_MOV, MATCH_REGISTER128, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Mov_RegVar },
1026 	{ OP_MOV, MATCH_MEMORY128,   MATCH_REGISTER128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Mov_MemReg },
1027 	{ OP_MOV, MATCH_MEMORY128,   MATCH_MEMORY128,   MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_Mov_MemMem },
1028 
1029 	{ OP_MERGETO256, MATCH_MEMORY256, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_MergeTo256_MemVarVar },
1030 
1031 	{ OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
1032 };
1033 
1034 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdMinMaxWConstMatchers[] =
1035 {
1036 	{ OP_MD_MIN_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_MinW_VarVarVar },
1037 	{ OP_MD_MAX_W, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_MaxW_VarVarVar },
1038 
1039 	{ OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
1040 };
1041 
1042 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdMinMaxWSse41ConstMatchers[] =
1043 {
1044 	MD_CONST_MATCHERS_3OPS(OP_MD_MIN_W, MDOP_MINW)
1045 	MD_CONST_MATCHERS_3OPS(OP_MD_MAX_W, MDOP_MAXW)
1046 
1047 	{ OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
1048 };
1049 
1050 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdMovMaskedConstMatchers[] =
1051 {
1052 	{ OP_MD_MOV_MASKED, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_MovMasked_VarVarVar },
1053 
1054 	{ OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
1055 };
1056 
1057 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdMovMaskedSse41ConstMatchers[] =
1058 {
1059 	{ OP_MD_MOV_MASKED, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_VARIABLE128, MATCH_NIL, &CCodeGen_x86::Emit_Md_MovMasked_Sse41_VarVarVar },
1060 
1061 	{ OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
1062 };
1063 
1064 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdFpFlagConstMatchers[] =
1065 {
1066 	{ OP_MD_MAKESZ,     MATCH_VARIABLE, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_MakeSz_VarVar },
1067 
1068 	{ OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
1069 };
1070 
1071 CCodeGen_x86::CONSTMATCHER CCodeGen_x86::g_mdFpFlagSsse3ConstMatchers[] =
1072 {
1073 	{ OP_MD_MAKESZ,     MATCH_VARIABLE, MATCH_VARIABLE128, MATCH_NIL, MATCH_NIL, &CCodeGen_x86::Emit_Md_MakeSz_Ssse3_VarVar },
1074 
1075 	{ OP_MOV, MATCH_NIL, MATCH_NIL, MATCH_NIL, MATCH_NIL, nullptr },
1076 };
1077