1 #include "Jitter_CodeGen_AArch32.h"
2 
3 using namespace Jitter;
4 
LoadMemory128AddressInRegister(CAArch32Assembler::REGISTER dstReg,CSymbol * symbol,uint32 offset)5 void CCodeGen_AArch32::LoadMemory128AddressInRegister(CAArch32Assembler::REGISTER dstReg, CSymbol* symbol, uint32 offset)
6 {
7 	switch(symbol->m_type)
8 	{
9 	case SYM_RELATIVE128:
10 		LoadRelative128AddressInRegister(dstReg, symbol, offset);
11 		break;
12 	case SYM_TEMPORARY128:
13 		LoadTemporary128AddressInRegister(dstReg, symbol, offset);
14 		break;
15 	default:
16 		assert(0);
17 		break;
18 	}
19 }
20 
LoadRelative128AddressInRegister(CAArch32Assembler::REGISTER dstReg,CSymbol * symbol,uint32 offset)21 void CCodeGen_AArch32::LoadRelative128AddressInRegister(CAArch32Assembler::REGISTER dstReg, CSymbol* symbol, uint32 offset)
22 {
23 	assert(symbol->m_type == SYM_RELATIVE128);
24 
25 	uint8 immediate = 0;
26 	uint8 shiftAmount = 0;
27 	if(!TryGetAluImmediateParams(symbol->m_valueLow + offset, immediate, shiftAmount))
28 	{
29 		throw std::runtime_error("Failed to build immediate for symbol.");
30 	}
31 	m_assembler.Add(dstReg, g_baseRegister, CAArch32Assembler::MakeImmediateAluOperand(immediate, shiftAmount));
32 }
33 
LoadTemporary128AddressInRegister(CAArch32Assembler::REGISTER dstReg,CSymbol * symbol,uint32 offset)34 void CCodeGen_AArch32::LoadTemporary128AddressInRegister(CAArch32Assembler::REGISTER dstReg, CSymbol* symbol, uint32 offset)
35 {
36 	assert(symbol->m_type == SYM_TEMPORARY128);
37 
38 	uint8 immediate = 0;
39 	uint8 shiftAmount = 0;
40 	if(!TryGetAluImmediateParams(symbol->m_stackLocation + m_stackLevel + offset, immediate, shiftAmount))
41 	{
42 		throw std::runtime_error("Failed to build immediate for symbol.");
43 	}
44 	m_assembler.Add(dstReg, CAArch32Assembler::rSP, CAArch32Assembler::MakeImmediateAluOperand(immediate, shiftAmount));
45 }
46 
LoadTemporary256ElementAddressInRegister(CAArch32Assembler::REGISTER dstReg,CSymbol * symbol,uint32 offset)47 void CCodeGen_AArch32::LoadTemporary256ElementAddressInRegister(CAArch32Assembler::REGISTER dstReg, CSymbol* symbol, uint32 offset)
48 {
49 	assert(symbol->m_type == SYM_TEMPORARY256);
50 
51 	uint8 immediate = 0;
52 	uint8 shiftAmount = 0;
53 	if(!TryGetAluImmediateParams(symbol->m_stackLocation + m_stackLevel + offset, immediate, shiftAmount))
54 	{
55 		throw std::runtime_error("Failed to build immediate for symbol.");
56 	}
57 	m_assembler.Add(dstReg, CAArch32Assembler::rSP, CAArch32Assembler::MakeImmediateAluOperand(immediate, shiftAmount));
58 }
59 
60 template <typename MDOP>
Emit_Md_MemMem(const STATEMENT & statement)61 void CCodeGen_AArch32::Emit_Md_MemMem(const STATEMENT& statement)
62 {
63 	auto dst = statement.dst->GetSymbol().get();
64 	auto src1 = statement.src1->GetSymbol().get();
65 
66 	auto dstAddrReg = CAArch32Assembler::r0;
67 	auto src1AddrReg = CAArch32Assembler::r1;
68 	auto dstReg = CAArch32Assembler::q0;
69 	auto src1Reg = CAArch32Assembler::q1;
70 
71 	LoadMemory128AddressInRegister(dstAddrReg, dst);
72 	LoadMemory128AddressInRegister(src1AddrReg, src1);
73 
74 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
75 	((m_assembler).*(MDOP::OpReg()))(dstReg, src1Reg);
76 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
77 }
78 
79 template <typename MDOP>
Emit_Md_MemMemMem(const STATEMENT & statement)80 void CCodeGen_AArch32::Emit_Md_MemMemMem(const STATEMENT& statement)
81 {
82 	auto dst = statement.dst->GetSymbol().get();
83 	auto src1 = statement.src1->GetSymbol().get();
84 	auto src2 = statement.src2->GetSymbol().get();
85 
86 	auto dstAddrReg = CAArch32Assembler::r0;
87 	auto src1AddrReg = CAArch32Assembler::r1;
88 	auto src2AddrReg = CAArch32Assembler::r2;
89 	auto dstReg = CAArch32Assembler::q0;
90 	auto src1Reg = CAArch32Assembler::q1;
91 	auto src2Reg = CAArch32Assembler::q2;
92 
93 	LoadMemory128AddressInRegister(dstAddrReg, dst);
94 	LoadMemory128AddressInRegister(src1AddrReg, src1);
95 	LoadMemory128AddressInRegister(src2AddrReg, src2);
96 
97 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
98 	m_assembler.Vld1_32x4(src2Reg, src2AddrReg);
99 	((m_assembler).*(MDOP::OpReg()))(dstReg, src1Reg, src2Reg);
100 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
101 }
102 
103 template <typename MDSHIFTOP>
Emit_Md_Shift_MemMemCst(const STATEMENT & statement)104 void CCodeGen_AArch32::Emit_Md_Shift_MemMemCst(const STATEMENT& statement)
105 {
106 	auto dst = statement.dst->GetSymbol().get();
107 	auto src1 = statement.src1->GetSymbol().get();
108 	auto src2 = statement.src2->GetSymbol().get();
109 
110 	auto dstAddrReg = CAArch32Assembler::r0;
111 	auto src1AddrReg = CAArch32Assembler::r1;
112 	auto dstReg = CAArch32Assembler::q0;
113 	auto src1Reg = CAArch32Assembler::q1;
114 
115 	LoadMemory128AddressInRegister(dstAddrReg, dst);
116 	LoadMemory128AddressInRegister(src1AddrReg, src1);
117 
118 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
119 	((m_assembler).*(MDSHIFTOP::OpReg()))(dstReg, src1Reg, src2->m_valueLow);
120 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
121 }
122 
123 template <uint32 condition>
Emit_Md_Test_VarMem(const STATEMENT & statement)124 void CCodeGen_AArch32::Emit_Md_Test_VarMem(const STATEMENT& statement)
125 {
126 	auto dst = statement.dst->GetSymbol().get();
127 	auto src1 = statement.src1->GetSymbol().get();
128 
129 	auto src1AddrReg = CAArch32Assembler::r0;
130 	auto src1Reg = CAArch32Assembler::q0;
131 
132 	LoadMemory128AddressInRegister(src1AddrReg, src1);
133 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
134 
135 	auto dstReg = PrepareSymbolRegisterDef(dst, CAArch32Assembler::r0);
136 
137 	static CAArch32Assembler::REGISTER regs[4] =
138 	{
139 		CAArch32Assembler::r2,
140 		CAArch32Assembler::r3
141 	};
142 
143 	m_assembler.Eor(dstReg, dstReg, dstReg);
144 	for(unsigned int i = 0; i < 4; i++)
145 	{
146 		m_assembler.Vmov(regs[i & 1], static_cast<CAArch32Assembler::DOUBLE_REGISTER>(src1Reg + (i / 2)), i & 1);
147 		m_assembler.Tst(regs[i & 1], regs[i & 1]);
148 		uint8 immediate = 0, shiftAmount = 0;
149 		if(!TryGetAluImmediateParams(1 << (3 - i), immediate, shiftAmount))
150 		{
151 			assert(false);
152 		}
153 		m_assembler.Or(static_cast<CAArch32Assembler::CONDITION>(condition), dstReg, dstReg,
154 			CAArch32Assembler::MakeImmediateAluOperand(immediate, shiftAmount));
155 	}
156 
157 	CommitSymbolRegister(dst, dstReg);
158 }
159 
Emit_Md_Mov_MemMem(const STATEMENT & statement)160 void CCodeGen_AArch32::Emit_Md_Mov_MemMem(const STATEMENT& statement)
161 {
162 	auto dst = statement.dst->GetSymbol().get();
163 	auto src1 = statement.src1->GetSymbol().get();
164 
165 	auto dstAddrReg = CAArch32Assembler::r0;
166 	auto src1AddrReg = CAArch32Assembler::r1;
167 	auto tmpReg = CAArch32Assembler::q0;
168 	LoadMemory128AddressInRegister(dstAddrReg, dst);
169 	LoadMemory128AddressInRegister(src1AddrReg, src1);
170 	m_assembler.Vld1_32x4(tmpReg, src1AddrReg);
171 	m_assembler.Vst1_32x4(tmpReg, dstAddrReg);
172 }
173 
Emit_Md_Not_MemMem(const STATEMENT & statement)174 void CCodeGen_AArch32::Emit_Md_Not_MemMem(const STATEMENT& statement)
175 {
176 	auto dst = statement.dst->GetSymbol().get();
177 	auto src1 = statement.src1->GetSymbol().get();
178 
179 	auto dstAddrReg = CAArch32Assembler::r0;
180 	auto src1AddrReg = CAArch32Assembler::r1;
181 	auto zeroReg = CAArch32Assembler::q0;
182 	auto tmpReg = CAArch32Assembler::q1;
183 
184 	LoadMemory128AddressInRegister(dstAddrReg, dst);
185 	LoadMemory128AddressInRegister(src1AddrReg, src1);
186 
187 	m_assembler.Vld1_32x4(tmpReg, src1AddrReg);
188 	m_assembler.Veor(zeroReg, zeroReg, zeroReg);
189 	m_assembler.Vorn(tmpReg, zeroReg, tmpReg);
190 	m_assembler.Vst1_32x4(tmpReg, dstAddrReg);
191 }
192 
Emit_Md_DivS_MemMemMem(const STATEMENT & statement)193 void CCodeGen_AArch32::Emit_Md_DivS_MemMemMem(const STATEMENT& statement)
194 {
195 	auto dst = statement.dst->GetSymbol().get();
196 	auto src1 = statement.src1->GetSymbol().get();
197 	auto src2 = statement.src2->GetSymbol().get();
198 
199 	auto dstAddrReg = CAArch32Assembler::r0;
200 	auto src1AddrReg = CAArch32Assembler::r1;
201 	auto src2AddrReg = CAArch32Assembler::r2;
202 	auto dstReg = CAArch32Assembler::q0;
203 	auto src1Reg = CAArch32Assembler::q1;
204 	auto src2Reg = CAArch32Assembler::q2;
205 
206 	LoadMemory128AddressInRegister(dstAddrReg, dst);
207 	LoadMemory128AddressInRegister(src1AddrReg, src1);
208 	LoadMemory128AddressInRegister(src2AddrReg, src2);
209 
210 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
211 	m_assembler.Vld1_32x4(src2Reg, src2AddrReg);
212 
213 	//No vector floating point divide on NEON, gotta do it 4x
214 	for(unsigned int i = 0; i < 4; i++)
215 	{
216 		auto subDstReg = static_cast<CAArch32Assembler::SINGLE_REGISTER>(dstReg * 2 + i);
217 		auto subSrc1Reg = static_cast<CAArch32Assembler::SINGLE_REGISTER>(src1Reg * 2 + i);
218 		auto subSrc2Reg = static_cast<CAArch32Assembler::SINGLE_REGISTER>(src2Reg * 2 + i);
219 		m_assembler.Vdiv_F32(subDstReg, subSrc1Reg, subSrc2Reg);
220 	}
221 
222 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
223 }
224 
Emit_Md_Srl256_MemMemCst(const STATEMENT & statement)225 void CCodeGen_AArch32::Emit_Md_Srl256_MemMemCst(const STATEMENT& statement)
226 {
227 	auto dst = statement.dst->GetSymbol().get();
228 	auto src1 = statement.src1->GetSymbol().get();
229 	auto src2 = statement.src2->GetSymbol().get();
230 
231 	assert(src1->m_type == SYM_TEMPORARY256);
232 	assert(src2->m_type == SYM_CONSTANT);
233 
234 	auto dstAddrReg = CAArch32Assembler::r0;
235 	auto src1AddrReg = CAArch32Assembler::r1;
236 	auto dstReg = CAArch32Assembler::q0;
237 
238 	uint32 offset = (src2->m_valueLow & 0x7F) / 8;
239 
240 	LoadMemory128AddressInRegister(dstAddrReg, dst);
241 	LoadTemporary256ElementAddressInRegister(src1AddrReg, src1, offset);
242 
243 	m_assembler.Vld1_32x4_u(dstReg, src1AddrReg);
244 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
245 }
246 
Emit_Md_Srl256_MemMemVar(const STATEMENT & statement)247 void CCodeGen_AArch32::Emit_Md_Srl256_MemMemVar(const STATEMENT& statement)
248 {
249 	auto dst = statement.dst->GetSymbol().get();
250 	auto src1 = statement.src1->GetSymbol().get();
251 	auto src2 = statement.src2->GetSymbol().get();
252 
253 	assert(src1->m_type == SYM_TEMPORARY256);
254 
255 	auto offsetRegister = CAArch32Assembler::r0;
256 	auto dstAddrReg = CAArch32Assembler::r1;
257 	auto src1AddrReg = CAArch32Assembler::r2;
258 	auto src2Register = PrepareSymbolRegisterUse(src2, CAArch32Assembler::r3);
259 
260 	auto dstReg = CAArch32Assembler::q0;
261 
262 	auto offsetShift = CAArch32Assembler::MakeConstantShift(CAArch32Assembler::SHIFT_LSR, 3);
263 
264 	LoadMemory128AddressInRegister(dstAddrReg, dst);
265 	LoadTemporary256ElementAddressInRegister(src1AddrReg, src1, 0);
266 
267 	//Compute offset and modify address
268 	m_assembler.And(offsetRegister, src2Register, CAArch32Assembler::MakeImmediateAluOperand(0x7F, 0));
269 	m_assembler.Mov(offsetRegister, CAArch32Assembler::MakeRegisterAluOperand(offsetRegister, offsetShift));
270 	m_assembler.Add(src1AddrReg, src1AddrReg, offsetRegister);
271 
272 	m_assembler.Vld1_32x4_u(dstReg, src1AddrReg);
273 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
274 }
275 
Emit_Md_LoadFromRef_MemMem(const STATEMENT & statement)276 void CCodeGen_AArch32::Emit_Md_LoadFromRef_MemMem(const STATEMENT& statement)
277 {
278 	auto dst = statement.dst->GetSymbol().get();
279 	auto src1 = statement.src1->GetSymbol().get();
280 
281 	auto src1AddrReg = CAArch32Assembler::r0;
282 	auto dstAddrReg = CAArch32Assembler::r1;
283 
284 	auto dstReg = CAArch32Assembler::q0;
285 
286 	LoadMemory128AddressInRegister(dstAddrReg, dst);
287 	LoadMemoryReferenceInRegister(src1AddrReg, src1);
288 
289 	m_assembler.Vld1_32x4(dstReg, src1AddrReg);
290 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
291 }
292 
Emit_Md_StoreAtRef_MemMem(const STATEMENT & statement)293 void CCodeGen_AArch32::Emit_Md_StoreAtRef_MemMem(const STATEMENT& statement)
294 {
295 	auto src1 = statement.src1->GetSymbol().get();
296 	auto src2 = statement.src2->GetSymbol().get();
297 
298 	auto src1AddrReg = CAArch32Assembler::r0;
299 	auto src2AddrReg = CAArch32Assembler::r1;
300 
301 	auto src2Reg = CAArch32Assembler::q0;
302 
303 	LoadMemoryReferenceInRegister(src1AddrReg, src1);
304 	LoadMemory128AddressInRegister(src2AddrReg, src2);
305 
306 	m_assembler.Vld1_32x4(src2Reg, src2AddrReg);
307 	m_assembler.Vst1_32x4(src2Reg, src1AddrReg);
308 }
309 
Emit_Md_MovMasked_MemMemMem(const STATEMENT & statement)310 void CCodeGen_AArch32::Emit_Md_MovMasked_MemMemMem(const STATEMENT& statement)
311 {
312 	auto dst = statement.dst->GetSymbol().get();
313 	auto src1 = statement.src1->GetSymbol().get();
314 	auto src2 = statement.src2->GetSymbol().get();
315 
316 	assert(dst->Equals(src1));
317 
318 	auto mask = static_cast<uint8>(statement.jmpCondition);
319 
320 	auto dstAddrReg = CAArch32Assembler::r0;
321 	auto src2AddrReg = CAArch32Assembler::r2;
322 	auto tmpReg = CAArch32Assembler::r3;
323 	auto dstReg = CAArch32Assembler::q0;
324 	auto src2Reg = CAArch32Assembler::q2;
325 	auto dstRegLo = static_cast<CAArch32Assembler::DOUBLE_REGISTER>(dstReg + 0);
326 	auto dstRegHi = static_cast<CAArch32Assembler::DOUBLE_REGISTER>(dstReg + 1);
327 	auto src2RegLo = static_cast<CAArch32Assembler::DOUBLE_REGISTER>(src2Reg + 0);
328 	auto src2RegHi = static_cast<CAArch32Assembler::DOUBLE_REGISTER>(src2Reg + 1);
329 
330 	LoadMemory128AddressInRegister(dstAddrReg, dst);
331 	LoadMemory128AddressInRegister(src2AddrReg, src2);
332 
333 	m_assembler.Vld1_32x4(dstReg, dstAddrReg);
334 	m_assembler.Vld1_32x4(src2Reg, src2AddrReg);
335 	for(unsigned int i = 0; i < 4; i++)
336 	{
337 		if(mask & (1 << i))
338 		{
339 			m_assembler.Vmov(tmpReg, (i & 2) ? src2RegHi : src2RegLo, (i & 1));
340 			m_assembler.Vmov((i & 2) ? dstRegHi : dstRegLo, tmpReg, (i & 1));
341 		}
342 	}
343 
344 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
345 }
346 
Emit_Md_Expand_MemReg(const STATEMENT & statement)347 void CCodeGen_AArch32::Emit_Md_Expand_MemReg(const STATEMENT& statement)
348 {
349 	auto dst = statement.dst->GetSymbol().get();
350 	auto src1 = statement.src1->GetSymbol().get();
351 
352 	auto dstAddrReg = CAArch32Assembler::r0;
353 	auto tmpReg = CAArch32Assembler::q0;
354 
355 	LoadMemory128AddressInRegister(dstAddrReg, dst);
356 
357 	m_assembler.Vdup(tmpReg, g_registers[src1->m_valueLow]);
358 	m_assembler.Vst1_32x4(tmpReg, dstAddrReg);
359 }
360 
Emit_Md_Expand_MemMem(const STATEMENT & statement)361 void CCodeGen_AArch32::Emit_Md_Expand_MemMem(const STATEMENT& statement)
362 {
363 	auto dst = statement.dst->GetSymbol().get();
364 	auto src1 = statement.src1->GetSymbol().get();
365 
366 	auto dstAddrReg = CAArch32Assembler::r0;
367 	auto src1Reg = CAArch32Assembler::r1;
368 	auto tmpReg = CAArch32Assembler::q0;
369 
370 	LoadMemoryInRegister(src1Reg, src1);
371 	LoadMemory128AddressInRegister(dstAddrReg, dst);
372 
373 	m_assembler.Vdup(tmpReg, src1Reg);
374 	m_assembler.Vst1_32x4(tmpReg, dstAddrReg);
375 }
376 
Emit_Md_Expand_MemCst(const STATEMENT & statement)377 void CCodeGen_AArch32::Emit_Md_Expand_MemCst(const STATEMENT& statement)
378 {
379 	auto dst = statement.dst->GetSymbol().get();
380 	auto src1 = statement.src1->GetSymbol().get();
381 
382 	auto dstAddrReg = CAArch32Assembler::r0;
383 	auto src1Reg = CAArch32Assembler::r1;
384 	auto tmpReg = CAArch32Assembler::q0;
385 
386 	LoadConstantInRegister(src1Reg, src1->m_valueLow);
387 	LoadMemory128AddressInRegister(dstAddrReg, dst);
388 
389 	m_assembler.Vdup(tmpReg, src1Reg);
390 	m_assembler.Vst1_32x4(tmpReg, dstAddrReg);
391 }
392 
Emit_Md_PackHB_MemMemMem(const STATEMENT & statement)393 void CCodeGen_AArch32::Emit_Md_PackHB_MemMemMem(const STATEMENT& statement)
394 {
395 	auto dst = statement.dst->GetSymbol().get();
396 	auto src1 = statement.src1->GetSymbol().get();
397 	auto src2 = statement.src2->GetSymbol().get();
398 
399 	auto dstAddrReg = CAArch32Assembler::r0;
400 	auto src1AddrReg = CAArch32Assembler::r1;
401 	auto src2AddrReg = CAArch32Assembler::r2;
402 	auto dstReg = CAArch32Assembler::q0;
403 	auto src1Reg = CAArch32Assembler::q1;
404 	auto src2Reg = CAArch32Assembler::q2;
405 
406 	LoadMemory128AddressInRegister(dstAddrReg, dst);
407 	LoadMemory128AddressInRegister(src1AddrReg, src1);
408 	LoadMemory128AddressInRegister(src2AddrReg, src2);
409 
410 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
411 	m_assembler.Vld1_32x4(src2Reg, src2AddrReg);
412 	m_assembler.Vmovn_I16(static_cast<CAArch32Assembler::DOUBLE_REGISTER>(dstReg + 1), src1Reg);
413 	m_assembler.Vmovn_I16(static_cast<CAArch32Assembler::DOUBLE_REGISTER>(dstReg + 0), src2Reg);
414 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
415 }
416 
Emit_Md_PackWH_MemMemMem(const STATEMENT & statement)417 void CCodeGen_AArch32::Emit_Md_PackWH_MemMemMem(const STATEMENT& statement)
418 {
419 	auto dst = statement.dst->GetSymbol().get();
420 	auto src1 = statement.src1->GetSymbol().get();
421 	auto src2 = statement.src2->GetSymbol().get();
422 
423 	auto dstAddrReg = CAArch32Assembler::r0;
424 	auto src1AddrReg = CAArch32Assembler::r1;
425 	auto src2AddrReg = CAArch32Assembler::r2;
426 	auto dstReg = CAArch32Assembler::q0;
427 	auto src1Reg = CAArch32Assembler::q1;
428 	auto src2Reg = CAArch32Assembler::q2;
429 
430 	LoadMemory128AddressInRegister(dstAddrReg, dst);
431 	LoadMemory128AddressInRegister(src1AddrReg, src1);
432 	LoadMemory128AddressInRegister(src2AddrReg, src2);
433 
434 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
435 	m_assembler.Vld1_32x4(src2Reg, src2AddrReg);
436 	m_assembler.Vmovn_I32(static_cast<CAArch32Assembler::DOUBLE_REGISTER>(dstReg + 1), src1Reg);
437 	m_assembler.Vmovn_I32(static_cast<CAArch32Assembler::DOUBLE_REGISTER>(dstReg + 0), src2Reg);
438 	m_assembler.Vst1_32x4(dstReg, dstAddrReg);
439 }
440 
441 template <uint32 offset>
Emit_Md_UnpackBH_MemMemMem(const STATEMENT & statement)442 void CCodeGen_AArch32::Emit_Md_UnpackBH_MemMemMem(const STATEMENT& statement)
443 {
444 	auto dst = statement.dst->GetSymbol().get();
445 	auto src1 = statement.src1->GetSymbol().get();
446 	auto src2 = statement.src2->GetSymbol().get();
447 
448 	auto dstAddrReg = CAArch32Assembler::r0;
449 	auto src1AddrReg = CAArch32Assembler::r1;
450 	auto src2AddrReg = CAArch32Assembler::r2;
451 	auto src1Reg = CAArch32Assembler::d0;
452 	auto src2Reg = CAArch32Assembler::d1;
453 
454 	LoadMemory128AddressInRegister(dstAddrReg, dst);
455 	LoadMemory128AddressInRegister(src1AddrReg, src1, offset);
456 	LoadMemory128AddressInRegister(src2AddrReg, src2, offset);
457 
458 	//Warning: VZIP modifies both registers
459 	m_assembler.Vld1_32x2(src1Reg, src2AddrReg);
460 	m_assembler.Vld1_32x2(src2Reg, src1AddrReg);
461 	m_assembler.Vzip_I8(src1Reg, src2Reg);
462 	m_assembler.Vst1_32x4(static_cast<CAArch32Assembler::QUAD_REGISTER>(src1Reg), dstAddrReg);
463 }
464 
465 template <uint32 offset>
Emit_Md_UnpackHW_MemMemMem(const STATEMENT & statement)466 void CCodeGen_AArch32::Emit_Md_UnpackHW_MemMemMem(const STATEMENT& statement)
467 {
468 	auto dst = statement.dst->GetSymbol().get();
469 	auto src1 = statement.src1->GetSymbol().get();
470 	auto src2 = statement.src2->GetSymbol().get();
471 
472 	auto dstAddrReg = CAArch32Assembler::r0;
473 	auto src1AddrReg = CAArch32Assembler::r1;
474 	auto src2AddrReg = CAArch32Assembler::r2;
475 	auto src1Reg = CAArch32Assembler::d0;
476 	auto src2Reg = CAArch32Assembler::d1;
477 
478 	LoadMemory128AddressInRegister(dstAddrReg, dst);
479 	LoadMemory128AddressInRegister(src1AddrReg, src1, offset);
480 	LoadMemory128AddressInRegister(src2AddrReg, src2, offset);
481 
482 	//Warning: VZIP modifies both registers
483 	m_assembler.Vld1_32x2(src1Reg, src2AddrReg);
484 	m_assembler.Vld1_32x2(src2Reg, src1AddrReg);
485 	m_assembler.Vzip_I16(src1Reg, src2Reg);
486 	m_assembler.Vst1_32x4(static_cast<CAArch32Assembler::QUAD_REGISTER>(src1Reg), dstAddrReg);
487 }
488 
489 template <uint32 offset>
Emit_Md_UnpackWD_MemMemMem(const STATEMENT & statement)490 void CCodeGen_AArch32::Emit_Md_UnpackWD_MemMemMem(const STATEMENT& statement)
491 {
492 	auto dst = statement.dst->GetSymbol().get();
493 	auto src1 = statement.src1->GetSymbol().get();
494 	auto src2 = statement.src2->GetSymbol().get();
495 
496 	auto dstAddrReg = CAArch32Assembler::r0;
497 	auto src1AddrReg = CAArch32Assembler::r1;
498 	auto src2AddrReg = CAArch32Assembler::r2;
499 	auto src1Reg = CAArch32Assembler::d0;
500 	auto src2Reg = CAArch32Assembler::d2;
501 
502 	LoadMemory128AddressInRegister(dstAddrReg, dst);
503 	LoadMemory128AddressInRegister(src1AddrReg, src1, offset);
504 	LoadMemory128AddressInRegister(src2AddrReg, src2, offset);
505 
506 	//Warning: VZIP modifies both registers
507 	m_assembler.Vld1_32x2(src1Reg, src2AddrReg);
508 	m_assembler.Vld1_32x2(src2Reg, src1AddrReg);
509 	m_assembler.Vzip_I32(static_cast<CAArch32Assembler::QUAD_REGISTER>(src1Reg), static_cast<CAArch32Assembler::QUAD_REGISTER>(src2Reg));
510 	m_assembler.Vst1_32x4(static_cast<CAArch32Assembler::QUAD_REGISTER>(src1Reg), dstAddrReg);
511 }
512 
Emit_MergeTo256_MemMemMem(const STATEMENT & statement)513 void CCodeGen_AArch32::Emit_MergeTo256_MemMemMem(const STATEMENT& statement)
514 {
515 	auto dst = statement.dst->GetSymbol().get();
516 	auto src1 = statement.src1->GetSymbol().get();
517 	auto src2 = statement.src2->GetSymbol().get();
518 
519 	assert(dst->m_type == SYM_TEMPORARY256);
520 
521 	auto dstLoAddrReg = CAArch32Assembler::r0;
522 	auto dstHiAddrReg = CAArch32Assembler::r1;
523 	auto src1AddrReg = CAArch32Assembler::r2;
524 	auto src2AddrReg = CAArch32Assembler::r3;
525 	auto src1Reg = CAArch32Assembler::q0;
526 	auto src2Reg = CAArch32Assembler::q1;
527 
528 	LoadTemporary256ElementAddressInRegister(dstLoAddrReg, dst, 0x00);
529 	LoadTemporary256ElementAddressInRegister(dstHiAddrReg, dst, 0x10);
530 	LoadMemory128AddressInRegister(src1AddrReg, src1);
531 	LoadMemory128AddressInRegister(src2AddrReg, src2);
532 
533 	m_assembler.Vld1_32x4(src1Reg, src1AddrReg);
534 	m_assembler.Vld1_32x4(src2Reg, src2AddrReg);
535 	m_assembler.Vst1_32x4(src1Reg, dstLoAddrReg);
536 	m_assembler.Vst1_32x4(src2Reg, dstHiAddrReg);
537 }
538 
539 CCodeGen_AArch32::CONSTMATCHER CCodeGen_AArch32::g_mdConstMatchers[] =
540 {
541 	{ OP_MD_ADD_B,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDB>					},
542 	{ OP_MD_ADD_H,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDH>					},
543 	{ OP_MD_ADD_W,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDW>					},
544 
545 	{ OP_MD_SUB_B,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBB>					},
546 	{ OP_MD_SUB_H,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBH>					},
547 	{ OP_MD_SUB_W,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBW>					},
548 
549 	{ OP_MD_ADDUS_B,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDBUS>				},
550 	{ OP_MD_ADDUS_H,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDHUS>				},
551 	{ OP_MD_ADDUS_W,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDWUS>				},
552 
553 	{ OP_MD_ADDSS_H,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDHSS>				},
554 	{ OP_MD_ADDSS_W,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDWSS>				},
555 
556 	{ OP_MD_SUBUS_B,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBBUS>				},
557 	{ OP_MD_SUBUS_H,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBHUS>				},
558 
559 	{ OP_MD_SUBSS_H,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBHSS>				},
560 	{ OP_MD_SUBSS_W,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBWSS>				},
561 
562 	{ OP_MD_CMPEQ_W,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_CMPEQW>				},
563 
564 	{ OP_MD_CMPGT_H,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_CMPGTH>				},
565 
566 	{ OP_MD_MIN_H,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_MINH>					},
567 	{ OP_MD_MIN_W,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_MINW>					},
568 
569 	{ OP_MD_MAX_H,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_MAXH>					},
570 	{ OP_MD_MAX_W,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_MAXW>					},
571 
572 	{ OP_MD_ADD_S,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_ADDS>					},
573 	{ OP_MD_SUB_S,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_SUBS>					},
574 	{ OP_MD_MUL_S,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_MULS>					},
575 	{ OP_MD_DIV_S,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_DivS_MemMemMem						},
576 
577 	{ OP_MD_ABS_S,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_MemMem<MDOP_ABSS>					},
578 	{ OP_MD_MIN_S,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<FPUMDOP_MIN>				},
579 	{ OP_MD_MAX_S,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<FPUMDOP_MAX>				},
580 
581 	{ OP_MD_AND,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_AND>					},
582 	{ OP_MD_OR,					MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_OR>					},
583 	{ OP_MD_XOR,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MemMemMem<MDOP_XOR>					},
584 
585 	{ OP_MD_NOT,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_Not_MemMem							},
586 
587 	{ OP_MD_SLLH,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_CONSTANT,			&CCodeGen_AArch32::Emit_Md_Shift_MemMemCst<MDOP_SLLH>			},
588 	{ OP_MD_SLLW,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_CONSTANT,			&CCodeGen_AArch32::Emit_Md_Shift_MemMemCst<MDOP_SLLW>			},
589 
590 	{ OP_MD_SRLH,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_CONSTANT,			&CCodeGen_AArch32::Emit_Md_Shift_MemMemCst<MDOP_SRLH>			},
591 	{ OP_MD_SRLW,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_CONSTANT,			&CCodeGen_AArch32::Emit_Md_Shift_MemMemCst<MDOP_SRLW>			},
592 
593 	{ OP_MD_SRAH,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_CONSTANT,			&CCodeGen_AArch32::Emit_Md_Shift_MemMemCst<MDOP_SRAH>			},
594 	{ OP_MD_SRAW,				MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_CONSTANT,			&CCodeGen_AArch32::Emit_Md_Shift_MemMemCst<MDOP_SRAW>			},
595 
596 	{ OP_MD_SRL256,				MATCH_VARIABLE128,			MATCH_MEMORY256,			MATCH_VARIABLE,			&CCodeGen_AArch32::Emit_Md_Srl256_MemMemVar						},
597 	{ OP_MD_SRL256,				MATCH_VARIABLE128,			MATCH_MEMORY256,			MATCH_CONSTANT,			&CCodeGen_AArch32::Emit_Md_Srl256_MemMemCst						},
598 
599 	{ OP_MD_ISNEGATIVE,			MATCH_VARIABLE,				MATCH_MEMORY128,			MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_Test_VarMem<CAArch32Assembler::CONDITION_MI> },
600 	{ OP_MD_ISZERO,				MATCH_VARIABLE,				MATCH_MEMORY128,			MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_Test_VarMem<CAArch32Assembler::CONDITION_EQ> },
601 
602 	{ OP_MD_TOSINGLE,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_MemMem<MDOP_TOSINGLE>				},
603 	{ OP_MD_TOWORD_TRUNCATE,	MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_MemMem<MDOP_TOWORD>					},
604 
605 	{ OP_MOV,					MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_Mov_MemMem							},
606 
607 	{ OP_LOADFROMREF,			MATCH_MEMORY128,			MATCH_MEM_REF,				MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_LoadFromRef_MemMem					},
608 	{ OP_STOREATREF,			MATCH_NIL,					MATCH_MEM_REF,				MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_StoreAtRef_MemMem					},
609 
610 	{ OP_MD_MOV_MASKED,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_MovMasked_MemMemMem					},
611 
612 	{ OP_MD_EXPAND,				MATCH_MEMORY128,			MATCH_REGISTER,				MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_Expand_MemReg						},
613 	{ OP_MD_EXPAND,				MATCH_MEMORY128,			MATCH_MEMORY,				MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_Expand_MemMem						},
614 	{ OP_MD_EXPAND,				MATCH_MEMORY128,			MATCH_CONSTANT,				MATCH_NIL,				&CCodeGen_AArch32::Emit_Md_Expand_MemCst						},
615 
616 	{ OP_MD_PACK_HB,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_PackHB_MemMemMem						},
617 	{ OP_MD_PACK_WH,			MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_PackWH_MemMemMem						},
618 
619 	{ OP_MD_UNPACK_LOWER_BH,	MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_UnpackBH_MemMemMem<0>				},
620 	{ OP_MD_UNPACK_LOWER_HW,	MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_UnpackHW_MemMemMem<0>				},
621 	{ OP_MD_UNPACK_LOWER_WD,	MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_UnpackWD_MemMemMem<0>				},
622 
623 	{ OP_MD_UNPACK_UPPER_BH,	MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_UnpackBH_MemMemMem<8>				},
624 	{ OP_MD_UNPACK_UPPER_HW,	MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_UnpackHW_MemMemMem<8>				},
625 	{ OP_MD_UNPACK_UPPER_WD,	MATCH_MEMORY128,			MATCH_MEMORY128,			MATCH_MEMORY128,		&CCodeGen_AArch32::Emit_Md_UnpackWD_MemMemMem<8>				},
626 
627 	{ OP_MERGETO256,			MATCH_MEMORY256,			MATCH_VARIABLE128,			MATCH_VARIABLE128,		&CCodeGen_AArch32::Emit_MergeTo256_MemMemMem					},
628 
629 	{ OP_MOV,					MATCH_NIL,					MATCH_NIL,					MATCH_NIL,				NULL														},
630 };
631