1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30
31 using namespace llvm;
32
R600TargetLowering(TargetMachine & TM)33 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
34 AMDGPUTargetLowering(TM),
35 Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
36 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
37 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
38 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
39 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
40 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
41 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
42
43 computeRegisterProperties();
44
45 // Set condition code actions
46 setCondCodeAction(ISD::SETO, MVT::f32, Expand);
47 setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
48 setCondCodeAction(ISD::SETLT, MVT::f32, Expand);
49 setCondCodeAction(ISD::SETLE, MVT::f32, Expand);
50 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
51 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
52 setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
53 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
54 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
55 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
56 setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
57 setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
58
59 setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
60 setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
61 setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
62 setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
63
64 setOperationAction(ISD::FCOS, MVT::f32, Custom);
65 setOperationAction(ISD::FSIN, MVT::f32, Custom);
66
67 setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
68 setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
69
70 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
71 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
72 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
73
74 setOperationAction(ISD::FSUB, MVT::f32, Expand);
75
76 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
77 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
78 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
79
80 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
81 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
82
83 setOperationAction(ISD::SETCC, MVT::i32, Expand);
84 setOperationAction(ISD::SETCC, MVT::f32, Expand);
85 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
86 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
87 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
88
89 setOperationAction(ISD::SELECT, MVT::i32, Expand);
90 setOperationAction(ISD::SELECT, MVT::f32, Expand);
91 setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
92 setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
93
94 // Expand sign extension of vectors
95 if (!Subtarget->hasBFE())
96 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
97
98 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
99 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
100
101 if (!Subtarget->hasBFE())
102 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
103 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
105
106 if (!Subtarget->hasBFE())
107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
110
111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
114
115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
116
117
118 // Legalize loads and stores to the private address space.
119 setOperationAction(ISD::LOAD, MVT::i32, Custom);
120 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
121 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
122
123 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
124 // spaces, so it is custom lowered to handle those where it isn't.
125 for (MVT VT : MVT::integer_valuetypes()) {
126 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
127 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
128 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
129
130 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
131 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
132 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
133
134 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
135 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
136 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
137 }
138
139 setOperationAction(ISD::STORE, MVT::i8, Custom);
140 setOperationAction(ISD::STORE, MVT::i32, Custom);
141 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
142 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
143 setTruncStoreAction(MVT::i32, MVT::i8, Custom);
144 setTruncStoreAction(MVT::i32, MVT::i16, Custom);
145
146 setOperationAction(ISD::LOAD, MVT::i32, Custom);
147 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
148 setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
149
150 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
151 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
152 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
153 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
154
155 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
156 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
157 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
158 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
159
160 setTargetDAGCombine(ISD::FP_ROUND);
161 setTargetDAGCombine(ISD::FP_TO_SINT);
162 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
163 setTargetDAGCombine(ISD::SELECT_CC);
164 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
165
166 setOperationAction(ISD::SUB, MVT::i64, Expand);
167
168 // These should be replaced by UDVIREM, but it does not happen automatically
169 // during Type Legalization
170 setOperationAction(ISD::UDIV, MVT::i64, Custom);
171 setOperationAction(ISD::UREM, MVT::i64, Custom);
172 setOperationAction(ISD::SDIV, MVT::i64, Custom);
173 setOperationAction(ISD::SREM, MVT::i64, Custom);
174
175 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
176 // to be Legal/Custom in order to avoid library calls.
177 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
178 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
179 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
180
181 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
182
183 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
184 for (MVT VT : ScalarIntVTs) {
185 setOperationAction(ISD::ADDC, VT, Expand);
186 setOperationAction(ISD::SUBC, VT, Expand);
187 setOperationAction(ISD::ADDE, VT, Expand);
188 setOperationAction(ISD::SUBE, VT, Expand);
189 }
190
191 setSchedulingPreference(Sched::Source);
192 }
193
EmitInstrWithCustomInserter(MachineInstr * MI,MachineBasicBlock * BB) const194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
195 MachineInstr * MI, MachineBasicBlock * BB) const {
196 MachineFunction * MF = BB->getParent();
197 MachineRegisterInfo &MRI = MF->getRegInfo();
198 MachineBasicBlock::iterator I = *MI;
199 const R600InstrInfo *TII =
200 static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
201
202 switch (MI->getOpcode()) {
203 default:
204 // Replace LDS_*_RET instruction that don't have any uses with the
205 // equivalent LDS_*_NORET instruction.
206 if (TII->isLDSRetInstr(MI->getOpcode())) {
207 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
208 assert(DstIdx != -1);
209 MachineInstrBuilder NewMI;
210 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
211 // LDS_1A2D support and remove this special case.
212 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
213 MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
214 return BB;
215
216 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
217 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
218 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
219 NewMI.addOperand(MI->getOperand(i));
220 }
221 } else {
222 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
223 }
224 break;
225 case AMDGPU::CLAMP_R600: {
226 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
227 AMDGPU::MOV,
228 MI->getOperand(0).getReg(),
229 MI->getOperand(1).getReg());
230 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
231 break;
232 }
233
234 case AMDGPU::FABS_R600: {
235 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
236 AMDGPU::MOV,
237 MI->getOperand(0).getReg(),
238 MI->getOperand(1).getReg());
239 TII->addFlag(NewMI, 0, MO_FLAG_ABS);
240 break;
241 }
242
243 case AMDGPU::FNEG_R600: {
244 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
245 AMDGPU::MOV,
246 MI->getOperand(0).getReg(),
247 MI->getOperand(1).getReg());
248 TII->addFlag(NewMI, 0, MO_FLAG_NEG);
249 break;
250 }
251
252 case AMDGPU::MASK_WRITE: {
253 unsigned maskedRegister = MI->getOperand(0).getReg();
254 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
255 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
256 TII->addFlag(defInstr, 0, MO_FLAG_MASK);
257 break;
258 }
259
260 case AMDGPU::MOV_IMM_F32:
261 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
262 MI->getOperand(1).getFPImm()->getValueAPF()
263 .bitcastToAPInt().getZExtValue());
264 break;
265 case AMDGPU::MOV_IMM_I32:
266 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
267 MI->getOperand(1).getImm());
268 break;
269 case AMDGPU::CONST_COPY: {
270 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
271 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
272 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
273 MI->getOperand(1).getImm());
274 break;
275 }
276
277 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
278 case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
279 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
280 unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
281
282 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
283 .addOperand(MI->getOperand(0))
284 .addOperand(MI->getOperand(1))
285 .addImm(EOP); // Set End of program bit
286 break;
287 }
288
289 case AMDGPU::TXD: {
290 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
291 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
292 MachineOperand &RID = MI->getOperand(4);
293 MachineOperand &SID = MI->getOperand(5);
294 unsigned TextureId = MI->getOperand(6).getImm();
295 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
296 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
297
298 switch (TextureId) {
299 case 5: // Rect
300 CTX = CTY = 0;
301 break;
302 case 6: // Shadow1D
303 SrcW = SrcZ;
304 break;
305 case 7: // Shadow2D
306 SrcW = SrcZ;
307 break;
308 case 8: // ShadowRect
309 CTX = CTY = 0;
310 SrcW = SrcZ;
311 break;
312 case 9: // 1DArray
313 SrcZ = SrcY;
314 CTZ = 0;
315 break;
316 case 10: // 2DArray
317 CTZ = 0;
318 break;
319 case 11: // Shadow1DArray
320 SrcZ = SrcY;
321 CTZ = 0;
322 break;
323 case 12: // Shadow2DArray
324 CTZ = 0;
325 break;
326 }
327 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
328 .addOperand(MI->getOperand(3))
329 .addImm(SrcX)
330 .addImm(SrcY)
331 .addImm(SrcZ)
332 .addImm(SrcW)
333 .addImm(0)
334 .addImm(0)
335 .addImm(0)
336 .addImm(0)
337 .addImm(1)
338 .addImm(2)
339 .addImm(3)
340 .addOperand(RID)
341 .addOperand(SID)
342 .addImm(CTX)
343 .addImm(CTY)
344 .addImm(CTZ)
345 .addImm(CTW);
346 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
347 .addOperand(MI->getOperand(2))
348 .addImm(SrcX)
349 .addImm(SrcY)
350 .addImm(SrcZ)
351 .addImm(SrcW)
352 .addImm(0)
353 .addImm(0)
354 .addImm(0)
355 .addImm(0)
356 .addImm(1)
357 .addImm(2)
358 .addImm(3)
359 .addOperand(RID)
360 .addOperand(SID)
361 .addImm(CTX)
362 .addImm(CTY)
363 .addImm(CTZ)
364 .addImm(CTW);
365 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
366 .addOperand(MI->getOperand(0))
367 .addOperand(MI->getOperand(1))
368 .addImm(SrcX)
369 .addImm(SrcY)
370 .addImm(SrcZ)
371 .addImm(SrcW)
372 .addImm(0)
373 .addImm(0)
374 .addImm(0)
375 .addImm(0)
376 .addImm(1)
377 .addImm(2)
378 .addImm(3)
379 .addOperand(RID)
380 .addOperand(SID)
381 .addImm(CTX)
382 .addImm(CTY)
383 .addImm(CTZ)
384 .addImm(CTW)
385 .addReg(T0, RegState::Implicit)
386 .addReg(T1, RegState::Implicit);
387 break;
388 }
389
390 case AMDGPU::TXD_SHADOW: {
391 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
392 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
393 MachineOperand &RID = MI->getOperand(4);
394 MachineOperand &SID = MI->getOperand(5);
395 unsigned TextureId = MI->getOperand(6).getImm();
396 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
397 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
398
399 switch (TextureId) {
400 case 5: // Rect
401 CTX = CTY = 0;
402 break;
403 case 6: // Shadow1D
404 SrcW = SrcZ;
405 break;
406 case 7: // Shadow2D
407 SrcW = SrcZ;
408 break;
409 case 8: // ShadowRect
410 CTX = CTY = 0;
411 SrcW = SrcZ;
412 break;
413 case 9: // 1DArray
414 SrcZ = SrcY;
415 CTZ = 0;
416 break;
417 case 10: // 2DArray
418 CTZ = 0;
419 break;
420 case 11: // Shadow1DArray
421 SrcZ = SrcY;
422 CTZ = 0;
423 break;
424 case 12: // Shadow2DArray
425 CTZ = 0;
426 break;
427 }
428
429 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
430 .addOperand(MI->getOperand(3))
431 .addImm(SrcX)
432 .addImm(SrcY)
433 .addImm(SrcZ)
434 .addImm(SrcW)
435 .addImm(0)
436 .addImm(0)
437 .addImm(0)
438 .addImm(0)
439 .addImm(1)
440 .addImm(2)
441 .addImm(3)
442 .addOperand(RID)
443 .addOperand(SID)
444 .addImm(CTX)
445 .addImm(CTY)
446 .addImm(CTZ)
447 .addImm(CTW);
448 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
449 .addOperand(MI->getOperand(2))
450 .addImm(SrcX)
451 .addImm(SrcY)
452 .addImm(SrcZ)
453 .addImm(SrcW)
454 .addImm(0)
455 .addImm(0)
456 .addImm(0)
457 .addImm(0)
458 .addImm(1)
459 .addImm(2)
460 .addImm(3)
461 .addOperand(RID)
462 .addOperand(SID)
463 .addImm(CTX)
464 .addImm(CTY)
465 .addImm(CTZ)
466 .addImm(CTW);
467 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
468 .addOperand(MI->getOperand(0))
469 .addOperand(MI->getOperand(1))
470 .addImm(SrcX)
471 .addImm(SrcY)
472 .addImm(SrcZ)
473 .addImm(SrcW)
474 .addImm(0)
475 .addImm(0)
476 .addImm(0)
477 .addImm(0)
478 .addImm(1)
479 .addImm(2)
480 .addImm(3)
481 .addOperand(RID)
482 .addOperand(SID)
483 .addImm(CTX)
484 .addImm(CTY)
485 .addImm(CTZ)
486 .addImm(CTW)
487 .addReg(T0, RegState::Implicit)
488 .addReg(T1, RegState::Implicit);
489 break;
490 }
491
492 case AMDGPU::BRANCH:
493 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
494 .addOperand(MI->getOperand(0));
495 break;
496
497 case AMDGPU::BRANCH_COND_f32: {
498 MachineInstr *NewMI =
499 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
500 AMDGPU::PREDICATE_BIT)
501 .addOperand(MI->getOperand(1))
502 .addImm(OPCODE_IS_NOT_ZERO)
503 .addImm(0); // Flags
504 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
505 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
506 .addOperand(MI->getOperand(0))
507 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
508 break;
509 }
510
511 case AMDGPU::BRANCH_COND_i32: {
512 MachineInstr *NewMI =
513 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
514 AMDGPU::PREDICATE_BIT)
515 .addOperand(MI->getOperand(1))
516 .addImm(OPCODE_IS_NOT_ZERO_INT)
517 .addImm(0); // Flags
518 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
519 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
520 .addOperand(MI->getOperand(0))
521 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
522 break;
523 }
524
525 case AMDGPU::EG_ExportSwz:
526 case AMDGPU::R600_ExportSwz: {
527 // Instruction is left unmodified if its not the last one of its type
528 bool isLastInstructionOfItsType = true;
529 unsigned InstExportType = MI->getOperand(1).getImm();
530 for (MachineBasicBlock::iterator NextExportInst = std::next(I),
531 EndBlock = BB->end(); NextExportInst != EndBlock;
532 NextExportInst = std::next(NextExportInst)) {
533 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
534 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
535 unsigned CurrentInstExportType = NextExportInst->getOperand(1)
536 .getImm();
537 if (CurrentInstExportType == InstExportType) {
538 isLastInstructionOfItsType = false;
539 break;
540 }
541 }
542 }
543 bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
544 if (!EOP && !isLastInstructionOfItsType)
545 return BB;
546 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
547 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
548 .addOperand(MI->getOperand(0))
549 .addOperand(MI->getOperand(1))
550 .addOperand(MI->getOperand(2))
551 .addOperand(MI->getOperand(3))
552 .addOperand(MI->getOperand(4))
553 .addOperand(MI->getOperand(5))
554 .addOperand(MI->getOperand(6))
555 .addImm(CfInst)
556 .addImm(EOP);
557 break;
558 }
559 case AMDGPU::RETURN: {
560 // RETURN instructions must have the live-out registers as implicit uses,
561 // otherwise they appear dead.
562 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
563 MachineInstrBuilder MIB(*MF, MI);
564 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
565 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
566 return BB;
567 }
568 }
569
570 MI->eraseFromParent();
571 return BB;
572 }
573
574 //===----------------------------------------------------------------------===//
575 // Custom DAG Lowering Operations
576 //===----------------------------------------------------------------------===//
577
LowerOperation(SDValue Op,SelectionDAG & DAG) const578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
579 MachineFunction &MF = DAG.getMachineFunction();
580 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
581 switch (Op.getOpcode()) {
582 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
583 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
584 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
585 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
586 case ISD::SRA_PARTS:
587 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
588 case ISD::FCOS:
589 case ISD::FSIN: return LowerTrig(Op, DAG);
590 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
591 case ISD::STORE: return LowerSTORE(Op, DAG);
592 case ISD::LOAD: {
593 SDValue Result = LowerLOAD(Op, DAG);
594 assert((!Result.getNode() ||
595 Result.getNode()->getNumValues() == 2) &&
596 "Load should return a value and a chain");
597 return Result;
598 }
599
600 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
601 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
602 case ISD::INTRINSIC_VOID: {
603 SDValue Chain = Op.getOperand(0);
604 unsigned IntrinsicID =
605 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
606 switch (IntrinsicID) {
607 case AMDGPUIntrinsic::AMDGPU_store_output: {
608 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
609 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
610 MFI->LiveOuts.push_back(Reg);
611 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
612 }
613 case AMDGPUIntrinsic::R600_store_swizzle: {
614 const SDValue Args[8] = {
615 Chain,
616 Op.getOperand(2), // Export Value
617 Op.getOperand(3), // ArrayBase
618 Op.getOperand(4), // Type
619 DAG.getConstant(0, MVT::i32), // SWZ_X
620 DAG.getConstant(1, MVT::i32), // SWZ_Y
621 DAG.getConstant(2, MVT::i32), // SWZ_Z
622 DAG.getConstant(3, MVT::i32) // SWZ_W
623 };
624 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
625 }
626
627 // default for switch(IntrinsicID)
628 default: break;
629 }
630 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
631 break;
632 }
633 case ISD::INTRINSIC_WO_CHAIN: {
634 unsigned IntrinsicID =
635 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
636 EVT VT = Op.getValueType();
637 SDLoc DL(Op);
638 switch(IntrinsicID) {
639 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
640 case AMDGPUIntrinsic::R600_load_input: {
641 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
642 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
643 MachineFunction &MF = DAG.getMachineFunction();
644 MachineRegisterInfo &MRI = MF.getRegInfo();
645 MRI.addLiveIn(Reg);
646 return DAG.getCopyFromReg(DAG.getEntryNode(),
647 SDLoc(DAG.getEntryNode()), Reg, VT);
648 }
649
650 case AMDGPUIntrinsic::R600_interp_input: {
651 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
652 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
653 MachineSDNode *interp;
654 if (ijb < 0) {
655 const MachineFunction &MF = DAG.getMachineFunction();
656 const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
657 MF.getSubtarget().getInstrInfo());
658 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
659 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
660 return DAG.getTargetExtractSubreg(
661 TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
662 DL, MVT::f32, SDValue(interp, 0));
663 }
664 MachineFunction &MF = DAG.getMachineFunction();
665 MachineRegisterInfo &MRI = MF.getRegInfo();
666 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
667 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
668 MRI.addLiveIn(RegisterI);
669 MRI.addLiveIn(RegisterJ);
670 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
671 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
672 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
673 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
674
675 if (slot % 4 < 2)
676 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
677 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
678 RegisterJNode, RegisterINode);
679 else
680 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
681 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
682 RegisterJNode, RegisterINode);
683 return SDValue(interp, slot % 2);
684 }
685 case AMDGPUIntrinsic::R600_interp_xy:
686 case AMDGPUIntrinsic::R600_interp_zw: {
687 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
688 MachineSDNode *interp;
689 SDValue RegisterINode = Op.getOperand(2);
690 SDValue RegisterJNode = Op.getOperand(3);
691
692 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
693 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
694 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
695 RegisterJNode, RegisterINode);
696 else
697 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
698 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
699 RegisterJNode, RegisterINode);
700 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
701 SDValue(interp, 0), SDValue(interp, 1));
702 }
703 case AMDGPUIntrinsic::R600_tex:
704 case AMDGPUIntrinsic::R600_texc:
705 case AMDGPUIntrinsic::R600_txl:
706 case AMDGPUIntrinsic::R600_txlc:
707 case AMDGPUIntrinsic::R600_txb:
708 case AMDGPUIntrinsic::R600_txbc:
709 case AMDGPUIntrinsic::R600_txf:
710 case AMDGPUIntrinsic::R600_txq:
711 case AMDGPUIntrinsic::R600_ddx:
712 case AMDGPUIntrinsic::R600_ddy:
713 case AMDGPUIntrinsic::R600_ldptr: {
714 unsigned TextureOp;
715 switch (IntrinsicID) {
716 case AMDGPUIntrinsic::R600_tex:
717 TextureOp = 0;
718 break;
719 case AMDGPUIntrinsic::R600_texc:
720 TextureOp = 1;
721 break;
722 case AMDGPUIntrinsic::R600_txl:
723 TextureOp = 2;
724 break;
725 case AMDGPUIntrinsic::R600_txlc:
726 TextureOp = 3;
727 break;
728 case AMDGPUIntrinsic::R600_txb:
729 TextureOp = 4;
730 break;
731 case AMDGPUIntrinsic::R600_txbc:
732 TextureOp = 5;
733 break;
734 case AMDGPUIntrinsic::R600_txf:
735 TextureOp = 6;
736 break;
737 case AMDGPUIntrinsic::R600_txq:
738 TextureOp = 7;
739 break;
740 case AMDGPUIntrinsic::R600_ddx:
741 TextureOp = 8;
742 break;
743 case AMDGPUIntrinsic::R600_ddy:
744 TextureOp = 9;
745 break;
746 case AMDGPUIntrinsic::R600_ldptr:
747 TextureOp = 10;
748 break;
749 default:
750 llvm_unreachable("Unknow Texture Operation");
751 }
752
753 SDValue TexArgs[19] = {
754 DAG.getConstant(TextureOp, MVT::i32),
755 Op.getOperand(1),
756 DAG.getConstant(0, MVT::i32),
757 DAG.getConstant(1, MVT::i32),
758 DAG.getConstant(2, MVT::i32),
759 DAG.getConstant(3, MVT::i32),
760 Op.getOperand(2),
761 Op.getOperand(3),
762 Op.getOperand(4),
763 DAG.getConstant(0, MVT::i32),
764 DAG.getConstant(1, MVT::i32),
765 DAG.getConstant(2, MVT::i32),
766 DAG.getConstant(3, MVT::i32),
767 Op.getOperand(5),
768 Op.getOperand(6),
769 Op.getOperand(7),
770 Op.getOperand(8),
771 Op.getOperand(9),
772 Op.getOperand(10)
773 };
774 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
775 }
776 case AMDGPUIntrinsic::AMDGPU_dp4: {
777 SDValue Args[8] = {
778 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
779 DAG.getConstant(0, MVT::i32)),
780 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
781 DAG.getConstant(0, MVT::i32)),
782 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
783 DAG.getConstant(1, MVT::i32)),
784 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
785 DAG.getConstant(1, MVT::i32)),
786 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
787 DAG.getConstant(2, MVT::i32)),
788 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
789 DAG.getConstant(2, MVT::i32)),
790 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
791 DAG.getConstant(3, MVT::i32)),
792 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
793 DAG.getConstant(3, MVT::i32))
794 };
795 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
796 }
797
798 case Intrinsic::r600_read_ngroups_x:
799 return LowerImplicitParameter(DAG, VT, DL, 0);
800 case Intrinsic::r600_read_ngroups_y:
801 return LowerImplicitParameter(DAG, VT, DL, 1);
802 case Intrinsic::r600_read_ngroups_z:
803 return LowerImplicitParameter(DAG, VT, DL, 2);
804 case Intrinsic::r600_read_global_size_x:
805 return LowerImplicitParameter(DAG, VT, DL, 3);
806 case Intrinsic::r600_read_global_size_y:
807 return LowerImplicitParameter(DAG, VT, DL, 4);
808 case Intrinsic::r600_read_global_size_z:
809 return LowerImplicitParameter(DAG, VT, DL, 5);
810 case Intrinsic::r600_read_local_size_x:
811 return LowerImplicitParameter(DAG, VT, DL, 6);
812 case Intrinsic::r600_read_local_size_y:
813 return LowerImplicitParameter(DAG, VT, DL, 7);
814 case Intrinsic::r600_read_local_size_z:
815 return LowerImplicitParameter(DAG, VT, DL, 8);
816
817 case Intrinsic::AMDGPU_read_workdim:
818 return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
819
820 case Intrinsic::r600_read_tgid_x:
821 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
822 AMDGPU::T1_X, VT);
823 case Intrinsic::r600_read_tgid_y:
824 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
825 AMDGPU::T1_Y, VT);
826 case Intrinsic::r600_read_tgid_z:
827 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
828 AMDGPU::T1_Z, VT);
829 case Intrinsic::r600_read_tidig_x:
830 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
831 AMDGPU::T0_X, VT);
832 case Intrinsic::r600_read_tidig_y:
833 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
834 AMDGPU::T0_Y, VT);
835 case Intrinsic::r600_read_tidig_z:
836 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
837 AMDGPU::T0_Z, VT);
838 case Intrinsic::AMDGPU_rsq:
839 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
840 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
841
842 case AMDGPUIntrinsic::AMDGPU_fract:
843 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
844 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
845 }
846 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
847 break;
848 }
849 } // end switch(Op.getOpcode())
850 return SDValue();
851 }
852
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const853 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
854 SmallVectorImpl<SDValue> &Results,
855 SelectionDAG &DAG) const {
856 switch (N->getOpcode()) {
857 default:
858 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
859 return;
860 case ISD::FP_TO_UINT:
861 if (N->getValueType(0) == MVT::i1) {
862 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
863 return;
864 }
865 // Fall-through. Since we don't care about out of bounds values
866 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
867 // considers some extra cases which are not necessary here.
868 case ISD::FP_TO_SINT: {
869 SDValue Result;
870 if (expandFP_TO_SINT(N, Result, DAG))
871 Results.push_back(Result);
872 return;
873 }
874 case ISD::UDIV: {
875 SDValue Op = SDValue(N, 0);
876 SDLoc DL(Op);
877 EVT VT = Op.getValueType();
878 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
879 N->getOperand(0), N->getOperand(1));
880 Results.push_back(UDIVREM);
881 break;
882 }
883 case ISD::UREM: {
884 SDValue Op = SDValue(N, 0);
885 SDLoc DL(Op);
886 EVT VT = Op.getValueType();
887 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
888 N->getOperand(0), N->getOperand(1));
889 Results.push_back(UDIVREM.getValue(1));
890 break;
891 }
892 case ISD::SDIV: {
893 SDValue Op = SDValue(N, 0);
894 SDLoc DL(Op);
895 EVT VT = Op.getValueType();
896 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
897 N->getOperand(0), N->getOperand(1));
898 Results.push_back(SDIVREM);
899 break;
900 }
901 case ISD::SREM: {
902 SDValue Op = SDValue(N, 0);
903 SDLoc DL(Op);
904 EVT VT = Op.getValueType();
905 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
906 N->getOperand(0), N->getOperand(1));
907 Results.push_back(SDIVREM.getValue(1));
908 break;
909 }
910 case ISD::SDIVREM: {
911 SDValue Op = SDValue(N, 1);
912 SDValue RES = LowerSDIVREM(Op, DAG);
913 Results.push_back(RES);
914 Results.push_back(RES.getValue(1));
915 break;
916 }
917 case ISD::UDIVREM: {
918 SDValue Op = SDValue(N, 0);
919 LowerUDIVREM64(Op, DAG, Results);
920 break;
921 }
922 }
923 }
924
vectorToVerticalVector(SelectionDAG & DAG,SDValue Vector) const925 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
926 SDValue Vector) const {
927
928 SDLoc DL(Vector);
929 EVT VecVT = Vector.getValueType();
930 EVT EltVT = VecVT.getVectorElementType();
931 SmallVector<SDValue, 8> Args;
932
933 for (unsigned i = 0, e = VecVT.getVectorNumElements();
934 i != e; ++i) {
935 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
936 Vector, DAG.getConstant(i, getVectorIdxTy())));
937 }
938
939 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
940 }
941
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const942 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
943 SelectionDAG &DAG) const {
944
945 SDLoc DL(Op);
946 SDValue Vector = Op.getOperand(0);
947 SDValue Index = Op.getOperand(1);
948
949 if (isa<ConstantSDNode>(Index) ||
950 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
951 return Op;
952
953 Vector = vectorToVerticalVector(DAG, Vector);
954 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
955 Vector, Index);
956 }
957
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const958 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
959 SelectionDAG &DAG) const {
960 SDLoc DL(Op);
961 SDValue Vector = Op.getOperand(0);
962 SDValue Value = Op.getOperand(1);
963 SDValue Index = Op.getOperand(2);
964
965 if (isa<ConstantSDNode>(Index) ||
966 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
967 return Op;
968
969 Vector = vectorToVerticalVector(DAG, Vector);
970 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
971 Vector, Value, Index);
972 return vectorToVerticalVector(DAG, Insert);
973 }
974
LowerTrig(SDValue Op,SelectionDAG & DAG) const975 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
976 // On hw >= R700, COS/SIN input must be between -1. and 1.
977 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
978 EVT VT = Op.getValueType();
979 SDValue Arg = Op.getOperand(0);
980 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
981 DAG.getNode(ISD::FADD, SDLoc(Op), VT,
982 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
983 DAG.getConstantFP(0.15915494309, MVT::f32)),
984 DAG.getConstantFP(0.5, MVT::f32)));
985 unsigned TrigNode;
986 switch (Op.getOpcode()) {
987 case ISD::FCOS:
988 TrigNode = AMDGPUISD::COS_HW;
989 break;
990 case ISD::FSIN:
991 TrigNode = AMDGPUISD::SIN_HW;
992 break;
993 default:
994 llvm_unreachable("Wrong trig opcode");
995 }
996 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
997 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
998 DAG.getConstantFP(-0.5, MVT::f32)));
999 if (Gen >= AMDGPUSubtarget::R700)
1000 return TrigVal;
1001 // On R600 hw, COS/SIN input must be between -Pi and Pi.
1002 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1003 DAG.getConstantFP(3.14159265359, MVT::f32));
1004 }
1005
LowerSHLParts(SDValue Op,SelectionDAG & DAG) const1006 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1007 SDLoc DL(Op);
1008 EVT VT = Op.getValueType();
1009
1010 SDValue Lo = Op.getOperand(0);
1011 SDValue Hi = Op.getOperand(1);
1012 SDValue Shift = Op.getOperand(2);
1013 SDValue Zero = DAG.getConstant(0, VT);
1014 SDValue One = DAG.getConstant(1, VT);
1015
1016 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT);
1017 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1018 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1019 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1020
1021 // The dance around Width1 is necessary for 0 special case.
1022 // Without it the CompShift might be 32, producing incorrect results in
1023 // Overflow. So we do the shift in two steps, the alternative is to
1024 // add a conditional to filter the special case.
1025
1026 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1027 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1028
1029 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1030 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1031 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1032
1033 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1034 SDValue LoBig = Zero;
1035
1036 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1037 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1038
1039 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1040 }
1041
LowerSRXParts(SDValue Op,SelectionDAG & DAG) const1042 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1043 SDLoc DL(Op);
1044 EVT VT = Op.getValueType();
1045
1046 SDValue Lo = Op.getOperand(0);
1047 SDValue Hi = Op.getOperand(1);
1048 SDValue Shift = Op.getOperand(2);
1049 SDValue Zero = DAG.getConstant(0, VT);
1050 SDValue One = DAG.getConstant(1, VT);
1051
1052 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1053
1054 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT);
1055 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1056 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1057 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1058
1059 // The dance around Width1 is necessary for 0 special case.
1060 // Without it the CompShift might be 32, producing incorrect results in
1061 // Overflow. So we do the shift in two steps, the alternative is to
1062 // add a conditional to filter the special case.
1063
1064 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1065 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1066
1067 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1068 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1069 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1070
1071 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1072 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1073
1074 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1075 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1076
1077 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1078 }
1079
LowerFPTOUINT(SDValue Op,SelectionDAG & DAG) const1080 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1081 return DAG.getNode(
1082 ISD::SETCC,
1083 SDLoc(Op),
1084 MVT::i1,
1085 Op, DAG.getConstantFP(0.0f, MVT::f32),
1086 DAG.getCondCode(ISD::SETNE)
1087 );
1088 }
1089
LowerImplicitParameter(SelectionDAG & DAG,EVT VT,SDLoc DL,unsigned DwordOffset) const1090 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1091 SDLoc DL,
1092 unsigned DwordOffset) const {
1093 unsigned ByteOffset = DwordOffset * 4;
1094 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1095 AMDGPUAS::CONSTANT_BUFFER_0);
1096
1097 // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1098 assert(isInt<16>(ByteOffset));
1099
1100 return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1101 DAG.getConstant(ByteOffset, MVT::i32), // PTR
1102 MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1103 false, false, false, 0);
1104 }
1105
isZero(SDValue Op) const1106 bool R600TargetLowering::isZero(SDValue Op) const {
1107 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1108 return Cst->isNullValue();
1109 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1110 return CstFP->isZero();
1111 } else {
1112 return false;
1113 }
1114 }
1115
LowerSELECT_CC(SDValue Op,SelectionDAG & DAG) const1116 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1117 SDLoc DL(Op);
1118 EVT VT = Op.getValueType();
1119
1120 SDValue LHS = Op.getOperand(0);
1121 SDValue RHS = Op.getOperand(1);
1122 SDValue True = Op.getOperand(2);
1123 SDValue False = Op.getOperand(3);
1124 SDValue CC = Op.getOperand(4);
1125 SDValue Temp;
1126
1127 if (VT == MVT::f32) {
1128 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1129 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1130 if (MinMax)
1131 return MinMax;
1132 }
1133
1134 // LHS and RHS are guaranteed to be the same value type
1135 EVT CompareVT = LHS.getValueType();
1136
1137 // Check if we can lower this to a native operation.
1138
1139 // Try to lower to a SET* instruction:
1140 //
1141 // SET* can match the following patterns:
1142 //
1143 // select_cc f32, f32, -1, 0, cc_supported
1144 // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1145 // select_cc i32, i32, -1, 0, cc_supported
1146 //
1147
1148 // Move hardware True/False values to the correct operand.
1149 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1150 ISD::CondCode InverseCC =
1151 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1152 if (isHWTrueValue(False) && isHWFalseValue(True)) {
1153 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1154 std::swap(False, True);
1155 CC = DAG.getCondCode(InverseCC);
1156 } else {
1157 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1158 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1159 std::swap(False, True);
1160 std::swap(LHS, RHS);
1161 CC = DAG.getCondCode(SwapInvCC);
1162 }
1163 }
1164 }
1165
1166 if (isHWTrueValue(True) && isHWFalseValue(False) &&
1167 (CompareVT == VT || VT == MVT::i32)) {
1168 // This can be matched by a SET* instruction.
1169 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1170 }
1171
1172 // Try to lower to a CND* instruction:
1173 //
1174 // CND* can match the following patterns:
1175 //
1176 // select_cc f32, 0.0, f32, f32, cc_supported
1177 // select_cc f32, 0.0, i32, i32, cc_supported
1178 // select_cc i32, 0, f32, f32, cc_supported
1179 // select_cc i32, 0, i32, i32, cc_supported
1180 //
1181
1182 // Try to move the zero value to the RHS
1183 if (isZero(LHS)) {
1184 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1185 // Try swapping the operands
1186 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1187 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1188 std::swap(LHS, RHS);
1189 CC = DAG.getCondCode(CCSwapped);
1190 } else {
1191 // Try inverting the conditon and then swapping the operands
1192 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1193 CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1194 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1195 std::swap(True, False);
1196 std::swap(LHS, RHS);
1197 CC = DAG.getCondCode(CCSwapped);
1198 }
1199 }
1200 }
1201 if (isZero(RHS)) {
1202 SDValue Cond = LHS;
1203 SDValue Zero = RHS;
1204 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1205 if (CompareVT != VT) {
1206 // Bitcast True / False to the correct types. This will end up being
1207 // a nop, but it allows us to define only a single pattern in the
1208 // .TD files for each CND* instruction rather than having to have
1209 // one pattern for integer True/False and one for fp True/False
1210 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1211 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1212 }
1213
1214 switch (CCOpcode) {
1215 case ISD::SETONE:
1216 case ISD::SETUNE:
1217 case ISD::SETNE:
1218 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1219 Temp = True;
1220 True = False;
1221 False = Temp;
1222 break;
1223 default:
1224 break;
1225 }
1226 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1227 Cond, Zero,
1228 True, False,
1229 DAG.getCondCode(CCOpcode));
1230 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1231 }
1232
1233 // If we make it this for it means we have no native instructions to handle
1234 // this SELECT_CC, so we must lower it.
1235 SDValue HWTrue, HWFalse;
1236
1237 if (CompareVT == MVT::f32) {
1238 HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1239 HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1240 } else if (CompareVT == MVT::i32) {
1241 HWTrue = DAG.getConstant(-1, CompareVT);
1242 HWFalse = DAG.getConstant(0, CompareVT);
1243 }
1244 else {
1245 llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1246 }
1247
1248 // Lower this unsupported SELECT_CC into a combination of two supported
1249 // SELECT_CC operations.
1250 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1251
1252 return DAG.getNode(ISD::SELECT_CC, DL, VT,
1253 Cond, HWFalse,
1254 True, False,
1255 DAG.getCondCode(ISD::SETNE));
1256 }
1257
1258 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to
1259 /// convert these pointers to a register index. Each register holds
1260 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1261 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1262 /// for indirect addressing.
stackPtrToRegIndex(SDValue Ptr,unsigned StackWidth,SelectionDAG & DAG) const1263 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1264 unsigned StackWidth,
1265 SelectionDAG &DAG) const {
1266 unsigned SRLPad;
1267 switch(StackWidth) {
1268 case 1:
1269 SRLPad = 2;
1270 break;
1271 case 2:
1272 SRLPad = 3;
1273 break;
1274 case 4:
1275 SRLPad = 4;
1276 break;
1277 default: llvm_unreachable("Invalid stack width");
1278 }
1279
1280 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1281 DAG.getConstant(SRLPad, MVT::i32));
1282 }
1283
getStackAddress(unsigned StackWidth,unsigned ElemIdx,unsigned & Channel,unsigned & PtrIncr) const1284 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1285 unsigned ElemIdx,
1286 unsigned &Channel,
1287 unsigned &PtrIncr) const {
1288 switch (StackWidth) {
1289 default:
1290 case 1:
1291 Channel = 0;
1292 if (ElemIdx > 0) {
1293 PtrIncr = 1;
1294 } else {
1295 PtrIncr = 0;
1296 }
1297 break;
1298 case 2:
1299 Channel = ElemIdx % 2;
1300 if (ElemIdx == 2) {
1301 PtrIncr = 1;
1302 } else {
1303 PtrIncr = 0;
1304 }
1305 break;
1306 case 4:
1307 Channel = ElemIdx;
1308 PtrIncr = 0;
1309 break;
1310 }
1311 }
1312
LowerSTORE(SDValue Op,SelectionDAG & DAG) const1313 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1314 SDLoc DL(Op);
1315 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1316 SDValue Chain = Op.getOperand(0);
1317 SDValue Value = Op.getOperand(1);
1318 SDValue Ptr = Op.getOperand(2);
1319
1320 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1321 if (Result.getNode()) {
1322 return Result;
1323 }
1324
1325 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1326 if (StoreNode->isTruncatingStore()) {
1327 EVT VT = Value.getValueType();
1328 assert(VT.bitsLE(MVT::i32));
1329 EVT MemVT = StoreNode->getMemoryVT();
1330 SDValue MaskConstant;
1331 if (MemVT == MVT::i8) {
1332 MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1333 } else {
1334 assert(MemVT == MVT::i16);
1335 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1336 }
1337 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1338 DAG.getConstant(2, MVT::i32));
1339 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1340 DAG.getConstant(0x00000003, VT));
1341 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1342 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1343 DAG.getConstant(3, VT));
1344 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1345 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1346 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1347 // vector instead.
1348 SDValue Src[4] = {
1349 ShiftedValue,
1350 DAG.getConstant(0, MVT::i32),
1351 DAG.getConstant(0, MVT::i32),
1352 Mask
1353 };
1354 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1355 SDValue Args[3] = { Chain, Input, DWordAddr };
1356 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1357 Op->getVTList(), Args, MemVT,
1358 StoreNode->getMemOperand());
1359 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1360 Value.getValueType().bitsGE(MVT::i32)) {
1361 // Convert pointer from byte address to dword address.
1362 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1363 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1364 Ptr, DAG.getConstant(2, MVT::i32)));
1365
1366 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1367 llvm_unreachable("Truncated and indexed stores not supported yet");
1368 } else {
1369 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1370 }
1371 return Chain;
1372 }
1373 }
1374
1375 EVT ValueVT = Value.getValueType();
1376
1377 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1378 return SDValue();
1379 }
1380
1381 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1382 if (Ret.getNode()) {
1383 return Ret;
1384 }
1385 // Lowering for indirect addressing
1386
1387 const MachineFunction &MF = DAG.getMachineFunction();
1388 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1389 getTargetMachine().getSubtargetImpl()->getFrameLowering());
1390 unsigned StackWidth = TFL->getStackWidth(MF);
1391
1392 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1393
1394 if (ValueVT.isVector()) {
1395 unsigned NumElemVT = ValueVT.getVectorNumElements();
1396 EVT ElemVT = ValueVT.getVectorElementType();
1397 SmallVector<SDValue, 4> Stores(NumElemVT);
1398
1399 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1400 "vector width in load");
1401
1402 for (unsigned i = 0; i < NumElemVT; ++i) {
1403 unsigned Channel, PtrIncr;
1404 getStackAddress(StackWidth, i, Channel, PtrIncr);
1405 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1406 DAG.getConstant(PtrIncr, MVT::i32));
1407 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1408 Value, DAG.getConstant(i, MVT::i32));
1409
1410 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1411 Chain, Elem, Ptr,
1412 DAG.getTargetConstant(Channel, MVT::i32));
1413 }
1414 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1415 } else {
1416 if (ValueVT == MVT::i8) {
1417 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1418 }
1419 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1420 DAG.getTargetConstant(0, MVT::i32)); // Channel
1421 }
1422
1423 return Chain;
1424 }
1425
1426 // return (512 + (kc_bank << 12)
1427 static int
ConstantAddressBlock(unsigned AddressSpace)1428 ConstantAddressBlock(unsigned AddressSpace) {
1429 switch (AddressSpace) {
1430 case AMDGPUAS::CONSTANT_BUFFER_0:
1431 return 512;
1432 case AMDGPUAS::CONSTANT_BUFFER_1:
1433 return 512 + 4096;
1434 case AMDGPUAS::CONSTANT_BUFFER_2:
1435 return 512 + 4096 * 2;
1436 case AMDGPUAS::CONSTANT_BUFFER_3:
1437 return 512 + 4096 * 3;
1438 case AMDGPUAS::CONSTANT_BUFFER_4:
1439 return 512 + 4096 * 4;
1440 case AMDGPUAS::CONSTANT_BUFFER_5:
1441 return 512 + 4096 * 5;
1442 case AMDGPUAS::CONSTANT_BUFFER_6:
1443 return 512 + 4096 * 6;
1444 case AMDGPUAS::CONSTANT_BUFFER_7:
1445 return 512 + 4096 * 7;
1446 case AMDGPUAS::CONSTANT_BUFFER_8:
1447 return 512 + 4096 * 8;
1448 case AMDGPUAS::CONSTANT_BUFFER_9:
1449 return 512 + 4096 * 9;
1450 case AMDGPUAS::CONSTANT_BUFFER_10:
1451 return 512 + 4096 * 10;
1452 case AMDGPUAS::CONSTANT_BUFFER_11:
1453 return 512 + 4096 * 11;
1454 case AMDGPUAS::CONSTANT_BUFFER_12:
1455 return 512 + 4096 * 12;
1456 case AMDGPUAS::CONSTANT_BUFFER_13:
1457 return 512 + 4096 * 13;
1458 case AMDGPUAS::CONSTANT_BUFFER_14:
1459 return 512 + 4096 * 14;
1460 case AMDGPUAS::CONSTANT_BUFFER_15:
1461 return 512 + 4096 * 15;
1462 default:
1463 return -1;
1464 }
1465 }
1466
LowerLOAD(SDValue Op,SelectionDAG & DAG) const1467 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1468 {
1469 EVT VT = Op.getValueType();
1470 SDLoc DL(Op);
1471 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1472 SDValue Chain = Op.getOperand(0);
1473 SDValue Ptr = Op.getOperand(1);
1474 SDValue LoweredLoad;
1475
1476 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1477 if (Ret.getNode()) {
1478 SDValue Ops[2] = {
1479 Ret,
1480 Chain
1481 };
1482 return DAG.getMergeValues(Ops, DL);
1483 }
1484
1485 // Lower loads constant address space global variable loads
1486 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1487 isa<GlobalVariable>(
1488 GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1489
1490 SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1491 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1492 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1493 DAG.getConstant(2, MVT::i32));
1494 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1495 LoadNode->getChain(), Ptr,
1496 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1497 }
1498
1499 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1500 SDValue MergedValues[2] = {
1501 ScalarizeVectorLoad(Op, DAG),
1502 Chain
1503 };
1504 return DAG.getMergeValues(MergedValues, DL);
1505 }
1506
1507 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1508 if (ConstantBlock > -1 &&
1509 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1510 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1511 SDValue Result;
1512 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1513 isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1514 isa<ConstantSDNode>(Ptr)) {
1515 SDValue Slots[4];
1516 for (unsigned i = 0; i < 4; i++) {
1517 // We want Const position encoded with the following formula :
1518 // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1519 // const_index is Ptr computed by llvm using an alignment of 16.
1520 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1521 // then div by 4 at the ISel step
1522 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1523 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1524 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1525 }
1526 EVT NewVT = MVT::v4i32;
1527 unsigned NumElements = 4;
1528 if (VT.isVector()) {
1529 NewVT = VT;
1530 NumElements = VT.getVectorNumElements();
1531 }
1532 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1533 makeArrayRef(Slots, NumElements));
1534 } else {
1535 // non-constant ptr can't be folded, keeps it as a v4f32 load
1536 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1537 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1538 DAG.getConstant(LoadNode->getAddressSpace() -
1539 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1540 );
1541 }
1542
1543 if (!VT.isVector()) {
1544 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1545 DAG.getConstant(0, MVT::i32));
1546 }
1547
1548 SDValue MergedValues[2] = {
1549 Result,
1550 Chain
1551 };
1552 return DAG.getMergeValues(MergedValues, DL);
1553 }
1554
1555 // For most operations returning SDValue() will result in the node being
1556 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1557 // need to manually expand loads that may be legal in some address spaces and
1558 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1559 // compute shaders, since the data is sign extended when it is uploaded to the
1560 // buffer. However SEXT loads from other address spaces are not supported, so
1561 // we need to expand them here.
1562 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1563 EVT MemVT = LoadNode->getMemoryVT();
1564 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1565 SDValue ShiftAmount =
1566 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1567 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1568 LoadNode->getPointerInfo(), MemVT,
1569 LoadNode->isVolatile(),
1570 LoadNode->isNonTemporal(),
1571 LoadNode->isInvariant(),
1572 LoadNode->getAlignment());
1573 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1574 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1575
1576 SDValue MergedValues[2] = { Sra, Chain };
1577 return DAG.getMergeValues(MergedValues, DL);
1578 }
1579
1580 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1581 return SDValue();
1582 }
1583
1584 // Lowering for indirect addressing
1585 const MachineFunction &MF = DAG.getMachineFunction();
1586 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1587 getTargetMachine().getSubtargetImpl()->getFrameLowering());
1588 unsigned StackWidth = TFL->getStackWidth(MF);
1589
1590 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1591
1592 if (VT.isVector()) {
1593 unsigned NumElemVT = VT.getVectorNumElements();
1594 EVT ElemVT = VT.getVectorElementType();
1595 SDValue Loads[4];
1596
1597 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1598 "vector width in load");
1599
1600 for (unsigned i = 0; i < NumElemVT; ++i) {
1601 unsigned Channel, PtrIncr;
1602 getStackAddress(StackWidth, i, Channel, PtrIncr);
1603 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1604 DAG.getConstant(PtrIncr, MVT::i32));
1605 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1606 Chain, Ptr,
1607 DAG.getTargetConstant(Channel, MVT::i32),
1608 Op.getOperand(2));
1609 }
1610 for (unsigned i = NumElemVT; i < 4; ++i) {
1611 Loads[i] = DAG.getUNDEF(ElemVT);
1612 }
1613 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1614 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1615 } else {
1616 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1617 Chain, Ptr,
1618 DAG.getTargetConstant(0, MVT::i32), // Channel
1619 Op.getOperand(2));
1620 }
1621
1622 SDValue Ops[2] = {
1623 LoweredLoad,
1624 Chain
1625 };
1626
1627 return DAG.getMergeValues(Ops, DL);
1628 }
1629
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const1630 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1631 SDValue Chain = Op.getOperand(0);
1632 SDValue Cond = Op.getOperand(1);
1633 SDValue Jump = Op.getOperand(2);
1634
1635 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1636 Chain, Jump, Cond);
1637 }
1638
1639 /// XXX Only kernel functions are supported, so we can assume for now that
1640 /// every function is a kernel function, but in the future we should use
1641 /// separate calling conventions for kernel and non-kernel functions.
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,SDLoc DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const1642 SDValue R600TargetLowering::LowerFormalArguments(
1643 SDValue Chain,
1644 CallingConv::ID CallConv,
1645 bool isVarArg,
1646 const SmallVectorImpl<ISD::InputArg> &Ins,
1647 SDLoc DL, SelectionDAG &DAG,
1648 SmallVectorImpl<SDValue> &InVals) const {
1649 SmallVector<CCValAssign, 16> ArgLocs;
1650 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1651 *DAG.getContext());
1652 MachineFunction &MF = DAG.getMachineFunction();
1653 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1654
1655 SmallVector<ISD::InputArg, 8> LocalIns;
1656
1657 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1658
1659 AnalyzeFormalArguments(CCInfo, LocalIns);
1660
1661 for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1662 CCValAssign &VA = ArgLocs[i];
1663 const ISD::InputArg &In = Ins[i];
1664 EVT VT = In.VT;
1665 EVT MemVT = VA.getLocVT();
1666 if (!VT.isVector() && MemVT.isVector()) {
1667 // Get load source type if scalarized.
1668 MemVT = MemVT.getVectorElementType();
1669 }
1670
1671 if (MFI->getShaderType() != ShaderType::COMPUTE) {
1672 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1673 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1674 InVals.push_back(Register);
1675 continue;
1676 }
1677
1678 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1679 AMDGPUAS::CONSTANT_BUFFER_0);
1680
1681 // i64 isn't a legal type, so the register type used ends up as i32, which
1682 // isn't expected here. It attempts to create this sextload, but it ends up
1683 // being invalid. Somehow this seems to work with i64 arguments, but breaks
1684 // for <1 x i64>.
1685
1686 // The first 36 bytes of the input buffer contains information about
1687 // thread group and global sizes.
1688 ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1689 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1690 // FIXME: This should really check the extload type, but the handling of
1691 // extload vector parameters seems to be broken.
1692
1693 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1694 Ext = ISD::SEXTLOAD;
1695 }
1696
1697 // Compute the offset from the value.
1698 // XXX - I think PartOffset should give you this, but it seems to give the
1699 // size of the register which isn't useful.
1700
1701 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1702 unsigned PartOffset = VA.getLocMemOffset();
1703 unsigned Offset = 36 + VA.getLocMemOffset();
1704
1705 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1706 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1707 DAG.getConstant(Offset, MVT::i32),
1708 DAG.getUNDEF(MVT::i32),
1709 PtrInfo,
1710 MemVT, false, true, true, 4);
1711
1712 // 4 is the preferred alignment for the CONSTANT memory space.
1713 InVals.push_back(Arg);
1714 MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1715 }
1716 return Chain;
1717 }
1718
getSetCCResultType(LLVMContext &,EVT VT) const1719 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1720 if (!VT.isVector())
1721 return MVT::i32;
1722 return VT.changeVectorElementTypeToInteger();
1723 }
1724
CompactSwizzlableVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1725 static SDValue CompactSwizzlableVector(
1726 SelectionDAG &DAG, SDValue VectorEntry,
1727 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1728 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1729 assert(RemapSwizzle.empty());
1730 SDValue NewBldVec[4] = {
1731 VectorEntry.getOperand(0),
1732 VectorEntry.getOperand(1),
1733 VectorEntry.getOperand(2),
1734 VectorEntry.getOperand(3)
1735 };
1736
1737 for (unsigned i = 0; i < 4; i++) {
1738 if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1739 // We mask write here to teach later passes that the ith element of this
1740 // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1741 // break false dependencies and additionnaly make assembly easier to read.
1742 RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1743 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1744 if (C->isZero()) {
1745 RemapSwizzle[i] = 4; // SEL_0
1746 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1747 } else if (C->isExactlyValue(1.0)) {
1748 RemapSwizzle[i] = 5; // SEL_1
1749 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1750 }
1751 }
1752
1753 if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1754 continue;
1755 for (unsigned j = 0; j < i; j++) {
1756 if (NewBldVec[i] == NewBldVec[j]) {
1757 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1758 RemapSwizzle[i] = j;
1759 break;
1760 }
1761 }
1762 }
1763
1764 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1765 VectorEntry.getValueType(), NewBldVec);
1766 }
1767
ReorganizeVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1768 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1769 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1770 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1771 assert(RemapSwizzle.empty());
1772 SDValue NewBldVec[4] = {
1773 VectorEntry.getOperand(0),
1774 VectorEntry.getOperand(1),
1775 VectorEntry.getOperand(2),
1776 VectorEntry.getOperand(3)
1777 };
1778 bool isUnmovable[4] = { false, false, false, false };
1779 for (unsigned i = 0; i < 4; i++) {
1780 RemapSwizzle[i] = i;
1781 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1782 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1783 ->getZExtValue();
1784 if (i == Idx)
1785 isUnmovable[Idx] = true;
1786 }
1787 }
1788
1789 for (unsigned i = 0; i < 4; i++) {
1790 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1791 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1792 ->getZExtValue();
1793 if (isUnmovable[Idx])
1794 continue;
1795 // Swap i and Idx
1796 std::swap(NewBldVec[Idx], NewBldVec[i]);
1797 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1798 break;
1799 }
1800 }
1801
1802 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1803 VectorEntry.getValueType(), NewBldVec);
1804 }
1805
1806
OptimizeSwizzle(SDValue BuildVector,SDValue Swz[4],SelectionDAG & DAG) const1807 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1808 SDValue Swz[4], SelectionDAG &DAG) const {
1809 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1810 // Old -> New swizzle values
1811 DenseMap<unsigned, unsigned> SwizzleRemap;
1812
1813 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1814 for (unsigned i = 0; i < 4; i++) {
1815 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1816 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1817 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1818 }
1819
1820 SwizzleRemap.clear();
1821 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1822 for (unsigned i = 0; i < 4; i++) {
1823 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1824 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1825 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1826 }
1827
1828 return BuildVector;
1829 }
1830
1831
1832 //===----------------------------------------------------------------------===//
1833 // Custom DAG Optimizations
1834 //===----------------------------------------------------------------------===//
1835
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const1836 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1837 DAGCombinerInfo &DCI) const {
1838 SelectionDAG &DAG = DCI.DAG;
1839
1840 switch (N->getOpcode()) {
1841 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1842 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1843 case ISD::FP_ROUND: {
1844 SDValue Arg = N->getOperand(0);
1845 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1846 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1847 Arg.getOperand(0));
1848 }
1849 break;
1850 }
1851
1852 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1853 // (i32 select_cc f32, f32, -1, 0 cc)
1854 //
1855 // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1856 // this to one of the SET*_DX10 instructions.
1857 case ISD::FP_TO_SINT: {
1858 SDValue FNeg = N->getOperand(0);
1859 if (FNeg.getOpcode() != ISD::FNEG) {
1860 return SDValue();
1861 }
1862 SDValue SelectCC = FNeg.getOperand(0);
1863 if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1864 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1865 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1866 !isHWTrueValue(SelectCC.getOperand(2)) ||
1867 !isHWFalseValue(SelectCC.getOperand(3))) {
1868 return SDValue();
1869 }
1870
1871 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1872 SelectCC.getOperand(0), // LHS
1873 SelectCC.getOperand(1), // RHS
1874 DAG.getConstant(-1, MVT::i32), // True
1875 DAG.getConstant(0, MVT::i32), // Flase
1876 SelectCC.getOperand(4)); // CC
1877
1878 break;
1879 }
1880
1881 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1882 // => build_vector elt0, ... , NewEltIdx, ... , eltN
1883 case ISD::INSERT_VECTOR_ELT: {
1884 SDValue InVec = N->getOperand(0);
1885 SDValue InVal = N->getOperand(1);
1886 SDValue EltNo = N->getOperand(2);
1887 SDLoc dl(N);
1888
1889 // If the inserted element is an UNDEF, just use the input vector.
1890 if (InVal.getOpcode() == ISD::UNDEF)
1891 return InVec;
1892
1893 EVT VT = InVec.getValueType();
1894
1895 // If we can't generate a legal BUILD_VECTOR, exit
1896 if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1897 return SDValue();
1898
1899 // Check that we know which element is being inserted
1900 if (!isa<ConstantSDNode>(EltNo))
1901 return SDValue();
1902 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1903
1904 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1905 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the
1906 // vector elements.
1907 SmallVector<SDValue, 8> Ops;
1908 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1909 Ops.append(InVec.getNode()->op_begin(),
1910 InVec.getNode()->op_end());
1911 } else if (InVec.getOpcode() == ISD::UNDEF) {
1912 unsigned NElts = VT.getVectorNumElements();
1913 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1914 } else {
1915 return SDValue();
1916 }
1917
1918 // Insert the element
1919 if (Elt < Ops.size()) {
1920 // All the operands of BUILD_VECTOR must have the same type;
1921 // we enforce that here.
1922 EVT OpVT = Ops[0].getValueType();
1923 if (InVal.getValueType() != OpVT)
1924 InVal = OpVT.bitsGT(InVal.getValueType()) ?
1925 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1926 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1927 Ops[Elt] = InVal;
1928 }
1929
1930 // Return the new vector
1931 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1932 }
1933
1934 // Extract_vec (Build_vector) generated by custom lowering
1935 // also needs to be customly combined
1936 case ISD::EXTRACT_VECTOR_ELT: {
1937 SDValue Arg = N->getOperand(0);
1938 if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1939 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1940 unsigned Element = Const->getZExtValue();
1941 return Arg->getOperand(Element);
1942 }
1943 }
1944 if (Arg.getOpcode() == ISD::BITCAST &&
1945 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1946 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1947 unsigned Element = Const->getZExtValue();
1948 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1949 Arg->getOperand(0).getOperand(Element));
1950 }
1951 }
1952 }
1953
1954 case ISD::SELECT_CC: {
1955 // Try common optimizations
1956 SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1957 if (Ret.getNode())
1958 return Ret;
1959
1960 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1961 // selectcc x, y, a, b, inv(cc)
1962 //
1963 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1964 // selectcc x, y, a, b, cc
1965 SDValue LHS = N->getOperand(0);
1966 if (LHS.getOpcode() != ISD::SELECT_CC) {
1967 return SDValue();
1968 }
1969
1970 SDValue RHS = N->getOperand(1);
1971 SDValue True = N->getOperand(2);
1972 SDValue False = N->getOperand(3);
1973 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1974
1975 if (LHS.getOperand(2).getNode() != True.getNode() ||
1976 LHS.getOperand(3).getNode() != False.getNode() ||
1977 RHS.getNode() != False.getNode()) {
1978 return SDValue();
1979 }
1980
1981 switch (NCC) {
1982 default: return SDValue();
1983 case ISD::SETNE: return LHS;
1984 case ISD::SETEQ: {
1985 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1986 LHSCC = ISD::getSetCCInverse(LHSCC,
1987 LHS.getOperand(0).getValueType().isInteger());
1988 if (DCI.isBeforeLegalizeOps() ||
1989 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1990 return DAG.getSelectCC(SDLoc(N),
1991 LHS.getOperand(0),
1992 LHS.getOperand(1),
1993 LHS.getOperand(2),
1994 LHS.getOperand(3),
1995 LHSCC);
1996 break;
1997 }
1998 }
1999 return SDValue();
2000 }
2001
2002 case AMDGPUISD::EXPORT: {
2003 SDValue Arg = N->getOperand(1);
2004 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2005 break;
2006
2007 SDValue NewArgs[8] = {
2008 N->getOperand(0), // Chain
2009 SDValue(),
2010 N->getOperand(2), // ArrayBase
2011 N->getOperand(3), // Type
2012 N->getOperand(4), // SWZ_X
2013 N->getOperand(5), // SWZ_Y
2014 N->getOperand(6), // SWZ_Z
2015 N->getOperand(7) // SWZ_W
2016 };
2017 SDLoc DL(N);
2018 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2019 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2020 }
2021 case AMDGPUISD::TEXTURE_FETCH: {
2022 SDValue Arg = N->getOperand(1);
2023 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2024 break;
2025
2026 SDValue NewArgs[19] = {
2027 N->getOperand(0),
2028 N->getOperand(1),
2029 N->getOperand(2),
2030 N->getOperand(3),
2031 N->getOperand(4),
2032 N->getOperand(5),
2033 N->getOperand(6),
2034 N->getOperand(7),
2035 N->getOperand(8),
2036 N->getOperand(9),
2037 N->getOperand(10),
2038 N->getOperand(11),
2039 N->getOperand(12),
2040 N->getOperand(13),
2041 N->getOperand(14),
2042 N->getOperand(15),
2043 N->getOperand(16),
2044 N->getOperand(17),
2045 N->getOperand(18),
2046 };
2047 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2048 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2049 NewArgs);
2050 }
2051 }
2052
2053 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2054 }
2055
2056 static bool
FoldOperand(SDNode * ParentNode,unsigned SrcIdx,SDValue & Src,SDValue & Neg,SDValue & Abs,SDValue & Sel,SDValue & Imm,SelectionDAG & DAG)2057 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2058 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2059 const R600InstrInfo *TII =
2060 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2061 if (!Src.isMachineOpcode())
2062 return false;
2063 switch (Src.getMachineOpcode()) {
2064 case AMDGPU::FNEG_R600:
2065 if (!Neg.getNode())
2066 return false;
2067 Src = Src.getOperand(0);
2068 Neg = DAG.getTargetConstant(1, MVT::i32);
2069 return true;
2070 case AMDGPU::FABS_R600:
2071 if (!Abs.getNode())
2072 return false;
2073 Src = Src.getOperand(0);
2074 Abs = DAG.getTargetConstant(1, MVT::i32);
2075 return true;
2076 case AMDGPU::CONST_COPY: {
2077 unsigned Opcode = ParentNode->getMachineOpcode();
2078 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2079
2080 if (!Sel.getNode())
2081 return false;
2082
2083 SDValue CstOffset = Src.getOperand(0);
2084 if (ParentNode->getValueType(0).isVector())
2085 return false;
2086
2087 // Gather constants values
2088 int SrcIndices[] = {
2089 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2090 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2091 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2092 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2093 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2094 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2095 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2096 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2097 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2098 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2099 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2100 };
2101 std::vector<unsigned> Consts;
2102 for (int OtherSrcIdx : SrcIndices) {
2103 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2104 if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2105 continue;
2106 if (HasDst) {
2107 OtherSrcIdx--;
2108 OtherSelIdx--;
2109 }
2110 if (RegisterSDNode *Reg =
2111 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2112 if (Reg->getReg() == AMDGPU::ALU_CONST) {
2113 ConstantSDNode *Cst
2114 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2115 Consts.push_back(Cst->getZExtValue());
2116 }
2117 }
2118 }
2119
2120 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2121 Consts.push_back(Cst->getZExtValue());
2122 if (!TII->fitsConstReadLimitations(Consts)) {
2123 return false;
2124 }
2125
2126 Sel = CstOffset;
2127 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2128 return true;
2129 }
2130 case AMDGPU::MOV_IMM_I32:
2131 case AMDGPU::MOV_IMM_F32: {
2132 unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2133 uint64_t ImmValue = 0;
2134
2135
2136 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2137 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2138 float FloatValue = FPC->getValueAPF().convertToFloat();
2139 if (FloatValue == 0.0) {
2140 ImmReg = AMDGPU::ZERO;
2141 } else if (FloatValue == 0.5) {
2142 ImmReg = AMDGPU::HALF;
2143 } else if (FloatValue == 1.0) {
2144 ImmReg = AMDGPU::ONE;
2145 } else {
2146 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2147 }
2148 } else {
2149 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2150 uint64_t Value = C->getZExtValue();
2151 if (Value == 0) {
2152 ImmReg = AMDGPU::ZERO;
2153 } else if (Value == 1) {
2154 ImmReg = AMDGPU::ONE_INT;
2155 } else {
2156 ImmValue = Value;
2157 }
2158 }
2159
2160 // Check that we aren't already using an immediate.
2161 // XXX: It's possible for an instruction to have more than one
2162 // immediate operand, but this is not supported yet.
2163 if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2164 if (!Imm.getNode())
2165 return false;
2166 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2167 assert(C);
2168 if (C->getZExtValue())
2169 return false;
2170 Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2171 }
2172 Src = DAG.getRegister(ImmReg, MVT::i32);
2173 return true;
2174 }
2175 default:
2176 return false;
2177 }
2178 }
2179
2180
2181 /// \brief Fold the instructions after selecting them
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const2182 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2183 SelectionDAG &DAG) const {
2184 const R600InstrInfo *TII =
2185 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2186 if (!Node->isMachineOpcode())
2187 return Node;
2188 unsigned Opcode = Node->getMachineOpcode();
2189 SDValue FakeOp;
2190
2191 std::vector<SDValue> Ops;
2192 for (const SDUse &I : Node->ops())
2193 Ops.push_back(I);
2194
2195 if (Opcode == AMDGPU::DOT_4) {
2196 int OperandIdx[] = {
2197 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2198 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2199 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2200 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2201 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2202 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2203 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2204 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2205 };
2206 int NegIdx[] = {
2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2211 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2212 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2213 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2214 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2215 };
2216 int AbsIdx[] = {
2217 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2218 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2219 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2220 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2221 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2222 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2223 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2224 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2225 };
2226 for (unsigned i = 0; i < 8; i++) {
2227 if (OperandIdx[i] < 0)
2228 return Node;
2229 SDValue &Src = Ops[OperandIdx[i] - 1];
2230 SDValue &Neg = Ops[NegIdx[i] - 1];
2231 SDValue &Abs = Ops[AbsIdx[i] - 1];
2232 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2233 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2234 if (HasDst)
2235 SelIdx--;
2236 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2237 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2238 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2239 }
2240 } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2241 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2242 SDValue &Src = Ops[i];
2243 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2244 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2245 }
2246 } else if (Opcode == AMDGPU::CLAMP_R600) {
2247 SDValue Src = Node->getOperand(0);
2248 if (!Src.isMachineOpcode() ||
2249 !TII->hasInstrModifiers(Src.getMachineOpcode()))
2250 return Node;
2251 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2252 AMDGPU::OpName::clamp);
2253 if (ClampIdx < 0)
2254 return Node;
2255 std::vector<SDValue> Ops;
2256 unsigned NumOp = Src.getNumOperands();
2257 for(unsigned i = 0; i < NumOp; ++i)
2258 Ops.push_back(Src.getOperand(i));
2259 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2260 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2261 Node->getVTList(), Ops);
2262 } else {
2263 if (!TII->hasInstrModifiers(Opcode))
2264 return Node;
2265 int OperandIdx[] = {
2266 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2267 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2268 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2269 };
2270 int NegIdx[] = {
2271 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2272 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2273 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2274 };
2275 int AbsIdx[] = {
2276 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2277 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2278 -1
2279 };
2280 for (unsigned i = 0; i < 3; i++) {
2281 if (OperandIdx[i] < 0)
2282 return Node;
2283 SDValue &Src = Ops[OperandIdx[i] - 1];
2284 SDValue &Neg = Ops[NegIdx[i] - 1];
2285 SDValue FakeAbs;
2286 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2287 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2288 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2289 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2290 if (HasDst) {
2291 SelIdx--;
2292 ImmIdx--;
2293 }
2294 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2295 SDValue &Imm = Ops[ImmIdx];
2296 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2297 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2298 }
2299 }
2300
2301 return Node;
2302 }
2303