1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30 
31 using namespace llvm;
32 
R600TargetLowering(TargetMachine & TM)33 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
34     AMDGPUTargetLowering(TM),
35     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
42 
43   computeRegisterProperties();
44 
45   // Set condition code actions
46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
58 
59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
63 
64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
66 
67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
69 
70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
73 
74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
75 
76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
79 
80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
82 
83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
88 
89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
93 
94   // Expand sign extension of vectors
95   if (!Subtarget->hasBFE())
96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
97 
98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
100 
101   if (!Subtarget->hasBFE())
102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
105 
106   if (!Subtarget->hasBFE())
107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
110 
111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
114 
115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
116 
117 
118   // Legalize loads and stores to the private address space.
119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
122 
123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
124   // spaces, so it is custom lowered to handle those where it isn't.
125   for (MVT VT : MVT::integer_valuetypes()) {
126     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
127     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
128     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
129 
130     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
131     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
132     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
133 
134     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
135     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
136     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
137   }
138 
139   setOperationAction(ISD::STORE, MVT::i8, Custom);
140   setOperationAction(ISD::STORE, MVT::i32, Custom);
141   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
142   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
143   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
144   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
145 
146   setOperationAction(ISD::LOAD, MVT::i32, Custom);
147   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
148   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
149 
150   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
151   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
152   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
153   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
154 
155   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
156   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
157   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
158   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
159 
160   setTargetDAGCombine(ISD::FP_ROUND);
161   setTargetDAGCombine(ISD::FP_TO_SINT);
162   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
163   setTargetDAGCombine(ISD::SELECT_CC);
164   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
165 
166   setOperationAction(ISD::SUB, MVT::i64, Expand);
167 
168   // These should be replaced by UDVIREM, but it does not happen automatically
169   // during Type Legalization
170   setOperationAction(ISD::UDIV, MVT::i64, Custom);
171   setOperationAction(ISD::UREM, MVT::i64, Custom);
172   setOperationAction(ISD::SDIV, MVT::i64, Custom);
173   setOperationAction(ISD::SREM, MVT::i64, Custom);
174 
175   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
176   //  to be Legal/Custom in order to avoid library calls.
177   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
178   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
179   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
180 
181   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
182 
183   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
184   for (MVT VT : ScalarIntVTs) {
185     setOperationAction(ISD::ADDC, VT, Expand);
186     setOperationAction(ISD::SUBC, VT, Expand);
187     setOperationAction(ISD::ADDE, VT, Expand);
188     setOperationAction(ISD::SUBE, VT, Expand);
189   }
190 
191   setSchedulingPreference(Sched::Source);
192 }
193 
EmitInstrWithCustomInserter(MachineInstr * MI,MachineBasicBlock * BB) const194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
195     MachineInstr * MI, MachineBasicBlock * BB) const {
196   MachineFunction * MF = BB->getParent();
197   MachineRegisterInfo &MRI = MF->getRegInfo();
198   MachineBasicBlock::iterator I = *MI;
199   const R600InstrInfo *TII =
200       static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
201 
202   switch (MI->getOpcode()) {
203   default:
204     // Replace LDS_*_RET instruction that don't have any uses with the
205     // equivalent LDS_*_NORET instruction.
206     if (TII->isLDSRetInstr(MI->getOpcode())) {
207       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
208       assert(DstIdx != -1);
209       MachineInstrBuilder NewMI;
210       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
211       //        LDS_1A2D support and remove this special case.
212       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
213            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
214         return BB;
215 
216       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
217                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
218       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
219         NewMI.addOperand(MI->getOperand(i));
220       }
221     } else {
222       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
223     }
224     break;
225   case AMDGPU::CLAMP_R600: {
226     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
227                                                    AMDGPU::MOV,
228                                                    MI->getOperand(0).getReg(),
229                                                    MI->getOperand(1).getReg());
230     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
231     break;
232   }
233 
234   case AMDGPU::FABS_R600: {
235     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
236                                                     AMDGPU::MOV,
237                                                     MI->getOperand(0).getReg(),
238                                                     MI->getOperand(1).getReg());
239     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
240     break;
241   }
242 
243   case AMDGPU::FNEG_R600: {
244     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
245                                                     AMDGPU::MOV,
246                                                     MI->getOperand(0).getReg(),
247                                                     MI->getOperand(1).getReg());
248     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
249     break;
250   }
251 
252   case AMDGPU::MASK_WRITE: {
253     unsigned maskedRegister = MI->getOperand(0).getReg();
254     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
255     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
256     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
257     break;
258   }
259 
260   case AMDGPU::MOV_IMM_F32:
261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
262                      MI->getOperand(1).getFPImm()->getValueAPF()
263                          .bitcastToAPInt().getZExtValue());
264     break;
265   case AMDGPU::MOV_IMM_I32:
266     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
267                      MI->getOperand(1).getImm());
268     break;
269   case AMDGPU::CONST_COPY: {
270     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
271         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
272     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
273         MI->getOperand(1).getImm());
274     break;
275   }
276 
277   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
278   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
279   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
280     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
281 
282     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
283             .addOperand(MI->getOperand(0))
284             .addOperand(MI->getOperand(1))
285             .addImm(EOP); // Set End of program bit
286     break;
287   }
288 
289   case AMDGPU::TXD: {
290     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
291     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
292     MachineOperand &RID = MI->getOperand(4);
293     MachineOperand &SID = MI->getOperand(5);
294     unsigned TextureId = MI->getOperand(6).getImm();
295     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
296     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
297 
298     switch (TextureId) {
299     case 5: // Rect
300       CTX = CTY = 0;
301       break;
302     case 6: // Shadow1D
303       SrcW = SrcZ;
304       break;
305     case 7: // Shadow2D
306       SrcW = SrcZ;
307       break;
308     case 8: // ShadowRect
309       CTX = CTY = 0;
310       SrcW = SrcZ;
311       break;
312     case 9: // 1DArray
313       SrcZ = SrcY;
314       CTZ = 0;
315       break;
316     case 10: // 2DArray
317       CTZ = 0;
318       break;
319     case 11: // Shadow1DArray
320       SrcZ = SrcY;
321       CTZ = 0;
322       break;
323     case 12: // Shadow2DArray
324       CTZ = 0;
325       break;
326     }
327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
328             .addOperand(MI->getOperand(3))
329             .addImm(SrcX)
330             .addImm(SrcY)
331             .addImm(SrcZ)
332             .addImm(SrcW)
333             .addImm(0)
334             .addImm(0)
335             .addImm(0)
336             .addImm(0)
337             .addImm(1)
338             .addImm(2)
339             .addImm(3)
340             .addOperand(RID)
341             .addOperand(SID)
342             .addImm(CTX)
343             .addImm(CTY)
344             .addImm(CTZ)
345             .addImm(CTW);
346     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
347             .addOperand(MI->getOperand(2))
348             .addImm(SrcX)
349             .addImm(SrcY)
350             .addImm(SrcZ)
351             .addImm(SrcW)
352             .addImm(0)
353             .addImm(0)
354             .addImm(0)
355             .addImm(0)
356             .addImm(1)
357             .addImm(2)
358             .addImm(3)
359             .addOperand(RID)
360             .addOperand(SID)
361             .addImm(CTX)
362             .addImm(CTY)
363             .addImm(CTZ)
364             .addImm(CTW);
365     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
366             .addOperand(MI->getOperand(0))
367             .addOperand(MI->getOperand(1))
368             .addImm(SrcX)
369             .addImm(SrcY)
370             .addImm(SrcZ)
371             .addImm(SrcW)
372             .addImm(0)
373             .addImm(0)
374             .addImm(0)
375             .addImm(0)
376             .addImm(1)
377             .addImm(2)
378             .addImm(3)
379             .addOperand(RID)
380             .addOperand(SID)
381             .addImm(CTX)
382             .addImm(CTY)
383             .addImm(CTZ)
384             .addImm(CTW)
385             .addReg(T0, RegState::Implicit)
386             .addReg(T1, RegState::Implicit);
387     break;
388   }
389 
390   case AMDGPU::TXD_SHADOW: {
391     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
392     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
393     MachineOperand &RID = MI->getOperand(4);
394     MachineOperand &SID = MI->getOperand(5);
395     unsigned TextureId = MI->getOperand(6).getImm();
396     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
397     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
398 
399     switch (TextureId) {
400     case 5: // Rect
401       CTX = CTY = 0;
402       break;
403     case 6: // Shadow1D
404       SrcW = SrcZ;
405       break;
406     case 7: // Shadow2D
407       SrcW = SrcZ;
408       break;
409     case 8: // ShadowRect
410       CTX = CTY = 0;
411       SrcW = SrcZ;
412       break;
413     case 9: // 1DArray
414       SrcZ = SrcY;
415       CTZ = 0;
416       break;
417     case 10: // 2DArray
418       CTZ = 0;
419       break;
420     case 11: // Shadow1DArray
421       SrcZ = SrcY;
422       CTZ = 0;
423       break;
424     case 12: // Shadow2DArray
425       CTZ = 0;
426       break;
427     }
428 
429     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
430             .addOperand(MI->getOperand(3))
431             .addImm(SrcX)
432             .addImm(SrcY)
433             .addImm(SrcZ)
434             .addImm(SrcW)
435             .addImm(0)
436             .addImm(0)
437             .addImm(0)
438             .addImm(0)
439             .addImm(1)
440             .addImm(2)
441             .addImm(3)
442             .addOperand(RID)
443             .addOperand(SID)
444             .addImm(CTX)
445             .addImm(CTY)
446             .addImm(CTZ)
447             .addImm(CTW);
448     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
449             .addOperand(MI->getOperand(2))
450             .addImm(SrcX)
451             .addImm(SrcY)
452             .addImm(SrcZ)
453             .addImm(SrcW)
454             .addImm(0)
455             .addImm(0)
456             .addImm(0)
457             .addImm(0)
458             .addImm(1)
459             .addImm(2)
460             .addImm(3)
461             .addOperand(RID)
462             .addOperand(SID)
463             .addImm(CTX)
464             .addImm(CTY)
465             .addImm(CTZ)
466             .addImm(CTW);
467     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
468             .addOperand(MI->getOperand(0))
469             .addOperand(MI->getOperand(1))
470             .addImm(SrcX)
471             .addImm(SrcY)
472             .addImm(SrcZ)
473             .addImm(SrcW)
474             .addImm(0)
475             .addImm(0)
476             .addImm(0)
477             .addImm(0)
478             .addImm(1)
479             .addImm(2)
480             .addImm(3)
481             .addOperand(RID)
482             .addOperand(SID)
483             .addImm(CTX)
484             .addImm(CTY)
485             .addImm(CTZ)
486             .addImm(CTW)
487             .addReg(T0, RegState::Implicit)
488             .addReg(T1, RegState::Implicit);
489     break;
490   }
491 
492   case AMDGPU::BRANCH:
493       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
494               .addOperand(MI->getOperand(0));
495       break;
496 
497   case AMDGPU::BRANCH_COND_f32: {
498     MachineInstr *NewMI =
499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
500               AMDGPU::PREDICATE_BIT)
501               .addOperand(MI->getOperand(1))
502               .addImm(OPCODE_IS_NOT_ZERO)
503               .addImm(0); // Flags
504     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
505     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
506             .addOperand(MI->getOperand(0))
507             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
508     break;
509   }
510 
511   case AMDGPU::BRANCH_COND_i32: {
512     MachineInstr *NewMI =
513       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
514             AMDGPU::PREDICATE_BIT)
515             .addOperand(MI->getOperand(1))
516             .addImm(OPCODE_IS_NOT_ZERO_INT)
517             .addImm(0); // Flags
518     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
519     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
520            .addOperand(MI->getOperand(0))
521             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
522     break;
523   }
524 
525   case AMDGPU::EG_ExportSwz:
526   case AMDGPU::R600_ExportSwz: {
527     // Instruction is left unmodified if its not the last one of its type
528     bool isLastInstructionOfItsType = true;
529     unsigned InstExportType = MI->getOperand(1).getImm();
530     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
531          EndBlock = BB->end(); NextExportInst != EndBlock;
532          NextExportInst = std::next(NextExportInst)) {
533       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
534           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
535         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
536             .getImm();
537         if (CurrentInstExportType == InstExportType) {
538           isLastInstructionOfItsType = false;
539           break;
540         }
541       }
542     }
543     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
544     if (!EOP && !isLastInstructionOfItsType)
545       return BB;
546     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
547     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
548             .addOperand(MI->getOperand(0))
549             .addOperand(MI->getOperand(1))
550             .addOperand(MI->getOperand(2))
551             .addOperand(MI->getOperand(3))
552             .addOperand(MI->getOperand(4))
553             .addOperand(MI->getOperand(5))
554             .addOperand(MI->getOperand(6))
555             .addImm(CfInst)
556             .addImm(EOP);
557     break;
558   }
559   case AMDGPU::RETURN: {
560     // RETURN instructions must have the live-out registers as implicit uses,
561     // otherwise they appear dead.
562     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
563     MachineInstrBuilder MIB(*MF, MI);
564     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
565       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
566     return BB;
567   }
568   }
569 
570   MI->eraseFromParent();
571   return BB;
572 }
573 
574 //===----------------------------------------------------------------------===//
575 // Custom DAG Lowering Operations
576 //===----------------------------------------------------------------------===//
577 
LowerOperation(SDValue Op,SelectionDAG & DAG) const578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
579   MachineFunction &MF = DAG.getMachineFunction();
580   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
581   switch (Op.getOpcode()) {
582   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
583   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
584   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
585   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
586   case ISD::SRA_PARTS:
587   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
588   case ISD::FCOS:
589   case ISD::FSIN: return LowerTrig(Op, DAG);
590   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
591   case ISD::STORE: return LowerSTORE(Op, DAG);
592   case ISD::LOAD: {
593     SDValue Result = LowerLOAD(Op, DAG);
594     assert((!Result.getNode() ||
595             Result.getNode()->getNumValues() == 2) &&
596            "Load should return a value and a chain");
597     return Result;
598   }
599 
600   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
601   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
602   case ISD::INTRINSIC_VOID: {
603     SDValue Chain = Op.getOperand(0);
604     unsigned IntrinsicID =
605                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
606     switch (IntrinsicID) {
607     case AMDGPUIntrinsic::AMDGPU_store_output: {
608       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
609       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
610       MFI->LiveOuts.push_back(Reg);
611       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
612     }
613     case AMDGPUIntrinsic::R600_store_swizzle: {
614       const SDValue Args[8] = {
615         Chain,
616         Op.getOperand(2), // Export Value
617         Op.getOperand(3), // ArrayBase
618         Op.getOperand(4), // Type
619         DAG.getConstant(0, MVT::i32), // SWZ_X
620         DAG.getConstant(1, MVT::i32), // SWZ_Y
621         DAG.getConstant(2, MVT::i32), // SWZ_Z
622         DAG.getConstant(3, MVT::i32) // SWZ_W
623       };
624       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
625     }
626 
627     // default for switch(IntrinsicID)
628     default: break;
629     }
630     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
631     break;
632   }
633   case ISD::INTRINSIC_WO_CHAIN: {
634     unsigned IntrinsicID =
635                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
636     EVT VT = Op.getValueType();
637     SDLoc DL(Op);
638     switch(IntrinsicID) {
639     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
640     case AMDGPUIntrinsic::R600_load_input: {
641       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
642       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
643       MachineFunction &MF = DAG.getMachineFunction();
644       MachineRegisterInfo &MRI = MF.getRegInfo();
645       MRI.addLiveIn(Reg);
646       return DAG.getCopyFromReg(DAG.getEntryNode(),
647           SDLoc(DAG.getEntryNode()), Reg, VT);
648     }
649 
650     case AMDGPUIntrinsic::R600_interp_input: {
651       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
652       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
653       MachineSDNode *interp;
654       if (ijb < 0) {
655         const MachineFunction &MF = DAG.getMachineFunction();
656         const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
657             MF.getSubtarget().getInstrInfo());
658         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
659             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
660         return DAG.getTargetExtractSubreg(
661             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
662             DL, MVT::f32, SDValue(interp, 0));
663       }
664       MachineFunction &MF = DAG.getMachineFunction();
665       MachineRegisterInfo &MRI = MF.getRegInfo();
666       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
667       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
668       MRI.addLiveIn(RegisterI);
669       MRI.addLiveIn(RegisterJ);
670       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
671           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
672       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
673           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
674 
675       if (slot % 4 < 2)
676         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
677             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
678             RegisterJNode, RegisterINode);
679       else
680         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
681             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
682             RegisterJNode, RegisterINode);
683       return SDValue(interp, slot % 2);
684     }
685     case AMDGPUIntrinsic::R600_interp_xy:
686     case AMDGPUIntrinsic::R600_interp_zw: {
687       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
688       MachineSDNode *interp;
689       SDValue RegisterINode = Op.getOperand(2);
690       SDValue RegisterJNode = Op.getOperand(3);
691 
692       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
693         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
694             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
695             RegisterJNode, RegisterINode);
696       else
697         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
698             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
699             RegisterJNode, RegisterINode);
700       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
701           SDValue(interp, 0), SDValue(interp, 1));
702     }
703     case AMDGPUIntrinsic::R600_tex:
704     case AMDGPUIntrinsic::R600_texc:
705     case AMDGPUIntrinsic::R600_txl:
706     case AMDGPUIntrinsic::R600_txlc:
707     case AMDGPUIntrinsic::R600_txb:
708     case AMDGPUIntrinsic::R600_txbc:
709     case AMDGPUIntrinsic::R600_txf:
710     case AMDGPUIntrinsic::R600_txq:
711     case AMDGPUIntrinsic::R600_ddx:
712     case AMDGPUIntrinsic::R600_ddy:
713     case AMDGPUIntrinsic::R600_ldptr: {
714       unsigned TextureOp;
715       switch (IntrinsicID) {
716       case AMDGPUIntrinsic::R600_tex:
717         TextureOp = 0;
718         break;
719       case AMDGPUIntrinsic::R600_texc:
720         TextureOp = 1;
721         break;
722       case AMDGPUIntrinsic::R600_txl:
723         TextureOp = 2;
724         break;
725       case AMDGPUIntrinsic::R600_txlc:
726         TextureOp = 3;
727         break;
728       case AMDGPUIntrinsic::R600_txb:
729         TextureOp = 4;
730         break;
731       case AMDGPUIntrinsic::R600_txbc:
732         TextureOp = 5;
733         break;
734       case AMDGPUIntrinsic::R600_txf:
735         TextureOp = 6;
736         break;
737       case AMDGPUIntrinsic::R600_txq:
738         TextureOp = 7;
739         break;
740       case AMDGPUIntrinsic::R600_ddx:
741         TextureOp = 8;
742         break;
743       case AMDGPUIntrinsic::R600_ddy:
744         TextureOp = 9;
745         break;
746       case AMDGPUIntrinsic::R600_ldptr:
747         TextureOp = 10;
748         break;
749       default:
750         llvm_unreachable("Unknow Texture Operation");
751       }
752 
753       SDValue TexArgs[19] = {
754         DAG.getConstant(TextureOp, MVT::i32),
755         Op.getOperand(1),
756         DAG.getConstant(0, MVT::i32),
757         DAG.getConstant(1, MVT::i32),
758         DAG.getConstant(2, MVT::i32),
759         DAG.getConstant(3, MVT::i32),
760         Op.getOperand(2),
761         Op.getOperand(3),
762         Op.getOperand(4),
763         DAG.getConstant(0, MVT::i32),
764         DAG.getConstant(1, MVT::i32),
765         DAG.getConstant(2, MVT::i32),
766         DAG.getConstant(3, MVT::i32),
767         Op.getOperand(5),
768         Op.getOperand(6),
769         Op.getOperand(7),
770         Op.getOperand(8),
771         Op.getOperand(9),
772         Op.getOperand(10)
773       };
774       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
775     }
776     case AMDGPUIntrinsic::AMDGPU_dp4: {
777       SDValue Args[8] = {
778       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
779           DAG.getConstant(0, MVT::i32)),
780       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
781           DAG.getConstant(0, MVT::i32)),
782       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
783           DAG.getConstant(1, MVT::i32)),
784       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
785           DAG.getConstant(1, MVT::i32)),
786       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
787           DAG.getConstant(2, MVT::i32)),
788       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
789           DAG.getConstant(2, MVT::i32)),
790       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
791           DAG.getConstant(3, MVT::i32)),
792       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
793           DAG.getConstant(3, MVT::i32))
794       };
795       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
796     }
797 
798     case Intrinsic::r600_read_ngroups_x:
799       return LowerImplicitParameter(DAG, VT, DL, 0);
800     case Intrinsic::r600_read_ngroups_y:
801       return LowerImplicitParameter(DAG, VT, DL, 1);
802     case Intrinsic::r600_read_ngroups_z:
803       return LowerImplicitParameter(DAG, VT, DL, 2);
804     case Intrinsic::r600_read_global_size_x:
805       return LowerImplicitParameter(DAG, VT, DL, 3);
806     case Intrinsic::r600_read_global_size_y:
807       return LowerImplicitParameter(DAG, VT, DL, 4);
808     case Intrinsic::r600_read_global_size_z:
809       return LowerImplicitParameter(DAG, VT, DL, 5);
810     case Intrinsic::r600_read_local_size_x:
811       return LowerImplicitParameter(DAG, VT, DL, 6);
812     case Intrinsic::r600_read_local_size_y:
813       return LowerImplicitParameter(DAG, VT, DL, 7);
814     case Intrinsic::r600_read_local_size_z:
815       return LowerImplicitParameter(DAG, VT, DL, 8);
816 
817     case Intrinsic::AMDGPU_read_workdim:
818       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
819 
820     case Intrinsic::r600_read_tgid_x:
821       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
822                                   AMDGPU::T1_X, VT);
823     case Intrinsic::r600_read_tgid_y:
824       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
825                                   AMDGPU::T1_Y, VT);
826     case Intrinsic::r600_read_tgid_z:
827       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
828                                   AMDGPU::T1_Z, VT);
829     case Intrinsic::r600_read_tidig_x:
830       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
831                                   AMDGPU::T0_X, VT);
832     case Intrinsic::r600_read_tidig_y:
833       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
834                                   AMDGPU::T0_Y, VT);
835     case Intrinsic::r600_read_tidig_z:
836       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
837                                   AMDGPU::T0_Z, VT);
838     case Intrinsic::AMDGPU_rsq:
839       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
840       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
841 
842     case AMDGPUIntrinsic::AMDGPU_fract:
843     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
844       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
845     }
846     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
847     break;
848   }
849   } // end switch(Op.getOpcode())
850   return SDValue();
851 }
852 
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const853 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
854                                             SmallVectorImpl<SDValue> &Results,
855                                             SelectionDAG &DAG) const {
856   switch (N->getOpcode()) {
857   default:
858     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
859     return;
860   case ISD::FP_TO_UINT:
861     if (N->getValueType(0) == MVT::i1) {
862       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
863       return;
864     }
865     // Fall-through. Since we don't care about out of bounds values
866     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
867     // considers some extra cases which are not necessary here.
868   case ISD::FP_TO_SINT: {
869     SDValue Result;
870     if (expandFP_TO_SINT(N, Result, DAG))
871       Results.push_back(Result);
872     return;
873   }
874   case ISD::UDIV: {
875     SDValue Op = SDValue(N, 0);
876     SDLoc DL(Op);
877     EVT VT = Op.getValueType();
878     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
879       N->getOperand(0), N->getOperand(1));
880     Results.push_back(UDIVREM);
881     break;
882   }
883   case ISD::UREM: {
884     SDValue Op = SDValue(N, 0);
885     SDLoc DL(Op);
886     EVT VT = Op.getValueType();
887     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
888       N->getOperand(0), N->getOperand(1));
889     Results.push_back(UDIVREM.getValue(1));
890     break;
891   }
892   case ISD::SDIV: {
893     SDValue Op = SDValue(N, 0);
894     SDLoc DL(Op);
895     EVT VT = Op.getValueType();
896     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
897       N->getOperand(0), N->getOperand(1));
898     Results.push_back(SDIVREM);
899     break;
900   }
901   case ISD::SREM: {
902     SDValue Op = SDValue(N, 0);
903     SDLoc DL(Op);
904     EVT VT = Op.getValueType();
905     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
906       N->getOperand(0), N->getOperand(1));
907     Results.push_back(SDIVREM.getValue(1));
908     break;
909   }
910   case ISD::SDIVREM: {
911     SDValue Op = SDValue(N, 1);
912     SDValue RES = LowerSDIVREM(Op, DAG);
913     Results.push_back(RES);
914     Results.push_back(RES.getValue(1));
915     break;
916   }
917   case ISD::UDIVREM: {
918     SDValue Op = SDValue(N, 0);
919     LowerUDIVREM64(Op, DAG, Results);
920     break;
921   }
922   }
923 }
924 
vectorToVerticalVector(SelectionDAG & DAG,SDValue Vector) const925 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
926                                                    SDValue Vector) const {
927 
928   SDLoc DL(Vector);
929   EVT VecVT = Vector.getValueType();
930   EVT EltVT = VecVT.getVectorElementType();
931   SmallVector<SDValue, 8> Args;
932 
933   for (unsigned i = 0, e = VecVT.getVectorNumElements();
934                                                            i != e; ++i) {
935     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
936                                Vector, DAG.getConstant(i, getVectorIdxTy())));
937   }
938 
939   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
940 }
941 
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const942 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
943                                                     SelectionDAG &DAG) const {
944 
945   SDLoc DL(Op);
946   SDValue Vector = Op.getOperand(0);
947   SDValue Index = Op.getOperand(1);
948 
949   if (isa<ConstantSDNode>(Index) ||
950       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
951     return Op;
952 
953   Vector = vectorToVerticalVector(DAG, Vector);
954   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
955                      Vector, Index);
956 }
957 
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const958 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
959                                                    SelectionDAG &DAG) const {
960   SDLoc DL(Op);
961   SDValue Vector = Op.getOperand(0);
962   SDValue Value = Op.getOperand(1);
963   SDValue Index = Op.getOperand(2);
964 
965   if (isa<ConstantSDNode>(Index) ||
966       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
967     return Op;
968 
969   Vector = vectorToVerticalVector(DAG, Vector);
970   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
971                                Vector, Value, Index);
972   return vectorToVerticalVector(DAG, Insert);
973 }
974 
LowerTrig(SDValue Op,SelectionDAG & DAG) const975 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
976   // On hw >= R700, COS/SIN input must be between -1. and 1.
977   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
978   EVT VT = Op.getValueType();
979   SDValue Arg = Op.getOperand(0);
980   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
981       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
982         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
983           DAG.getConstantFP(0.15915494309, MVT::f32)),
984         DAG.getConstantFP(0.5, MVT::f32)));
985   unsigned TrigNode;
986   switch (Op.getOpcode()) {
987   case ISD::FCOS:
988     TrigNode = AMDGPUISD::COS_HW;
989     break;
990   case ISD::FSIN:
991     TrigNode = AMDGPUISD::SIN_HW;
992     break;
993   default:
994     llvm_unreachable("Wrong trig opcode");
995   }
996   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
997       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
998         DAG.getConstantFP(-0.5, MVT::f32)));
999   if (Gen >= AMDGPUSubtarget::R700)
1000     return TrigVal;
1001   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1002   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1003       DAG.getConstantFP(3.14159265359, MVT::f32));
1004 }
1005 
LowerSHLParts(SDValue Op,SelectionDAG & DAG) const1006 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1007   SDLoc DL(Op);
1008   EVT VT = Op.getValueType();
1009 
1010   SDValue Lo = Op.getOperand(0);
1011   SDValue Hi = Op.getOperand(1);
1012   SDValue Shift = Op.getOperand(2);
1013   SDValue Zero = DAG.getConstant(0, VT);
1014   SDValue One  = DAG.getConstant(1, VT);
1015 
1016   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1017   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1018   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1019   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1020 
1021   // The dance around Width1 is necessary for 0 special case.
1022   // Without it the CompShift might be 32, producing incorrect results in
1023   // Overflow. So we do the shift in two steps, the alternative is to
1024   // add a conditional to filter the special case.
1025 
1026   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1027   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1028 
1029   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1030   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1031   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1032 
1033   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1034   SDValue LoBig = Zero;
1035 
1036   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1037   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1038 
1039   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1040 }
1041 
LowerSRXParts(SDValue Op,SelectionDAG & DAG) const1042 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1043   SDLoc DL(Op);
1044   EVT VT = Op.getValueType();
1045 
1046   SDValue Lo = Op.getOperand(0);
1047   SDValue Hi = Op.getOperand(1);
1048   SDValue Shift = Op.getOperand(2);
1049   SDValue Zero = DAG.getConstant(0, VT);
1050   SDValue One  = DAG.getConstant(1, VT);
1051 
1052   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1053 
1054   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1055   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1056   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1057   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1058 
1059   // The dance around Width1 is necessary for 0 special case.
1060   // Without it the CompShift might be 32, producing incorrect results in
1061   // Overflow. So we do the shift in two steps, the alternative is to
1062   // add a conditional to filter the special case.
1063 
1064   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1065   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1066 
1067   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1068   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1069   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1070 
1071   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1072   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1073 
1074   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1075   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1076 
1077   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1078 }
1079 
LowerFPTOUINT(SDValue Op,SelectionDAG & DAG) const1080 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1081   return DAG.getNode(
1082       ISD::SETCC,
1083       SDLoc(Op),
1084       MVT::i1,
1085       Op, DAG.getConstantFP(0.0f, MVT::f32),
1086       DAG.getCondCode(ISD::SETNE)
1087       );
1088 }
1089 
LowerImplicitParameter(SelectionDAG & DAG,EVT VT,SDLoc DL,unsigned DwordOffset) const1090 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1091                                                    SDLoc DL,
1092                                                    unsigned DwordOffset) const {
1093   unsigned ByteOffset = DwordOffset * 4;
1094   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1095                                       AMDGPUAS::CONSTANT_BUFFER_0);
1096 
1097   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1098   assert(isInt<16>(ByteOffset));
1099 
1100   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1101                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1102                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1103                      false, false, false, 0);
1104 }
1105 
isZero(SDValue Op) const1106 bool R600TargetLowering::isZero(SDValue Op) const {
1107   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1108     return Cst->isNullValue();
1109   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1110     return CstFP->isZero();
1111   } else {
1112     return false;
1113   }
1114 }
1115 
LowerSELECT_CC(SDValue Op,SelectionDAG & DAG) const1116 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1117   SDLoc DL(Op);
1118   EVT VT = Op.getValueType();
1119 
1120   SDValue LHS = Op.getOperand(0);
1121   SDValue RHS = Op.getOperand(1);
1122   SDValue True = Op.getOperand(2);
1123   SDValue False = Op.getOperand(3);
1124   SDValue CC = Op.getOperand(4);
1125   SDValue Temp;
1126 
1127   if (VT == MVT::f32) {
1128     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1129     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1130     if (MinMax)
1131       return MinMax;
1132   }
1133 
1134   // LHS and RHS are guaranteed to be the same value type
1135   EVT CompareVT = LHS.getValueType();
1136 
1137   // Check if we can lower this to a native operation.
1138 
1139   // Try to lower to a SET* instruction:
1140   //
1141   // SET* can match the following patterns:
1142   //
1143   // select_cc f32, f32, -1,  0, cc_supported
1144   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1145   // select_cc i32, i32, -1,  0, cc_supported
1146   //
1147 
1148   // Move hardware True/False values to the correct operand.
1149   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1150   ISD::CondCode InverseCC =
1151      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1152   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1153     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1154       std::swap(False, True);
1155       CC = DAG.getCondCode(InverseCC);
1156     } else {
1157       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1158       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1159         std::swap(False, True);
1160         std::swap(LHS, RHS);
1161         CC = DAG.getCondCode(SwapInvCC);
1162       }
1163     }
1164   }
1165 
1166   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1167       (CompareVT == VT || VT == MVT::i32)) {
1168     // This can be matched by a SET* instruction.
1169     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1170   }
1171 
1172   // Try to lower to a CND* instruction:
1173   //
1174   // CND* can match the following patterns:
1175   //
1176   // select_cc f32, 0.0, f32, f32, cc_supported
1177   // select_cc f32, 0.0, i32, i32, cc_supported
1178   // select_cc i32, 0,   f32, f32, cc_supported
1179   // select_cc i32, 0,   i32, i32, cc_supported
1180   //
1181 
1182   // Try to move the zero value to the RHS
1183   if (isZero(LHS)) {
1184     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1185     // Try swapping the operands
1186     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1187     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1188       std::swap(LHS, RHS);
1189       CC = DAG.getCondCode(CCSwapped);
1190     } else {
1191       // Try inverting the conditon and then swapping the operands
1192       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1193       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1194       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1195         std::swap(True, False);
1196         std::swap(LHS, RHS);
1197         CC = DAG.getCondCode(CCSwapped);
1198       }
1199     }
1200   }
1201   if (isZero(RHS)) {
1202     SDValue Cond = LHS;
1203     SDValue Zero = RHS;
1204     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1205     if (CompareVT != VT) {
1206       // Bitcast True / False to the correct types.  This will end up being
1207       // a nop, but it allows us to define only a single pattern in the
1208       // .TD files for each CND* instruction rather than having to have
1209       // one pattern for integer True/False and one for fp True/False
1210       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1211       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1212     }
1213 
1214     switch (CCOpcode) {
1215     case ISD::SETONE:
1216     case ISD::SETUNE:
1217     case ISD::SETNE:
1218       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1219       Temp = True;
1220       True = False;
1221       False = Temp;
1222       break;
1223     default:
1224       break;
1225     }
1226     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1227         Cond, Zero,
1228         True, False,
1229         DAG.getCondCode(CCOpcode));
1230     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1231   }
1232 
1233   // If we make it this for it means we have no native instructions to handle
1234   // this SELECT_CC, so we must lower it.
1235   SDValue HWTrue, HWFalse;
1236 
1237   if (CompareVT == MVT::f32) {
1238     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1239     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1240   } else if (CompareVT == MVT::i32) {
1241     HWTrue = DAG.getConstant(-1, CompareVT);
1242     HWFalse = DAG.getConstant(0, CompareVT);
1243   }
1244   else {
1245     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1246   }
1247 
1248   // Lower this unsupported SELECT_CC into a combination of two supported
1249   // SELECT_CC operations.
1250   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1251 
1252   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1253       Cond, HWFalse,
1254       True, False,
1255       DAG.getCondCode(ISD::SETNE));
1256 }
1257 
1258 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1259 /// convert these pointers to a register index.  Each register holds
1260 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1261 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1262 /// for indirect addressing.
stackPtrToRegIndex(SDValue Ptr,unsigned StackWidth,SelectionDAG & DAG) const1263 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1264                                                unsigned StackWidth,
1265                                                SelectionDAG &DAG) const {
1266   unsigned SRLPad;
1267   switch(StackWidth) {
1268   case 1:
1269     SRLPad = 2;
1270     break;
1271   case 2:
1272     SRLPad = 3;
1273     break;
1274   case 4:
1275     SRLPad = 4;
1276     break;
1277   default: llvm_unreachable("Invalid stack width");
1278   }
1279 
1280   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1281                      DAG.getConstant(SRLPad, MVT::i32));
1282 }
1283 
getStackAddress(unsigned StackWidth,unsigned ElemIdx,unsigned & Channel,unsigned & PtrIncr) const1284 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1285                                          unsigned ElemIdx,
1286                                          unsigned &Channel,
1287                                          unsigned &PtrIncr) const {
1288   switch (StackWidth) {
1289   default:
1290   case 1:
1291     Channel = 0;
1292     if (ElemIdx > 0) {
1293       PtrIncr = 1;
1294     } else {
1295       PtrIncr = 0;
1296     }
1297     break;
1298   case 2:
1299     Channel = ElemIdx % 2;
1300     if (ElemIdx == 2) {
1301       PtrIncr = 1;
1302     } else {
1303       PtrIncr = 0;
1304     }
1305     break;
1306   case 4:
1307     Channel = ElemIdx;
1308     PtrIncr = 0;
1309     break;
1310   }
1311 }
1312 
LowerSTORE(SDValue Op,SelectionDAG & DAG) const1313 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1314   SDLoc DL(Op);
1315   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1316   SDValue Chain = Op.getOperand(0);
1317   SDValue Value = Op.getOperand(1);
1318   SDValue Ptr = Op.getOperand(2);
1319 
1320   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1321   if (Result.getNode()) {
1322     return Result;
1323   }
1324 
1325   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1326     if (StoreNode->isTruncatingStore()) {
1327       EVT VT = Value.getValueType();
1328       assert(VT.bitsLE(MVT::i32));
1329       EVT MemVT = StoreNode->getMemoryVT();
1330       SDValue MaskConstant;
1331       if (MemVT == MVT::i8) {
1332         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1333       } else {
1334         assert(MemVT == MVT::i16);
1335         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1336       }
1337       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1338                                       DAG.getConstant(2, MVT::i32));
1339       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1340                                       DAG.getConstant(0x00000003, VT));
1341       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1342       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1343                                    DAG.getConstant(3, VT));
1344       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1345       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1346       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1347       // vector instead.
1348       SDValue Src[4] = {
1349         ShiftedValue,
1350         DAG.getConstant(0, MVT::i32),
1351         DAG.getConstant(0, MVT::i32),
1352         Mask
1353       };
1354       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1355       SDValue Args[3] = { Chain, Input, DWordAddr };
1356       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1357                                      Op->getVTList(), Args, MemVT,
1358                                      StoreNode->getMemOperand());
1359     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1360                Value.getValueType().bitsGE(MVT::i32)) {
1361       // Convert pointer from byte address to dword address.
1362       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1363                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1364                                     Ptr, DAG.getConstant(2, MVT::i32)));
1365 
1366       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1367         llvm_unreachable("Truncated and indexed stores not supported yet");
1368       } else {
1369         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1370       }
1371       return Chain;
1372     }
1373   }
1374 
1375   EVT ValueVT = Value.getValueType();
1376 
1377   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1378     return SDValue();
1379   }
1380 
1381   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1382   if (Ret.getNode()) {
1383     return Ret;
1384   }
1385   // Lowering for indirect addressing
1386 
1387   const MachineFunction &MF = DAG.getMachineFunction();
1388   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1389       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1390   unsigned StackWidth = TFL->getStackWidth(MF);
1391 
1392   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1393 
1394   if (ValueVT.isVector()) {
1395     unsigned NumElemVT = ValueVT.getVectorNumElements();
1396     EVT ElemVT = ValueVT.getVectorElementType();
1397     SmallVector<SDValue, 4> Stores(NumElemVT);
1398 
1399     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1400                                       "vector width in load");
1401 
1402     for (unsigned i = 0; i < NumElemVT; ++i) {
1403       unsigned Channel, PtrIncr;
1404       getStackAddress(StackWidth, i, Channel, PtrIncr);
1405       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1406                         DAG.getConstant(PtrIncr, MVT::i32));
1407       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1408                                  Value, DAG.getConstant(i, MVT::i32));
1409 
1410       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1411                               Chain, Elem, Ptr,
1412                               DAG.getTargetConstant(Channel, MVT::i32));
1413     }
1414      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1415    } else {
1416     if (ValueVT == MVT::i8) {
1417       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1418     }
1419     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1420     DAG.getTargetConstant(0, MVT::i32)); // Channel
1421   }
1422 
1423   return Chain;
1424 }
1425 
1426 // return (512 + (kc_bank << 12)
1427 static int
ConstantAddressBlock(unsigned AddressSpace)1428 ConstantAddressBlock(unsigned AddressSpace) {
1429   switch (AddressSpace) {
1430   case AMDGPUAS::CONSTANT_BUFFER_0:
1431     return 512;
1432   case AMDGPUAS::CONSTANT_BUFFER_1:
1433     return 512 + 4096;
1434   case AMDGPUAS::CONSTANT_BUFFER_2:
1435     return 512 + 4096 * 2;
1436   case AMDGPUAS::CONSTANT_BUFFER_3:
1437     return 512 + 4096 * 3;
1438   case AMDGPUAS::CONSTANT_BUFFER_4:
1439     return 512 + 4096 * 4;
1440   case AMDGPUAS::CONSTANT_BUFFER_5:
1441     return 512 + 4096 * 5;
1442   case AMDGPUAS::CONSTANT_BUFFER_6:
1443     return 512 + 4096 * 6;
1444   case AMDGPUAS::CONSTANT_BUFFER_7:
1445     return 512 + 4096 * 7;
1446   case AMDGPUAS::CONSTANT_BUFFER_8:
1447     return 512 + 4096 * 8;
1448   case AMDGPUAS::CONSTANT_BUFFER_9:
1449     return 512 + 4096 * 9;
1450   case AMDGPUAS::CONSTANT_BUFFER_10:
1451     return 512 + 4096 * 10;
1452   case AMDGPUAS::CONSTANT_BUFFER_11:
1453     return 512 + 4096 * 11;
1454   case AMDGPUAS::CONSTANT_BUFFER_12:
1455     return 512 + 4096 * 12;
1456   case AMDGPUAS::CONSTANT_BUFFER_13:
1457     return 512 + 4096 * 13;
1458   case AMDGPUAS::CONSTANT_BUFFER_14:
1459     return 512 + 4096 * 14;
1460   case AMDGPUAS::CONSTANT_BUFFER_15:
1461     return 512 + 4096 * 15;
1462   default:
1463     return -1;
1464   }
1465 }
1466 
LowerLOAD(SDValue Op,SelectionDAG & DAG) const1467 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1468 {
1469   EVT VT = Op.getValueType();
1470   SDLoc DL(Op);
1471   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1472   SDValue Chain = Op.getOperand(0);
1473   SDValue Ptr = Op.getOperand(1);
1474   SDValue LoweredLoad;
1475 
1476   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1477   if (Ret.getNode()) {
1478     SDValue Ops[2] = {
1479       Ret,
1480       Chain
1481     };
1482     return DAG.getMergeValues(Ops, DL);
1483   }
1484 
1485   // Lower loads constant address space global variable loads
1486   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1487       isa<GlobalVariable>(
1488           GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1489 
1490     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1491         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1492     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1493         DAG.getConstant(2, MVT::i32));
1494     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1495                        LoadNode->getChain(), Ptr,
1496                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1497   }
1498 
1499   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1500     SDValue MergedValues[2] = {
1501       ScalarizeVectorLoad(Op, DAG),
1502       Chain
1503     };
1504     return DAG.getMergeValues(MergedValues, DL);
1505   }
1506 
1507   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1508   if (ConstantBlock > -1 &&
1509       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1510        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1511     SDValue Result;
1512     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1513         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1514         isa<ConstantSDNode>(Ptr)) {
1515       SDValue Slots[4];
1516       for (unsigned i = 0; i < 4; i++) {
1517         // We want Const position encoded with the following formula :
1518         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1519         // const_index is Ptr computed by llvm using an alignment of 16.
1520         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1521         // then div by 4 at the ISel step
1522         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1523             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1524         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1525       }
1526       EVT NewVT = MVT::v4i32;
1527       unsigned NumElements = 4;
1528       if (VT.isVector()) {
1529         NewVT = VT;
1530         NumElements = VT.getVectorNumElements();
1531       }
1532       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1533                            makeArrayRef(Slots, NumElements));
1534     } else {
1535       // non-constant ptr can't be folded, keeps it as a v4f32 load
1536       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1537           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1538           DAG.getConstant(LoadNode->getAddressSpace() -
1539                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1540           );
1541     }
1542 
1543     if (!VT.isVector()) {
1544       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1545           DAG.getConstant(0, MVT::i32));
1546     }
1547 
1548     SDValue MergedValues[2] = {
1549       Result,
1550       Chain
1551     };
1552     return DAG.getMergeValues(MergedValues, DL);
1553   }
1554 
1555   // For most operations returning SDValue() will result in the node being
1556   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1557   // need to manually expand loads that may be legal in some address spaces and
1558   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1559   // compute shaders, since the data is sign extended when it is uploaded to the
1560   // buffer. However SEXT loads from other address spaces are not supported, so
1561   // we need to expand them here.
1562   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1563     EVT MemVT = LoadNode->getMemoryVT();
1564     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1565     SDValue ShiftAmount =
1566           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1567     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1568                                   LoadNode->getPointerInfo(), MemVT,
1569                                   LoadNode->isVolatile(),
1570                                   LoadNode->isNonTemporal(),
1571                                   LoadNode->isInvariant(),
1572                                   LoadNode->getAlignment());
1573     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1574     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1575 
1576     SDValue MergedValues[2] = { Sra, Chain };
1577     return DAG.getMergeValues(MergedValues, DL);
1578   }
1579 
1580   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1581     return SDValue();
1582   }
1583 
1584   // Lowering for indirect addressing
1585   const MachineFunction &MF = DAG.getMachineFunction();
1586   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1587       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1588   unsigned StackWidth = TFL->getStackWidth(MF);
1589 
1590   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1591 
1592   if (VT.isVector()) {
1593     unsigned NumElemVT = VT.getVectorNumElements();
1594     EVT ElemVT = VT.getVectorElementType();
1595     SDValue Loads[4];
1596 
1597     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1598                                       "vector width in load");
1599 
1600     for (unsigned i = 0; i < NumElemVT; ++i) {
1601       unsigned Channel, PtrIncr;
1602       getStackAddress(StackWidth, i, Channel, PtrIncr);
1603       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1604                         DAG.getConstant(PtrIncr, MVT::i32));
1605       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1606                              Chain, Ptr,
1607                              DAG.getTargetConstant(Channel, MVT::i32),
1608                              Op.getOperand(2));
1609     }
1610     for (unsigned i = NumElemVT; i < 4; ++i) {
1611       Loads[i] = DAG.getUNDEF(ElemVT);
1612     }
1613     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1614     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1615   } else {
1616     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1617                               Chain, Ptr,
1618                               DAG.getTargetConstant(0, MVT::i32), // Channel
1619                               Op.getOperand(2));
1620   }
1621 
1622   SDValue Ops[2] = {
1623     LoweredLoad,
1624     Chain
1625   };
1626 
1627   return DAG.getMergeValues(Ops, DL);
1628 }
1629 
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const1630 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1631   SDValue Chain = Op.getOperand(0);
1632   SDValue Cond  = Op.getOperand(1);
1633   SDValue Jump  = Op.getOperand(2);
1634 
1635   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1636                      Chain, Jump, Cond);
1637 }
1638 
1639 /// XXX Only kernel functions are supported, so we can assume for now that
1640 /// every function is a kernel function, but in the future we should use
1641 /// separate calling conventions for kernel and non-kernel functions.
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,SDLoc DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const1642 SDValue R600TargetLowering::LowerFormalArguments(
1643                                       SDValue Chain,
1644                                       CallingConv::ID CallConv,
1645                                       bool isVarArg,
1646                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1647                                       SDLoc DL, SelectionDAG &DAG,
1648                                       SmallVectorImpl<SDValue> &InVals) const {
1649   SmallVector<CCValAssign, 16> ArgLocs;
1650   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1651                  *DAG.getContext());
1652   MachineFunction &MF = DAG.getMachineFunction();
1653   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1654 
1655   SmallVector<ISD::InputArg, 8> LocalIns;
1656 
1657   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1658 
1659   AnalyzeFormalArguments(CCInfo, LocalIns);
1660 
1661   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1662     CCValAssign &VA = ArgLocs[i];
1663     const ISD::InputArg &In = Ins[i];
1664     EVT VT = In.VT;
1665     EVT MemVT = VA.getLocVT();
1666     if (!VT.isVector() && MemVT.isVector()) {
1667       // Get load source type if scalarized.
1668       MemVT = MemVT.getVectorElementType();
1669     }
1670 
1671     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1672       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1673       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1674       InVals.push_back(Register);
1675       continue;
1676     }
1677 
1678     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1679                                           AMDGPUAS::CONSTANT_BUFFER_0);
1680 
1681     // i64 isn't a legal type, so the register type used ends up as i32, which
1682     // isn't expected here. It attempts to create this sextload, but it ends up
1683     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1684     // for <1 x i64>.
1685 
1686     // The first 36 bytes of the input buffer contains information about
1687     // thread group and global sizes.
1688     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1689     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1690       // FIXME: This should really check the extload type, but the handling of
1691       // extload vector parameters seems to be broken.
1692 
1693       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1694       Ext = ISD::SEXTLOAD;
1695     }
1696 
1697     // Compute the offset from the value.
1698     // XXX - I think PartOffset should give you this, but it seems to give the
1699     // size of the register which isn't useful.
1700 
1701     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1702     unsigned PartOffset = VA.getLocMemOffset();
1703     unsigned Offset = 36 + VA.getLocMemOffset();
1704 
1705     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1706     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1707                               DAG.getConstant(Offset, MVT::i32),
1708                               DAG.getUNDEF(MVT::i32),
1709                               PtrInfo,
1710                               MemVT, false, true, true, 4);
1711 
1712     // 4 is the preferred alignment for the CONSTANT memory space.
1713     InVals.push_back(Arg);
1714     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1715   }
1716   return Chain;
1717 }
1718 
getSetCCResultType(LLVMContext &,EVT VT) const1719 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1720    if (!VT.isVector())
1721      return MVT::i32;
1722    return VT.changeVectorElementTypeToInteger();
1723 }
1724 
CompactSwizzlableVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1725 static SDValue CompactSwizzlableVector(
1726   SelectionDAG &DAG, SDValue VectorEntry,
1727   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1728   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1729   assert(RemapSwizzle.empty());
1730   SDValue NewBldVec[4] = {
1731     VectorEntry.getOperand(0),
1732     VectorEntry.getOperand(1),
1733     VectorEntry.getOperand(2),
1734     VectorEntry.getOperand(3)
1735   };
1736 
1737   for (unsigned i = 0; i < 4; i++) {
1738     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1739       // We mask write here to teach later passes that the ith element of this
1740       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1741       // break false dependencies and additionnaly make assembly easier to read.
1742       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1743     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1744       if (C->isZero()) {
1745         RemapSwizzle[i] = 4; // SEL_0
1746         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1747       } else if (C->isExactlyValue(1.0)) {
1748         RemapSwizzle[i] = 5; // SEL_1
1749         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1750       }
1751     }
1752 
1753     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1754       continue;
1755     for (unsigned j = 0; j < i; j++) {
1756       if (NewBldVec[i] == NewBldVec[j]) {
1757         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1758         RemapSwizzle[i] = j;
1759         break;
1760       }
1761     }
1762   }
1763 
1764   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1765                      VectorEntry.getValueType(), NewBldVec);
1766 }
1767 
ReorganizeVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1768 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1769                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1770   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1771   assert(RemapSwizzle.empty());
1772   SDValue NewBldVec[4] = {
1773       VectorEntry.getOperand(0),
1774       VectorEntry.getOperand(1),
1775       VectorEntry.getOperand(2),
1776       VectorEntry.getOperand(3)
1777   };
1778   bool isUnmovable[4] = { false, false, false, false };
1779   for (unsigned i = 0; i < 4; i++) {
1780     RemapSwizzle[i] = i;
1781     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1782       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1783           ->getZExtValue();
1784       if (i == Idx)
1785         isUnmovable[Idx] = true;
1786     }
1787   }
1788 
1789   for (unsigned i = 0; i < 4; i++) {
1790     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1791       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1792           ->getZExtValue();
1793       if (isUnmovable[Idx])
1794         continue;
1795       // Swap i and Idx
1796       std::swap(NewBldVec[Idx], NewBldVec[i]);
1797       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1798       break;
1799     }
1800   }
1801 
1802   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1803                      VectorEntry.getValueType(), NewBldVec);
1804 }
1805 
1806 
OptimizeSwizzle(SDValue BuildVector,SDValue Swz[4],SelectionDAG & DAG) const1807 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1808 SDValue Swz[4], SelectionDAG &DAG) const {
1809   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1810   // Old -> New swizzle values
1811   DenseMap<unsigned, unsigned> SwizzleRemap;
1812 
1813   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1814   for (unsigned i = 0; i < 4; i++) {
1815     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1816     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1817       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1818   }
1819 
1820   SwizzleRemap.clear();
1821   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1822   for (unsigned i = 0; i < 4; i++) {
1823     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1824     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1825       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1826   }
1827 
1828   return BuildVector;
1829 }
1830 
1831 
1832 //===----------------------------------------------------------------------===//
1833 // Custom DAG Optimizations
1834 //===----------------------------------------------------------------------===//
1835 
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const1836 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1837                                               DAGCombinerInfo &DCI) const {
1838   SelectionDAG &DAG = DCI.DAG;
1839 
1840   switch (N->getOpcode()) {
1841   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1842   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1843   case ISD::FP_ROUND: {
1844       SDValue Arg = N->getOperand(0);
1845       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1846         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1847                            Arg.getOperand(0));
1848       }
1849       break;
1850     }
1851 
1852   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1853   // (i32 select_cc f32, f32, -1, 0 cc)
1854   //
1855   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1856   // this to one of the SET*_DX10 instructions.
1857   case ISD::FP_TO_SINT: {
1858     SDValue FNeg = N->getOperand(0);
1859     if (FNeg.getOpcode() != ISD::FNEG) {
1860       return SDValue();
1861     }
1862     SDValue SelectCC = FNeg.getOperand(0);
1863     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1864         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1865         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1866         !isHWTrueValue(SelectCC.getOperand(2)) ||
1867         !isHWFalseValue(SelectCC.getOperand(3))) {
1868       return SDValue();
1869     }
1870 
1871     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1872                            SelectCC.getOperand(0), // LHS
1873                            SelectCC.getOperand(1), // RHS
1874                            DAG.getConstant(-1, MVT::i32), // True
1875                            DAG.getConstant(0, MVT::i32),  // Flase
1876                            SelectCC.getOperand(4)); // CC
1877 
1878     break;
1879   }
1880 
1881   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1882   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1883   case ISD::INSERT_VECTOR_ELT: {
1884     SDValue InVec = N->getOperand(0);
1885     SDValue InVal = N->getOperand(1);
1886     SDValue EltNo = N->getOperand(2);
1887     SDLoc dl(N);
1888 
1889     // If the inserted element is an UNDEF, just use the input vector.
1890     if (InVal.getOpcode() == ISD::UNDEF)
1891       return InVec;
1892 
1893     EVT VT = InVec.getValueType();
1894 
1895     // If we can't generate a legal BUILD_VECTOR, exit
1896     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1897       return SDValue();
1898 
1899     // Check that we know which element is being inserted
1900     if (!isa<ConstantSDNode>(EltNo))
1901       return SDValue();
1902     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1903 
1904     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1905     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1906     // vector elements.
1907     SmallVector<SDValue, 8> Ops;
1908     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1909       Ops.append(InVec.getNode()->op_begin(),
1910                  InVec.getNode()->op_end());
1911     } else if (InVec.getOpcode() == ISD::UNDEF) {
1912       unsigned NElts = VT.getVectorNumElements();
1913       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1914     } else {
1915       return SDValue();
1916     }
1917 
1918     // Insert the element
1919     if (Elt < Ops.size()) {
1920       // All the operands of BUILD_VECTOR must have the same type;
1921       // we enforce that here.
1922       EVT OpVT = Ops[0].getValueType();
1923       if (InVal.getValueType() != OpVT)
1924         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1925           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1926           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1927       Ops[Elt] = InVal;
1928     }
1929 
1930     // Return the new vector
1931     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1932   }
1933 
1934   // Extract_vec (Build_vector) generated by custom lowering
1935   // also needs to be customly combined
1936   case ISD::EXTRACT_VECTOR_ELT: {
1937     SDValue Arg = N->getOperand(0);
1938     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1939       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1940         unsigned Element = Const->getZExtValue();
1941         return Arg->getOperand(Element);
1942       }
1943     }
1944     if (Arg.getOpcode() == ISD::BITCAST &&
1945         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1946       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1947         unsigned Element = Const->getZExtValue();
1948         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1949             Arg->getOperand(0).getOperand(Element));
1950       }
1951     }
1952   }
1953 
1954   case ISD::SELECT_CC: {
1955     // Try common optimizations
1956     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1957     if (Ret.getNode())
1958       return Ret;
1959 
1960     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1961     //      selectcc x, y, a, b, inv(cc)
1962     //
1963     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1964     //      selectcc x, y, a, b, cc
1965     SDValue LHS = N->getOperand(0);
1966     if (LHS.getOpcode() != ISD::SELECT_CC) {
1967       return SDValue();
1968     }
1969 
1970     SDValue RHS = N->getOperand(1);
1971     SDValue True = N->getOperand(2);
1972     SDValue False = N->getOperand(3);
1973     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1974 
1975     if (LHS.getOperand(2).getNode() != True.getNode() ||
1976         LHS.getOperand(3).getNode() != False.getNode() ||
1977         RHS.getNode() != False.getNode()) {
1978       return SDValue();
1979     }
1980 
1981     switch (NCC) {
1982     default: return SDValue();
1983     case ISD::SETNE: return LHS;
1984     case ISD::SETEQ: {
1985       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1986       LHSCC = ISD::getSetCCInverse(LHSCC,
1987                                   LHS.getOperand(0).getValueType().isInteger());
1988       if (DCI.isBeforeLegalizeOps() ||
1989           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1990         return DAG.getSelectCC(SDLoc(N),
1991                                LHS.getOperand(0),
1992                                LHS.getOperand(1),
1993                                LHS.getOperand(2),
1994                                LHS.getOperand(3),
1995                                LHSCC);
1996       break;
1997     }
1998     }
1999     return SDValue();
2000   }
2001 
2002   case AMDGPUISD::EXPORT: {
2003     SDValue Arg = N->getOperand(1);
2004     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2005       break;
2006 
2007     SDValue NewArgs[8] = {
2008       N->getOperand(0), // Chain
2009       SDValue(),
2010       N->getOperand(2), // ArrayBase
2011       N->getOperand(3), // Type
2012       N->getOperand(4), // SWZ_X
2013       N->getOperand(5), // SWZ_Y
2014       N->getOperand(6), // SWZ_Z
2015       N->getOperand(7) // SWZ_W
2016     };
2017     SDLoc DL(N);
2018     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2019     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2020   }
2021   case AMDGPUISD::TEXTURE_FETCH: {
2022     SDValue Arg = N->getOperand(1);
2023     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2024       break;
2025 
2026     SDValue NewArgs[19] = {
2027       N->getOperand(0),
2028       N->getOperand(1),
2029       N->getOperand(2),
2030       N->getOperand(3),
2031       N->getOperand(4),
2032       N->getOperand(5),
2033       N->getOperand(6),
2034       N->getOperand(7),
2035       N->getOperand(8),
2036       N->getOperand(9),
2037       N->getOperand(10),
2038       N->getOperand(11),
2039       N->getOperand(12),
2040       N->getOperand(13),
2041       N->getOperand(14),
2042       N->getOperand(15),
2043       N->getOperand(16),
2044       N->getOperand(17),
2045       N->getOperand(18),
2046     };
2047     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2048     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2049         NewArgs);
2050   }
2051   }
2052 
2053   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2054 }
2055 
2056 static bool
FoldOperand(SDNode * ParentNode,unsigned SrcIdx,SDValue & Src,SDValue & Neg,SDValue & Abs,SDValue & Sel,SDValue & Imm,SelectionDAG & DAG)2057 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2058             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2059   const R600InstrInfo *TII =
2060       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2061   if (!Src.isMachineOpcode())
2062     return false;
2063   switch (Src.getMachineOpcode()) {
2064   case AMDGPU::FNEG_R600:
2065     if (!Neg.getNode())
2066       return false;
2067     Src = Src.getOperand(0);
2068     Neg = DAG.getTargetConstant(1, MVT::i32);
2069     return true;
2070   case AMDGPU::FABS_R600:
2071     if (!Abs.getNode())
2072       return false;
2073     Src = Src.getOperand(0);
2074     Abs = DAG.getTargetConstant(1, MVT::i32);
2075     return true;
2076   case AMDGPU::CONST_COPY: {
2077     unsigned Opcode = ParentNode->getMachineOpcode();
2078     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2079 
2080     if (!Sel.getNode())
2081       return false;
2082 
2083     SDValue CstOffset = Src.getOperand(0);
2084     if (ParentNode->getValueType(0).isVector())
2085       return false;
2086 
2087     // Gather constants values
2088     int SrcIndices[] = {
2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2096       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2097       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2098       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2099       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2100     };
2101     std::vector<unsigned> Consts;
2102     for (int OtherSrcIdx : SrcIndices) {
2103       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2104       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2105         continue;
2106       if (HasDst) {
2107         OtherSrcIdx--;
2108         OtherSelIdx--;
2109       }
2110       if (RegisterSDNode *Reg =
2111           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2112         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2113           ConstantSDNode *Cst
2114             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2115           Consts.push_back(Cst->getZExtValue());
2116         }
2117       }
2118     }
2119 
2120     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2121     Consts.push_back(Cst->getZExtValue());
2122     if (!TII->fitsConstReadLimitations(Consts)) {
2123       return false;
2124     }
2125 
2126     Sel = CstOffset;
2127     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2128     return true;
2129   }
2130   case AMDGPU::MOV_IMM_I32:
2131   case AMDGPU::MOV_IMM_F32: {
2132     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2133     uint64_t ImmValue = 0;
2134 
2135 
2136     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2137       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2138       float FloatValue = FPC->getValueAPF().convertToFloat();
2139       if (FloatValue == 0.0) {
2140         ImmReg = AMDGPU::ZERO;
2141       } else if (FloatValue == 0.5) {
2142         ImmReg = AMDGPU::HALF;
2143       } else if (FloatValue == 1.0) {
2144         ImmReg = AMDGPU::ONE;
2145       } else {
2146         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2147       }
2148     } else {
2149       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2150       uint64_t Value = C->getZExtValue();
2151       if (Value == 0) {
2152         ImmReg = AMDGPU::ZERO;
2153       } else if (Value == 1) {
2154         ImmReg = AMDGPU::ONE_INT;
2155       } else {
2156         ImmValue = Value;
2157       }
2158     }
2159 
2160     // Check that we aren't already using an immediate.
2161     // XXX: It's possible for an instruction to have more than one
2162     // immediate operand, but this is not supported yet.
2163     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2164       if (!Imm.getNode())
2165         return false;
2166       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2167       assert(C);
2168       if (C->getZExtValue())
2169         return false;
2170       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2171     }
2172     Src = DAG.getRegister(ImmReg, MVT::i32);
2173     return true;
2174   }
2175   default:
2176     return false;
2177   }
2178 }
2179 
2180 
2181 /// \brief Fold the instructions after selecting them
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const2182 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2183                                             SelectionDAG &DAG) const {
2184   const R600InstrInfo *TII =
2185       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2186   if (!Node->isMachineOpcode())
2187     return Node;
2188   unsigned Opcode = Node->getMachineOpcode();
2189   SDValue FakeOp;
2190 
2191   std::vector<SDValue> Ops;
2192   for (const SDUse &I : Node->ops())
2193     Ops.push_back(I);
2194 
2195   if (Opcode == AMDGPU::DOT_4) {
2196     int OperandIdx[] = {
2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2198       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2199       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2201       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2202       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2205         };
2206     int NegIdx[] = {
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2215     };
2216     int AbsIdx[] = {
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2222       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2223       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2224       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2225     };
2226     for (unsigned i = 0; i < 8; i++) {
2227       if (OperandIdx[i] < 0)
2228         return Node;
2229       SDValue &Src = Ops[OperandIdx[i] - 1];
2230       SDValue &Neg = Ops[NegIdx[i] - 1];
2231       SDValue &Abs = Ops[AbsIdx[i] - 1];
2232       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2233       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2234       if (HasDst)
2235         SelIdx--;
2236       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2237       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2238         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2239     }
2240   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2241     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2242       SDValue &Src = Ops[i];
2243       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2244         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2245     }
2246   } else if (Opcode == AMDGPU::CLAMP_R600) {
2247     SDValue Src = Node->getOperand(0);
2248     if (!Src.isMachineOpcode() ||
2249         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2250       return Node;
2251     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2252         AMDGPU::OpName::clamp);
2253     if (ClampIdx < 0)
2254       return Node;
2255     std::vector<SDValue> Ops;
2256     unsigned NumOp = Src.getNumOperands();
2257     for(unsigned i = 0; i < NumOp; ++i)
2258           Ops.push_back(Src.getOperand(i));
2259     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2260     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2261         Node->getVTList(), Ops);
2262   } else {
2263     if (!TII->hasInstrModifiers(Opcode))
2264       return Node;
2265     int OperandIdx[] = {
2266       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2268       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2269     };
2270     int NegIdx[] = {
2271       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2272       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2273       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2274     };
2275     int AbsIdx[] = {
2276       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2277       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2278       -1
2279     };
2280     for (unsigned i = 0; i < 3; i++) {
2281       if (OperandIdx[i] < 0)
2282         return Node;
2283       SDValue &Src = Ops[OperandIdx[i] - 1];
2284       SDValue &Neg = Ops[NegIdx[i] - 1];
2285       SDValue FakeAbs;
2286       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2287       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2288       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2289       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2290       if (HasDst) {
2291         SelIdx--;
2292         ImmIdx--;
2293       }
2294       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2295       SDValue &Imm = Ops[ImmIdx];
2296       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2297         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2298     }
2299   }
2300 
2301   return Node;
2302 }
2303