1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include <string>
10 #include <iostream>
11 #include <sstream>
12 #include <fstream>
13 #include <list>
14
15 #include "visa_igc_common_header.h"
16 #include "Common_ISA_util.h"
17 #include "Common_ISA_framework.h"
18 #include "JitterDataStruct.h"
19 #include "BuildIR.h"
20 #include "common.h"
21 #include "Timer.h"
22
23 using namespace vISA;
24
~DeclarePool()25 DeclarePool::~DeclarePool()
26 {
27 for (unsigned i = 0, size = (unsigned)dcllist.size(); i < size; i++) {
28 G4_Declare* dcl = dcllist[i];
29 dcl->~G4_Declare();
30 }
31 dcllist.clear();
32 }
33
createDeclare(const char * name,G4_RegFileKind regFile,unsigned short nElems,unsigned short nRows,G4_Type ty,DeclareType kind,G4_RegVar * base,G4_Operand * repRegion,G4_ExecSize execSize)34 G4_Declare* DeclarePool::createDeclare(
35 const char* name,
36 G4_RegFileKind regFile,
37 unsigned short nElems,
38 unsigned short nRows,
39 G4_Type ty,
40 DeclareType kind,
41 G4_RegVar * base,
42 G4_Operand * repRegion,
43 G4_ExecSize execSize)
44 {
45 G4_Declare* dcl = new (mem) G4_Declare(name, regFile, nElems * nRows, ty, dcllist);
46 G4_RegVar * regVar;
47 if (kind == DeclareType::Regular)
48 regVar = new (mem) G4_RegVar(dcl, G4_RegVar::RegVarType::Default);
49 else if (kind == DeclareType::AddrSpill)
50 regVar = new (mem) G4_RegVarAddrSpillLoc(dcl, addrSpillLocCount);
51 else if (kind == DeclareType::Tmp)
52 regVar = new (mem) G4_RegVarTmp(dcl, base);
53 else if (kind == DeclareType::Spill)
54 regVar = new (mem) G4_RegVarTransient(dcl, base, repRegion->asDstRegRegion(), execSize, G4_RegVarTransient::TransientType::Spill);
55 else if (kind == DeclareType::Fill)
56 regVar = new (mem)G4_RegVarTransient(dcl, base, repRegion->asSrcRegRegion(), execSize, G4_RegVarTransient::TransientType::Fill);
57 else if (kind == DeclareType::CoalescedFill || kind == DeclareType::CoalescedSpill)
58 regVar = new (mem)G4_RegVarCoalesced(dcl, kind == DeclareType::CoalescedFill);
59 else
60 {
61 MUST_BE_TRUE(false, ERROR_INTERNAL_ARGUMENT);
62 regVar = NULL;
63 }
64 dcl->setRegVar(regVar);
65
66 if (regFile == G4_ADDRESS || regFile == G4_SCALAR)
67 {
68 dcl->setSubRegAlign(Any);
69 }
70 else if (regFile != G4_FLAG)
71 {
72 if ((unsigned int)nElems * nRows * TypeSize(ty) >= numEltPerGRF<Type_UB>())
73 {
74 dcl->setSubRegAlign(GRFALIGN);
75 }
76 else
77 {
78 // at a minimum subRegAlign has to be at least the type size
79 dcl->setSubRegAlign(Get_G4_SubRegAlign_From_Type(ty));
80 }
81 }
82 else
83 {
84 if (dcl->getNumberFlagElements() == 32)
85 {
86 dcl->setSubRegAlign(Even_Word);
87 }
88 }
89
90 return dcl;
91 }
92
93
addImmVal(G4_Imm * imm,int numElt)94 G4_Declare * IR_Builder::GlobalImmPool::addImmVal(G4_Imm* imm, int numElt)
95 {
96 ImmVal val = { imm, numElt };
97 for (int i = 0; i < curSize; ++i)
98 {
99 if (val == immArray[i])
100 {
101 return dclArray[i];
102 }
103 }
104 if (curSize == MAX_POOL_SIZE)
105 {
106 return nullptr;
107 }
108 immArray[curSize] = val;
109 dclArray[curSize] = builder.createTempVar(numElt, imm->getType(), Any);
110 return dclArray[curSize++];
111 }
112
113
114 ///////////////////////////////////////////////////////////////////////////////
115 // IR_Builder functions (except translateXXXX, which should be in VisaToG4)
116 //
117
dump(std::ostream & os)118 void IR_Builder::dump(std::ostream &os)
119 {
120 os << "DECLARES:\n";
121 for (const G4_Declare *dcl : kernel.Declares) {
122 dcl->emit(os);
123 os << "\n";
124 }
125 os << "\n";
126 os << "INSTS:\n";
127 for (G4_INST *i : instList) {
128 i->emit(os, false, false);
129 os << "\n";
130 }
131 }
132
133
134 // bind a vISA input variable <dcl> to the GRF byte offset <offset>
bindInputDecl(G4_Declare * dcl,int offset)135 void IR_Builder::bindInputDecl(G4_Declare* dcl, int offset)
136 { // decide the physical register number and sub register number
137 unsigned int regNum = offset / getGRFSize();
138 unsigned int subRegNum = (offset % getGRFSize()) / dcl->getElemSize();
139 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), subRegNum);
140 dcl->setRegFile(G4_INPUT);
141 unsigned int reservedGRFNum = m_options->getuInt32Option(vISA_ReservedGRFNum);
142 if (regNum + dcl->getNumRows() > kernel.getNumRegTotal() - reservedGRFNum) {
143 MUST_BE_TRUE(false, "INPUT payload execeeds the regsiter number");
144 }
145 }
146
147 // check if an operand is aligned to <align_byte>
isOpndAligned(G4_Operand * opnd,unsigned short & offset,int align_byte) const148 bool IR_Builder::isOpndAligned(
149 G4_Operand *opnd, unsigned short &offset, int align_byte) const
150 {
151 offset = 0;
152 bool isAligned = true;
153
154 switch (opnd->getKind())
155 {
156 case G4_Operand::immediate:
157 case G4_Operand::addrExp:
158 case G4_Operand::label:
159 case G4_Operand::condMod:
160 case G4_Operand::predicate:
161 {
162 isAligned = true;
163 break;
164 }
165 case G4_Operand::srcRegRegion:
166 case G4_Operand::dstRegRegion:
167 {
168 int type_size = opnd->getTypeSize();
169 G4_Declare *dcl = NULL;
170 if (opnd->getBase()->isRegVar())
171 {
172 dcl = opnd->getBase()->asRegVar()->getDeclare();
173 while (dcl && dcl->getAliasDeclare())
174 {
175 if (dcl->getSubRegAlign() != Any &&
176 (((dcl->getSubRegAlign() * 2) >= align_byte && (dcl->getSubRegAlign() * 2) % align_byte != 0) ||
177 ((dcl->getSubRegAlign() * 2) < align_byte && align_byte % (dcl->getSubRegAlign() * 2) != 0)))
178 {
179 isAligned = false;
180 break;
181 }
182 offset += (unsigned short) dcl->getAliasOffset();
183 dcl = dcl->getAliasDeclare();
184 }
185
186 if (dcl && dcl->getRegVar() && dcl->getRegVar()->isPhyRegAssigned())
187 {
188 offset += static_cast<unsigned short>(dcl->getRegVar()->getByteAddr());
189 }
190 }
191 if (!isAligned)
192 {
193 return isAligned;
194 }
195
196 if (opnd->isDstRegRegion())
197 {
198 if (opnd->asDstRegRegion()->getRegAccess() != Direct)
199 {
200 isAligned = false;
201 }
202 offset += opnd->asDstRegRegion()->getRegOff() * numEltPerGRF<Type_UB>() + opnd->asDstRegRegion()->getSubRegOff() * type_size;
203 }
204 else if (opnd->isSrcRegRegion())
205 {
206 if (opnd->asSrcRegRegion()->getRegAccess() != Direct)
207 {
208 isAligned = false;
209 }
210 offset += opnd->asSrcRegRegion()->getRegOff() * numEltPerGRF<Type_UB>() + opnd->asSrcRegRegion()->getSubRegOff() * type_size;
211 }
212 if (offset % align_byte != 0)
213 {
214 return false;
215 }
216 // Only alignment of the top dcl can be changed.
217 if (dcl && dcl->getRegFile() == G4_GRF)
218 {
219 if (dcl->getSubRegAlign() == Any ||
220 ((dcl->getSubRegAlign() * 2) < align_byte && align_byte % (dcl->getSubRegAlign() * 2) == 0))
221 {
222 dcl->setSubRegAlign(G4_SubReg_Align(align_byte / 2));
223 }
224 else if ((dcl->getSubRegAlign() * 2) < align_byte || (dcl->getSubRegAlign() * 2) % align_byte != 0)
225 {
226 isAligned = false;
227 }
228 }
229 else if (opnd->getKind() == G4_Operand::dstRegRegion &&
230 // Only care about GRF or half-GRF alignment.
231 (align_byte == numEltPerGRF<Type_UB>() || align_byte == numEltPerGRF<Type_UB>() / 2) &&
232 dcl && (dcl->getRegFile() == G4_ADDRESS))
233 {
234
235 // Get the single definition of the specified operand from the use
236 // inst.
237 auto getSingleDefInst = [](G4_INST *UI, Gen4_Operand_Number OpndNum)
238 -> G4_INST * {
239 G4_INST *Def = nullptr;
240 for (DEF_EDGE_LIST_ITER I = UI->defInstList.begin(),
241 E = UI->defInstList.end();
242 I != E; ++I) {
243 if (I->second != OpndNum)
244 continue;
245 if (Def) {
246 // Not single defined, bail out
247 Def = nullptr;
248 break;
249 }
250 Def = I->first;
251 }
252 return Def;
253 };
254
255 G4_INST *inst = opnd->getInst();
256 if (inst) {
257 // Check address calculation like:
258 //
259 // shl (1) V1 V0 imm
260 // add (1) a0 $V2 + off V1
261 // ...
262 // (use)... r[a0, disp] ...
263 //
264 // need to check both disp, off, and V1 are aligned.
265 //
266 // Check acc_use_op's def-list.
267 G4_INST *LEA = getSingleDefInst(inst, Opnd_dst);
268 if (LEA && LEA->opcode() == G4_add && LEA->getExecSize() == g4::SIMD1) {
269 isAligned = true;
270 G4_Operand *Op0 = LEA->getSrc(0);
271 G4_Operand *Op1 = LEA->getSrc(1);
272 if (Op0->isSrcRegRegion()) {
273 // TODO: Consider MUL as well.
274 G4_INST *Def = getSingleDefInst(LEA, Opnd_src0);
275 if (Def && Def->opcode() == G4_shl &&
276 Def->getSrc(1)->isImm()) {
277 G4_Imm *Imm = Def->getSrc(1)->asImm();
278 unsigned Factor = (1U << Imm->getInt());
279 // TODO: We only perform alignment checking on
280 // component wise and may need to consider checking
281 // the accumulated result.
282 if (Factor % align_byte != 0)
283 isAligned = false;
284 } else if (Def && Def->opcode() == G4_and &&
285 Def->getSrc(1)->isImm()) {
286 G4_Imm *Imm = Def->getSrc(1)->asImm();
287 uint64_t Mask = uint64_t(Imm->getInt());
288 // align_byte could be 32 or 16 guarded previsouly.
289 uint64_t AlignMask = align_byte - 1;
290 if ((Mask & AlignMask) != 0)
291 isAligned = false;
292 } else
293 isAligned = false;
294 }
295 if (isAligned && Op1->isAddrExp()) {
296 G4_AddrExp *AE = Op1->asAddrExp();
297 G4_Declare *Dcl = AE->getRegVar()->getDeclare();
298 unsigned AliasOffset = 0;
299 while (Dcl && Dcl->getAliasDeclare()) {
300 AliasOffset += Dcl->getAliasOffset();
301 Dcl = Dcl->getAliasDeclare();
302 }
303 // TODO: We only perform alignment checking on
304 // component wise and may need to consider checking
305 // the accumulated result.
306 if ((AliasOffset % align_byte) != 0 ||
307 (Dcl && Dcl->getSubRegAlign() != GRFALIGN &&
308 Dcl->getSubRegAlign() != Sixteen_Word &&
309 Dcl->getSubRegAlign() != Eight_Word) ||
310 AE->getOffset() % align_byte != 0) {
311 isAligned = false;
312 }
313 } else
314 isAligned = false;
315 if (isAligned) {
316 // TODO: We only perform alignment checking on
317 // component wise and may need to consider checking
318 // the accumulated result.
319 if (opnd->asDstRegRegion()->getAddrImm() % align_byte != 0)
320 isAligned = false;
321 }
322 }
323 }
324 }
325 else if (dcl && dcl->getRegFile() == G4_FLAG)
326 {
327 // need to make flag even-word aligned if it's used in a setp with dword source
328 // ToDo: should we fix input to use 16-bit value instead
329 if (align_byte == 4)
330 {
331 dcl->setSubRegAlign(Even_Word);
332 }
333 }
334 break;
335 }
336 default:
337 break;
338 }
339 return isAligned;
340 }
341
342
isOpndAligned(G4_Operand * opnd,int alignByte) const343 bool IR_Builder::isOpndAligned(G4_Operand* opnd, int alignByte) const
344 {
345 uint16_t offset = 0; // ignored
346 return isOpndAligned(opnd, offset, alignByte);
347 }
348
349
predefinedVarRegAssignment(uint8_t inputSize)350 void IR_Builder::predefinedVarRegAssignment(uint8_t inputSize)
351 {
352 uint32_t preDefinedStart = ((inputSize + G4_DSIZE - 1) / G4_DSIZE) * G4_DSIZE;
353 if (preDefinedStart == 0)
354 {
355 preDefinedStart = numEltPerGRF<Type_UB>();
356 }
357 for (PreDefinedVarsInternal i : allPreDefVars)
358 {
359 if (!predefinedVarNeedGRF(i))
360 {
361 continue;
362 }
363
364 G4_Type ty = GetGenTypeFromVISAType(getPredefinedVarType(i));
365 G4_Declare *dcl = preDefVars.getPreDefinedVar((PreDefinedVarsInternal)i);
366 if (!isPredefinedVarInR0((PreDefinedVarsInternal)i))
367 {
368 unsigned short new_offset = preDefinedStart + getPredefinedVarByteOffset(i);
369 unsigned int regNum = new_offset / numEltPerGRF<Type_UB>();
370 unsigned int subRegNum = (new_offset % numEltPerGRF<Type_UB>()) / TypeSize(ty);
371 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), subRegNum);
372 }
373 else
374 {
375 unsigned int regNum = 0;
376 unsigned int subRegNum = getPredefinedVarByteOffset(i) / TypeSize(ty);
377 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), subRegNum);
378 }
379 }
380 }
381
382 // Expand some of the pre-defined variables at kernel entry
383 // -- replace pre-defined V17 (hw_tid)
384 // -- replace pre-defined V22 (color)
385 // -- replace pre-defined V1 (thread_x)
386 // -- replace pre-defined V2 (thread_y)
expandPredefinedVars()387 void IR_Builder::expandPredefinedVars()
388 {
389
390 // Use FFTID from msg header
391 // and (1) hw_tid, r0.5, 0x3ff
392 //
393 // 9:0 FFTID. This ID is assigned by TS and is a unique identifier for the thread in
394 // comparison to other concurrent root threads. It is used to free up resources used
395 // by the thread upon thread completion.
396 //
397 // [Pre-DevBDW]: Format = U8. Bits 9:8 are Reserved, MBZ.
398 //
399 // [0:8] For Pre-Gen9
400 // [0:9] For Gen10+
401 //
402
403 // first non-label instruction
404 auto iter = std::find_if(instList.begin(), instList.end(), [](G4_INST* inst) { return !inst->isLabel(); });
405
406 if (preDefVars.isHasPredefined(PreDefinedVarsInternal::HW_TID))
407 {
408 const unsigned fftid_mask = getPlatform() >= GENX_CNL ? 0x3FF : 0x1FF;
409 G4_SrcRegRegion* src = createSrc(realR0->getRegVar(), 0, 5, getRegionScalar(), Type_UD);
410 G4_Imm* mask1 = createImm(fftid_mask, Type_UD);
411 G4_DstRegRegion* dst = createDstRegRegion(builtinHWTID, 1);
412 G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, src, mask1, InstOpt_WriteEnable, false);
413 instList.insert(iter, inst);
414 }
415
416 if (preDefVars.isHasPredefined(PreDefinedVarsInternal::X))
417 {
418 if (useNewR0Format())
419 {
420 // x -> and (1) thread_x<1>:uw r0.1:ud 0xFFF
421 G4_SrcRegRegion* r0Dot1UD = createSrc(
422 realR0->getRegVar(), 0, 1, getRegionScalar(), Type_UD);
423 G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::X), 1);
424 G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, r0Dot1UD,
425 createImm(0xFFF, Type_UW), InstOpt_WriteEnable, false);
426 instList.insert(iter, inst);
427 }
428 else
429 {
430 // We insert the new instruction
431 // and (1) thread_x<1>:uw, r0.2:uw, 0x01FF
432 G4_SrcRegRegion* r0Dot2UW = createSrc(
433 realR0->getRegVar(), 0, 2, getRegionScalar(), Type_UW);
434 int64_t mask = getThreadIDMask();
435 G4_Imm* src1 = createImm(mask, Type_UW);
436 G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::X), 1);
437 G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, r0Dot2UW, src1, InstOpt_WriteEnable, false);
438 instList.insert(iter, inst);
439 }
440 }
441
442 if (preDefVars.isHasPredefined(PreDefinedVarsInternal::Y))
443 {
444 if (useNewR0Format())
445 {
446 // y -> shr (1) thread_y<1>:uw r0.1:ud 12
447 // and (1) thread_y<1>:uw thread_y:uw 0xFFF
448 G4_SrcRegRegion* r0Dot1UD = createSrc(
449 realR0->getRegVar(), 0, 1, getRegionScalar(), Type_UD);
450
451 G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), 1);
452 G4_INST* inst1 = createBinOp(G4_shr, g4::SIMD1, dst, r0Dot1UD,
453 createImm(12, Type_UW), InstOpt_WriteEnable, false);
454 instList.insert(iter, inst1);
455 dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), 1);
456 G4_INST* inst2 = createBinOp(G4_and, g4::SIMD1, dst,
457 createSrcRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), getRegionScalar()),
458 createImm(0xFFF, Type_UW), InstOpt_WriteEnable, false);
459 instList.insert(iter, inst2);
460 }
461 else
462 {
463 // We insert the new instruction
464 // and (1) thread_y<1>:uw, r0.3:uw, 0x01FF
465 G4_SrcRegRegion* r0Dot3UW = createSrc(
466 realR0->getRegVar(), 0, 3, getRegionScalar(), Type_UW);
467 int64_t mask = getThreadIDMask();
468 G4_Imm* src1 = createImmWithLowerType(mask, Type_UW);
469 G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), 1);
470 G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, r0Dot3UW, src1, InstOpt_WriteEnable, false);
471 instList.insert(iter, inst);
472 }
473 }
474
475 // color bit
476 if (preDefVars.isHasPredefined(PreDefinedVarsInternal::COLOR))
477 {
478 if (useNewR0Format())
479 {
480 // r0.1[31:24]
481 // shr (1) color<2>:uw r0.1<0;1,0>:ud 24
482 G4_SrcRegRegion* src = createSrc(realR0->getRegVar(),
483 0, 1, getRegionScalar(), Type_UD);
484 G4_Imm* shift = createImm(24, Type_UW);
485 G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::COLOR), 2);
486 G4_INST* inst = createBinOp(G4_shr, g4::SIMD1, dst, src, shift,
487 InstOpt_WriteEnable, false);
488 instList.insert(iter, inst);
489 }
490 else
491 {
492 // else: r0.2[3:0]
493 // and (1) color<2>:uw r0.2<0;1,0>:ud 0xF
494 G4_SrcRegRegion* src = createSrc(realR0->getRegVar(),
495 0, 2, getRegionScalar(), Type_UD);
496 G4_Imm* mask = createImm(0xF, Type_UW);
497 G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::COLOR), 2);
498 G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, src, mask,
499 InstOpt_WriteEnable, false);
500 instList.insert(iter, inst);
501 }
502 }
503 }
504
getFCPatchInfo()505 FCPatchingInfo* IR_Builder::getFCPatchInfo()
506 {
507 // Create new instance of FC patching class if one is not
508 // yet created.
509 if (fcPatchInfo == NULL)
510 {
511 FCPatchingInfo* instance;
512 instance = (FCPatchingInfo*)mem.alloc(sizeof(FCPatchingInfo));
513 fcPatchInfo = new (instance) FCPatchingInfo();
514 }
515
516 return fcPatchInfo;
517 }
518
getNameString(Mem_Manager & mem,size_t size,const char * format,...)519 const char* IR_Builder::getNameString(
520 Mem_Manager& mem, size_t size, const char* format, ...)
521 {
522 #ifdef _DEBUG
523 char* name = (char*) mem.alloc(size);
524 va_list args;
525 va_start(args, format);
526 std::vsnprintf(name, size, format, args);
527 va_end(args);
528 return name;
529 #else
530 return "";
531 #endif
532 }
533
getFcallInfo(const G4_INST * inst) const534 G4_FCALL* IR_Builder::getFcallInfo(const G4_INST* inst) const {
535 auto it = m_fcallInfo.find(inst);
536 if (m_fcallInfo.end() == it) {
537 return nullptr;
538 } else {
539 return it->second;
540 }
541 }
542
createPreDefinedVars()543 void IR_Builder::createPreDefinedVars()
544 {
545 for (PreDefinedVarsInternal i : allPreDefVars)
546 {
547 G4_Declare* dcl = nullptr;
548
549 if (predefinedVarNeedGRF(i))
550 {
551 // work item id variables are handled uniformly
552 G4_Type ty = GetGenTypeFromVISAType(getPredefinedVarType(i));
553 dcl = createPreVar(getPredefinedVarID(i), 1, ty);
554 }
555 else
556 {
557 const char* name = getPredefinedVarString(i);
558 switch (i)
559 {
560 case PreDefinedVarsInternal::VAR_NULL:
561 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UD);
562 dcl->getRegVar()->setPhyReg(phyregpool.getNullReg(), 0);
563 break;
564 case PreDefinedVarsInternal::TSC:
565 {
566 G4_Declare* tscDcl = createPreVar(i, 5, Type_UD);
567 tscDcl->getRegVar()->setPhyReg(phyregpool.getTm0Reg(), 0);
568 dcl = tscDcl;
569 break;
570 }
571 case PreDefinedVarsInternal::R0:
572 {
573 dcl = getBuiltinR0();
574 break;
575 }
576 case PreDefinedVarsInternal::SR0:
577 {
578 G4_Declare* sr0Dcl = createPreVar(i, 4, Type_UD);
579 sr0Dcl->getRegVar()->setPhyReg(phyregpool.getSr0Reg(), 0);
580 dcl = sr0Dcl;
581 break;
582 }
583 case PreDefinedVarsInternal::CR0:
584 {
585 G4_Declare* cr0Dcl = createPreVar(i, 3, Type_UD);
586 cr0Dcl->getRegVar()->setPhyReg(phyregpool.getCr0Reg(), 0);
587 dcl = cr0Dcl;
588 break;
589 }
590 case PreDefinedVarsInternal::CE0:
591 {
592 G4_Declare* ce0Dcl = createPreVar(i, 1, Type_UD);
593 ce0Dcl->getRegVar()->setPhyReg(phyregpool.getMask0Reg(), 0);
594 dcl = ce0Dcl;
595 break;
596 }
597 case PreDefinedVarsInternal::DBG:
598 {
599 G4_Declare* dbgDcl = createPreVar(i, 2, Type_UD);
600 dbgDcl->getRegVar()->setPhyReg(phyregpool.getDbgReg(), 0);
601 dcl = dbgDcl;
602 break;
603 }
604 case PreDefinedVarsInternal::ARG:
605 {
606 dcl = createDeclareNoLookup(name, G4_INPUT, numEltPerGRF<Type_UD>(), 32, Type_UD);
607 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(ArgRet_Stackcall::Arg), 0);
608 break;
609 }
610 case PreDefinedVarsInternal::RET:
611 {
612 dcl = createDeclareNoLookup(name, G4_GRF, numEltPerGRF<Type_UD>(), 12, Type_UD);
613 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(ArgRet_Stackcall::Ret), 0);
614 dcl->setLiveOut();
615 break;
616 }
617 case PreDefinedVarsInternal::FE_SP:
618 {
619 unsigned int startReg = kernel.getFPSPGRF();
620 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UQ);
621 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(startReg), SubRegs_Stackcall::FE_SP);
622 break;
623 }
624 case PreDefinedVarsInternal::FE_FP:
625 {
626 // PREDEFINED_FE_FP
627 unsigned int startReg = kernel.getFPSPGRF();
628 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UQ);
629 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(startReg), SubRegs_Stackcall::FE_FP);
630 break;
631 }
632 case PreDefinedVarsInternal::HW_TID:
633 {
634 // PREDEFINED_HW_TID
635 dcl = getBuiltinHWTID();
636 break;
637 }
638 case PreDefinedVarsInternal::X:
639 case PreDefinedVarsInternal::Y:
640 case PreDefinedVarsInternal::COLOR:
641 {
642 // these three are size 1 UW
643 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1,
644 GetGenTypeFromVISAType(getPredefinedVarType(i)));
645 break;
646 }
647 case PreDefinedVarsInternal::IMPL_ARG_BUF_PTR:
648 {
649 dcl = createDeclareNoLookup("implBufPtr", G4_GRF, 1, 1, Type_UQ);
650 auto phyReg = phyregpool.getGreg(kernel.getSpillHeaderGRF());
651 dcl->getRegVar()->setPhyReg(phyReg, SubRegs_ImplPtrs::ImplBufPtr);
652 break;
653 }
654
655 case PreDefinedVarsInternal::LOCAL_ID_BUF_PTR:
656 {
657 dcl = createDeclareNoLookup("localIdBufPtr", G4_GRF, 1, 1, Type_UQ);
658 auto phyReg = phyregpool.getGreg(kernel.getSpillHeaderGRF());
659 dcl->getRegVar()->setPhyReg(phyReg, SubRegs_ImplPtrs::LocalIdBufPtr);
660 break;
661 }
662
663 default:
664 {
665 break;
666 }
667 }
668 }
669 preDefVars.setPredefinedVar(i, dcl);
670 dcl->setPreDefinedVar(true);
671 }
672 }
673
createBuiltinDecls()674 void IR_Builder::createBuiltinDecls()
675 {
676 // realR0 is always tied to physical r0
677 auto numR0DW = numEltPerGRF<Type_UD>();
678 realR0 = createDeclareNoLookup(
679 "BuiltInR0",
680 G4_INPUT,
681 numR0DW,
682 1,
683 Type_UD);
684 realR0->getRegVar()->setPhyReg(phyregpool.getGreg(0), 0);
685
686 // builtinR0 either gets allocated to r0 or to a different
687 // register depending on conditions in RA.
688 builtinR0 = createTempVar(numR0DW, Type_UD, GRFALIGN, "R0_Copy");
689 builtinR0->setDoNotSpill();
690
691 builtinA0 = createDeclareNoLookup(
692 "BuiltinA0",
693 G4_ADDRESS,
694 1,
695 1,
696 Type_UD);
697 builtinA0->getRegVar()->setPhyReg(phyregpool.getAddrReg(), 0);
698
699 builtinA0Dot2 = createDeclareNoLookup(
700 "BuiltinA0Dot2", //a0.2
701 G4_ADDRESS,
702 1,
703 1,
704 Type_UD);
705 builtinA0Dot2->getRegVar()->setPhyReg(phyregpool.getAddrReg(), 2);
706
707 builtinHWTID = createDeclareNoLookup("hw_tid", G4_GRF, 1, 1, Type_UD);
708
709 builtinT252 = createDeclareNoLookup(vISAPreDefSurf[PREDEFINED_SURFACE_T252].name, G4_GRF, 1, 1, Type_UD);
710 builtinBindlessSampler = createDeclareNoLookup("B_S", G4_GRF, 1, 1, Type_UD);
711
712 builtinSamplerHeader = createDeclareNoLookup("samplerHeader", G4_GRF, numEltPerGRF<Type_UD>(), 1, Type_UD);
713
714 builtinScratchSurface = createDeclareNoLookup(vISAPreDefSurf[PREDEFINED_SURFACE_SCRATCH].name, G4_GRF, 1, 1, Type_UD);
715 }
716
717
getSpillFillHeader()718 G4_Declare* IR_Builder::getSpillFillHeader()
719 {
720 if (!spillFillHeader)
721 {
722 spillFillHeader = createTempVar(1, Type_UD, GRFALIGN, "spillHeader");
723 spillFillHeader->setLiveOut();
724 spillFillHeader->setLiveIn();
725 spillFillHeader->setDoNotSpill();
726 }
727 return spillFillHeader;
728 }
729
getEUFusionWATmpVar()730 G4_Declare* IR_Builder::getEUFusionWATmpVar()
731 {
732 if (!euFusionWATmpVar)
733 {
734 euFusionWATmpVar = createTempVar(2, Type_UD, Even_Word, "euFusionWATmp");
735 euFusionWATmpVar->setLiveOut();
736 euFusionWATmpVar->setLiveIn();
737 euFusionWATmpVar->setDoNotSpill();
738 }
739 return euFusionWATmpVar;
740 }
741
getOldA0Dot2Temp()742 G4_Declare* IR_Builder::getOldA0Dot2Temp()
743 {
744 if (!oldA0Dot2Temp)
745 {
746 oldA0Dot2Temp = createTempVar(1, Type_UD, Any, "OldA0Dot2");
747 oldA0Dot2Temp->setLiveOut();
748 oldA0Dot2Temp->setLiveIn();
749 oldA0Dot2Temp->setDoNotSpill();
750 }
751 return oldA0Dot2Temp;
752 }
753
IR_Builder(TARGET_PLATFORM genPlatform,INST_LIST_NODE_ALLOCATOR & alloc,G4_Kernel & k,Mem_Manager & m,Options * options,CISA_IR_Builder * parent,FINALIZER_INFO * jitInfo,const WA_TABLE * pWaTable)754 IR_Builder::IR_Builder(
755 TARGET_PLATFORM genPlatform,
756 INST_LIST_NODE_ALLOCATOR& alloc,
757 G4_Kernel& k,
758 Mem_Manager& m,
759 Options* options,
760 CISA_IR_Builder* parent,
761 FINALIZER_INFO* jitInfo,
762 const WA_TABLE* pWaTable)
763 : platform(genPlatform), curFile(NULL), curLine(0), curCISAOffset(-1), immPool(*this), metaData(jitInfo),
764 type(VISA_BUILD_TYPE::KERNEL), parentBuilder(parent),
765 builtinSamplerHeaderInitialized(false), m_pWaTable(pWaTable), m_options(options), CanonicalRegionStride0(0, 1, 0),
766 CanonicalRegionStride1(1, 1, 0), CanonicalRegionStride2(2, 1, 0), CanonicalRegionStride4(4, 1, 0),
767 mem(m), phyregpool(m, k.getNumRegTotal()), hashtable(m), rgnpool(m), dclpool(m),
768 instList(alloc), kernel(k), metadataMem(4096)
769 {
770 num_temp_dcl = 0;
771 kernel.setBuilder(this); // kernel needs pointer to the builder
772 createBuiltinDecls();
773
774 sampler8x8_group_id = 0;
775
776 be_sp = be_fp = tmpFCRet = nullptr;
777
778 arg_size = 0;
779 return_var_size = 0;
780
781 if (metaData != NULL)
782 {
783 memset(metaData, 0, sizeof(FINALIZER_INFO));
784 }
785
786 fcPatchInfo = NULL;
787
788 createPreDefinedVars();
789 }
790
791
~IR_Builder()792 IR_Builder::~IR_Builder()
793 {
794 // We need to invoke the destructor of every instruction ever allocated
795 // so that its members will be freed.
796 // Note that we don't delete the instruction itself as it's allocated from
797 // the memory manager's pool
798 for (unsigned i = 0, size = (unsigned)instAllocList.size(); i != size; i++)
799 {
800 G4_INST* inst = instAllocList[i];
801 inst->~G4_INST();
802 }
803 instAllocList.clear();
804
805 for (auto MD : allMDs)
806 {
807 MD->~Metadata();
808 }
809
810 for (auto node : allMDNodes)
811 {
812 node->~MDNode();
813 }
814
815 if (fcPatchInfo)
816 {
817 fcPatchInfo->~FCPatchingInfo();
818 }
819 }
820
createDeclareNoLookup(const char * name,G4_RegFileKind regFile,unsigned short n_elems,unsigned short n_rows,G4_Type ty,DeclareType kind,G4_RegVar * base,G4_Operand * repRegion,G4_ExecSize execSize)821 G4_Declare* IR_Builder::createDeclareNoLookup(
822 const char* name,
823 G4_RegFileKind regFile,
824 unsigned short n_elems,
825 unsigned short n_rows,
826 G4_Type ty,
827 DeclareType kind,
828 G4_RegVar * base,
829 G4_Operand * repRegion,
830 G4_ExecSize execSize)
831 {
832 if (regFile == G4_FLAG)
833 {
834 MUST_BE_TRUE(ty == Type_UW, "flag decl must have type UW");
835 }
836
837 G4_Declare* dcl = dclpool.createDeclare(name, regFile, n_elems,
838 n_rows, ty, kind, base, repRegion, execSize);
839
840 kernel.Declares.push_back(dcl);
841
842 return dcl;
843 }
844
845
getSplitEMask(unsigned execSize,uint32_t eMask,bool isLo)846 uint32_t IR_Builder::getSplitEMask(unsigned execSize, uint32_t eMask, bool isLo)
847 {
848 const uint32_t qhMasks = InstOpt_M0 | InstOpt_M8 |
849 InstOpt_M16 | InstOpt_M24;
850 uint32_t other = eMask & ~qhMasks;
851 uint32_t qh = eMask & qhMasks;
852
853 switch (execSize) {
854 case 16: // Split SIMD16 into SIMD8
855 switch (qh) {
856 case 0: // instOpt not specified, treat as 1H
857 case InstOpt_M0:
858 return (isLo ? InstOpt_M0 : InstOpt_M8) | other;
859 case InstOpt_M16:
860 return (isLo ? InstOpt_M16 : InstOpt_M24) | other;
861 }
862 break;
863 case 32: // Split SIMD32 into SIMD16.
864 switch (qh) {
865 case 0:
866 return (isLo ? InstOpt_M0 : InstOpt_M16) | other;
867 }
868 break;
869 }
870
871 ASSERT_USER(false, "Unhandled cases for EMask splitting!");
872 return ~0U;
873 }
874
initScratchSurfaceOffset()875 void IR_Builder::initScratchSurfaceOffset()
876 {
877 // (W) and (1) sso r0.5 0xFFFFC00, placed at kernel entry
878 if (!scratchSurfaceOffset)
879 {
880 G4_SrcRegRegion* R0_5 = createSrc(builtinR0->getRegVar(), 0, 5,
881 getRegionScalar(), Type_UD);
882 scratchSurfaceOffset = createTempVar(1, Type_UD, Any, "SSO");
883 scratchSurfaceOffset->setLiveOut();
884 scratchSurfaceOffset->setDoNotSpill();
885 if (kernel.getBoolKernelAttr(Attributes::ATTR_SepSpillPvtSS))
886 {
887 G4_Declare* slot0SSO = createTempVar(1, Type_UD, Any, "Slot0SSO");
888 G4_DstRegRegion* andDst = createDstRegRegion(slot0SSO, 1);
889 auto andInst = createBinOp(G4_and, g4::SIMD1, andDst, R0_5, createImm(0xFFFFFC00, Type_UD), InstOpt_WriteEnable, true);
890 instList.pop_back();
891 auto iter = std::find_if(instList.begin(), instList.end(), [](G4_INST* inst) { return !inst->isLabel(); });
892 instList.insert(iter, andInst);
893
894 //scratchSurfaceOffset is reserved for spillfill, pvtmem should use r0.5+1
895 G4_DstRegRegion* dst = createDstRegRegion(scratchSurfaceOffset, 1);
896 createBinOp(G4_add, g4::SIMD1, dst, createSrcRegRegion(slot0SSO, getRegionScalar()),
897 createImm(0x400, Type_UD), InstOpt_WriteEnable, true);
898 }
899 else
900 {
901 G4_DstRegRegion* andDst = createDstRegRegion(scratchSurfaceOffset, 1);
902 auto andInst = createBinOp(G4_and, g4::SIMD1, andDst, R0_5, createImm(0xFFFFFC00, Type_UD), InstOpt_WriteEnable, true);
903 instList.pop_back();
904 auto iter = std::find_if(instList.begin(), instList.end(), [](G4_INST* inst) { return !inst->isLabel(); });
905 instList.insert(iter, andInst);
906 }
907 }
908 }
909
910
createTempVar(unsigned int numElements,G4_Type type,G4_SubReg_Align subAlign,const char * prefix,bool appendIdToName)911 G4_Declare* IR_Builder::createTempVar(
912 unsigned int numElements, G4_Type type, G4_SubReg_Align subAlign,
913 const char* prefix, bool appendIdToName)
914 {
915 const char* name = appendIdToName ?
916 getNameString(mem, 20, "%s%d", prefix, num_temp_dcl++) :
917 getNameString(mem, 20, "%s", prefix);
918
919 unsigned short dcl_width = 0, dcl_height = 1;
920 const uint16_t typeSize = TypeSize(type);
921 int totalByteSize = numElements * typeSize;
922 if (totalByteSize <= (int)numEltPerGRF<Type_UB>())
923 {
924 dcl_width = totalByteSize / typeSize;
925 }
926 else
927 {
928 // here we assume that the start point of the var is the beginning of a GRF?
929 // so subregister must be 0?
930 dcl_width = numEltPerGRF<Type_UB>() / typeSize;
931 dcl_height = totalByteSize / numEltPerGRF<Type_UB>();
932 if (totalByteSize % numEltPerGRF<Type_UB>() != 0)
933 {
934 dcl_height++;
935 }
936 }
937
938 G4_Declare* dcl = createDeclareNoLookup(name, G4_GRF, dcl_width, dcl_height, type);
939 dcl->setSubRegAlign(subAlign);
940 return dcl;
941 }
942
createAddrFlagSpillLoc(G4_Declare * dcl)943 G4_Declare* IR_Builder::createAddrFlagSpillLoc(G4_Declare* dcl)
944 {
945 const char* name = getNameString(mem, 16, "SP_LOC_%d", numAddrFlagSpillLoc++);
946 G4_Declare* spillLoc = createDeclareNoLookup(name,
947 G4_GRF,
948 dcl->getNumElems(),
949 1,
950 dcl->getElemType(),
951 DeclareType::AddrSpill);
952 dcl->setSpilledDeclare(spillLoc);
953 spillLoc->setSubRegAlign(dcl->getSubRegAlign()); // for simd32 flag the spill loc has to be 2-word aligned since it's accessed as dw
954 return spillLoc;
955 }
956
createHardwiredDeclare(uint32_t numElements,G4_Type type,uint32_t regNum,uint32_t regOff)957 G4_Declare* IR_Builder::createHardwiredDeclare(
958 uint32_t numElements, G4_Type type, uint32_t regNum, uint32_t regOff)
959 {
960 G4_Declare* dcl = createTempVar(numElements, type, Any);
961 unsigned int linearizedStart = (regNum * numEltPerGRF<Type_UB>()) + (regOff * TypeSize(type));
962 // since it's called post RA (specifically post computePReg) we have to manually set the GRF's byte offset
963 dcl->setGRFBaseOffset(linearizedStart);
964 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), regOff);
965 return dcl;
966 }
967
createPseudoKills(std::initializer_list<G4_Declare * > dcls,PseudoKillType ty)968 G4_INST* IR_Builder::createPseudoKills(
969 std::initializer_list<G4_Declare*> dcls, PseudoKillType ty)
970 {
971 G4_INST* inst = nullptr;
972 for (auto dcl : dcls)
973 {
974 inst = createPseudoKill(dcl, ty);
975 }
976
977 return inst;
978 }
979
createPseudoKill(G4_Declare * dcl,PseudoKillType ty)980 G4_INST* IR_Builder::createPseudoKill(G4_Declare* dcl, PseudoKillType ty)
981 {
982 auto dstRgn = createDst(dcl->getRegVar(), 0, 0, 1, Type_UD);
983 G4_INST* inst = createIntrinsicInst(nullptr, Intrinsic::PseudoKill, g4::SIMD1,
984 dstRgn, createImm((unsigned int)ty, Type_UD), nullptr, nullptr, InstOpt_WriteEnable, true);
985
986 return inst;
987 }
988
989 static const unsigned int HWORD_BYTE_SIZE = 32;
990
991
createEUWASpill(bool addToInstList)992 G4_INST* IR_Builder::createEUWASpill(bool addToInstList)
993 {
994 const RegionDesc* rd = getRegionScalar();
995
996 G4_Declare* dcl = getEUFusionWATmpVar();
997 G4_SrcRegRegion* pseudoUseSrc =
998 createSrc(dcl->getRegVar(), 0, 0, rd, Type_UD);
999
1000 G4_INST* pseudoUseInst = createIntrinsicInst(
1001 nullptr, Intrinsic::FlagSpill, g4::SIMD2,
1002 nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt, addToInstList);
1003
1004 return pseudoUseInst;
1005 }
1006
createSpill(G4_DstRegRegion * dst,G4_SrcRegRegion * header,G4_SrcRegRegion * payload,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1007 G4_INST* IR_Builder::createSpill(
1008 G4_DstRegRegion* dst, G4_SrcRegRegion* header, G4_SrcRegRegion* payload,
1009 G4_ExecSize execSize,
1010 uint16_t numRows, uint32_t offset, G4_Declare* fp, G4_InstOption option,
1011 bool addToInstList)
1012 {
1013 G4_INST* spill = createIntrinsicInst(nullptr, Intrinsic::Spill, execSize, dst,
1014 header, payload, nullptr, option, addToInstList);
1015 spill->asSpillIntrinsic()->setFP(fp);
1016 spill->asSpillIntrinsic()->setOffset((uint32_t)
1017 (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1018 spill->asSpillIntrinsic()->setNumRows(numRows);
1019
1020 return spill;
1021 }
1022
createSpill(G4_DstRegRegion * dst,G4_SrcRegRegion * payload,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1023 G4_INST* IR_Builder::createSpill(
1024 G4_DstRegRegion* dst, G4_SrcRegRegion* payload,
1025 G4_ExecSize execSize, uint16_t numRows, uint32_t offset,
1026 G4_Declare* fp, G4_InstOption option, bool addToInstList)
1027 {
1028 auto builtInR0 = getBuiltinR0();
1029 auto rd = getRegionStride1();
1030 auto srcRgnr0 = createSrc(builtInR0->getRegVar(), 0, 0, rd, Type_UD);
1031 G4_INST* spill = createIntrinsicInst(nullptr, Intrinsic::Spill, execSize, dst,
1032 srcRgnr0, payload, nullptr, option, addToInstList);
1033 spill->asSpillIntrinsic()->setFP(fp);
1034 spill->asSpillIntrinsic()->setOffset((uint32_t)
1035 (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1036 spill->asSpillIntrinsic()->setNumRows(numRows);
1037 return spill;
1038 }
1039
createFill(G4_SrcRegRegion * header,G4_DstRegRegion * dstData,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1040 G4_INST* IR_Builder::createFill(
1041 G4_SrcRegRegion* header, G4_DstRegRegion* dstData,
1042 G4_ExecSize execSize,
1043 uint16_t numRows, uint32_t offset, G4_Declare* fp, G4_InstOption option,
1044 bool addToInstList)
1045 {
1046 G4_INST* fill = createIntrinsicInst(nullptr, Intrinsic::Fill, execSize, dstData,
1047 header, nullptr, nullptr, option, addToInstList);
1048 fill->asFillIntrinsic()->setFP(fp);
1049 fill->asFillIntrinsic()->setOffset((uint32_t)
1050 (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1051 fill->asFillIntrinsic()->setNumRows(numRows);
1052 return fill;
1053 }
1054
createFill(G4_DstRegRegion * dstData,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1055 G4_INST* IR_Builder::createFill(
1056 G4_DstRegRegion* dstData,
1057 G4_ExecSize execSize,
1058 uint16_t numRows, uint32_t offset, G4_Declare* fp , G4_InstOption option,
1059 bool addToInstList)
1060 {
1061 auto builtInR0 = getBuiltinR0();
1062 auto rd = getRegionStride1();
1063 auto srcRgnr0 = createSrc(builtInR0->getRegVar(), 0, 0, rd, Type_UD);
1064 G4_INST* fill = createIntrinsicInst(nullptr, Intrinsic::Fill, execSize, dstData,
1065 srcRgnr0, nullptr, nullptr, option, addToInstList);
1066
1067 fill->asFillIntrinsic()->setFP(fp);
1068 fill->asFillIntrinsic()->setOffset((uint32_t)
1069 (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1070 fill->asFillIntrinsic()->setNumRows(numRows);
1071 return fill;
1072 }
1073
1074
createTempFlag(unsigned short numberOfFlags,const char * prefix)1075 G4_Declare* IR_Builder::createTempFlag(unsigned short numberOfFlags, const char* prefix)
1076 {
1077 const char* name = getNameString(mem, 20, "%s%d", prefix, num_temp_dcl++);
1078
1079 G4_Declare* dcl = createDeclareNoLookup(name, G4_FLAG, numberOfFlags, 1, Type_UW);
1080
1081 return dcl;
1082 }
1083
createFlag(uint16_t numFlagElements,const char * name)1084 G4_Declare* IR_Builder::createFlag(uint16_t numFlagElements, const char* name)
1085 {
1086 uint32_t numWords = (numFlagElements + 15) / 16;
1087 G4_Declare* dcl = createDeclareNoLookup(name, G4_FLAG, numWords, 1, Type_UW);
1088 dcl->setNumberFlagElements((uint8_t)numFlagElements);
1089 return dcl;
1090 }
1091
createTempScalar(uint16_t numFlagElements,const char * prefix)1092 G4_Declare* IR_Builder::createTempScalar(uint16_t numFlagElements, const char* prefix)
1093 {
1094 const char* name = getNameString(mem, 20, "%s%d", prefix, num_temp_dcl++);
1095 G4_Declare* dcl = createDeclareNoLookup(name, G4_SCALAR, numFlagElements, 1, Type_UB);
1096 return dcl;
1097 }
1098
createScalar(uint16_t numFlagElements,const char * name)1099 G4_Declare* IR_Builder::createScalar(uint16_t numFlagElements, const char* name)
1100 {
1101 G4_Declare* dcl = createDeclareNoLookup(name, G4_SCALAR, numFlagElements, 1, Type_UB);
1102 return dcl;
1103 }
1104
createPreVar(PreDefinedVarsInternal preDefVar_index,unsigned short numElements,G4_Type type)1105 G4_Declare* IR_Builder::createPreVar(
1106 PreDefinedVarsInternal preDefVar_index, unsigned short numElements, G4_Type type)
1107 {
1108 MUST_BE_TRUE(preDefVar_index < PreDefinedVarsInternal::VAR_LAST,
1109 "illegal predefined var index");
1110 unsigned short dcl_width = 0, dcl_height = 1;
1111 auto typeSize = TypeSize(type);
1112 int totalByteSize = numElements * typeSize;
1113 if (totalByteSize <= (int)numEltPerGRF<Type_UB>())
1114 {
1115 dcl_width = totalByteSize / typeSize;
1116 }
1117 else
1118 {
1119 // here we assume that the start point of the var is the beginning of a GRF?
1120 // so subregister must be 0?
1121 dcl_width = numEltPerGRF<Type_UB>() / typeSize;
1122 dcl_height = totalByteSize / numEltPerGRF<Type_UB>();
1123 if (totalByteSize % numEltPerGRF<Type_UB>() != 0)
1124 {
1125 dcl_height++;
1126 }
1127 }
1128
1129 G4_Declare* dcl = createPreVarDeclareNoLookup(
1130 preDefVar_index, dcl_width, dcl_height, type);
1131 // subAlign has to be type size at the minimum
1132 dcl->setSubRegAlign(Get_G4_SubRegAlign_From_Type(type));
1133 return dcl;
1134 }
1135
1136
createSrcWithNewRegOff(G4_SrcRegRegion * old,short newRegOff)1137 G4_SrcRegRegion* IR_Builder::createSrcWithNewRegOff(G4_SrcRegRegion* old, short newRegOff)
1138 {
1139 if (old->getRegAccess() == Direct)
1140 {
1141 return createSrcRegRegion(old->getModifier(), Direct, old->getBase(), newRegOff,
1142 old->getSubRegOff(), old->getRegion(), old->getType(), old->getAccRegSel());
1143 }
1144 else
1145 {
1146 return createIndirectSrc(old->getModifier(), old->getBase(), newRegOff, old->getSubRegOff(),
1147 old->getRegion(), old->getType(), old->getAddrImm());
1148 }
1149 }
1150
1151
createSrcWithNewSubRegOff(G4_SrcRegRegion * old,short newSubRegOff)1152 G4_SrcRegRegion* IR_Builder::createSrcWithNewSubRegOff(G4_SrcRegRegion* old, short newSubRegOff)
1153 {
1154 if (old->getRegAccess() == Direct)
1155 {
1156 return createSrcRegRegion(old->getModifier(), old->getRegAccess(), old->getBase(), old->getRegOff(),
1157 newSubRegOff, old->getRegion(), old->getType(), old->getAccRegSel());
1158 }
1159 else
1160 {
1161 return createIndirectSrc(old->getModifier(), old->getBase(), old->getRegOff(), newSubRegOff,
1162 old->getRegion(), old->getType(), old->getAddrImm());
1163 }
1164 }
1165
1166
createSrcWithNewBase(G4_SrcRegRegion * old,G4_VarBase * newBase)1167 G4_SrcRegRegion* IR_Builder::createSrcWithNewBase(G4_SrcRegRegion* old, G4_VarBase* newBase)
1168 {
1169 if (old->getRegAccess() == Direct)
1170 {
1171 return createSrcRegRegion(old->getModifier(), Direct, newBase, old->getRegOff(),
1172 old->getSubRegOff(), old->getRegion(), old->getType(), old->getAccRegSel());
1173 }
1174 else
1175 {
1176 return createIndirectSrc(old->getModifier(), newBase, old->getRegOff(), old->getSubRegOff(),
1177 old->getRegion(), old->getType(), old->getAddrImm());
1178 }
1179 }
1180
createDstWithNewSubRegOff(G4_DstRegRegion * old,short newSubRegOff)1181 G4_DstRegRegion* IR_Builder::createDstWithNewSubRegOff(G4_DstRegRegion* old, short newSubRegOff)
1182 {
1183 if (old->getRegAccess() == Direct)
1184 {
1185 return createDst(old->getBase(), old->getRegOff(), newSubRegOff, old->getHorzStride(), old->getType(), old->getAccRegSel());
1186 }
1187 else
1188 {
1189 return createIndirectDst(old->getBase(), newSubRegOff, old->getHorzStride(), old->getType(), old->getAddrImm());
1190 }
1191 }
1192
1193
createImm(float fp)1194 G4_Imm* IR_Builder::createImm(float fp)
1195 {
1196 uint32_t imm = *((uint32_t*) &fp);
1197 G4_Type immType = Type_F;
1198 if (getPlatform() >= GENX_CHV && m_options->getOption(vISA_FImmToHFImm) &&
1199 !VISA_WA_CHECK(getPWaTable(), WaSrc1ImmHfNotAllowed))
1200 {
1201 // we may be able to lower it to HF
1202 // ieee32 format: 23-8-1
1203 // ieee16 format: 10-5-1
1204 // bit0-22 are fractions
1205 uint32_t fraction = imm & 0x7FFFFF;
1206 // bit23-30 are exponents
1207 uint32_t exponent = (imm >> 23) & 0xFF;
1208 uint32_t sign = (imm >> 31) & 0x1;
1209 int expVal = ((int) exponent) - 127;
1210
1211 if (exponent == 0 && fraction == 0)
1212 {
1213 // 0 and -0
1214 immType = Type_HF;
1215 imm = sign << 15;
1216 }
1217 else if ((fraction & 0x1FFF) == 0 && (expVal <= 15 && expVal >= -16))
1218 {
1219 // immediate can be exactly represented in HF.
1220 // we exclude denormal, infinity, and NaN.
1221 immType = Type_HF;
1222 uint32_t newExp = (expVal + 15) & 0x1F;
1223 imm = (sign << 15) | (newExp << 10) | (fraction >> 13);
1224 }
1225 }
1226 G4_Imm* i = hashtable.lookupImm(imm, immType);
1227 return (i != NULL)? i : hashtable.createImm(imm, immType);
1228 }
1229
createDFImm(double fp)1230 G4_Imm* IR_Builder::createDFImm(double fp)
1231 {
1232 int64_t val = (int64_t)(*(uint64_t*)&fp);
1233 G4_Imm* i = hashtable.lookupImm(val, Type_DF);
1234 return (i != NULL)? i : hashtable.createImm(val, Type_DF);
1235 }
1236
getNewType(int64_t imm,G4_Type ty)1237 G4_Type IR_Builder::getNewType(int64_t imm, G4_Type ty)
1238 {
1239 switch (ty)
1240 {
1241 case Type_Q:
1242 case Type_D:
1243 // It is legal to change a positive imm's type from signed to unsigned if it fits
1244 // in the unsigned type. We do prefer signed type however for readability.
1245 if (imm >= MIN_WORD_VALUE && imm <= MAX_WORD_VALUE)
1246 {
1247 return Type_W;
1248 }
1249 else if (imm >= MIN_UWORD_VALUE && imm <= MAX_UWORD_VALUE)
1250 {
1251 return Type_UW;
1252 }
1253 else if (imm >= int(MIN_DWORD_VALUE) && imm <= int(MAX_DWORD_VALUE))
1254 {
1255 return Type_D;
1256 }
1257 else if (imm >= unsigned(MIN_UDWORD_VALUE) && imm <= unsigned(MAX_UDWORD_VALUE))
1258 {
1259 return Type_UD;
1260 }
1261 break;
1262 case Type_UQ:
1263 case Type_UD:
1264 {
1265 // unsigned imm must stay as unsigned
1266 uint64_t immU = static_cast<uint64_t>(imm);
1267 if (immU <= MAX_UWORD_VALUE)
1268 {
1269 return Type_UW;
1270 }
1271 else if (immU <= unsigned(MAX_UDWORD_VALUE))
1272 {
1273 return Type_UD;
1274 }
1275 break;
1276 }
1277 case Type_UB:
1278 return Type_UW;
1279 case Type_B:
1280 return Type_W;
1281 default:
1282 return ty;
1283 }
1284 return ty;
1285 }
1286
1287 //
1288 // look up an imm operand
1289 //
lookupImm(int64_t imm,G4_Type ty)1290 G4_Imm* OperandHashTable::lookupImm(int64_t imm, G4_Type ty)
1291 {
1292 ImmKey key(imm, ty);
1293 auto iter = immTable.find(key);
1294 return iter != immTable.end() ? iter->second : nullptr;
1295 }
1296
1297 //
1298 // create a dst reg region
1299 //
createImm(int64_t imm,G4_Type ty)1300 G4_Imm* OperandHashTable::createImm(int64_t imm, G4_Type ty)
1301 {
1302 G4_Imm* i = new (mem)G4_Imm(imm, ty);
1303 ImmKey key(imm, ty);
1304 immTable[key] = i;
1305 return i;
1306 }
1307
1308
1309 //
1310 // create the region <vstride; width, hstride> if not yet created
1311 //
createRegion(uint16_t vstride,uint16_t width,uint16_t hstride)1312 const RegionDesc* RegionPool::createRegion(
1313 uint16_t vstride, uint16_t width, uint16_t hstride)
1314 {
1315
1316 for (unsigned i = 0, size = (unsigned)rgnlist.size(); i < size; i++)
1317 {
1318 RegionDesc* region = rgnlist[i];
1319 if (region->vertStride == vstride &&
1320 region->width == width &&
1321 region->horzStride == hstride)
1322 {
1323 return region; // exist
1324 }
1325 }
1326 //
1327 // create one
1328 //
1329 RegionDesc* rd = new (mem) RegionDesc(vstride, width, hstride);
1330 rgnlist.push_back(rd);
1331 return rd;
1332 }
1333
1334 /*
1335 Used in IR_Builder::translateVISARawSendInst. All the bits in des and extDesc are already set.
1336 */
createGeneralMsgDesc(uint32_t desc,uint32_t extDesc,SendAccess access,G4_Operand * bti,G4_Operand * sti)1337 G4_SendDescRaw * IR_Builder::createGeneralMsgDesc(
1338 uint32_t desc,
1339 uint32_t extDesc,
1340 SendAccess access,
1341 G4_Operand* bti,
1342 G4_Operand* sti)
1343 {
1344 return new (mem) G4_SendDescRaw(desc, extDesc, access, bti, sti);
1345 }
1346
createSendMsgDesc(SFID sfid,uint32_t desc,uint32_t extDesc,int src1Len,SendAccess access,G4_Operand * bti,G4_ExecSize execSize,bool isValidFuncCtrl)1347 G4_SendDescRaw * IR_Builder::createSendMsgDesc(
1348 SFID sfid,
1349 uint32_t desc,
1350 uint32_t extDesc,
1351 int src1Len,
1352 SendAccess access,
1353 G4_Operand *bti,
1354 G4_ExecSize execSize,
1355 bool isValidFuncCtrl)
1356 {
1357 return new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, access, bti, execSize, isValidFuncCtrl);
1358 }
1359
createSendMsgDesc(SFID sfid,uint32_t desc,uint32_t extDesc,int src1Len,SendAccess access,G4_Operand * bti,bool isValidFuncCtrl)1360 G4_SendDescRaw* IR_Builder::createSendMsgDesc(
1361 SFID sfid,
1362 uint32_t desc,
1363 uint32_t extDesc,
1364 int src1Len,
1365 SendAccess access,
1366 G4_Operand* bti,
1367 bool isValidFuncCtrl)
1368 {
1369 return new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, access, bti, isValidFuncCtrl);
1370 }
1371
createSendMsgDesc(unsigned funcCtrl,unsigned regs2rcv,unsigned regs2snd,SFID funcID,unsigned extMsgLength,uint16_t extFuncCtrl,SendAccess access,G4_Operand * bti,G4_Operand * sti)1372 G4_SendDescRaw * IR_Builder::createSendMsgDesc(
1373 unsigned funcCtrl,
1374 unsigned regs2rcv,
1375 unsigned regs2snd,
1376 SFID funcID,
1377 unsigned extMsgLength,
1378 uint16_t extFuncCtrl,
1379 SendAccess access,
1380 G4_Operand *bti,
1381 G4_Operand *sti)
1382 {
1383 G4_SendDescRaw* msgDesc = new (mem) G4_SendDescRaw(
1384 funcCtrl, regs2rcv, regs2snd, funcID, (uint16_t)extMsgLength,
1385 extFuncCtrl, access, bti, sti, *this);
1386 return msgDesc;
1387 }
1388
1389 // shorthand for read msg desc. Note that extDesc still needs to be explicitly created,
1390 // SendMsgDesc ctor does not program all the bits
createReadMsgDesc(SFID sfid,uint32_t desc,G4_Operand * bti)1391 G4_SendDescRaw* IR_Builder::createReadMsgDesc(
1392 SFID sfid,
1393 uint32_t desc,
1394 G4_Operand* bti)
1395 {
1396 //ToDo: move extDesc into SendMsgDesc ctor
1397 uint32_t extDesc = G4_SendDescRaw::createExtDesc(sfid);
1398 return new (mem) G4_SendDescRaw(sfid, desc, extDesc, 0, SendAccess::READ_ONLY, bti, true);
1399 }
1400
createWriteMsgDesc(SFID sfid,uint32_t desc,int src1Len,G4_Operand * bti)1401 G4_SendDescRaw* IR_Builder::createWriteMsgDesc(
1402 SFID sfid,
1403 uint32_t desc,
1404 int src1Len,
1405 G4_Operand* bti)
1406 {
1407 //ToDo: move extDesc into SendMsgDesc ctor
1408 uint32_t extDesc = G4_SendDescRaw::createExtDesc(sfid, false, src1Len);
1409 return new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, SendAccess::WRITE_ONLY, bti, true);
1410 }
1411
createSyncMsgDesc(SFID sfid,uint32_t desc)1412 G4_SendDescRaw* IR_Builder::createSyncMsgDesc(SFID sfid, uint32_t desc)
1413 {
1414 //ToDo: move extDesc into SendMsgDesc ctor
1415 uint32_t extDesc = G4_SendDescRaw::createExtDesc(sfid);
1416 return new (mem) G4_SendDescRaw(sfid, desc, extDesc, 0, SendAccess::READ_WRITE, nullptr, true);
1417 }
1418
createSampleMsgDesc(uint32_t desc,bool cps,int src1Len,G4_Operand * bti,G4_Operand * sti)1419 G4_SendDescRaw* IR_Builder::createSampleMsgDesc(
1420 uint32_t desc,
1421 bool cps,
1422 int src1Len,
1423 G4_Operand* bti,
1424 G4_Operand* sti)
1425 {
1426 #define CPS_LOD_COMPENSATION_ENABLE 11
1427
1428 uint32_t extDesc = G4_SendDescRaw::createExtDesc(SFID::SAMPLER, false, src1Len);
1429 if (cps)
1430 {
1431 extDesc |= 1 << CPS_LOD_COMPENSATION_ENABLE;
1432 }
1433 return new (mem) G4_SendDescRaw(desc, extDesc, SendAccess::READ_ONLY, bti, sti);
1434 }
1435
emitSampleIndexGE16(G4_Operand * sampler,G4_Declare * headerDecl)1436 G4_Operand* IR_Builder::emitSampleIndexGE16(
1437 G4_Operand* sampler,
1438 G4_Declare* headerDecl)
1439 {
1440 G4_Operand* samplerIdx;
1441
1442 G4_Declare* t0
1443 = createTempVar(1, Type_UD, Any);
1444 G4_DstRegRegion* t0Dst
1445 = createDstRegRegion(t0, 1);
1446 G4_SrcRegRegion* t0Src
1447 = createSrcRegRegion(t0, getRegionScalar());
1448
1449 G4_Declare* baseAdj
1450 = createTempVar(1, Type_UD, Any);
1451 G4_DstRegRegion* baseAdjDst
1452 = createDstRegRegion(baseAdj, 1);
1453 G4_SrcRegRegion* baseAdjSrc
1454 = createSrcRegRegion(baseAdj, getRegionScalar());
1455
1456 G4_Declare* idxLow
1457 = createTempVar(1, Type_UD, Any);
1458 G4_DstRegRegion* idxLowDst
1459 = createDstRegRegion(idxLow, 1);
1460 G4_SrcRegRegion* idxLowSrc
1461 = createSrcRegRegion(idxLow, getRegionScalar());
1462
1463 // calculate the sampler state base pointer offset based on
1464 // sample index, for putting to msg header M0.3
1465 createBinOp(G4_shr, g4::SIMD1,
1466 t0Dst, sampler, createImm(4, Type_UD),
1467 InstOpt_WriteEnable, true);
1468 createBinOp(G4_shl, g4::SIMD1,
1469 baseAdjDst, t0Src, createImm(8, Type_UD),
1470 InstOpt_WriteEnable, true);
1471
1472 // get low 4 bits of sample index for putting into msg descriptor
1473 G4_SrcRegRegion* sampler2Src
1474 = createSrc(
1475 sampler->getTopDcl()->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
1476 createBinOp(G4_and, g4::SIMD1,
1477 idxLowDst, sampler2Src, createImm(0xf, Type_UD),
1478 InstOpt_WriteEnable, true);
1479 samplerIdx = idxLowSrc;
1480
1481 // add the base pointer offset with r0.3 and put to M0.3
1482 G4_DstRegRegion* stateBaseRgn
1483 = createDst(headerDecl->getRegVar(),
1484 0, 3, 1, Type_UD);
1485 G4_SrcRegRegion* src0
1486 = createSrc(
1487 builtinR0->getRegVar(), 0, 3, getRegionScalar(), Type_UD);
1488 createBinOp(G4_add, g4::SIMD1, stateBaseRgn,
1489 src0, baseAdjSrc, InstOpt_WriteEnable, true);
1490
1491 return samplerIdx;
1492 }
1493
createInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,bool addToInstList)1494 G4_INST* IR_Builder::createInst(
1495 G4_Predicate* prd,
1496 G4_opcode op,
1497 G4_CondMod* mod,
1498 G4_Sat sat,
1499 G4_ExecSize execSize,
1500 G4_DstRegRegion* dst,
1501 G4_Operand* src0,
1502 G4_Operand* src1,
1503 G4_InstOpts options,
1504 bool addToInstList)
1505 {
1506 MUST_BE_TRUE(op != G4_math, "IR_Builder::createInst should not be used to create math instructions");
1507 G4_INST* i = NULL;
1508
1509 // ToDo: have separate functions to create call/jmp/ret
1510 if (G4_Inst_Table[op].instType == InstTypeFlow)
1511 {
1512 // TODO: remove this path
1513 MUST_BE_TRUE(!sat, "saturation not defined on branching ops");
1514 i = new (mem)G4_InstCF(*this, prd, op, mod, execSize, dst, src0, options);
1515 }
1516 else
1517 {
1518 i = new (mem)G4_INST(*this, prd, op, mod, sat, execSize, dst, src0, src1, options);
1519 }
1520
1521 if (addToInstList)
1522 {
1523 i->setCISAOff(curCISAOffset);
1524
1525 if (m_options->getOption(vISA_EmitLocation))
1526 {
1527 i->setLocation(allocateMDLocation(curLine, curFile));
1528 }
1529
1530 instList.push_back(i);
1531 }
1532
1533 instAllocList.push_back(i);
1534
1535 return i;
1536 }
1537
1538 // same as above, except we don't add it to the Builder's instList
createInternalInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options)1539 G4_INST* IR_Builder::createInternalInst(
1540 G4_Predicate* prd,
1541 G4_opcode op,
1542 G4_CondMod* mod,
1543 G4_Sat sat,
1544 G4_ExecSize execSize,
1545 G4_DstRegRegion* dst,
1546 G4_Operand* src0,
1547 G4_Operand* src1,
1548 G4_InstOpts options)
1549 {
1550 MUST_BE_TRUE(op != G4_math, "IR_Builder::createInternalInst should not be used to create math instructions");
1551
1552 auto ii = createInst(prd, op, mod, sat, execSize, dst, src0, src1, options, false);
1553
1554 return ii;
1555 }
1556
createNop(G4_InstOpts instOpt)1557 G4_INST* IR_Builder::createNop(G4_InstOpts instOpt)
1558 {
1559 return createInternalInst(
1560 nullptr, G4_nop, nullptr, g4::NOSAT, g4::SIMD1,
1561 nullptr, nullptr, nullptr, instOpt);
1562 }
1563
1564 // sync inst are always internal, so no option to append it to instList.
1565 // Also currently don't take any InstOpt
createSync(G4_opcode syncOp,G4_Operand * src)1566 G4_INST* IR_Builder::createSync(G4_opcode syncOp, G4_Operand* src)
1567 {
1568 assert(G4_INST::isSyncOpcode(syncOp) && "expect a sync op");
1569 return createInternalInst(
1570 nullptr, syncOp, nullptr, g4::NOSAT, g4::SIMD1,
1571 nullptr, src, nullptr, InstOpt_NoOpt);
1572 }
1573
createMov(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_InstOpts options,bool appendToInstList)1574 G4_INST* IR_Builder::createMov(
1575 G4_ExecSize execSize,
1576 G4_DstRegRegion* dst, G4_Operand* src0, G4_InstOpts options,
1577 bool appendToInstList)
1578 {
1579 G4_INST* newInst = nullptr;
1580 if (appendToInstList)
1581 {
1582 newInst = createInst(
1583 nullptr, G4_mov, nullptr, g4::NOSAT, execSize,
1584 dst, src0, nullptr, options, true);
1585 }
1586 else
1587 {
1588 newInst = createInternalInst(
1589 nullptr, G4_mov, nullptr, g4::NOSAT, execSize,
1590 dst, src0, nullptr, options);
1591 }
1592 return newInst;
1593 }
1594
createBinOp(G4_Predicate * pred,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,bool appendToInstList)1595 G4_INST* IR_Builder::createBinOp(
1596 G4_Predicate *pred, G4_opcode op, G4_ExecSize execSize,
1597 G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1,
1598 G4_InstOpts options,
1599 bool appendToInstList)
1600 {
1601 if (appendToInstList)
1602 {
1603 return createInst(
1604 pred, op, nullptr, g4::NOSAT, execSize,
1605 dst, src0, src1, options, true);
1606 }
1607 else
1608 {
1609 return createInternalInst(
1610 pred, op, nullptr, g4::NOSAT, execSize,
1611 dst, src0, src1, options);
1612 }
1613 }
1614
1615 // mach creates both implicit acc and src using the supplied accType. AccWrCtrl is turned on.
1616 // acc0.0 is always used
createMach(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,G4_Type accType)1617 G4_INST* IR_Builder::createMach(
1618 G4_ExecSize execSize,
1619 G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1,
1620 G4_InstOpts options, G4_Type accType)
1621 {
1622 auto machInst = createInternalInst(
1623 nullptr, G4_mach, nullptr, g4::NOSAT, execSize,
1624 dst, src0, src1, options);
1625 const RegionDesc* rd = execSize > g4::SIMD1 ? getRegionStride1() : getRegionScalar();
1626 auto accSrc = createSrc(phyregpool.getAcc0Reg(), 0, 0, rd, accType);
1627 machInst->setImplAccSrc(accSrc);
1628 auto accDSt = createDst(phyregpool.getAcc0Reg(), 0, 0, 1, accType);
1629 machInst->setImplAccDst(accDSt);
1630 machInst->setOptionOn(InstOpt_AccWrCtrl);
1631 return machInst;
1632 }
1633
1634 // macl creates an implicit src using the supplied the accType. AccWrCtrl is not set.
1635 // acc0.0 is always used
createMacl(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,G4_Type accType)1636 G4_INST* IR_Builder::createMacl(
1637 G4_ExecSize execSize,
1638 G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1,
1639 G4_InstOpts options, G4_Type accType)
1640 {
1641 auto maclInst = createInternalInst(
1642 nullptr, G4_mach, nullptr, g4::NOSAT, execSize, dst, src0, src1, options);
1643 const RegionDesc* rd = execSize > g4::SIMD1 ? getRegionStride1() : getRegionScalar();
1644 auto accSrc = createSrc(phyregpool.getAcc0Reg(), 0, 0, rd, accType);
1645 maclInst->setImplAccSrc(accSrc);
1646 return maclInst;
1647 }
1648
createMadm(G4_Predicate * pred,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_SrcRegRegion * src2,G4_InstOpts options)1649 G4_INST* IR_Builder::createMadm(
1650 G4_Predicate* pred,
1651 G4_ExecSize execSize,
1652 G4_DstRegRegion* dst,
1653 G4_SrcRegRegion* src0, G4_SrcRegRegion* src1, G4_SrcRegRegion* src2,
1654 G4_InstOpts options)
1655 {
1656 // madm is currently only created in vISA->Gen IR translation
1657 return createInst(
1658 pred, G4_madm, nullptr, g4::NOSAT, execSize,
1659 dst, src0, src1, src2, options, true);
1660 }
1661
createIf(G4_Predicate * prd,G4_ExecSize execSize,G4_InstOpts options)1662 G4_INST* IR_Builder::createIf(G4_Predicate* prd, G4_ExecSize execSize, G4_InstOpts options)
1663 {
1664 auto inst = createCFInst(prd, G4_if, execSize, nullptr, nullptr, options, true);
1665 return inst;
1666 }
1667
createElse(G4_ExecSize execSize,G4_InstOpts options)1668 G4_INST* IR_Builder::createElse(G4_ExecSize execSize, G4_InstOpts options)
1669 {
1670 auto inst = createCFInst(nullptr, G4_else, execSize, nullptr, nullptr, options, true);
1671 return inst;
1672 }
1673
createEndif(G4_ExecSize execSize,G4_InstOpts options)1674 G4_INST* IR_Builder::createEndif(G4_ExecSize execSize, G4_InstOpts options)
1675 {
1676 auto inst = createCFInst(nullptr, G4_endif, execSize, nullptr, nullptr, options, true);
1677 return inst;
1678 }
1679
createLabelInst(G4_Label * label,bool appendToInstList)1680 G4_INST* IR_Builder::createLabelInst(G4_Label* label, bool appendToInstList)
1681 {
1682 if (appendToInstList)
1683 {
1684 return createInst(nullptr, G4_label, nullptr, g4::NOSAT, g4::SIMD_UNDEFINED,
1685 nullptr, label, nullptr, InstOpt_NoOpt, true);
1686 }
1687 else
1688 {
1689 return createInternalInst(
1690 nullptr, G4_label, nullptr, g4::NOSAT, g4::SIMD_UNDEFINED,
1691 nullptr, label, nullptr, 0,
1692 0);
1693 }
1694 }
1695
1696 // jmpTarget may be either a label (direct jmp) or scalar operand (indirect jmp)
createJmp(G4_Predicate * pred,G4_Operand * jmpTarget,G4_InstOpts options,bool appendToInstList)1697 G4_INST* IR_Builder::createJmp(
1698 G4_Predicate* pred,
1699 G4_Operand* jmpTarget, G4_InstOpts options,
1700 bool appendToInstList)
1701 {
1702 if (appendToInstList)
1703 {
1704 return createInst(pred, G4_jmpi, nullptr, g4::NOSAT, g4::SIMD1,
1705 nullptr, jmpTarget, nullptr, options, true);
1706 }
1707 else
1708 {
1709 return createInternalInst(pred, G4_jmpi, nullptr, g4::NOSAT, g4::SIMD1,
1710 nullptr, jmpTarget, nullptr, options);
1711 }
1712 }
1713
createInternalCFInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_Label * jip,G4_Label * uip,G4_InstOpts options)1714 G4_INST* IR_Builder::createInternalCFInst(
1715 G4_Predicate* prd, G4_opcode op, G4_ExecSize execSize,
1716 G4_Label* jip, G4_Label* uip,
1717 G4_InstOpts options)
1718 {
1719 MUST_BE_TRUE(G4_Inst_Table[op].instType == InstTypeFlow,
1720 "IR_Builder::createInternalCFInst must be used with InstTypeFlow instruction class");
1721
1722 auto ii = createCFInst(prd, op, execSize, jip, uip, options, false);
1723 return ii;
1724 }
1725
createCFInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_Label * jip,G4_Label * uip,G4_InstOpts options,bool addToInstList)1726 G4_INST* IR_Builder::createCFInst(
1727 G4_Predicate* prd, G4_opcode op, G4_ExecSize execSize,
1728 G4_Label* jip, G4_Label* uip,
1729 G4_InstOpts options,
1730 bool addToInstList)
1731 {
1732 MUST_BE_TRUE(G4_Inst_Table[op].instType == InstTypeFlow,
1733 "IR_Builder::createCFInst must be used with InstTypeFlow instruction class");
1734
1735 G4_InstCF* ii = new (mem)G4_InstCF(*this, prd, op, execSize, jip, uip, options);
1736
1737 if (addToInstList)
1738 {
1739 ii->setCISAOff(curCISAOffset);
1740
1741 if (m_options->getOption(vISA_EmitLocation))
1742 {
1743 ii->setLocation(allocateMDLocation(curLine, curFile));
1744 }
1745 instList.push_back(ii);
1746 }
1747
1748 instAllocList.push_back(ii);
1749
1750 return ii;
1751 }
1752
createDpasInst(G4_opcode opc,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_Operand * src3,G4_InstOpts options,GenPrecision A,GenPrecision W,uint8_t D,uint8_t C,bool addToInstList)1753 G4_INST* IR_Builder::createDpasInst(
1754 G4_opcode opc, G4_ExecSize execSize,
1755 G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1, G4_Operand* src2,
1756 G4_Operand* src3,
1757 G4_InstOpts options,
1758 GenPrecision A,
1759 GenPrecision W,
1760 uint8_t D,
1761 uint8_t C,
1762 bool addToInstList)
1763 {
1764 G4_INST* i = new (mem)G4_InstDpas(*this,
1765 opc, execSize, dst, src0, src1, src2, src3, options, A, W, D, C);
1766
1767 if (addToInstList)
1768 {
1769 i->setCISAOff(curCISAOffset);
1770 if (m_options->getOption(vISA_EmitLocation))
1771 {
1772 i->setLocation(allocateMDLocation(curLine, curFile));
1773 }
1774 instList.push_back(i);
1775 }
1776
1777 instAllocList.push_back(i);
1778
1779
1780 return i;
1781 }
1782
createInternalDpasInst(G4_opcode opc,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_Operand * src3,G4_InstOpts options,GenPrecision A,GenPrecision W,uint8_t D,uint8_t C)1783 G4_INST* IR_Builder::createInternalDpasInst(
1784 G4_opcode opc,
1785 G4_ExecSize execSize,
1786 G4_DstRegRegion* dst,
1787 G4_Operand* src0,
1788 G4_Operand* src1,
1789 G4_Operand* src2,
1790 G4_Operand* src3,
1791 G4_InstOpts options,
1792 GenPrecision A,
1793 GenPrecision W,
1794 uint8_t D,
1795 uint8_t C)
1796 {
1797 auto ii = createDpasInst(opc, execSize, dst, src0, src1, src2,
1798 nullptr, options, A, W, D, C, false);
1799
1800 return ii;
1801 }
1802
createBfnInst(uint8_t booleanFuncCtrl,G4_Predicate * prd,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options,bool addToInstList)1803 G4_INST* IR_Builder::createBfnInst(
1804 uint8_t booleanFuncCtrl,
1805 G4_Predicate* prd,
1806 G4_CondMod* mod,
1807 G4_Sat sat,
1808 G4_ExecSize execSize,
1809 G4_DstRegRegion* dst,
1810 G4_Operand* src0,
1811 G4_Operand* src1,
1812 G4_Operand* src2,
1813 G4_InstOpts options,
1814 bool addToInstList)
1815 {
1816 G4_INST* i = new (mem)G4_InstBfn(*this,
1817 prd, mod, sat, execSize, dst, src0, src1, src2, options, booleanFuncCtrl);
1818
1819 if (addToInstList)
1820 {
1821 i->setCISAOff(curCISAOffset);
1822
1823 if (m_options->getOption(vISA_EmitLocation))
1824 {
1825 i->setLocation(allocateMDLocation(curLine, curFile));
1826 }
1827 instList.push_back(i);
1828 }
1829
1830 instAllocList.push_back(i);
1831
1832 return i;
1833 }
1834
createInternalBfnInst(uint8_t booleanFuncCtrl,G4_Predicate * prd,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options)1835 G4_INST* IR_Builder::createInternalBfnInst(
1836 uint8_t booleanFuncCtrl,
1837 G4_Predicate* prd,
1838 G4_CondMod* mod,
1839 G4_Sat sat,
1840 G4_ExecSize execSize,
1841 G4_DstRegRegion* dst,
1842 G4_Operand* src0,
1843 G4_Operand* src1,
1844 G4_Operand* src2,
1845 G4_InstOpts options)
1846 {
1847 auto ii = createBfnInst(
1848 booleanFuncCtrl, prd, mod, sat, execSize, dst, src0, src1, src2, options, false);
1849
1850 return ii;
1851 }
1852
1853 //scratch surfaces, write r0.5 to message descriptor
1854 //exdesc holds the value of the extended message descriptor for bit [0:11]
1855 // kernel entry:
1856 // and (1) tmp<1>:ud r0.5<0;1,0>:ud 0xFFFFFC00:ud {NoMask}
1857 // before send message:
1858 // shl (1) a0.0<1>:ud tmp<1>:ud 0x2 {NoMask}
1859 // (for old exDesc format) add (1) a0.0<1>:ud tmp<1>:ud exDesc:ud {NoMask}
1860 // returns a0.0<0;1,0>:ud
createScratchExDesc(uint32_t exdesc)1861 G4_SrcRegRegion* IR_Builder::createScratchExDesc(uint32_t exdesc)
1862 {
1863 const char* buf = getNameString(mem, 20, "ExDesc%d", num_temp_dcl++);
1864 G4_Declare* exDescDecl = createDeclareNoLookup(buf, G4_ADDRESS, 1, 1, Type_UD);
1865 exDescDecl->setSubRegAlign(Four_Word);
1866
1867 // copy r0.5[10:31] to a0[12:31] or a0[6:31] for the new format
1868 initScratchSurfaceOffset();
1869
1870 if (!useNewExtDescFormat())
1871 {
1872
1873 // (W) shl (1) a0.0 sso 0x2
1874 auto shlSrc0 = createSrcRegRegion(scratchSurfaceOffset, getRegionScalar());
1875 auto shlDst = createDstRegRegion(exDescDecl, 1);
1876 createBinOp(G4_shl, g4::SIMD1, shlDst, shlSrc0, createImm(0x2, Type_UW), InstOpt_WriteEnable, true);
1877
1878 G4_DstRegRegion* dst = createDstRegRegion(exDescDecl, 1);
1879 createBinOp(G4_add, g4::SIMD1, dst, createSrcRegRegion(exDescDecl, getRegionScalar()),
1880 createImm(exdesc, Type_UD), InstOpt_WriteEnable, true);
1881 }
1882 else
1883 {
1884 // (W) shr (1) a0.0 ss0 0x4
1885 auto shrSrc0 = createSrcRegRegion(scratchSurfaceOffset, getRegionScalar());
1886 auto shrDst = createDstRegRegion(exDescDecl, 1);
1887 createBinOp(G4_shr, g4::SIMD1, shrDst, shrSrc0, createImm(0x4, Type_UW), InstOpt_WriteEnable, true);
1888 }
1889 return createSrcRegRegion(exDescDecl, getRegionScalar());
1890 }
1891
createInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options,bool addToInstList)1892 G4_INST* IR_Builder::createInst(
1893 G4_Predicate* prd,
1894 G4_opcode op,
1895 G4_CondMod* mod,
1896 G4_Sat sat,
1897 G4_ExecSize execSize,
1898 G4_DstRegRegion* dst,
1899 G4_Operand* src0,
1900 G4_Operand* src1,
1901 G4_Operand* src2,
1902 G4_InstOpts options,
1903 bool addToInstList)
1904 {
1905 MUST_BE_TRUE(op != G4_math && G4_Inst_Table[op].instType != InstTypeFlow,
1906 "IR_Builder::createInst should not be used to create math/CF instructions");
1907
1908 if (op == G4_madw)
1909 {
1910 MUST_BE_TRUE(getPlatform() >= GENX_PVC || execSize != g4::SIMD32, "SIMD32 is not supported on this platform for madw");
1911 }
1912
1913 G4_INST* i = NULL;
1914
1915 i = new (mem)G4_INST(*this, prd, op, mod, sat, execSize, dst, src0, src1, src2, options);
1916
1917 if (addToInstList)
1918 {
1919 i->setCISAOff(curCISAOffset);
1920
1921 if (m_options->getOption(vISA_EmitLocation))
1922 {
1923 i->setLocation(allocateMDLocation(curLine, curFile));
1924 }
1925
1926 instList.push_back(i);
1927 }
1928
1929 instAllocList.push_back(i);
1930
1931 return i;
1932 }
1933
1934 // same as above, except we don't add it to the Builder's instList
createInternalInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options)1935 G4_INST* IR_Builder::createInternalInst(
1936 G4_Predicate* prd,
1937 G4_opcode op,
1938 G4_CondMod* mod,
1939 G4_Sat sat,
1940 G4_ExecSize execSize,
1941 G4_DstRegRegion* dst,
1942 G4_Operand* src0,
1943 G4_Operand* src1,
1944 G4_Operand* src2,
1945 G4_InstOpts options)
1946 {
1947 auto ii = createInst(
1948 prd, op, mod, sat, execSize,
1949 dst, src0, src1, src2, options, false);
1950
1951 return ii;
1952
1953 }
1954
createSendInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * postDst,G4_SrcRegRegion * currSrc,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc,bool addToInstList)1955 G4_InstSend* IR_Builder::createSendInst(
1956 G4_Predicate* prd,
1957 G4_opcode op,
1958 G4_ExecSize execSize,
1959 G4_DstRegRegion* postDst,
1960 G4_SrcRegRegion* currSrc,
1961 G4_Operand* msg,
1962 G4_InstOpts options,
1963 G4_SendDesc *msgDesc,
1964 bool addToInstList)
1965 {
1966
1967 assert (msgDesc && "msgDesc must not be null");
1968 G4_InstSend* m = new (mem)G4_InstSend(
1969 *this, prd, op, execSize, postDst, currSrc, msg, options, msgDesc);
1970
1971 if (addToInstList)
1972 {
1973 m->setCISAOff(curCISAOffset);
1974
1975 if (m_options->getOption(vISA_EmitLocation))
1976 {
1977 m->setLocation(allocateMDLocation(curLine, curFile));
1978 }
1979
1980 instList.push_back(m);
1981 }
1982
1983 instAllocList.push_back(m);
1984
1985 return m;
1986 }
1987
createInternalSendInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * postDst,G4_SrcRegRegion * currSrc,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc)1988 G4_InstSend* IR_Builder::createInternalSendInst(
1989 G4_Predicate* prd,
1990 G4_opcode op,
1991 G4_ExecSize execSize,
1992 G4_DstRegRegion* postDst,
1993 G4_SrcRegRegion* currSrc,
1994 G4_Operand* msg,
1995 G4_InstOpts options,
1996 G4_SendDesc *msgDesc)
1997 {
1998 auto ii = createSendInst(prd, op, execSize,
1999 postDst, currSrc,
2000 msg, options, msgDesc, false);
2001
2002 return ii;
2003 }
2004
2005 //
2006 // Create a split send (sends) instruction
2007 // sends (size) dst src0 src1 exDesc msgDesc
2008 //
2009
createSplitSendInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc,G4_Operand * src3,bool addToInstList)2010 G4_InstSend* IR_Builder::createSplitSendInst(
2011 G4_Predicate* prd,
2012 G4_opcode op,
2013 G4_ExecSize execSize,
2014 G4_DstRegRegion* dst,
2015 G4_SrcRegRegion* src0, // can be header
2016 G4_SrcRegRegion* src1,
2017 G4_Operand* msg, // msg descriptor: imm or vec
2018 G4_InstOpts options,
2019 G4_SendDesc* msgDesc,
2020 G4_Operand* src3, // ext msg desciptor: imm or vec
2021 bool addToInstList)
2022 {
2023
2024 if (!src1)
2025 {
2026 // src1 may be null if we need to force generate split send (e.g., for bindless surfaces)
2027 MUST_BE_TRUE(msgDesc->getSrc1LenRegs() == 0, "src1 length must be 0 if it is null");
2028 src1 = createNullSrc(Type_UD);
2029 }
2030 if (!src3 && msgDesc->isRaw())
2031 {
2032 src3 = createImm(((G4_SendDescRaw *)msgDesc)->getExtendedDesc(), Type_UD);
2033 }
2034 G4_InstSend* m = new (mem) G4_InstSend(
2035 *this, prd, op, execSize, dst, src0, src1, msg, src3, options, msgDesc);
2036
2037 if (addToInstList)
2038 {
2039 m->setCISAOff(curCISAOffset);
2040
2041 if (m_options->getOption(vISA_EmitLocation))
2042 {
2043 m->setLocation(allocateMDLocation(curLine, curFile));
2044 }
2045 instList.push_back(m);
2046 }
2047
2048 instAllocList.push_back(m);
2049
2050 return m;
2051 }
2052
createInternalSplitSendInst(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc,G4_Operand * src3)2053 G4_InstSend* IR_Builder::createInternalSplitSendInst(
2054 G4_ExecSize execSize,
2055 G4_DstRegRegion* dst,
2056 G4_SrcRegRegion* src0, // can be header
2057 G4_SrcRegRegion* src1,
2058 G4_Operand* msg, // msg descriptor: imm or vec
2059 G4_InstOpts options,
2060 G4_SendDesc* msgDesc,
2061 G4_Operand* src3) // ext msg desciptor: imm or vec)
2062 {
2063 auto ii = createSplitSendInst(nullptr, G4_sends, execSize, dst, src0, src1, msg, options,
2064 msgDesc, src3, false);
2065
2066 return ii;
2067 }
2068
2069 //
2070 // Math instruction is like a generic one except:
2071 // -- it takes a G4_MathOp to specify the function control
2072 // -- conditional modifier is not allowed
2073 // -- there are additional restrictions on dst/src regions that will be checked in HW conformity
2074 //
createMathInst(G4_Predicate * prd,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_MathOp mathOp,G4_InstOpts options,bool addToInstList)2075 G4_INST* IR_Builder::createMathInst(
2076 G4_Predicate* prd,
2077 G4_Sat sat,
2078 G4_ExecSize execSize,
2079 G4_DstRegRegion* dst,
2080 G4_Operand* src0,
2081 G4_Operand* src1,
2082 G4_MathOp mathOp,
2083 G4_InstOpts options,
2084 bool addToInstList)
2085 {
2086 G4_INST* i = new (mem)G4_InstMath(
2087 *this, prd, G4_math, NULL, sat, execSize, dst, src0, src1, options, mathOp);
2088
2089 if (addToInstList)
2090 {
2091 i->setCISAOff(curCISAOffset);
2092
2093 if (m_options->getOption(vISA_EmitLocation))
2094 {
2095 i->setLocation(allocateMDLocation(curLine, curFile));
2096 }
2097 instList.push_back(i);
2098 }
2099
2100 instAllocList.push_back(i);
2101
2102 return i;
2103 }
2104
createInternalMathInst(G4_Predicate * prd,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_MathOp mathOp,G4_InstOpts options)2105 G4_INST* IR_Builder::createInternalMathInst(
2106 G4_Predicate* prd,
2107 G4_Sat sat,
2108 G4_ExecSize execSize,
2109 G4_DstRegRegion* dst,
2110 G4_Operand* src0,
2111 G4_Operand* src1,
2112 G4_MathOp mathOp,
2113 G4_InstOpts options)
2114 {
2115 auto ii = createMathInst(prd, sat, execSize, dst, src0, src1, mathOp, options, false);
2116 return ii;
2117 }
2118
createIntrinsicInst(G4_Predicate * prd,Intrinsic intrinId,G4_ExecSize size,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options,bool addToInstList)2119 G4_INST* IR_Builder::createIntrinsicInst(
2120 G4_Predicate* prd, Intrinsic intrinId,
2121 G4_ExecSize size,
2122 G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1, G4_Operand* src2,
2123 G4_InstOpts options, bool addToInstList)
2124 {
2125 G4_INST* i = nullptr;
2126
2127 if (intrinId == Intrinsic::Spill)
2128 i = new (mem) G4_SpillIntrinsic(*this, prd, intrinId, size, dst, src0, src1, src2, options);
2129 else if (intrinId == Intrinsic::Fill)
2130 i = new (mem) G4_FillIntrinsic(*this, prd, intrinId, size, dst, src0, src1, src2, options);
2131 else
2132 i = new (mem) G4_InstIntrinsic(*this, prd, intrinId, size, dst, src0, src1, src2, options);
2133
2134 if (addToInstList)
2135 {
2136 i->setCISAOff(curCISAOffset);
2137
2138 if (m_options->getOption(vISA_EmitLocation))
2139 {
2140 i->setLocation(allocateMDLocation(curLine, curFile));
2141 }
2142
2143 instList.push_back(i);
2144 }
2145
2146 instAllocList.push_back(i);
2147
2148 return i;
2149 }
2150
createInternalIntrinsicInst(G4_Predicate * prd,Intrinsic intrinId,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options)2151 G4_INST* IR_Builder::createInternalIntrinsicInst(
2152 G4_Predicate* prd, Intrinsic intrinId, G4_ExecSize execSize,
2153 G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1, G4_Operand* src2,
2154 G4_InstOpts options)
2155 {
2156 auto ii = createIntrinsicInst(prd, intrinId, execSize, dst, src0, src1, src2, options, false);
2157
2158 return ii;
2159 }
2160
createIntrinsicAddrMovInst(Intrinsic intrinId,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_Operand * src3,G4_Operand * src4,G4_Operand * src5,G4_Operand * src6,G4_Operand * src7,bool addToInstList)2161 G4_INST* IR_Builder::createIntrinsicAddrMovInst(
2162 Intrinsic intrinId,
2163 G4_DstRegRegion* dst,
2164 G4_Operand* src0, G4_Operand* src1, G4_Operand* src2, G4_Operand* src3,
2165 G4_Operand* src4, G4_Operand* src5, G4_Operand* src6, G4_Operand* src7,
2166 bool addToInstList)
2167 {
2168 G4_INST* i = nullptr;
2169 assert(intrinId == Intrinsic::PseudoAddrMov && "expect pseudo_mov op");
2170
2171 i = new (mem) G4_PseudoAddrMovIntrinsic(*this, intrinId, dst, src0, src1, src2, src3, src4, src5, src6, src7);
2172
2173 if (addToInstList)
2174 {
2175 i->setCISAOff(curCISAOffset);
2176
2177 if (m_options->getOption(vISA_EmitLocation))
2178 {
2179 i->setLocation(allocateMDLocation(curLine, curFile));
2180 }
2181
2182 instList.push_back(i);
2183 }
2184
2185 instAllocList.push_back(i);
2186
2187 return i;
2188 }
2189
Get_MathFuncCtrl(ISA_Opcode op,G4_Type type)2190 G4_MathOp IR_Builder::Get_MathFuncCtrl(ISA_Opcode op, G4_Type type)
2191 {
2192 switch (op)
2193 {
2194 case ISA_LOG:
2195 return MATH_LOG;
2196 case ISA_MOD: // remainder of IDIV
2197 return MATH_INT_DIV_REM;
2198 case ISA_POW:
2199 return MATH_POW;
2200 case ISA_SIN:
2201 return MATH_SIN;
2202 case ISA_COS:
2203 return MATH_COS;
2204 case ISA_SQRT:
2205 return MATH_SQRT;
2206 case ISA_RSQRT:
2207 return MATH_RSQ;
2208 case ISA_INV:
2209 return MATH_INV;
2210 case ISA_DIV:
2211 return IS_FTYPE(type) || IS_HFTYPE(type) ? MATH_FDIV : MATH_INT_DIV_QUOT;
2212 case ISA_EXP:
2213 return MATH_EXP;
2214 default:
2215 ASSERT_USER(0, "Illegal math opcode.");
2216 return MATH_RESERVED;
2217 }
2218 }
2219
2220 // After building IR total number number of rows required
2221 // for arg and retvar become known, so resize the pre-defined
2222 // vars here to the max required in current compilation unit.
resizePredefinedStackVars()2223 void IR_Builder::resizePredefinedStackVars()
2224 {
2225 getStackCallArg()->resizeNumRows(this->getArgSize());
2226 getStackCallRet()->resizeNumRows(this->getRetVarSize());
2227 }
2228
duplicateOpndImpl(G4_Operand * opnd)2229 G4_Operand* IR_Builder::duplicateOpndImpl(G4_Operand* opnd)
2230 {
2231 if (!opnd || opnd->isImm())
2232 return opnd;
2233 if (opnd->isSrcRegRegion()) {
2234 return createSrcRegRegion(*(opnd->asSrcRegRegion()));
2235 }
2236 else if (opnd->isDstRegRegion()) {
2237 return createDstRegRegion(*(opnd->asDstRegRegion()));
2238 }
2239 else if (opnd->isPredicate()) {
2240 return createPredicate(*(opnd->asPredicate()));
2241 }
2242 else if (opnd->isCondMod()) {
2243 return createCondMod(*(opnd->asCondMod()));
2244 }
2245 else {
2246 return opnd;
2247 }
2248 }
2249
2250 /*
2251 * Create send instruction for specified GenX architecture.
2252 * bti: surface id
2253 * sti: sampler id
2254 */
createSendInst(G4_Predicate * pred,G4_DstRegRegion * postDst,G4_SrcRegRegion * payload,unsigned regs2snd,unsigned regs2rcv,G4_ExecSize execSize,unsigned fc,SFID tf_id,bool header_present,SendAccess access,G4_Operand * bti,G4_Operand * sti,G4_InstOpts options,bool is_sendc)2255 G4_InstSend* IR_Builder::createSendInst(
2256 G4_Predicate* pred,
2257 G4_DstRegRegion *postDst,
2258 G4_SrcRegRegion *payload,
2259 unsigned regs2snd,
2260 unsigned regs2rcv,
2261 G4_ExecSize execSize,
2262 unsigned fc,
2263 SFID tf_id,
2264 bool header_present,
2265 SendAccess access,
2266 G4_Operand* bti,
2267 G4_Operand* sti,
2268 G4_InstOpts options,
2269 bool is_sendc)
2270 {
2271 G4_SendDescRaw* msgDesc =
2272 createSendMsgDesc(fc, regs2rcv, regs2snd, tf_id, 0, 0, access,
2273 bti, sti);
2274
2275 msgDesc->setHeaderPresent(header_present);
2276
2277 return createSendInst(
2278 pred, postDst, payload, execSize, msgDesc, options, is_sendc);
2279 }
2280
2281 //bindless surfaces, write the content of T252 to extended message descriptor
2282 //exdesc holds the value of the extended message descriptor for bit [0:11]
2283 //add (1) a0.2<1>:ud T252<1>:ud exDesc:ud {NoMask}
2284 // returns a0.2<0;1,0>:ud
createBindlessExDesc(uint32_t exdesc)2285 G4_SrcRegRegion* IR_Builder::createBindlessExDesc(uint32_t exdesc)
2286 {
2287 G4_InstOpts dbgOpt = m_options->getOption(vISA_markSamplerMoves) ? InstOpt_BreakPoint : InstOpt_NoOpt;
2288 // virtual var for each exdesc
2289 G4_SrcRegRegion* T252 = createSrcRegRegion(builtinT252, getRegionScalar());
2290 const char* buf = getNameString(mem, 20, "ExDesc%d", num_temp_dcl++);
2291 G4_Declare* exDescDecl = createDeclareNoLookup(buf, G4_ADDRESS, 1, 1, Type_UD);
2292 exDescDecl->setSubRegAlign(Four_Word);
2293 G4_DstRegRegion* dst = createDstRegRegion(exDescDecl, 1);
2294 if (useNewExtDescFormat())
2295 {
2296 createMov(g4::SIMD1, dst, T252, InstOpt_WriteEnable | dbgOpt, true);
2297 }
2298 else
2299 {
2300 createBinOp(G4_add, g4::SIMD1, dst, T252, createImm(exdesc, Type_UD), InstOpt_WriteEnable, true);
2301 }
2302 return createSrcRegRegion(exDescDecl, getRegionScalar());
2303 }
2304
2305
2306 /*
2307 *
2308 * this does two things:
2309 * -- If send has exec size 16, its destination must have Type W.
2310 * -- avoid using Q/UQ type on CHV/BXT
2311 */
fixSendDstType(G4_DstRegRegion * dst,G4_ExecSize execSize)2312 static void fixSendDstType(G4_DstRegRegion* dst, G4_ExecSize execSize)
2313 {
2314 MUST_BE_TRUE(dst->getRegAccess() == Direct, "Send dst must be a direct operand");
2315
2316 MUST_BE_TRUE(dst->getSubRegOff() == 0, "dst may not have a non-zero subreg offset");
2317
2318 // normally we should create a new alias for dst's declare, but since it's a send
2319 // type mismatch between operand and decl should not matter
2320 if (execSize == g4::SIMD16 && dst->getType() != Type_W && dst->getType() != Type_UW)
2321 {
2322 dst->setType(Type_W);
2323 }
2324
2325 if (dst->getType() == Type_HF)
2326 {
2327 dst->setType(Type_W);
2328 }
2329 }
2330
2331
createSendInst(G4_Predicate * pred,G4_DstRegRegion * postDst,G4_SrcRegRegion * payload,G4_ExecSize execsize,G4_SendDescRaw * msgDesc,G4_InstOpts option,bool is_sendc)2332 G4_InstSend *IR_Builder::createSendInst(
2333 G4_Predicate *pred,
2334 G4_DstRegRegion *postDst,
2335 G4_SrcRegRegion *payload,
2336 G4_ExecSize execsize,
2337 G4_SendDescRaw *msgDesc,
2338 G4_InstOpts option,
2339 bool is_sendc)
2340 {
2341 G4_opcode send_opcode= is_sendc ? G4_sendc : G4_send;
2342
2343 fixSendDstType(postDst, execsize);
2344
2345 uint32_t desc = msgDesc->getDesc();
2346 G4_Operand *bti = msgDesc->getSurface();
2347 G4_Operand *sti = msgDesc->getSti();
2348 G4_Operand *descOpnd = NULL;
2349
2350 bool needSamplerMove = sti && !sti->isImm() && !isBindlessSampler(sti);
2351
2352 if ((bti && !bti->isImm()) || needSamplerMove)
2353 {
2354 // use a0.0 directly
2355 G4_DstRegRegion* addr_dst_opnd = createDstRegRegion(builtinA0, 1);
2356
2357 if (bti && !bti->isImm())
2358 {
2359 //add (1) a0.0:ud bti:ud desc:ud
2360 // create source for bti
2361 createBinOp(
2362 G4_add,
2363 g4::SIMD1,
2364 addr_dst_opnd,
2365 bti,
2366 createImm(desc, Type_UD),
2367 InstOpt_WriteEnable,
2368 true);
2369 }
2370
2371 if (needSamplerMove)
2372 {
2373 G4_Declare *dcl1 = createTempVar(1, Type_UD, Any);
2374 G4_DstRegRegion* tmp_dst_opnd = createDstRegRegion(dcl1, 1);
2375
2376 createBinOp(
2377 G4_shl,
2378 g4::SIMD1,
2379 tmp_dst_opnd,
2380 sti,
2381 createImm(8, Type_UD),
2382 InstOpt_WriteEnable,
2383 true);
2384
2385 G4_SrcRegRegion* tmp_src_opnd = createSrcRegRegion(dcl1, getRegionScalar());
2386
2387 if (!bti || bti->isImm())
2388 {
2389 createBinOp(
2390 G4_add,
2391 g4::SIMD1,
2392 addr_dst_opnd,
2393 tmp_src_opnd,
2394 createImm(desc, Type_UD),
2395 InstOpt_WriteEnable,
2396 true);
2397 }
2398 else
2399 {
2400 G4_SrcRegRegion* addr_src_opnd = createSrcRegRegion(builtinA0, getRegionScalar());
2401
2402 createBinOp(
2403 G4_add,
2404 g4::SIMD1,
2405 duplicateOperand(addr_dst_opnd),
2406 addr_src_opnd,
2407 tmp_src_opnd,
2408 InstOpt_WriteEnable,
2409 true);
2410 }
2411 }
2412
2413 descOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2414 }
2415 else
2416 {
2417 descOpnd = createImm(desc, Type_UD);
2418 }
2419
2420 return createSendInst(
2421 pred,
2422 send_opcode,
2423 execsize,
2424 postDst,
2425 payload,
2426 descOpnd,
2427 option,
2428 msgDesc,
2429 true);
2430 }
2431
2432 /*
2433 * Create split send instruction for specified GenX architecture.
2434 * bti: surface id
2435 * sti: sampler id
2436 * Gen9: sends (execsize) dst, src1, src2, ex_desc, desc
2437 */
createSplitSendInst(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src1,unsigned regs2snd1,G4_SrcRegRegion * src2,unsigned regs2snd2,unsigned regs2rcv,G4_ExecSize execSize,unsigned fc,SFID tf_id,bool header_present,SendAccess access,G4_Operand * bti,G4_Operand * sti,G4_InstOpts options,bool is_sendc)2438 G4_InstSend* IR_Builder::createSplitSendInst(
2439 G4_Predicate* pred,
2440 G4_DstRegRegion *dst,
2441 G4_SrcRegRegion *src1,
2442 unsigned regs2snd1,
2443 G4_SrcRegRegion *src2,
2444 unsigned regs2snd2,
2445 unsigned regs2rcv,
2446 G4_ExecSize execSize,
2447 unsigned fc,
2448 SFID tf_id,
2449 bool header_present,
2450 SendAccess access,
2451 G4_Operand* bti,
2452 G4_Operand* sti,
2453 G4_InstOpts options,
2454 bool is_sendc)
2455 {
2456 G4_SendDescRaw *msgDesc =
2457 createSendMsgDesc(fc, regs2rcv, regs2snd1, tf_id, regs2snd2,
2458 0, access, bti, sti);
2459
2460 msgDesc->setHeaderPresent(header_present);
2461
2462 return createSplitSendInst(pred, dst, src1, src2, execSize,
2463 msgDesc, options, is_sendc);
2464 }
2465
2466 // desc, if indirect, is constructed from the BTI/STI values in msgDesc and is always a0.0
createSplitSendInst(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src1,G4_SrcRegRegion * src2,G4_ExecSize execsize,G4_SendDescRaw * msgDesc,G4_InstOpts option,bool is_sendc)2467 G4_InstSend *IR_Builder::createSplitSendInst(
2468 G4_Predicate *pred,
2469 G4_DstRegRegion *dst,
2470 G4_SrcRegRegion *src1,
2471 G4_SrcRegRegion *src2,
2472 G4_ExecSize execsize,
2473 G4_SendDescRaw *msgDesc,
2474 G4_InstOpts option,
2475 bool is_sendc)
2476 {
2477 G4_opcode send_opcode = is_sendc ? G4_sendsc : G4_sends;
2478
2479 fixSendDstType(dst, execsize);
2480
2481 uint32_t desc = msgDesc->getDesc();
2482 uint32_t exdesc = msgDesc->getExtendedDesc();
2483 G4_Operand *bti = msgDesc->getSurface();
2484 G4_Operand *sti = msgDesc->getSti();
2485
2486 G4_Operand* descOpnd = NULL;
2487 G4_SrcRegRegion* extDescOpnd = nullptr;
2488
2489 bool doAlignBindlessSampler = alignBindlessSampler() && sti && isBindlessSampler(sti);
2490 bool needsSamplerMove = (sti && !sti->isImm() && !isBindlessSampler(sti)) || doAlignBindlessSampler;
2491
2492 bool needsSurfaceMove = false;
2493 bool needsA0ExDesc = false;
2494
2495 if (bti && bti->isSrcRegRegion())
2496 {
2497 if (isBindlessSurface(bti))
2498 {
2499 needsA0ExDesc = true;
2500 // set T252 as BTI
2501 if ((desc & 0xFF) != PREDEF_SURF_252)
2502 {
2503 desc = (desc & ~0xFF) | PREDEF_SURF_252;
2504 }
2505 }
2506 else if (isScratchSpace(bti))
2507 {
2508 // use BTI 251
2509 needsA0ExDesc = true;
2510 desc = (desc & ~0xFF) | 251;
2511 }
2512 else
2513 {
2514 needsSurfaceMove = true;
2515 }
2516 }
2517
2518 if (needsSurfaceMove)
2519 {
2520 //add (1) a0.0:ud bti:ud desc:ud
2521 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0, 1);
2522
2523 createBinOp(G4_add, g4::SIMD1, addrDstOpnd, bti,
2524 createImm(desc, Type_UD), InstOpt_WriteEnable, true);
2525 }
2526
2527 if (needsSamplerMove)
2528 {
2529 G4_Declare *dcl1 = createTempVar(1, Type_UD, Any);
2530
2531 if (doAlignBindlessSampler)
2532 {
2533 // check if address is 32-byte aligned
2534 // use STI = 0 for 32-byte aligned address, STI = 1 otherwise
2535 // (W) and (1) (nz)f0.0 null S31 0x10:uw
2536 G4_Declare* tmpFlag = createTempFlag(1);
2537 G4_CondMod* condMod = createCondMod(Mod_nz, tmpFlag->getRegVar(), 0);
2538 createInst(nullptr, G4_and, condMod, g4::NOSAT, g4::SIMD1, createNullDst(Type_UD),
2539 createSrcRegRegion(*(sti->asSrcRegRegion())), createImm(0x10, Type_UW), InstOpt_WriteEnable, true);
2540 // (W) (f0.0) sel (1) tmp:ud 0x100 0x0
2541 G4_Predicate* pred = createPredicate(PredState_Plus, tmpFlag->getRegVar(), 0);
2542 createInst(pred, G4_sel, nullptr, g4::NOSAT, g4::SIMD1, createDstRegRegion(dcl1, 1),
2543 createImm(0x100, Type_UW), createImm(0x0, Type_UW), InstOpt_WriteEnable, true);
2544 }
2545 else
2546 {
2547 // shl (1) tmp:ud sti:ud 0x8:uw
2548 G4_DstRegRegion* tmpDstOpnd = createDstRegRegion(dcl1, 1);
2549 createBinOp(G4_shl, g4::SIMD1, tmpDstOpnd, sti,
2550 createImm(8, Type_UD), InstOpt_WriteEnable, true);
2551 }
2552
2553 G4_SrcRegRegion* tmpSrcOpnd = createSrcRegRegion(dcl1, getRegionScalar());
2554 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0, 1);
2555 if (!needsSurfaceMove)
2556 {
2557 // add (1) a0.0 tmp:ud desc:ud
2558 createBinOp(G4_add, g4::SIMD1, addrDstOpnd, tmpSrcOpnd,
2559 createImm(desc, Type_UD),
2560 InstOpt_WriteEnable,
2561 true);
2562 }
2563 else
2564 {
2565 // add (1) a0.0 a0.0:ud tmp:ud
2566 G4_SrcRegRegion* addrSrcOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2567 createBinOp(G4_add, g4::SIMD1, addrDstOpnd, addrSrcOpnd,
2568 tmpSrcOpnd, InstOpt_WriteEnable, true);
2569 }
2570 }
2571
2572 if (needsSurfaceMove || needsSamplerMove)
2573 {
2574 descOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2575 }
2576 else
2577 {
2578 descOpnd = createImm(desc, Type_UD);
2579 }
2580
2581 if (needsA0ExDesc)
2582 {
2583 extDescOpnd = isBindlessSurface(bti) ? createBindlessExDesc(exdesc) : createScratchExDesc(exdesc);
2584 }
2585 else
2586 {
2587 // do nothing as the extended msg desc will just be a null operand
2588 }
2589
2590 return createSplitSendInst(pred, send_opcode, execsize,
2591 dst, src1, src2,
2592 descOpnd,
2593 option, msgDesc, extDescOpnd, true);
2594 }
2595
createLscMsgDesc(LSC_OP op,LSC_SFID lscSfid,VISA_Exec_Size execSizeEnum,LSC_CACHE_OPTS cacheOpts,LSC_ADDR addr,LSC_DATA_SHAPE shape,G4_Operand * surface,uint32_t dstLen,uint32_t addrRegs)2596 G4_SendDescRaw* IR_Builder::createLscMsgDesc(
2597 LSC_OP op,
2598 LSC_SFID lscSfid,
2599 VISA_Exec_Size execSizeEnum,
2600 LSC_CACHE_OPTS cacheOpts,
2601 LSC_ADDR addr,
2602 LSC_DATA_SHAPE shape,
2603 G4_Operand *surface,
2604 uint32_t dstLen,
2605 uint32_t addrRegs)
2606 {
2607 // Desc[5:0] = OPCODE {LOAD{_BLOCK,_QUAD},STORE{_BLOCK,_QUAD},ATOMIC*}
2608 // Desc[8:7] = addr size
2609 // Desc[11:9] = data size
2610 // Desc[15:12] = data vector size (or cmask if *_QUAD)
2611 // Desc[19:17] = caching controls (see the table for allowable combinations)
2612 // Desc[30:29] = addr model (BTI = 3, SS = 2, BSS = 1, FLAT = 0)
2613 int status = VISA_SUCCESS;
2614 uint32_t desc = 0;
2615 uint32_t exDesc = 0;
2616 const auto opInfo = LscOpInfoGet(op);
2617 MUST_BE_TRUE(!opInfo.isBlock2D(), "block2d has a different layout");
2618 desc |= opInfo.encoding; // Desc[5:0]
2619
2620 lscEncodeAddrSize(addr.size, desc, status); // Desc[8:7]
2621
2622 int dataSizeBits = lscEncodeDataSize(shape.size, desc, status); // Desc[11:9]
2623
2624 // Desc[15:12]
2625 int vecSize; // definitely assigned
2626 if (!opInfo.hasChMask())
2627 {
2628 vecSize = lscEncodeDataElems(shape.elems, desc, status);
2629 lscEncodeDataOrder(shape.order, desc, status);
2630 }
2631 else
2632 {
2633 MUST_BE_TRUE(shape.chmask, "channel mask must not be empty");
2634 vecSize = 0;
2635 if (shape.chmask & LSC_DATA_CHMASK_X)
2636 {
2637 desc |= 1 << 12;
2638 vecSize++;
2639 }
2640 if (shape.chmask & LSC_DATA_CHMASK_Y)
2641 {
2642 desc |= 1 << 13;
2643 vecSize++;
2644 }
2645 if (shape.chmask & LSC_DATA_CHMASK_Z)
2646 {
2647 desc |= 1 << 14;
2648 vecSize++;
2649 }
2650 if (shape.chmask & LSC_DATA_CHMASK_W)
2651 {
2652 desc |= 1 << 15;
2653 vecSize++;
2654 }
2655 }
2656
2657 lscEncodeCachingOpts(opInfo, cacheOpts, desc, status); // Desc[19:17]
2658 lscEncodeAddrType(addr.type, desc, status); // Desc[30:29]
2659
2660 desc |= dstLen << 20; // Desc[24:20] dst len
2661 desc |= addrRegs << 25; // Desc[29:25] src0 len
2662
2663 // promote any immediate surface to the extended descriptor
2664 // ExDesc[31:12]
2665 if (surface && surface->isImm()) {
2666 auto surfaceImm = (uint32_t)surface->asImm()->getImm();
2667 if (addr.type == LSC_ADDR_TYPE_BTI) {
2668 // promote the immediate BTI to the descriptor
2669 exDesc |= surfaceImm << 24;
2670 surface = nullptr;
2671 }
2672 else if (
2673 addr.type == LSC_ADDR_TYPE_BSS ||
2674 addr.type == LSC_ADDR_TYPE_SS)
2675 {
2676 if ((surfaceImm & 0x3FF) == 0) {
2677 exDesc |= surfaceImm;
2678 surface = nullptr;
2679 }
2680 }
2681 else {
2682 // flat address type
2683 MUST_BE_TRUE(surface->isNullReg() ||
2684 surfaceImm == PREDEFINED_SURFACE_SLM ||
2685 surfaceImm == PREDEFINED_SURFACE_T255, // not sure what's up here
2686 "flat address type must have null reg (or 0)");
2687 surface = nullptr;
2688 }
2689 }
2690
2691 MUST_BE_TRUE(addr.immOffset == 0,
2692 "invalid address immediate offset");
2693
2694 SFID sfid = LSC_SFID_To_SFID(lscSfid);
2695
2696 const unsigned execSize = Get_VISA_Exec_Size(execSizeEnum);
2697 int src1Len = 0;
2698 uint32_t dataRegs = 1;
2699 bool isBlock2D =
2700 op == LSC_OP::LSC_LOAD_BLOCK2D ||
2701 op == LSC_OP::LSC_STORE_BLOCK2D;
2702 MUST_BE_TRUE(!isBlock2D, "block2d not implemented yet");
2703
2704 if (shape.order == LSC_DATA_ORDER_NONTRANSPOSE) {
2705 // Non-transpose case is the typical case.
2706 //
2707 // ceil[ SIMT32*dataSize(b)/512(b/REG) ] * vecSize
2708 // units = (b/b*REG) = REG
2709 dataRegs = std::max<uint32_t>(1,
2710 execSize*dataSizeBits / 8 / COMMON_ISA_GRF_REG_SIZE)*vecSize;
2711 }
2712 else
2713 { // if (shape.transpose == LSC_DATA_TRANSPOSE) {
2714 // The transpose case is a little odder
2715 // So the data size is the SIMD size (ExecSize) times the number of
2716 // registers consumed by each vector sequence (always a full
2717 // register number per seq).
2718 uint32_t regsPerVec = vecSize * dataSizeBits / 8 / COMMON_ISA_GRF_REG_SIZE;
2719 if (vecSize*dataSizeBits / 8 % COMMON_ISA_GRF_REG_SIZE)
2720 regsPerVec++; // pad out to full reg
2721 dataRegs = regsPerVec * execSize;
2722 }
2723
2724 // override sizes for special cases
2725 if (op == LSC_OP::LSC_LOAD_STATUS)
2726 {
2727 dataRegs = 1; // just returns a bitset
2728 }
2729
2730 if (opInfo.isLoad())
2731 {
2732 src1Len = 0;
2733 }
2734 else if (opInfo.isStore())
2735 {
2736 src1Len = (int)dataRegs;
2737 }
2738
2739 SendAccess access = opInfo.isLoad() && opInfo.isStore() ?
2740 SendAccess::READ_WRITE : (opInfo.isLoad() ? SendAccess::READ_ONLY : SendAccess::WRITE_ONLY);
2741
2742 G4_SendDescRaw *g4desc = createSendMsgDesc(
2743 sfid,
2744 desc,
2745 exDesc,
2746 src1Len,
2747 access,
2748 surface);
2749 return g4desc;
2750 }
2751
createLscDesc(SFID sfid,uint32_t desc,uint32_t extDesc,int src1Len,SendAccess access,G4_Operand * bti)2752 G4_SendDescRaw * IR_Builder::createLscDesc(
2753 SFID sfid,
2754 uint32_t desc,
2755 uint32_t extDesc,
2756 int src1Len,
2757 SendAccess access,
2758 G4_Operand* bti)
2759 {
2760 auto msgDesc = new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, access, bti, true);
2761 return msgDesc;
2762 }
2763
createLscSendInst(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_ExecSize execSize,G4_SendDescRaw * msgDesc,G4_InstOpts option,LSC_ADDR_TYPE addrType,bool emitA0RegDef)2764 G4_InstSend *IR_Builder::createLscSendInst(
2765 G4_Predicate *pred,
2766 G4_DstRegRegion *dst,
2767 G4_SrcRegRegion *src0,
2768 G4_SrcRegRegion *src1,
2769 G4_ExecSize execSize,
2770 G4_SendDescRaw *msgDesc,
2771 G4_InstOpts option,
2772 LSC_ADDR_TYPE addrType,
2773 bool emitA0RegDef)
2774 {
2775 uint32_t exDesc = msgDesc->getExtendedDesc();
2776 G4_Operand *surface = msgDesc->getSurface(); // BTI or SS/BSS
2777 G4_Operand *exDescOpnd = nullptr;
2778
2779 if (surface && surface->isSrcRegRegion()) {
2780 if (emitA0RegDef)
2781 {
2782 // This path is taken when caller hasnt defined a0.2 register for use
2783 // as ext msg descriptor of lsc. Currently, spill module defines a0.2
2784 // once per BB and reuses it in all spill msgs for that BB. Without this
2785 // check, each spill/fill msg would get its own computation of a0.2
2786 // which is wasteful.
2787 if (addrType == LSC_ADDR_TYPE_BTI) {
2788 // .declare shifted_bti v_type=T num_elts=1
2789 // ...
2790 // (surface is the BTI)
2791 // shl tmp surface 24
2792 G4_Declare* tmpDecl = createTempVar(1, Type_UD, Any);
2793 G4_DstRegRegion* tmpDst = createDstRegRegion(tmpDecl, 1);
2794 createBinOp(G4_shl, g4::SIMD1, tmpDst, surface,
2795 createImm(24, Type_UD), InstOpt_WriteEnable, true);
2796 auto tmpSrc = createSrcRegRegion(tmpDecl, getRegionScalar());
2797 // set src1.length into exDesc. BTI message is required to be on ExBSO=0
2798 // mode, so the src.length is part of exDesc
2799 exDesc = (exDesc & (~0x7FF)) | (msgDesc->extMessageLength() << 6);
2800 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0Dot2, 1);
2801 // add a0.2 tmpSrc exdesc
2802 createBinOp(G4_add, g4::SIMD1, addrDstOpnd, tmpSrc,
2803 createImm(exDesc, Type_UD), InstOpt_WriteEnable, true);
2804 }
2805 else {
2806 // SS or BSS
2807 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0Dot2, 1);
2808 if ((addrType == LSC_ADDR_TYPE_BSS) || (addrType == LSC_ADDR_TYPE_SS))
2809 {
2810 // mov a0.2 surface
2811 createMov(g4::SIMD1, addrDstOpnd, surface, InstOpt_WriteEnable, true);
2812 }
2813 else
2814 {
2815 assert(false && "FLAT have surface == nullptr here");
2816 }
2817 }
2818 }
2819
2820 exDescOpnd = createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2821 msgDesc->setSurface(exDescOpnd); // link a0.2 to the send descriptor
2822 } else if (surface && surface->isImm()) {
2823 // If by some chance the surface is an immediate value that didn't fold
2824 // to ExDesc (c.f. lscTryPromoteSurfaceImmToExDesc),
2825 // we can still possibly move it to a0.2 and use that way.
2826 // This enables us to access the full ExDesc[31:5] rather than
2827 // ExDesc[31:12] (the send instruction lacks room encode [11:6])
2828 // This can happen for BSS/SS, for example, with a small
2829 // surface state offset.
2830 //
2831 // Callers that fold the ExDesc value into an immediate descriptor
2832 // should pass nullptr as the surface.
2833 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0Dot2, 1);
2834 if (addrType == LSC_ADDR_TYPE_BSS || addrType == LSC_ADDR_TYPE_SS) {
2835 // mov a0.2 SurfaceAddrImm
2836 auto imm = surface->asImm()->getImm();
2837 assert(
2838 (imm & 0x1F) == 0 &&
2839 (imm & 0xFFFFFFFF00000000LL) == 0 && "ExDesc can only access [31:5]");
2840 createMov(g4::SIMD1, addrDstOpnd,
2841 createImm(imm, Type_UD), InstOpt_WriteEnable, true);
2842
2843 exDescOpnd = createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2844 msgDesc->setSurface(exDescOpnd); // link a0.2 to the send descriptor
2845 }
2846 else
2847 {
2848 // BTI is in ExDesc[31:24] and that is always available.
2849 assert(false && "BTI/FLAT should not reach this. "
2850 "FLAT should have surface == nullptr and"
2851 "BTI should either use a register for a variable BTI or have "
2852 "folded the immediate vlaue into ExDesc"
2853 " (and thus surface==nullptr here)");
2854 }
2855 } else {
2856 exDescOpnd = createImm(exDesc, Type_UD);
2857 }
2858
2859 return createSplitSendInst(
2860 pred, G4_sends, execSize, dst, src0, src1,
2861 createImm(msgDesc->getDesc(), Type_UD),
2862 option, msgDesc, exDescOpnd, true);
2863 }
2864
2865 //Using r0.8:ud to save and restore a0.2
getScratchSurfaceStatusIndex()2866 G4_SrcRegRegion* IR_Builder::getScratchSurfaceStatusIndex()
2867 {
2868 auto dst = createDst(builtinR0->getRegVar(), 0, 8, 1, Type_UD);
2869 auto src0 = createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2870 createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, true);
2871
2872 G4_SrcRegRegion* R0_5 = createSrc(builtinR0->getRegVar(), 0, 5, getRegionScalar(), Type_UD);
2873 G4_DstRegRegion* A02Dst = createDstRegRegion(builtinA0Dot2, 1);
2874 createMov(g4::SIMD1, A02Dst, R0_5, InstOpt_WriteEnable, true);
2875 return createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2876 }
2877
RestoreA0()2878 void IR_Builder::RestoreA0()
2879 {
2880 auto dst = createDstRegRegion(builtinA0Dot2, 1);
2881 auto src0 = createSrc(builtinR0->getRegVar(), 0, 8, getRegionStride1(), Type_UD);
2882 createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, true);
2883 }
2884
createLscSendInstToScratch(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_ExecSize execSize,G4_SendDescRaw * msgDesc,G4_InstOpts options,bool usesBti)2885 G4_InstSend *IR_Builder::createLscSendInstToScratch(
2886 G4_Predicate *pred,
2887 G4_DstRegRegion *dst,
2888 G4_SrcRegRegion *src0,
2889 G4_SrcRegRegion *src1,
2890 G4_ExecSize execSize,
2891 G4_SendDescRaw *msgDesc,
2892 G4_InstOpts options,
2893 bool usesBti)
2894 {
2895 uint32_t desc = msgDesc->getDesc();
2896 G4_Operand *surface = msgDesc->getSurface(); // BTI or SS/BSS
2897 G4_Operand *exDescOpnd = nullptr;
2898
2899 if (isScratchSpace(surface))
2900 {
2901 desc = (desc & ~0xFF) | 251;
2902 }
2903 exDescOpnd = getScratchSurfaceStatusIndex();
2904
2905 G4_InstSend* inst = createSplitSendInst(
2906 pred, G4_sends, execSize, dst, src0, src1,
2907 createImm(desc, Type_UD),
2908 options, msgDesc, exDescOpnd, true);
2909 RestoreA0();
2910
2911 return inst;
2912 }
2913
2914 // for reder target messages,
2915 // desc has a constant BTI value (i.e., no bindless) and no STI
2916 // extDesc may be indirect (MRT and other bits) and is passed in
createSplitSendToRenderTarget(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src1,G4_SrcRegRegion * src2,G4_SrcRegRegion * extDescOpnd,G4_ExecSize execSize,G4_SendDescRaw * msgDesc,G4_InstOpts option)2917 G4_InstSend *IR_Builder::createSplitSendToRenderTarget(
2918 G4_Predicate *pred,
2919 G4_DstRegRegion *dst,
2920 G4_SrcRegRegion *src1,
2921 G4_SrcRegRegion *src2,
2922 G4_SrcRegRegion *extDescOpnd,
2923 G4_ExecSize execSize,
2924 G4_SendDescRaw *msgDesc,
2925 G4_InstOpts option)
2926 {
2927 G4_opcode send_opcode = G4_sendsc;
2928
2929 fixSendDstType(dst, execSize);
2930
2931 uint32_t desc = msgDesc->getDesc();
2932 G4_Operand* descOpnd = nullptr;
2933 G4_Operand *bti = msgDesc->getSurface();
2934
2935 if (bti && bti->isSrcRegRegion())
2936 {
2937 //add (1) a0.0:ud bti:ud desc:ud
2938 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0, 1);
2939 createBinOp(G4_add, g4::SIMD1, addrDstOpnd, bti,
2940 createImm(desc, Type_UD), InstOpt_WriteEnable, true);
2941 descOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2942 }
2943 else
2944 {
2945 descOpnd = createImm(desc, Type_UD);
2946 }
2947
2948 return createSplitSendInst(pred, send_opcode, execSize,
2949 dst, src1, src2, descOpnd,
2950 option, msgDesc, extDescOpnd, true);
2951 }
2952
2953 // create a declare for send payload
createSendPayloadDcl(unsigned num_elt,G4_Type type)2954 G4_Declare* IR_Builder::createSendPayloadDcl(unsigned num_elt, G4_Type type)
2955 {
2956 const char* name = getNameString(mem, 16, "M%u", ++num_temp_dcl);
2957 const uint16_t sizeOfType = TypeSize(type);
2958 unsigned short numRow = (num_elt * sizeOfType - 1) / numEltPerGRF<Type_UB>() + 1;
2959 unsigned short numElt = (numRow == 1) ? num_elt : (numEltPerGRF<Type_UB>()/sizeOfType);
2960 G4_Declare *dcl = createDeclareNoLookup(
2961 name,
2962 G4_GRF,
2963 numElt,
2964 numRow,
2965 type);
2966 return dcl;
2967 }
2968
createMovR0Inst(G4_Declare * dcl,short regOff,short subregOff,bool use_nomask,G4_InstOpts options)2969 void IR_Builder::createMovR0Inst(G4_Declare* dcl, short regOff, short subregOff, bool use_nomask, G4_InstOpts options)
2970 {
2971 G4_DstRegRegion* dst1_opnd = createDst(
2972 dcl->getRegVar(),
2973 regOff,
2974 subregOff,
2975 1,
2976 dcl->getElemType());
2977
2978 // create r0 src
2979 G4_SrcRegRegion* r0_src_opnd = createSrcRegRegion(builtinR0, getRegionStride1());
2980 // create inst
2981 createMov(
2982 G4_ExecSize(GENX_DATAPORT_IO_SZ),
2983 dst1_opnd,
2984 r0_src_opnd,
2985 (use_nomask ? InstOpt_WriteEnable | options : options),
2986 true);
2987 }
2988
createAddInst(G4_Declare * dcl,short regOff,short subregOff,G4_ExecSize execsize,G4_Predicate * pred,G4_CondMod * condMod,G4_Operand * src0_opnd,G4_Operand * src1_opnd,G4_InstOption options)2989 void IR_Builder::createAddInst(
2990 G4_Declare* dcl, short regOff, short subregOff, G4_ExecSize execsize,
2991 G4_Predicate* pred, G4_CondMod* condMod,
2992 G4_Operand* src0_opnd, G4_Operand* src1_opnd, G4_InstOption options)
2993 {
2994 auto dst = createDst(dcl->getRegVar(), regOff, subregOff, 1, dcl->getElemType());
2995
2996 if (src0_opnd->isImm() && src0_opnd->asImm()->isZero())
2997 {
2998 createInst(pred, G4_mov, condMod, g4::NOSAT, execsize, dst, src1_opnd, NULL, options, true);
2999 }
3000 else if (src1_opnd->isImm() && src1_opnd->asImm()->isZero())
3001 {
3002 createInst(pred, G4_mov, condMod, g4::NOSAT, execsize, dst, src0_opnd, NULL, options, true);
3003 }
3004 else if (src0_opnd->isImm() && !src1_opnd->isImm())
3005 {
3006 createInst(pred, G4_add, condMod, g4::NOSAT, execsize, dst, src1_opnd, src0_opnd, options, true);
3007 }
3008 else
3009 {
3010 createInst(pred, G4_add, condMod, g4::NOSAT, execsize, dst, src0_opnd, src1_opnd, options, true);
3011 }
3012 }
3013
3014 // Currently this function is mostly used in dataport intrinsic translation functions.
3015 // If it is used in some other places, Qtrctrl should be added in options if needed.
createMovInst(G4_Declare * dcl,short regOff,short subregOff,G4_ExecSize execSize,G4_Predicate * pred,G4_CondMod * condMod,G4_Operand * src_opnd,bool use_nomask,G4_InstOpts options)3016 void IR_Builder::createMovInst(
3017 G4_Declare* dcl,
3018 short regOff,
3019 short subregOff,
3020 G4_ExecSize execSize,
3021 G4_Predicate* pred,
3022 G4_CondMod* condMod,
3023 G4_Operand* src_opnd,
3024 bool use_nomask,
3025 G4_InstOpts options)
3026 {
3027 G4_DstRegRegion* dst2_opnd = createDst(
3028 dcl->getRegVar(),
3029 regOff,
3030 subregOff,
3031 1,
3032 dcl->getElemType());
3033
3034 createInst(
3035 pred,
3036 G4_mov,
3037 condMod,
3038 g4::NOSAT,
3039 execSize,
3040 dst2_opnd,
3041 src_opnd,
3042 NULL,
3043 use_nomask ? (InstOpt_WriteEnable | options) : options,
3044 true);
3045 }
3046
3047 // send payload preparation.
3048 // dcl: decl for send payload
3049 // num_dword: number of DW to send
3050 // src_opnd: send src, its size may be several GRFs
createMovSendSrcInst(G4_Declare * dcl,short regoff,short subregoff,unsigned num_dword,G4_Operand * src_opnd,G4_InstOpts options)3051 void IR_Builder::createMovSendSrcInst(
3052 G4_Declare* dcl,
3053 short regoff,
3054 short subregoff,
3055 unsigned num_dword,
3056 G4_Operand* src_opnd,
3057 G4_InstOpts options)
3058 {
3059 // since src_opnd is raw source in CISA, it is aligned to GRF, so there is no subRegOff.
3060 unsigned remained_dword = num_dword;
3061 // if data type of src_opnd is not UD, change it to UD
3062 // assumption: size of src_opnd is multiple of UD
3063 short dst_regoff = regoff, dst_subregoff = subregoff;
3064 G4_ExecSize execsize = g4::SIMD1;
3065 G4_DstRegRegion* dst = NULL;
3066 //G4_SrcRegRegion* src = NULL;
3067 G4_Operand* src = NULL;
3068 const RegionDesc *rd = NULL;
3069 G4_Declare *dst_dcl = dcl;
3070 short src_regoff = 0, src_subregoff = 0;
3071 bool non_ud_scalar = false;
3072 bool scalar_src = (src_opnd->isImm() || num_dword == 1);
3073
3074 if (scalar_src && src_opnd->getType() != Type_UD) {
3075 // change the type of dst dcl to src type
3076 remained_dword = num_dword * (TypeSize(Type_UD)/src_opnd->getTypeSize());
3077 dst_dcl = createSendPayloadDcl(remained_dword, src_opnd->getType());
3078 dst_dcl->setAliasDeclare(dcl, regoff * numEltPerGRF<Type_UB>() + subregoff * TypeSize(Type_UD));
3079 dst_regoff = 0;
3080 dst_subregoff = 0;
3081 non_ud_scalar = true;
3082 }
3083
3084 src_regoff = src_opnd->asSrcRegRegion()->getRegOff();
3085 src_subregoff = src_opnd->asSrcRegRegion()->getSubRegOff();
3086 src_subregoff = src_subregoff * src_opnd->getTypeSize() / dst_dcl->getElemSize();
3087
3088 auto getMaxEsize = [](uint32_t opt)
3089 {
3090 unsigned maskOption = (opt & InstOpt_QuarterMasks);
3091 switch (maskOption)
3092 {
3093 case InstOpt_M4:
3094 case InstOpt_M12:
3095 case InstOpt_M20:
3096 case InstOpt_M28:
3097 return 4;
3098 case InstOpt_M8:
3099 case InstOpt_M24:
3100 return 8;
3101 case InstOpt_M16:
3102 return 16;
3103 default:
3104 return 32;
3105 }
3106 };
3107 G4_ExecSize maxEsize(getMaxEsize(options));
3108
3109 // here remained_dword is not the number of DW, but the number of dst data type.
3110 while (remained_dword)
3111 {
3112 if (non_ud_scalar && src_opnd->getTypeSize() != TypeSize(Type_UD))
3113 {
3114 if (remained_dword >= 32)
3115 {
3116 execsize = g4::SIMD32;
3117 }
3118 else if (remained_dword >= 16)
3119 {
3120 execsize = g4::SIMD16;
3121 }
3122 else
3123 {
3124 execsize = G4_ExecSize((uint8_t)Round_Down_Pow2(remained_dword));
3125 }
3126
3127 execsize = (execsize > maxEsize) ? maxEsize : execsize;
3128 if (execsize == g4::SIMD1)
3129 {
3130 rd = getRegionScalar();
3131 }
3132 else
3133 {
3134 rd = getRegionStride1();
3135 }
3136 }
3137 else
3138 {
3139 if (remained_dword >= 16)
3140 {
3141 execsize = g4::SIMD16;
3142 }
3143 else if (remained_dword >= 8)
3144 {
3145 execsize = g4::SIMD8;
3146 }
3147 else
3148 {
3149 execsize = G4_ExecSize(Round_Down_Pow2(remained_dword));
3150 }
3151 execsize = (execsize > maxEsize) ? maxEsize : execsize;
3152 if (execsize == g4::SIMD1)
3153 {
3154 rd = getRegionScalar();
3155 }
3156 else
3157 {
3158 rd = getRegionStride1();
3159 }
3160 }
3161
3162 dst = createDst(
3163 dst_dcl->getRegVar(),
3164 dst_regoff,
3165 dst_subregoff,
3166 1,
3167 dst_dcl->getElemType());
3168
3169 if (scalar_src && src_opnd->isImm())
3170 {
3171 src = src_opnd->asImm();
3172 }
3173 else
3174 {
3175 src = createSrc(
3176 src_opnd->asSrcRegRegion()->getBase(),
3177 src_regoff,
3178 src_subregoff,
3179 rd,
3180 dst_dcl->getElemType());
3181 }
3182
3183 createMov(
3184 execsize,
3185 dst,
3186 src,
3187 options,
3188 true);
3189
3190 // update offset in decl
3191 if (remained_dword >= execsize) {
3192 remained_dword -= execsize;
3193 if (execsize * dst_dcl->getElemSize() == 2 * numEltPerGRF<Type_UB>()) {
3194 dst_regoff += 2;
3195 if (!scalar_src) {
3196 src_regoff += 2;
3197 }
3198 }
3199 else if (execsize * dst_dcl->getElemSize() == numEltPerGRF<Type_UB>()) {
3200 dst_regoff += 1;
3201 if (!scalar_src) {
3202 src_regoff += 1;
3203 }
3204 }
3205 else {
3206 dst_subregoff += execsize;
3207 if (dst_subregoff > ((int)numEltPerGRF<Type_UB>() / dst_dcl->getElemSize())) {
3208 dst_regoff++;
3209 dst_subregoff -= numEltPerGRF<Type_UB>() / dst_dcl->getElemSize();
3210 }
3211 if (!scalar_src) {
3212 src_subregoff += execsize;
3213 if (src_subregoff > (short)(numEltPerGRF<Type_UB>() / TypeSize(Type_UD))) {
3214 src_regoff++;
3215 src_subregoff -= numEltPerGRF<Type_UB>() / TypeSize(Type_UD);
3216 }
3217 }
3218 }
3219 }
3220 }
3221 }
3222 // create an opnd without regpoff and subregoff
createDstRegRegion(G4_Declare * dcl,unsigned short hstride)3223 G4_DstRegRegion* IR_Builder::createDstRegRegion(
3224 G4_Declare* dcl, unsigned short hstride)
3225 {
3226 return createDst(
3227 dcl->getRegVar(),
3228 0,
3229 0,
3230 hstride,
3231 dcl->getElemType());
3232 }
3233 // create an opnd without regpoff and subregoff
createSrcRegRegion(G4_Declare * dcl,const RegionDesc * rd)3234 G4_SrcRegRegion* IR_Builder::createSrcRegRegion(
3235 G4_Declare* dcl, const RegionDesc* rd)
3236 {
3237 return createSrcRegRegion(
3238 Mod_src_undef,
3239 Direct,
3240 dcl->getRegVar(),
3241 0,
3242 0,
3243 rd,
3244 dcl->getElemType());
3245 }
3246
createNullDst(G4_Type dstType)3247 G4_DstRegRegion* IR_Builder::createNullDst(G4_Type dstType)
3248 {
3249 return createDst(
3250 phyregpool.getNullReg(),
3251 0,
3252 0,
3253 1,
3254 dstType);
3255 }
3256
createNullSrc(G4_Type srcType)3257 G4_SrcRegRegion* IR_Builder::createNullSrc(G4_Type srcType)
3258 {
3259 return createSrcRegRegion(Mod_src_undef,
3260 Direct,
3261 phyregpool.getNullReg(),
3262 0,
3263 0,
3264 getRegionScalar(),
3265 srcType);
3266 }
3267
3268 // check if the dst opnd align to GRF.
3269 // if it is not aligned to GRF
3270 // 1. change align of var dcl to GRF if the dst size is smaller than GRF size,
3271 // no alias or alias offset is 0.
3272 // 2. otherwise, create a temp operand and return it.
checkSendDst(G4_DstRegRegion * dst_opnd)3273 G4_DstRegRegion* IR_Builder::checkSendDst(G4_DstRegRegion *dst_opnd)
3274 {
3275 //FIXME: This function seems to be bogus
3276 G4_DstRegRegion* d;
3277 // check if dst is align to GRF
3278
3279 const unsigned short SIZEOF_DW = 4;
3280 if (dst_opnd->getTypeSize() > 1)
3281 {
3282 d = dst_opnd;
3283 }
3284 else
3285 {
3286 // change type of dcl and offset in it
3287 short new_SubRegOff = dst_opnd->getSubRegOff();
3288 if (dst_opnd->getRegAccess() == Direct)
3289 {
3290 new_SubRegOff = dst_opnd->getSubRegOff() / SIZEOF_DW;
3291 }
3292 G4_DstRegRegion new_dst(
3293 dst_opnd->getRegAccess(),
3294 dst_opnd->getBase(),
3295 dst_opnd->getRegOff(),
3296 new_SubRegOff,
3297 1,
3298 Type_UD);
3299 d = createDstRegRegion(new_dst);
3300 }
3301
3302 return d;
3303 }
3304
addInputArg(input_info_t * inpt)3305 void IR_Builder::addInputArg(input_info_t * inpt)
3306 {
3307 m_inputVect.push_back(inpt);
3308 }
3309
getInputArg(unsigned int index) const3310 input_info_t * IR_Builder::getInputArg(unsigned int index) const
3311 {
3312 return m_inputVect[index];
3313 }
3314
getInputCount() const3315 unsigned int IR_Builder::getInputCount() const
3316 {
3317 return (uint32_t)m_inputVect.size();
3318 }
3319
getRetIPArg() const3320 input_info_t *IR_Builder::getRetIPArg() const {
3321 // TODO: So far, we assume the last argument of caller of callable kernel
3322 // or callable kernel is the RetIP argument. If required, extra attribute
3323 // will be added to specify which QWORD argument is used as RetIP argument
3324 // and the code will traverse all argument to find that one.
3325 input_info_t *RetIP = getInputArg(getInputCount() - 1);
3326 // More sanity check on the argument.
3327 ASSERT_USER(IS_QTYPE(RetIP->dcl->getElemType()), "RetIP needs to be QWORD!");
3328 ASSERT_USER(RetIP->dcl->getNumElems() == 1, "RetIP needs to be QWORD!");
3329 return RetIP;
3330 }
3331
vISAPredicateToG4Predicate(VISA_PREDICATE_CONTROL control,G4_ExecSize execSize)3332 G4_Predicate_Control IR_Builder::vISAPredicateToG4Predicate(
3333 VISA_PREDICATE_CONTROL control, G4_ExecSize execSize)
3334 {
3335 switch (control)
3336 {
3337 case PRED_CTRL_NON:
3338 return PRED_DEFAULT;
3339 case PRED_CTRL_ANY:
3340 {
3341 if (!predCtrlHasWidth())
3342 {
3343 return PRED_ANY_WHOLE;
3344 }
3345 switch (execSize)
3346 {
3347 case 1: return PRED_DEFAULT;
3348 case 2: return PRED_ANY2H;
3349 case 4: return PRED_ANY4H;
3350 case 8: return PRED_ANY8H;
3351 case 16: return PRED_ANY16H;
3352 case 32: return PRED_ANY32H;
3353 default:
3354 MUST_BE_TRUE(0, "Invalid predicate control group size.");
3355 return PRED_DEFAULT;
3356 }
3357 }
3358 case PRED_CTRL_ALL:
3359 {
3360 if (!predCtrlHasWidth())
3361 {
3362 return PRED_ALL_WHOLE;
3363 }
3364 switch (execSize)
3365 {
3366 case 1: return PRED_DEFAULT;
3367 case 2: return PRED_ALL2H;
3368 case 4: return PRED_ALL4H;
3369 case 8: return PRED_ALL8H;
3370 case 16: return PRED_ALL16H;
3371 case 32: return PRED_ALL32H;
3372 default:
3373 MUST_BE_TRUE(0, "Invalid predicate control group size.");
3374 return PRED_DEFAULT;
3375 }
3376 }
3377 default:
3378 MUST_BE_TRUE(0, "Invalid predicate control.");
3379 return PRED_DEFAULT;
3380 }
3381 }
3382
3383
3384 // helper function to fold BinOp with two immediate operands
3385 // supported opcodes are given below in doConsFolding
3386 // returns nullptr if the two constants may not be folded
foldConstVal(G4_Imm * const1,G4_Imm * const2,G4_opcode op)3387 G4_Imm* IR_Builder::foldConstVal(G4_Imm* const1, G4_Imm* const2, G4_opcode op)
3388 {
3389 bool isNonQInt = IS_TYPE_INT(const1->getType()) && IS_TYPE_INT(const2->getType()) &&
3390 !IS_QTYPE(const1->getType()) && !IS_QTYPE(const2->getType());
3391
3392 if (!isNonQInt)
3393 {
3394 return nullptr;
3395 }
3396
3397 G4_Type src0T = const1->getType(), src1T = const2->getType(), resultType = src0T;
3398
3399 if (op == G4_mul || op == G4_add || op == G4_and || op == G4_xor || op == G4_or)
3400 {
3401 resultType = findConstFoldCommonType(src0T, src1T);
3402 if (resultType == Type_UNDEF)
3403 {
3404 return nullptr;
3405 }
3406
3407 int64_t res;
3408 switch (op)
3409 {
3410 case G4_and:
3411 res = (int64_t)(const1->getInt()) & (int64_t)(const2->getInt());
3412 break;
3413
3414 case G4_xor:
3415 res = (int64_t)(const1->getInt()) ^ (int64_t)(const2->getInt());
3416 break;
3417
3418 case G4_or:
3419 res = (int64_t)(const1->getInt()) | (int64_t)(const2->getInt());
3420 break;
3421
3422 case G4_add:
3423 res = (int64_t)(const1->getInt()) + (int64_t)(const2->getInt());
3424 break;
3425
3426 case G4_mul:
3427 res = (int64_t)(const1->getInt()) * (int64_t)(const2->getInt());
3428 break;
3429
3430 default:
3431 return nullptr;
3432 }
3433
3434 // result type is either D or UD
3435 // don't fold if the value overflows D/UD
3436 if (!G4_Imm::isInTypeRange(res, resultType))
3437 {
3438 return nullptr;
3439 }
3440 return createImmWithLowerType(res, resultType);
3441 }
3442 else
3443 {
3444 uint32_t shift = const2->getInt() % 32;
3445
3446 if (op == G4_shl || op == G4_shr)
3447 {
3448 uint32_t value = (uint32_t)const1->getInt();
3449 // set result type to D/UD as it may overflow W. If the value fits the type will be lowered later
3450 // source type matters here since it affects sign extension
3451 resultType = IS_SIGNED_INT(resultType) ? Type_D : Type_UD;
3452 int64_t res = op == G4_shl ?
3453 ((int64_t)value) << shift :
3454 value >> shift;
3455 if (!G4_Imm::isInTypeRange(res, resultType))
3456 {
3457 return nullptr;
3458 }
3459
3460 return createImmWithLowerType(res, resultType);
3461 }
3462 else if (op == G4_asr)
3463 {
3464 if (IS_SIGNED_INT(resultType))
3465 {
3466 int64_t value = const1->getInt();
3467 int64_t res = value >> shift;
3468 return createImmWithLowerType(res, resultType);
3469 }
3470 else
3471 {
3472 uint64_t value = const1->getInt();
3473 uint64_t res = value >> shift;
3474 return createImmWithLowerType(res, resultType);
3475 }
3476 }
3477 }
3478 return nullptr;
3479 }
3480
3481
3482 // Currently constant folding is done for the following code patterns:
3483 //
3484 // - op v, imm, imm
3485 // where op is shl, shr, asr, or, xor, and, add, mul
3486 // Restrictions:
3487 // - operand type cannot be float or Q/UQ
3488 // - saturation is not allowed
doConsFolding(G4_INST * inst)3489 void IR_Builder::doConsFolding(G4_INST *inst)
3490 {
3491 if (inst->getSaturate())
3492 return; // TODO: we could do this if we wanted to bad enough
3493
3494 auto srcIsFoldableImm = [](const G4_Operand *op) {
3495 return op && op->isImm() && !op->isRelocImm();
3496 };
3497
3498 if (inst->getNumSrc() == 2) {
3499 G4_Operand *src0 = inst->getSrc(0);
3500 G4_Operand *src1 = inst->getSrc(1);
3501 if (srcIsFoldableImm(src0) && srcIsFoldableImm(src1)) {
3502 G4_Imm *foldedImm =
3503 foldConstVal(src0->asImm(), src1->asImm(), inst->opcode());
3504 if (foldedImm)
3505 {
3506 // change instruction into a MOV
3507 inst->setOpcode(G4_mov);
3508 inst->setSrc(foldedImm, 0);
3509 inst->setSrc(nullptr, 1);
3510 }
3511 }
3512 } else if (inst->getNumSrc() == 3) {
3513 G4_Operand *src0 = inst->getSrc(0);
3514 G4_Operand *src1 = inst->getSrc(1);
3515 G4_Operand *src2 = inst->getSrc(2);
3516 if (inst->opcode() == G4_add3) {
3517 // always fold the variable into src0
3518 G4_Imm *foldedImm = nullptr;
3519 G4_Operand *otherSrc = nullptr;
3520 if (srcIsFoldableImm(src0) && srcIsFoldableImm(src1)) {
3521 foldedImm = foldConstVal(src0->asImm(), src1->asImm(), G4_add);
3522 otherSrc = src2;
3523 } else if (srcIsFoldableImm(src0) && srcIsFoldableImm(src2)) {
3524 foldedImm = foldConstVal(src0->asImm(), src2->asImm(), G4_add);
3525 otherSrc = src1;
3526 } else if (srcIsFoldableImm(src1) && srcIsFoldableImm(src2)) {
3527 foldedImm = foldConstVal(src1->asImm(), src2->asImm(), G4_add);
3528 otherSrc = src0;
3529 }
3530 if (foldedImm) {
3531 // always put the possible register in src0
3532 inst->setOpcode(G4_add);
3533 if (otherSrc != src0) {
3534 inst->setSrc(otherSrc, 0);
3535 inst->swapDefUse(
3536 Opnd_src0,
3537 otherSrc == src1 ? Opnd_src1 : Opnd_src2);
3538 }
3539 inst->setSrc(foldedImm, 1);
3540 inst->setSrc(nullptr, 2);
3541 // recurse for possible fold again
3542 doConsFolding(inst);
3543 }
3544 } // TODO: integer mad, bfn, bfi, bfe
3545 }
3546 }
3547 // Do the following algebraic simplification:
3548 // - mul v, src0, 0 ==> 0, commutative
3549 // - and v, src0, 0 ==> 0, commutative
3550 // - mul v, src0, 1 ==> src0, commutative
3551 // - shl v, src0, 0 ==> src0
3552 // - shr v, src0, 0 ==> src0
3553 // - asr v, src0, 0 ==> src0
3554 // - add v, src0, 0 ==> src0, commutative
doSimplification(G4_INST * inst)3555 void IR_Builder::doSimplification(G4_INST *inst)
3556 {
3557 // Just handle following commonly used ops for now.
3558 if (inst->opcode() != G4_mul && inst->opcode() != G4_and &&
3559 inst->opcode() != G4_add && inst->opcode() != G4_shl &&
3560 inst->opcode() != G4_shr && inst->opcode() != G4_asr &&
3561 inst->opcode() != G4_mov)
3562 {
3563 return;
3564 }
3565
3566
3567 // Perform 'mov' to 'movi' transform when it's a 'mov' of
3568 // - simd8
3569 // - it's a raw mov
3570 // - dst is within a single GRF.
3571 // - src uses VxH indirect access.
3572 // - src is within one GRF.
3573 // - indices to src are all within src.
3574 // - destination stride in bytes must be equal to the source element size in bytes.
3575 bool canConvertMovToMovi = inst->opcode() == G4_mov && inst->getExecSize() == g4::SIMD8 &&
3576 inst->isRawMov() && inst->getDst() &&
3577 !inst->getDst()->asDstRegRegion()->isCrossGRFDst() &&
3578 inst->getSrc(0) && inst->getSrc(0)->isSrcRegRegion() &&
3579 inst->getSrc(0)->asSrcRegRegion()->isIndirect() &&
3580 inst->getSrc(0)->asSrcRegRegion()->getRegion()->isRegionWH() &&
3581 inst->getSrc(0)->asSrcRegRegion()->getRegion()->width == 1 &&
3582 inst->getSrc(0)->getTypeSize() == inst->getDst()->getTypeSize() * inst->getDst()->asDstRegRegion()->getHorzStride();
3583 if (canConvertMovToMovi)
3584 {
3585 // Convert 'mov' to 'movi' if the following conditions are met.
3586
3587 auto getSingleDefInst = [](G4_INST *UI,
3588 Gen4_Operand_Number OpndNum)
3589 -> G4_INST * {
3590 G4_INST *Def = nullptr;
3591 for (auto I = UI->def_begin(), E = UI->def_end(); I != E; ++I) {
3592 if (I->second != OpndNum)
3593 continue;
3594 if (Def) {
3595 // Not single defined, bail out
3596 Def = nullptr;
3597 break;
3598 }
3599 Def = I->first;
3600 }
3601 return Def;
3602 };
3603
3604 unsigned SrcSizeInBytes =
3605 inst->getExecSize() * inst->getSrc(0)->getTypeSize();
3606 if (SrcSizeInBytes == numEltPerGRF<Type_UB>()/2 ||
3607 SrcSizeInBytes == numEltPerGRF<Type_UB>())
3608 {
3609 G4_INST *LEA = getSingleDefInst(inst, Opnd_src0);
3610 if (LEA && LEA->opcode() == G4_add &&
3611 LEA->getExecSize() == inst->getExecSize()) {
3612 G4_Operand *Op0 = LEA->getSrc(0);
3613 G4_Operand *Op1 = LEA->getSrc(1);
3614 G4_Declare *Dcl = nullptr;
3615 int Offset = 0;
3616 if (Op0->isAddrExp()) {
3617 G4_AddrExp *AE = Op0->asAddrExp();
3618 Dcl = AE->getRegVar()->getDeclare();
3619 Offset = AE->getOffset();
3620 }
3621 if (Dcl && (Offset % SrcSizeInBytes) == 0 &&
3622 Op1->isImm() && Op1->getType() == Type_UV) {
3623 // Immeidates in 'uv' ensures each element is a
3624 // byte-offset within half-GRF.
3625 G4_SubReg_Align SubAlign = GRFALIGN;
3626 if (SrcSizeInBytes <= numEltPerGRF<Type_UB>()/2u)
3627 SubAlign = (G4_SubReg_Align)(numEltPerGRF<Type_UW>()/2);
3628 inst->setOpcode(G4_movi);
3629 if (!Dcl->isEvenAlign() && Dcl->getSubRegAlign() != GRFALIGN)
3630 {
3631 Dcl->setSubRegAlign(SubAlign);
3632 }
3633 const RegionDesc *rd = getRegionStride1();
3634 inst->getSrc(0)->asSrcRegRegion()->setRegion(rd);
3635 // Set subreg alignment for the address variable.
3636 Dcl =
3637 LEA->getDst()->getBase()->asRegVar()->getDeclare();
3638 assert(Dcl->getRegFile() == G4_ADDRESS &&
3639 "Address variable is required.");
3640 Dcl->setSubRegAlign(Eight_Word);
3641 }
3642 }
3643 }
3644 }
3645
3646 auto isInteger = [](G4_Operand *opnd, int64_t val)
3647 {
3648 if (opnd && IS_TYPE_INT(opnd->getType()) && !opnd->isRelocImm())
3649 {
3650 return opnd->isImm() && opnd->asImm()->getInt() == val;
3651 }
3652 return false;
3653 };
3654
3655 G4_Operand *src0 = inst->getSrc(0);
3656 G4_Operand *src1 = inst->getSrc(1);
3657 G4_Operand *newSrc = nullptr;
3658 if (inst->opcode() == G4_mul || inst->opcode() == G4_and)
3659 {
3660 if (isInteger(src1, 0))
3661 {
3662 inst->removeDefUse(Opnd_src0);
3663 newSrc = createImm(0, Type_W);
3664 }
3665 else if (isInteger(src0, 0))
3666 {
3667 inst->removeDefUse(Opnd_src1);
3668 newSrc = createImm(0, Type_W);
3669 }
3670 else if (inst->opcode() == G4_mul)
3671 {
3672 if (isInteger(src1, 1))
3673 {
3674 newSrc = src0;
3675 }
3676 else if (isInteger(src0, 1))
3677 {
3678 inst->swapDefUse();
3679 newSrc = src1;
3680 }
3681 }
3682 }
3683 else if (inst->opcode() == G4_shl || inst->opcode() == G4_shr ||
3684 inst->opcode() == G4_asr || inst->opcode() == G4_add)
3685 {
3686 if (isInteger(src1, 0))
3687 {
3688 newSrc = src0;
3689 }
3690 else if (inst->opcode() == G4_add && isInteger(src0, 0))
3691 {
3692 inst->swapDefUse();
3693 newSrc = src1;
3694 }
3695 }
3696
3697 if (newSrc != nullptr)
3698 {
3699 inst->setOpcode(G4_mov);
3700 if (newSrc != src0)
3701 {
3702 inst->setSrc(newSrc, 0);
3703 }
3704 inst->setSrc(nullptr, 1);
3705 }
3706 }
3707
3708 // find a common (integer) type for constant folding. The rules are:
3709 // -- both types must be int
3710 // -- Q and UQ are not folded
3711 // -- UD if one of the type is UD
3712 // -- D otherwise
3713 //
3714 // returns Type_UNDEF if no appropriate type can be found
3715 //
findConstFoldCommonType(G4_Type type1,G4_Type type2)3716 G4_Type IR_Builder::findConstFoldCommonType(G4_Type type1, G4_Type type2)
3717 {
3718 if (IS_TYPE_INT(type1) && IS_TYPE_INT(type2))
3719 {
3720 if (TypeSize(type1) == 8 || TypeSize(type2) == 8)
3721 {
3722 return Type_UNDEF;
3723 }
3724 if (type1 == Type_UD || type2 == Type_UD)
3725 {
3726 return Type_UD;
3727 }
3728 else
3729 {
3730 return Type_D;
3731 }
3732 }
3733 return Type_UNDEF;
3734 }
3735