1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include <string>
10 #include <iostream>
11 #include <sstream>
12 #include <fstream>
13 #include <list>
14 
15 #include "visa_igc_common_header.h"
16 #include "Common_ISA_util.h"
17 #include "Common_ISA_framework.h"
18 #include "JitterDataStruct.h"
19 #include "BuildIR.h"
20 #include "common.h"
21 #include "Timer.h"
22 
23 using namespace vISA;
24 
~DeclarePool()25 DeclarePool::~DeclarePool()
26 {
27     for (unsigned i = 0, size = (unsigned)dcllist.size(); i < size; i++) {
28         G4_Declare* dcl = dcllist[i];
29         dcl->~G4_Declare();
30     }
31     dcllist.clear();
32 }
33 
createDeclare(const char * name,G4_RegFileKind regFile,unsigned short nElems,unsigned short nRows,G4_Type ty,DeclareType kind,G4_RegVar * base,G4_Operand * repRegion,G4_ExecSize execSize)34 G4_Declare* DeclarePool::createDeclare(
35     const char*    name,
36     G4_RegFileKind regFile,
37     unsigned short nElems,
38     unsigned short nRows,
39     G4_Type        ty,
40     DeclareType    kind,
41     G4_RegVar *    base,
42     G4_Operand *   repRegion,
43     G4_ExecSize    execSize)
44 {
45     G4_Declare* dcl = new (mem) G4_Declare(name, regFile, nElems * nRows, ty, dcllist);
46     G4_RegVar * regVar;
47     if (kind == DeclareType::Regular)
48         regVar = new (mem) G4_RegVar(dcl, G4_RegVar::RegVarType::Default);
49     else if (kind == DeclareType::AddrSpill)
50         regVar = new (mem) G4_RegVarAddrSpillLoc(dcl, addrSpillLocCount);
51     else if (kind == DeclareType::Tmp)
52         regVar = new (mem) G4_RegVarTmp(dcl, base);
53     else if (kind == DeclareType::Spill)
54         regVar = new (mem) G4_RegVarTransient(dcl, base, repRegion->asDstRegRegion(), execSize, G4_RegVarTransient::TransientType::Spill);
55     else if (kind == DeclareType::Fill)
56         regVar = new (mem)G4_RegVarTransient(dcl, base, repRegion->asSrcRegRegion(), execSize, G4_RegVarTransient::TransientType::Fill);
57     else if (kind == DeclareType::CoalescedFill || kind == DeclareType::CoalescedSpill)
58         regVar = new (mem)G4_RegVarCoalesced(dcl, kind == DeclareType::CoalescedFill);
59     else
60     {
61         MUST_BE_TRUE(false, ERROR_INTERNAL_ARGUMENT);
62         regVar = NULL;
63     }
64     dcl->setRegVar(regVar);
65 
66     if (regFile == G4_ADDRESS || regFile == G4_SCALAR)
67     {
68         dcl->setSubRegAlign(Any);
69     }
70     else if (regFile != G4_FLAG)
71     {
72         if ((unsigned int)nElems * nRows * TypeSize(ty) >= numEltPerGRF<Type_UB>())
73         {
74             dcl->setSubRegAlign(GRFALIGN);
75         }
76         else
77         {
78             // at a minimum subRegAlign has to be at least the type size
79             dcl->setSubRegAlign(Get_G4_SubRegAlign_From_Type(ty));
80         }
81     }
82     else
83     {
84         if (dcl->getNumberFlagElements() == 32)
85         {
86             dcl->setSubRegAlign(Even_Word);
87         }
88     }
89 
90     return dcl;
91 }
92 
93 
addImmVal(G4_Imm * imm,int numElt)94 G4_Declare * IR_Builder::GlobalImmPool::addImmVal(G4_Imm* imm, int numElt)
95 {
96     ImmVal val = { imm, numElt };
97     for (int i = 0; i < curSize; ++i)
98     {
99         if (val == immArray[i])
100         {
101             return dclArray[i];
102         }
103     }
104     if (curSize == MAX_POOL_SIZE)
105     {
106         return nullptr;
107     }
108     immArray[curSize] = val;
109     dclArray[curSize] = builder.createTempVar(numElt, imm->getType(), Any);
110     return dclArray[curSize++];
111 }
112 
113 
114 ///////////////////////////////////////////////////////////////////////////////
115 // IR_Builder functions (except translateXXXX, which should be in VisaToG4)
116 //
117 
dump(std::ostream & os)118 void IR_Builder::dump(std::ostream &os)
119 {
120     os << "DECLARES:\n";
121     for (const G4_Declare *dcl : kernel.Declares) {
122         dcl->emit(os);
123         os  << "\n";
124     }
125     os << "\n";
126     os << "INSTS:\n";
127     for (G4_INST *i : instList) {
128         i->emit(os, false, false);
129         os << "\n";
130     }
131 }
132 
133 
134 // bind a vISA input variable <dcl> to the GRF byte offset <offset>
bindInputDecl(G4_Declare * dcl,int offset)135 void IR_Builder::bindInputDecl(G4_Declare* dcl, int offset)
136 {    // decide the physical register number and sub register number
137     unsigned int regNum = offset / getGRFSize();
138     unsigned int subRegNum = (offset % getGRFSize()) / dcl->getElemSize();
139     dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), subRegNum);
140     dcl->setRegFile(G4_INPUT);
141     unsigned int reservedGRFNum = m_options->getuInt32Option(vISA_ReservedGRFNum);
142     if (regNum + dcl->getNumRows() > kernel.getNumRegTotal() - reservedGRFNum) {
143         MUST_BE_TRUE(false, "INPUT payload execeeds the regsiter number");
144     }
145 }
146 
147 // check if an operand is aligned to <align_byte>
isOpndAligned(G4_Operand * opnd,unsigned short & offset,int align_byte) const148 bool IR_Builder::isOpndAligned(
149     G4_Operand *opnd, unsigned short &offset, int align_byte) const
150 {
151     offset = 0;
152     bool isAligned = true;
153 
154     switch (opnd->getKind())
155     {
156     case G4_Operand::immediate:
157     case G4_Operand::addrExp:
158     case G4_Operand::label:
159     case G4_Operand::condMod:
160     case G4_Operand::predicate:
161     {
162         isAligned = true;
163         break;
164     }
165     case G4_Operand::srcRegRegion:
166     case G4_Operand::dstRegRegion:
167     {
168         int type_size = opnd->getTypeSize();
169         G4_Declare *dcl = NULL;
170         if (opnd->getBase()->isRegVar())
171         {
172             dcl = opnd->getBase()->asRegVar()->getDeclare();
173             while (dcl && dcl->getAliasDeclare())
174             {
175                 if (dcl->getSubRegAlign() != Any &&
176                     (((dcl->getSubRegAlign() * 2) >= align_byte && (dcl->getSubRegAlign() * 2) % align_byte != 0) ||
177                     ((dcl->getSubRegAlign() * 2) < align_byte && align_byte % (dcl->getSubRegAlign() * 2) != 0)))
178                 {
179                     isAligned = false;
180                     break;
181                 }
182                 offset += (unsigned short) dcl->getAliasOffset();
183                 dcl = dcl->getAliasDeclare();
184             }
185 
186             if (dcl && dcl->getRegVar() && dcl->getRegVar()->isPhyRegAssigned())
187             {
188                 offset += static_cast<unsigned short>(dcl->getRegVar()->getByteAddr());
189             }
190         }
191         if (!isAligned)
192         {
193             return isAligned;
194         }
195 
196         if (opnd->isDstRegRegion())
197         {
198             if (opnd->asDstRegRegion()->getRegAccess() != Direct)
199             {
200                 isAligned = false;
201             }
202             offset += opnd->asDstRegRegion()->getRegOff() * numEltPerGRF<Type_UB>() + opnd->asDstRegRegion()->getSubRegOff() * type_size;
203         }
204         else if (opnd->isSrcRegRegion())
205         {
206             if (opnd->asSrcRegRegion()->getRegAccess() != Direct)
207             {
208                 isAligned = false;
209             }
210             offset += opnd->asSrcRegRegion()->getRegOff() * numEltPerGRF<Type_UB>() + opnd->asSrcRegRegion()->getSubRegOff() * type_size;
211         }
212         if (offset % align_byte != 0)
213         {
214             return false;
215         }
216         // Only alignment of the top dcl can be changed.
217         if (dcl && dcl->getRegFile() == G4_GRF)
218         {
219             if (dcl->getSubRegAlign() == Any ||
220                 ((dcl->getSubRegAlign() * 2) < align_byte && align_byte % (dcl->getSubRegAlign() * 2) == 0))
221             {
222                 dcl->setSubRegAlign(G4_SubReg_Align(align_byte / 2));
223             }
224             else if ((dcl->getSubRegAlign() * 2) < align_byte || (dcl->getSubRegAlign() * 2) % align_byte != 0)
225             {
226                 isAligned = false;
227             }
228         }
229         else if (opnd->getKind() == G4_Operand::dstRegRegion &&
230             // Only care about GRF or half-GRF alignment.
231             (align_byte == numEltPerGRF<Type_UB>() || align_byte == numEltPerGRF<Type_UB>() / 2) &&
232             dcl && (dcl->getRegFile() == G4_ADDRESS))
233         {
234 
235             // Get the single definition of the specified operand from the use
236             // inst.
237             auto getSingleDefInst = [](G4_INST *UI, Gen4_Operand_Number OpndNum)
238                 -> G4_INST * {
239                 G4_INST *Def = nullptr;
240                 for (DEF_EDGE_LIST_ITER I = UI->defInstList.begin(),
241                     E = UI->defInstList.end();
242                     I != E; ++I) {
243                     if (I->second != OpndNum)
244                         continue;
245                     if (Def) {
246                         // Not single defined, bail out
247                         Def = nullptr;
248                         break;
249                     }
250                     Def = I->first;
251                 }
252                 return Def;
253             };
254 
255             G4_INST *inst = opnd->getInst();
256             if (inst) {
257                 // Check address calculation like:
258                 //
259                 //    shl (1) V1  V0          imm
260                 //    add (1) a0  $V2 + off   V1
261                 //    ...
262                 //    (use)... r[a0, disp] ...
263                 //
264                 // need to check both disp, off, and V1 are aligned.
265                 //
266                 // Check acc_use_op's def-list.
267                 G4_INST *LEA = getSingleDefInst(inst, Opnd_dst);
268                 if (LEA && LEA->opcode() == G4_add && LEA->getExecSize() == g4::SIMD1) {
269                     isAligned = true;
270                     G4_Operand *Op0 = LEA->getSrc(0);
271                     G4_Operand *Op1 = LEA->getSrc(1);
272                     if (Op0->isSrcRegRegion()) {
273                         // TODO: Consider MUL as well.
274                         G4_INST *Def = getSingleDefInst(LEA, Opnd_src0);
275                         if (Def && Def->opcode() == G4_shl &&
276                             Def->getSrc(1)->isImm()) {
277                             G4_Imm *Imm = Def->getSrc(1)->asImm();
278                             unsigned Factor = (1U << Imm->getInt());
279                             // TODO: We only perform alignment checking on
280                             // component wise and may need to consider checking
281                             // the accumulated result.
282                             if (Factor % align_byte != 0)
283                                 isAligned = false;
284                         } else if (Def && Def->opcode() == G4_and &&
285                             Def->getSrc(1)->isImm()) {
286                             G4_Imm *Imm = Def->getSrc(1)->asImm();
287                             uint64_t Mask = uint64_t(Imm->getInt());
288                             // align_byte could be 32 or 16 guarded previsouly.
289                             uint64_t AlignMask = align_byte - 1;
290                             if ((Mask & AlignMask) != 0)
291                                 isAligned = false;
292                         } else
293                             isAligned = false;
294                     }
295                     if (isAligned && Op1->isAddrExp()) {
296                         G4_AddrExp *AE = Op1->asAddrExp();
297                         G4_Declare *Dcl = AE->getRegVar()->getDeclare();
298                         unsigned AliasOffset = 0;
299                         while (Dcl && Dcl->getAliasDeclare()) {
300                             AliasOffset += Dcl->getAliasOffset();
301                             Dcl = Dcl->getAliasDeclare();
302                         }
303                         // TODO: We only perform alignment checking on
304                         // component wise and may need to consider checking
305                         // the accumulated result.
306                         if ((AliasOffset % align_byte) != 0 ||
307                             (Dcl && Dcl->getSubRegAlign() != GRFALIGN &&
308                                 Dcl->getSubRegAlign() != Sixteen_Word &&
309                                 Dcl->getSubRegAlign() != Eight_Word) ||
310                             AE->getOffset() % align_byte != 0) {
311                             isAligned = false;
312                         }
313                     } else
314                         isAligned = false;
315                     if (isAligned) {
316                         // TODO: We only perform alignment checking on
317                         // component wise and may need to consider checking
318                         // the accumulated result.
319                         if (opnd->asDstRegRegion()->getAddrImm() % align_byte != 0)
320                             isAligned = false;
321                     }
322                 }
323             }
324         }
325         else if (dcl && dcl->getRegFile() == G4_FLAG)
326         {
327             // need to make flag even-word aligned if it's used in a setp with dword source
328             // ToDo: should we fix input to use 16-bit value instead
329             if (align_byte == 4)
330             {
331                 dcl->setSubRegAlign(Even_Word);
332             }
333         }
334         break;
335     }
336     default:
337         break;
338     }
339     return isAligned;
340 }
341 
342 
isOpndAligned(G4_Operand * opnd,int alignByte) const343 bool IR_Builder::isOpndAligned(G4_Operand* opnd, int alignByte) const
344 {
345     uint16_t offset = 0; // ignored
346     return isOpndAligned(opnd, offset, alignByte);
347 }
348 
349 
predefinedVarRegAssignment(uint8_t inputSize)350 void IR_Builder::predefinedVarRegAssignment(uint8_t inputSize)
351 {
352     uint32_t preDefinedStart = ((inputSize + G4_DSIZE - 1) / G4_DSIZE) * G4_DSIZE;
353     if (preDefinedStart == 0)
354     {
355         preDefinedStart = numEltPerGRF<Type_UB>();
356     }
357     for (PreDefinedVarsInternal i : allPreDefVars)
358     {
359         if (!predefinedVarNeedGRF(i))
360         {
361             continue;
362         }
363 
364         G4_Type ty = GetGenTypeFromVISAType(getPredefinedVarType(i));
365         G4_Declare *dcl = preDefVars.getPreDefinedVar((PreDefinedVarsInternal)i);
366         if (!isPredefinedVarInR0((PreDefinedVarsInternal)i))
367         {
368             unsigned short new_offset = preDefinedStart + getPredefinedVarByteOffset(i);
369             unsigned int regNum = new_offset / numEltPerGRF<Type_UB>();
370             unsigned int subRegNum = (new_offset % numEltPerGRF<Type_UB>()) / TypeSize(ty);
371             dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), subRegNum);
372         }
373         else
374         {
375             unsigned int regNum = 0;
376             unsigned int subRegNum = getPredefinedVarByteOffset(i) / TypeSize(ty);
377             dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), subRegNum);
378         }
379     }
380 }
381 
382 // Expand some of the pre-defined variables at kernel entry
383 // -- replace pre-defined V17 (hw_tid)
384 // -- replace pre-defined V22 (color)
385 // -- replace pre-defined V1 (thread_x)
386 // -- replace pre-defined V2 (thread_y)
expandPredefinedVars()387 void IR_Builder::expandPredefinedVars()
388 {
389 
390     // Use FFTID from msg header
391     // and (1) hw_tid, r0.5, 0x3ff
392     //
393     // 9:0     FFTID. This ID is assigned by TS and is a unique identifier for the thread in
394     // comparison to other concurrent root threads. It is used to free up resources used
395     // by the thread upon thread completion.
396     //
397     // [Pre-DevBDW]: Format = U8. Bits 9:8 are Reserved, MBZ.
398     //
399     // [0:8] For Pre-Gen9
400     // [0:9] For Gen10+
401     //
402 
403     // first non-label instruction
404     auto iter = std::find_if(instList.begin(), instList.end(), [](G4_INST* inst) { return !inst->isLabel(); });
405 
406     if (preDefVars.isHasPredefined(PreDefinedVarsInternal::HW_TID))
407     {
408         const unsigned fftid_mask = getPlatform() >= GENX_CNL ? 0x3FF : 0x1FF;
409         G4_SrcRegRegion* src = createSrc(realR0->getRegVar(), 0, 5, getRegionScalar(), Type_UD);
410         G4_Imm* mask1 = createImm(fftid_mask, Type_UD);
411         G4_DstRegRegion* dst = createDstRegRegion(builtinHWTID, 1);
412         G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, src, mask1, InstOpt_WriteEnable, false);
413         instList.insert(iter, inst);
414     }
415 
416     if (preDefVars.isHasPredefined(PreDefinedVarsInternal::X))
417     {
418         if (useNewR0Format())
419         {
420             // x -> and (1) thread_x<1>:uw r0.1:ud 0xFFF
421             G4_SrcRegRegion* r0Dot1UD = createSrc(
422                 realR0->getRegVar(), 0, 1, getRegionScalar(), Type_UD);
423             G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::X), 1);
424             G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, r0Dot1UD,
425                 createImm(0xFFF, Type_UW), InstOpt_WriteEnable, false);
426             instList.insert(iter, inst);
427         }
428         else
429         {
430             //  We insert the new instruction
431             //  and (1) thread_x<1>:uw, r0.2:uw, 0x01FF
432             G4_SrcRegRegion* r0Dot2UW = createSrc(
433                 realR0->getRegVar(), 0, 2, getRegionScalar(), Type_UW);
434             int64_t mask = getThreadIDMask();
435             G4_Imm* src1 = createImm(mask, Type_UW);
436             G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::X), 1);
437             G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, r0Dot2UW, src1, InstOpt_WriteEnable, false);
438             instList.insert(iter, inst);
439         }
440     }
441 
442     if (preDefVars.isHasPredefined(PreDefinedVarsInternal::Y))
443     {
444         if (useNewR0Format())
445         {
446             // y -> shr (1) thread_y<1>:uw r0.1:ud 12
447             //      and (1) thread_y<1>:uw thread_y:uw 0xFFF
448             G4_SrcRegRegion* r0Dot1UD = createSrc(
449                 realR0->getRegVar(), 0, 1, getRegionScalar(), Type_UD);
450 
451             G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), 1);
452             G4_INST* inst1 = createBinOp(G4_shr, g4::SIMD1, dst, r0Dot1UD,
453                 createImm(12, Type_UW), InstOpt_WriteEnable, false);
454             instList.insert(iter, inst1);
455             dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), 1);
456             G4_INST* inst2 = createBinOp(G4_and, g4::SIMD1, dst,
457                 createSrcRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), getRegionScalar()),
458                 createImm(0xFFF, Type_UW), InstOpt_WriteEnable, false);
459             instList.insert(iter, inst2);
460         }
461         else
462         {
463             //  We insert the new instruction
464             //  and (1) thread_y<1>:uw, r0.3:uw, 0x01FF
465             G4_SrcRegRegion* r0Dot3UW = createSrc(
466                 realR0->getRegVar(), 0, 3, getRegionScalar(), Type_UW);
467             int64_t mask = getThreadIDMask();
468             G4_Imm* src1 = createImmWithLowerType(mask, Type_UW);
469             G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::Y), 1);
470             G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, r0Dot3UW, src1, InstOpt_WriteEnable, false);
471             instList.insert(iter, inst);
472         }
473     }
474 
475     // color bit
476     if (preDefVars.isHasPredefined(PreDefinedVarsInternal::COLOR))
477     {
478         if (useNewR0Format())
479         {
480             // r0.1[31:24]
481             // shr (1) color<2>:uw r0.1<0;1,0>:ud 24
482             G4_SrcRegRegion* src = createSrc(realR0->getRegVar(),
483                 0, 1, getRegionScalar(), Type_UD);
484             G4_Imm* shift = createImm(24, Type_UW);
485             G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::COLOR), 2);
486             G4_INST* inst = createBinOp(G4_shr, g4::SIMD1, dst, src, shift,
487                 InstOpt_WriteEnable, false);
488             instList.insert(iter, inst);
489         }
490         else
491         {
492             // else: r0.2[3:0]
493             // and (1) color<2>:uw r0.2<0;1,0>:ud 0xF
494             G4_SrcRegRegion* src = createSrc(realR0->getRegVar(),
495                 0, 2, getRegionScalar(), Type_UD);
496             G4_Imm* mask = createImm(0xF, Type_UW);
497             G4_DstRegRegion* dst = createDstRegRegion(preDefVars.getPreDefinedVar(PreDefinedVarsInternal::COLOR), 2);
498             G4_INST* inst = createBinOp(G4_and, g4::SIMD1, dst, src, mask,
499                 InstOpt_WriteEnable, false);
500             instList.insert(iter, inst);
501         }
502     }
503 }
504 
getFCPatchInfo()505 FCPatchingInfo* IR_Builder::getFCPatchInfo()
506 {
507     // Create new instance of FC patching class if one is not
508     // yet created.
509     if (fcPatchInfo == NULL)
510     {
511         FCPatchingInfo* instance;
512         instance = (FCPatchingInfo*)mem.alloc(sizeof(FCPatchingInfo));
513         fcPatchInfo = new (instance) FCPatchingInfo();
514     }
515 
516     return fcPatchInfo;
517 }
518 
getNameString(Mem_Manager & mem,size_t size,const char * format,...)519 const char* IR_Builder::getNameString(
520     Mem_Manager& mem, size_t size, const char* format, ...)
521 {
522 #ifdef _DEBUG
523     char* name = (char*) mem.alloc(size);
524     va_list args;
525     va_start(args, format);
526     std::vsnprintf(name, size, format, args);
527     va_end(args);
528     return name;
529 #else
530     return "";
531 #endif
532 }
533 
getFcallInfo(const G4_INST * inst) const534 G4_FCALL* IR_Builder::getFcallInfo(const G4_INST* inst) const {
535     auto it = m_fcallInfo.find(inst);
536     if (m_fcallInfo.end() == it) {
537         return nullptr;
538     } else {
539         return it->second;
540     }
541 }
542 
createPreDefinedVars()543 void IR_Builder::createPreDefinedVars()
544 {
545     for (PreDefinedVarsInternal i : allPreDefVars)
546     {
547         G4_Declare* dcl = nullptr;
548 
549         if (predefinedVarNeedGRF(i))
550         {
551             // work item id variables are handled uniformly
552             G4_Type ty = GetGenTypeFromVISAType(getPredefinedVarType(i));
553             dcl = createPreVar(getPredefinedVarID(i), 1, ty);
554         }
555         else
556         {
557             const char* name = getPredefinedVarString(i);
558             switch (i)
559             {
560             case PreDefinedVarsInternal::VAR_NULL:
561                 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UD);
562                 dcl->getRegVar()->setPhyReg(phyregpool.getNullReg(), 0);
563                 break;
564             case PreDefinedVarsInternal::TSC:
565             {
566                 G4_Declare* tscDcl = createPreVar(i, 5, Type_UD);
567                 tscDcl->getRegVar()->setPhyReg(phyregpool.getTm0Reg(), 0);
568                 dcl = tscDcl;
569                 break;
570             }
571             case PreDefinedVarsInternal::R0:
572             {
573                 dcl = getBuiltinR0();
574                 break;
575             }
576             case PreDefinedVarsInternal::SR0:
577             {
578                 G4_Declare* sr0Dcl = createPreVar(i, 4, Type_UD);
579                 sr0Dcl->getRegVar()->setPhyReg(phyregpool.getSr0Reg(), 0);
580                 dcl = sr0Dcl;
581                 break;
582             }
583             case PreDefinedVarsInternal::CR0:
584             {
585                 G4_Declare* cr0Dcl = createPreVar(i, 3, Type_UD);
586                 cr0Dcl->getRegVar()->setPhyReg(phyregpool.getCr0Reg(), 0);
587                 dcl = cr0Dcl;
588                 break;
589             }
590             case PreDefinedVarsInternal::CE0:
591             {
592                 G4_Declare* ce0Dcl = createPreVar(i, 1, Type_UD);
593                 ce0Dcl->getRegVar()->setPhyReg(phyregpool.getMask0Reg(), 0);
594                 dcl = ce0Dcl;
595                 break;
596             }
597             case PreDefinedVarsInternal::DBG:
598             {
599                 G4_Declare* dbgDcl = createPreVar(i, 2, Type_UD);
600                 dbgDcl->getRegVar()->setPhyReg(phyregpool.getDbgReg(), 0);
601                 dcl = dbgDcl;
602                 break;
603             }
604             case PreDefinedVarsInternal::ARG:
605             {
606                 dcl = createDeclareNoLookup(name, G4_INPUT, numEltPerGRF<Type_UD>(), 32, Type_UD);
607                 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(ArgRet_Stackcall::Arg), 0);
608                 break;
609             }
610             case PreDefinedVarsInternal::RET:
611             {
612                 dcl = createDeclareNoLookup(name, G4_GRF, numEltPerGRF<Type_UD>(), 12, Type_UD);
613                 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(ArgRet_Stackcall::Ret), 0);
614                 dcl->setLiveOut();
615                 break;
616             }
617             case PreDefinedVarsInternal::FE_SP:
618             {
619                 unsigned int startReg = kernel.getFPSPGRF();
620                 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UQ);
621                 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(startReg), SubRegs_Stackcall::FE_SP);
622                 break;
623             }
624             case PreDefinedVarsInternal::FE_FP:
625             {
626                 // PREDEFINED_FE_FP
627                 unsigned int startReg = kernel.getFPSPGRF();
628                 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UQ);
629                 dcl->getRegVar()->setPhyReg(phyregpool.getGreg(startReg), SubRegs_Stackcall::FE_FP);
630                 break;
631             }
632             case PreDefinedVarsInternal::HW_TID:
633             {
634                 // PREDEFINED_HW_TID
635                 dcl = getBuiltinHWTID();
636                 break;
637             }
638             case PreDefinedVarsInternal::X:
639             case PreDefinedVarsInternal::Y:
640             case PreDefinedVarsInternal::COLOR:
641             {
642                 // these three are size 1 UW
643                 dcl = createDeclareNoLookup(name, G4_GRF, 1, 1,
644                     GetGenTypeFromVISAType(getPredefinedVarType(i)));
645                 break;
646             }
647             case PreDefinedVarsInternal::IMPL_ARG_BUF_PTR:
648             {
649                 dcl = createDeclareNoLookup("implBufPtr", G4_GRF, 1, 1, Type_UQ);
650                 auto phyReg = phyregpool.getGreg(kernel.getSpillHeaderGRF());
651                 dcl->getRegVar()->setPhyReg(phyReg, SubRegs_ImplPtrs::ImplBufPtr);
652                 break;
653             }
654 
655             case PreDefinedVarsInternal::LOCAL_ID_BUF_PTR:
656             {
657                 dcl = createDeclareNoLookup("localIdBufPtr", G4_GRF, 1, 1, Type_UQ);
658                 auto phyReg = phyregpool.getGreg(kernel.getSpillHeaderGRF());
659                 dcl->getRegVar()->setPhyReg(phyReg, SubRegs_ImplPtrs::LocalIdBufPtr);
660                 break;
661             }
662 
663             default:
664             {
665                 break;
666             }
667             }
668         }
669         preDefVars.setPredefinedVar(i, dcl);
670         dcl->setPreDefinedVar(true);
671     }
672 }
673 
createBuiltinDecls()674 void IR_Builder::createBuiltinDecls()
675 {
676     // realR0 is always tied to physical r0
677     auto numR0DW = numEltPerGRF<Type_UD>();
678     realR0 = createDeclareNoLookup(
679         "BuiltInR0",
680         G4_INPUT,
681         numR0DW,
682         1,
683         Type_UD);
684     realR0->getRegVar()->setPhyReg(phyregpool.getGreg(0), 0);
685 
686     // builtinR0 either gets allocated to r0 or to a different
687     // register depending on conditions in RA.
688     builtinR0 = createTempVar(numR0DW, Type_UD, GRFALIGN, "R0_Copy");
689     builtinR0->setDoNotSpill();
690 
691     builtinA0 = createDeclareNoLookup(
692         "BuiltinA0",
693         G4_ADDRESS,
694         1,
695         1,
696         Type_UD);
697     builtinA0->getRegVar()->setPhyReg(phyregpool.getAddrReg(), 0);
698 
699     builtinA0Dot2 = createDeclareNoLookup(
700         "BuiltinA0Dot2",  //a0.2
701         G4_ADDRESS,
702         1,
703         1,
704         Type_UD);
705     builtinA0Dot2->getRegVar()->setPhyReg(phyregpool.getAddrReg(), 2);
706 
707     builtinHWTID = createDeclareNoLookup("hw_tid", G4_GRF, 1, 1, Type_UD);
708 
709     builtinT252 = createDeclareNoLookup(vISAPreDefSurf[PREDEFINED_SURFACE_T252].name, G4_GRF, 1, 1, Type_UD);
710     builtinBindlessSampler = createDeclareNoLookup("B_S", G4_GRF, 1, 1, Type_UD);
711 
712     builtinSamplerHeader = createDeclareNoLookup("samplerHeader", G4_GRF, numEltPerGRF<Type_UD>(), 1, Type_UD);
713 
714     builtinScratchSurface = createDeclareNoLookup(vISAPreDefSurf[PREDEFINED_SURFACE_SCRATCH].name, G4_GRF, 1, 1, Type_UD);
715 }
716 
717 
getSpillFillHeader()718 G4_Declare* IR_Builder::getSpillFillHeader()
719 {
720     if (!spillFillHeader)
721     {
722         spillFillHeader = createTempVar(1, Type_UD, GRFALIGN, "spillHeader");
723         spillFillHeader->setLiveOut();
724         spillFillHeader->setLiveIn();
725         spillFillHeader->setDoNotSpill();
726     }
727     return spillFillHeader;
728 }
729 
getEUFusionWATmpVar()730 G4_Declare* IR_Builder::getEUFusionWATmpVar()
731 {
732     if (!euFusionWATmpVar)
733     {
734         euFusionWATmpVar = createTempVar(2, Type_UD, Even_Word, "euFusionWATmp");
735         euFusionWATmpVar->setLiveOut();
736         euFusionWATmpVar->setLiveIn();
737         euFusionWATmpVar->setDoNotSpill();
738     }
739     return euFusionWATmpVar;
740 }
741 
getOldA0Dot2Temp()742 G4_Declare* IR_Builder::getOldA0Dot2Temp()
743 {
744     if (!oldA0Dot2Temp)
745     {
746         oldA0Dot2Temp = createTempVar(1, Type_UD, Any, "OldA0Dot2");
747         oldA0Dot2Temp->setLiveOut();
748         oldA0Dot2Temp->setLiveIn();
749         oldA0Dot2Temp->setDoNotSpill();
750     }
751     return oldA0Dot2Temp;
752 }
753 
IR_Builder(TARGET_PLATFORM genPlatform,INST_LIST_NODE_ALLOCATOR & alloc,G4_Kernel & k,Mem_Manager & m,Options * options,CISA_IR_Builder * parent,FINALIZER_INFO * jitInfo,const WA_TABLE * pWaTable)754 IR_Builder::IR_Builder(
755     TARGET_PLATFORM genPlatform,
756     INST_LIST_NODE_ALLOCATOR& alloc,
757     G4_Kernel& k,
758     Mem_Manager& m,
759     Options* options,
760     CISA_IR_Builder* parent,
761     FINALIZER_INFO* jitInfo,
762     const WA_TABLE* pWaTable)
763     : platform(genPlatform), curFile(NULL), curLine(0), curCISAOffset(-1), immPool(*this), metaData(jitInfo),
764     type(VISA_BUILD_TYPE::KERNEL), parentBuilder(parent),
765     builtinSamplerHeaderInitialized(false), m_pWaTable(pWaTable), m_options(options), CanonicalRegionStride0(0, 1, 0),
766     CanonicalRegionStride1(1, 1, 0), CanonicalRegionStride2(2, 1, 0), CanonicalRegionStride4(4, 1, 0),
767     mem(m), phyregpool(m, k.getNumRegTotal()), hashtable(m), rgnpool(m), dclpool(m),
768     instList(alloc), kernel(k), metadataMem(4096)
769 {
770     num_temp_dcl = 0;
771     kernel.setBuilder(this); // kernel needs pointer to the builder
772     createBuiltinDecls();
773 
774     sampler8x8_group_id = 0;
775 
776     be_sp = be_fp = tmpFCRet = nullptr;
777 
778     arg_size = 0;
779     return_var_size = 0;
780 
781     if (metaData != NULL)
782     {
783         memset(metaData, 0, sizeof(FINALIZER_INFO));
784     }
785 
786     fcPatchInfo = NULL;
787 
788     createPreDefinedVars();
789 }
790 
791 
~IR_Builder()792 IR_Builder::~IR_Builder()
793 {
794     // We need to invoke the destructor of every instruction ever allocated
795     // so that its members will be freed.
796     // Note that we don't delete the instruction itself as it's allocated from
797     // the memory manager's pool
798     for (unsigned i = 0, size = (unsigned)instAllocList.size(); i != size; i++)
799     {
800         G4_INST* inst = instAllocList[i];
801         inst->~G4_INST();
802     }
803     instAllocList.clear();
804 
805     for (auto MD : allMDs)
806     {
807         MD->~Metadata();
808     }
809 
810     for (auto node : allMDNodes)
811     {
812         node->~MDNode();
813     }
814 
815     if (fcPatchInfo)
816     {
817         fcPatchInfo->~FCPatchingInfo();
818     }
819 }
820 
createDeclareNoLookup(const char * name,G4_RegFileKind regFile,unsigned short n_elems,unsigned short n_rows,G4_Type ty,DeclareType kind,G4_RegVar * base,G4_Operand * repRegion,G4_ExecSize execSize)821 G4_Declare* IR_Builder::createDeclareNoLookup(
822     const char*     name,
823     G4_RegFileKind  regFile,
824     unsigned short  n_elems,
825     unsigned short  n_rows,
826     G4_Type         ty,
827     DeclareType     kind,
828     G4_RegVar *     base,
829     G4_Operand *    repRegion,
830     G4_ExecSize     execSize)
831 {
832     if (regFile == G4_FLAG)
833     {
834         MUST_BE_TRUE(ty == Type_UW, "flag decl must have type UW");
835     }
836 
837     G4_Declare* dcl = dclpool.createDeclare(name, regFile, n_elems,
838         n_rows, ty, kind, base, repRegion, execSize);
839 
840     kernel.Declares.push_back(dcl);
841 
842     return dcl;
843 }
844 
845 
getSplitEMask(unsigned execSize,uint32_t eMask,bool isLo)846 uint32_t IR_Builder::getSplitEMask(unsigned execSize, uint32_t eMask, bool isLo)
847 {
848     const uint32_t qhMasks = InstOpt_M0 | InstOpt_M8 |
849         InstOpt_M16 | InstOpt_M24;
850     uint32_t other = eMask & ~qhMasks;
851     uint32_t qh = eMask & qhMasks;
852 
853     switch (execSize) {
854     case 16: // Split SIMD16 into SIMD8
855         switch (qh) {
856         case 0: // instOpt not specified, treat as 1H
857         case InstOpt_M0:
858             return (isLo ? InstOpt_M0 : InstOpt_M8) | other;
859         case InstOpt_M16:
860             return (isLo ? InstOpt_M16 : InstOpt_M24) | other;
861         }
862         break;
863     case 32: // Split SIMD32 into SIMD16.
864         switch (qh) {
865         case 0:
866             return (isLo ? InstOpt_M0 : InstOpt_M16) | other;
867         }
868         break;
869     }
870 
871     ASSERT_USER(false, "Unhandled cases for EMask splitting!");
872     return ~0U;
873 }
874 
initScratchSurfaceOffset()875 void IR_Builder::initScratchSurfaceOffset()
876 {
877     // (W) and (1) sso r0.5 0xFFFFC00, placed at kernel entry
878     if (!scratchSurfaceOffset)
879     {
880         G4_SrcRegRegion* R0_5 = createSrc(builtinR0->getRegVar(), 0, 5,
881             getRegionScalar(), Type_UD);
882         scratchSurfaceOffset = createTempVar(1, Type_UD, Any, "SSO");
883         scratchSurfaceOffset->setLiveOut();
884         scratchSurfaceOffset->setDoNotSpill();
885         if (kernel.getBoolKernelAttr(Attributes::ATTR_SepSpillPvtSS))
886         {
887             G4_Declare* slot0SSO = createTempVar(1, Type_UD, Any, "Slot0SSO");
888             G4_DstRegRegion* andDst = createDstRegRegion(slot0SSO, 1);
889             auto andInst = createBinOp(G4_and, g4::SIMD1, andDst, R0_5, createImm(0xFFFFFC00, Type_UD), InstOpt_WriteEnable, true);
890             instList.pop_back();
891             auto iter = std::find_if(instList.begin(), instList.end(), [](G4_INST* inst) { return !inst->isLabel(); });
892             instList.insert(iter, andInst);
893 
894             //scratchSurfaceOffset is reserved for spillfill, pvtmem should use r0.5+1
895             G4_DstRegRegion* dst = createDstRegRegion(scratchSurfaceOffset, 1);
896             createBinOp(G4_add, g4::SIMD1, dst, createSrcRegRegion(slot0SSO, getRegionScalar()),
897                 createImm(0x400, Type_UD), InstOpt_WriteEnable, true);
898         }
899         else
900         {
901             G4_DstRegRegion* andDst = createDstRegRegion(scratchSurfaceOffset, 1);
902             auto andInst = createBinOp(G4_and, g4::SIMD1, andDst, R0_5, createImm(0xFFFFFC00, Type_UD), InstOpt_WriteEnable, true);
903             instList.pop_back();
904             auto iter = std::find_if(instList.begin(), instList.end(), [](G4_INST* inst) { return !inst->isLabel(); });
905             instList.insert(iter, andInst);
906         }
907     }
908 }
909 
910 
createTempVar(unsigned int numElements,G4_Type type,G4_SubReg_Align subAlign,const char * prefix,bool appendIdToName)911 G4_Declare* IR_Builder::createTempVar(
912     unsigned int numElements, G4_Type type, G4_SubReg_Align subAlign,
913     const char* prefix, bool appendIdToName)
914 {
915     const char* name = appendIdToName ?
916         getNameString(mem, 20, "%s%d", prefix, num_temp_dcl++) :
917         getNameString(mem, 20, "%s", prefix);
918 
919     unsigned short dcl_width = 0, dcl_height = 1;
920     const uint16_t typeSize = TypeSize(type);
921     int totalByteSize = numElements * typeSize;
922     if (totalByteSize <= (int)numEltPerGRF<Type_UB>())
923     {
924         dcl_width = totalByteSize / typeSize;
925     }
926     else
927     {
928         // here we assume that the start point of the var is the beginning of a GRF?
929         // so subregister must be 0?
930         dcl_width = numEltPerGRF<Type_UB>() / typeSize;
931         dcl_height = totalByteSize / numEltPerGRF<Type_UB>();
932         if (totalByteSize % numEltPerGRF<Type_UB>() != 0)
933         {
934             dcl_height++;
935         }
936     }
937 
938     G4_Declare* dcl = createDeclareNoLookup(name, G4_GRF, dcl_width, dcl_height, type);
939     dcl->setSubRegAlign(subAlign);
940     return dcl;
941 }
942 
createAddrFlagSpillLoc(G4_Declare * dcl)943 G4_Declare* IR_Builder::createAddrFlagSpillLoc(G4_Declare* dcl)
944 {
945     const char* name = getNameString(mem, 16, "SP_LOC_%d", numAddrFlagSpillLoc++);
946     G4_Declare* spillLoc = createDeclareNoLookup(name,
947         G4_GRF,
948         dcl->getNumElems(),
949         1,
950         dcl->getElemType(),
951         DeclareType::AddrSpill);
952     dcl->setSpilledDeclare(spillLoc);
953     spillLoc->setSubRegAlign(dcl->getSubRegAlign()); // for simd32 flag the spill loc has to be 2-word aligned since it's accessed as dw
954     return spillLoc;
955 }
956 
createHardwiredDeclare(uint32_t numElements,G4_Type type,uint32_t regNum,uint32_t regOff)957 G4_Declare* IR_Builder::createHardwiredDeclare(
958     uint32_t numElements, G4_Type type, uint32_t regNum, uint32_t regOff)
959 {
960     G4_Declare* dcl = createTempVar(numElements, type, Any);
961     unsigned int linearizedStart = (regNum * numEltPerGRF<Type_UB>()) + (regOff * TypeSize(type));
962     // since it's called post RA (specifically post computePReg) we have to manually set the GRF's byte offset
963     dcl->setGRFBaseOffset(linearizedStart);
964     dcl->getRegVar()->setPhyReg(phyregpool.getGreg(regNum), regOff);
965     return dcl;
966 }
967 
createPseudoKills(std::initializer_list<G4_Declare * > dcls,PseudoKillType ty)968 G4_INST* IR_Builder::createPseudoKills(
969     std::initializer_list<G4_Declare*> dcls, PseudoKillType ty)
970 {
971     G4_INST* inst = nullptr;
972     for (auto dcl : dcls)
973     {
974         inst = createPseudoKill(dcl, ty);
975     }
976 
977     return inst;
978 }
979 
createPseudoKill(G4_Declare * dcl,PseudoKillType ty)980 G4_INST* IR_Builder::createPseudoKill(G4_Declare* dcl, PseudoKillType ty)
981 {
982     auto dstRgn = createDst(dcl->getRegVar(), 0, 0, 1, Type_UD);
983     G4_INST* inst = createIntrinsicInst(nullptr, Intrinsic::PseudoKill, g4::SIMD1,
984         dstRgn, createImm((unsigned int)ty, Type_UD), nullptr, nullptr, InstOpt_WriteEnable, true);
985 
986     return inst;
987 }
988 
989 static const unsigned int HWORD_BYTE_SIZE = 32;
990 
991 
createEUWASpill(bool addToInstList)992 G4_INST* IR_Builder::createEUWASpill(bool addToInstList)
993 {
994     const RegionDesc* rd = getRegionScalar();
995 
996     G4_Declare* dcl = getEUFusionWATmpVar();
997     G4_SrcRegRegion* pseudoUseSrc =
998         createSrc(dcl->getRegVar(), 0, 0, rd, Type_UD);
999 
1000     G4_INST* pseudoUseInst = createIntrinsicInst(
1001         nullptr, Intrinsic::FlagSpill, g4::SIMD2,
1002         nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt, addToInstList);
1003 
1004     return pseudoUseInst;
1005 }
1006 
createSpill(G4_DstRegRegion * dst,G4_SrcRegRegion * header,G4_SrcRegRegion * payload,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1007 G4_INST* IR_Builder::createSpill(
1008     G4_DstRegRegion* dst, G4_SrcRegRegion* header, G4_SrcRegRegion* payload,
1009     G4_ExecSize execSize,
1010     uint16_t numRows, uint32_t offset, G4_Declare* fp, G4_InstOption option,
1011     bool addToInstList)
1012 {
1013     G4_INST* spill = createIntrinsicInst(nullptr, Intrinsic::Spill, execSize, dst,
1014         header, payload, nullptr, option, addToInstList);
1015     spill->asSpillIntrinsic()->setFP(fp);
1016     spill->asSpillIntrinsic()->setOffset((uint32_t)
1017         (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1018     spill->asSpillIntrinsic()->setNumRows(numRows);
1019 
1020     return spill;
1021 }
1022 
createSpill(G4_DstRegRegion * dst,G4_SrcRegRegion * payload,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1023 G4_INST* IR_Builder::createSpill(
1024     G4_DstRegRegion* dst, G4_SrcRegRegion* payload,
1025     G4_ExecSize execSize, uint16_t numRows, uint32_t offset,
1026     G4_Declare* fp, G4_InstOption option, bool addToInstList)
1027 {
1028     auto builtInR0 = getBuiltinR0();
1029     auto rd = getRegionStride1();
1030     auto srcRgnr0 = createSrc(builtInR0->getRegVar(), 0, 0, rd, Type_UD);
1031     G4_INST* spill = createIntrinsicInst(nullptr, Intrinsic::Spill, execSize, dst,
1032         srcRgnr0, payload, nullptr, option, addToInstList);
1033     spill->asSpillIntrinsic()->setFP(fp);
1034     spill->asSpillIntrinsic()->setOffset((uint32_t)
1035         (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1036     spill->asSpillIntrinsic()->setNumRows(numRows);
1037     return spill;
1038 }
1039 
createFill(G4_SrcRegRegion * header,G4_DstRegRegion * dstData,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1040 G4_INST* IR_Builder::createFill(
1041     G4_SrcRegRegion* header, G4_DstRegRegion* dstData,
1042     G4_ExecSize execSize,
1043     uint16_t numRows, uint32_t offset, G4_Declare* fp, G4_InstOption option,
1044     bool addToInstList)
1045 {
1046     G4_INST* fill = createIntrinsicInst(nullptr, Intrinsic::Fill, execSize, dstData,
1047         header, nullptr, nullptr, option, addToInstList);
1048     fill->asFillIntrinsic()->setFP(fp);
1049     fill->asFillIntrinsic()->setOffset((uint32_t)
1050         (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1051     fill->asFillIntrinsic()->setNumRows(numRows);
1052     return fill;
1053 }
1054 
createFill(G4_DstRegRegion * dstData,G4_ExecSize execSize,uint16_t numRows,uint32_t offset,G4_Declare * fp,G4_InstOption option,bool addToInstList)1055 G4_INST* IR_Builder::createFill(
1056     G4_DstRegRegion* dstData,
1057     G4_ExecSize execSize,
1058     uint16_t numRows, uint32_t offset, G4_Declare* fp , G4_InstOption option,
1059     bool addToInstList)
1060 {
1061     auto builtInR0 = getBuiltinR0();
1062     auto rd = getRegionStride1();
1063     auto srcRgnr0 = createSrc(builtInR0->getRegVar(), 0, 0, rd, Type_UD);
1064     G4_INST* fill = createIntrinsicInst(nullptr, Intrinsic::Fill, execSize, dstData,
1065         srcRgnr0, nullptr, nullptr, option, addToInstList);
1066 
1067     fill->asFillIntrinsic()->setFP(fp);
1068     fill->asFillIntrinsic()->setOffset((uint32_t)
1069         (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
1070     fill->asFillIntrinsic()->setNumRows(numRows);
1071     return fill;
1072 }
1073 
1074 
createTempFlag(unsigned short numberOfFlags,const char * prefix)1075 G4_Declare* IR_Builder::createTempFlag(unsigned short numberOfFlags, const char* prefix)
1076 {
1077     const char* name = getNameString(mem, 20, "%s%d", prefix, num_temp_dcl++);
1078 
1079     G4_Declare* dcl = createDeclareNoLookup(name, G4_FLAG, numberOfFlags, 1, Type_UW);
1080 
1081     return dcl;
1082 }
1083 
createFlag(uint16_t numFlagElements,const char * name)1084 G4_Declare* IR_Builder::createFlag(uint16_t numFlagElements, const char* name)
1085 {
1086     uint32_t numWords = (numFlagElements + 15) / 16;
1087     G4_Declare* dcl = createDeclareNoLookup(name, G4_FLAG, numWords, 1, Type_UW);
1088     dcl->setNumberFlagElements((uint8_t)numFlagElements);
1089     return dcl;
1090 }
1091 
createTempScalar(uint16_t numFlagElements,const char * prefix)1092 G4_Declare* IR_Builder::createTempScalar(uint16_t numFlagElements, const char* prefix)
1093 {
1094     const char* name = getNameString(mem, 20, "%s%d", prefix, num_temp_dcl++);
1095     G4_Declare* dcl = createDeclareNoLookup(name, G4_SCALAR, numFlagElements, 1, Type_UB);
1096     return dcl;
1097 }
1098 
createScalar(uint16_t numFlagElements,const char * name)1099 G4_Declare* IR_Builder::createScalar(uint16_t numFlagElements, const char* name)
1100 {
1101     G4_Declare* dcl = createDeclareNoLookup(name, G4_SCALAR, numFlagElements, 1, Type_UB);
1102     return dcl;
1103 }
1104 
createPreVar(PreDefinedVarsInternal preDefVar_index,unsigned short numElements,G4_Type type)1105 G4_Declare* IR_Builder::createPreVar(
1106     PreDefinedVarsInternal preDefVar_index, unsigned short numElements, G4_Type type)
1107 {
1108     MUST_BE_TRUE(preDefVar_index < PreDefinedVarsInternal::VAR_LAST,
1109         "illegal predefined var index");
1110     unsigned short dcl_width = 0, dcl_height = 1;
1111     auto typeSize = TypeSize(type);
1112     int totalByteSize = numElements * typeSize;
1113     if (totalByteSize <= (int)numEltPerGRF<Type_UB>())
1114     {
1115         dcl_width = totalByteSize / typeSize;
1116     }
1117     else
1118     {
1119         // here we assume that the start point of the var is the beginning of a GRF?
1120         // so subregister must be 0?
1121         dcl_width = numEltPerGRF<Type_UB>() / typeSize;
1122         dcl_height = totalByteSize / numEltPerGRF<Type_UB>();
1123         if (totalByteSize % numEltPerGRF<Type_UB>() != 0)
1124         {
1125             dcl_height++;
1126         }
1127     }
1128 
1129     G4_Declare* dcl = createPreVarDeclareNoLookup(
1130         preDefVar_index, dcl_width, dcl_height, type);
1131     // subAlign has to be type size at the minimum
1132     dcl->setSubRegAlign(Get_G4_SubRegAlign_From_Type(type));
1133     return dcl;
1134 }
1135 
1136 
createSrcWithNewRegOff(G4_SrcRegRegion * old,short newRegOff)1137 G4_SrcRegRegion* IR_Builder::createSrcWithNewRegOff(G4_SrcRegRegion* old, short newRegOff)
1138 {
1139     if (old->getRegAccess() == Direct)
1140     {
1141         return createSrcRegRegion(old->getModifier(), Direct, old->getBase(), newRegOff,
1142             old->getSubRegOff(), old->getRegion(), old->getType(), old->getAccRegSel());
1143     }
1144     else
1145     {
1146         return createIndirectSrc(old->getModifier(), old->getBase(), newRegOff, old->getSubRegOff(),
1147             old->getRegion(), old->getType(), old->getAddrImm());
1148     }
1149 }
1150 
1151 
createSrcWithNewSubRegOff(G4_SrcRegRegion * old,short newSubRegOff)1152 G4_SrcRegRegion* IR_Builder::createSrcWithNewSubRegOff(G4_SrcRegRegion* old, short newSubRegOff)
1153 {
1154     if (old->getRegAccess() == Direct)
1155     {
1156         return createSrcRegRegion(old->getModifier(), old->getRegAccess(), old->getBase(), old->getRegOff(),
1157             newSubRegOff, old->getRegion(), old->getType(), old->getAccRegSel());
1158     }
1159     else
1160     {
1161         return createIndirectSrc(old->getModifier(), old->getBase(), old->getRegOff(), newSubRegOff,
1162             old->getRegion(), old->getType(), old->getAddrImm());
1163     }
1164 }
1165 
1166 
createSrcWithNewBase(G4_SrcRegRegion * old,G4_VarBase * newBase)1167 G4_SrcRegRegion* IR_Builder::createSrcWithNewBase(G4_SrcRegRegion* old, G4_VarBase* newBase)
1168 {
1169     if (old->getRegAccess() == Direct)
1170     {
1171         return createSrcRegRegion(old->getModifier(), Direct, newBase, old->getRegOff(),
1172             old->getSubRegOff(), old->getRegion(), old->getType(), old->getAccRegSel());
1173     }
1174     else
1175     {
1176         return createIndirectSrc(old->getModifier(), newBase, old->getRegOff(), old->getSubRegOff(),
1177             old->getRegion(), old->getType(), old->getAddrImm());
1178     }
1179 }
1180 
createDstWithNewSubRegOff(G4_DstRegRegion * old,short newSubRegOff)1181 G4_DstRegRegion* IR_Builder::createDstWithNewSubRegOff(G4_DstRegRegion* old, short newSubRegOff)
1182 {
1183     if (old->getRegAccess() == Direct)
1184     {
1185         return createDst(old->getBase(), old->getRegOff(), newSubRegOff, old->getHorzStride(), old->getType(), old->getAccRegSel());
1186     }
1187     else
1188     {
1189         return createIndirectDst(old->getBase(), newSubRegOff, old->getHorzStride(), old->getType(), old->getAddrImm());
1190     }
1191 }
1192 
1193 
createImm(float fp)1194 G4_Imm* IR_Builder::createImm(float fp)
1195 {
1196     uint32_t imm = *((uint32_t*) &fp);
1197     G4_Type immType = Type_F;
1198     if (getPlatform() >= GENX_CHV && m_options->getOption(vISA_FImmToHFImm) &&
1199         !VISA_WA_CHECK(getPWaTable(), WaSrc1ImmHfNotAllowed))
1200     {
1201         // we may be able to lower it to HF
1202         // ieee32 format: 23-8-1
1203         // ieee16 format: 10-5-1
1204         // bit0-22 are fractions
1205         uint32_t fraction = imm & 0x7FFFFF;
1206         // bit23-30 are exponents
1207         uint32_t exponent = (imm >> 23) & 0xFF;
1208         uint32_t sign = (imm >> 31) & 0x1;
1209         int expVal = ((int) exponent) - 127;
1210 
1211         if (exponent == 0 && fraction == 0)
1212         {
1213             // 0 and -0
1214             immType = Type_HF;
1215             imm = sign << 15;
1216         }
1217         else if ((fraction & 0x1FFF) == 0 && (expVal <= 15 && expVal >= -16))
1218         {
1219             // immediate can be exactly represented in HF.
1220             // we exclude denormal, infinity, and NaN.
1221             immType = Type_HF;
1222             uint32_t newExp = (expVal + 15) & 0x1F;
1223             imm = (sign << 15) | (newExp << 10) | (fraction >> 13);
1224         }
1225     }
1226     G4_Imm* i = hashtable.lookupImm(imm, immType);
1227     return (i != NULL)? i : hashtable.createImm(imm, immType);
1228 }
1229 
createDFImm(double fp)1230 G4_Imm* IR_Builder::createDFImm(double fp)
1231 {
1232     int64_t val = (int64_t)(*(uint64_t*)&fp);
1233     G4_Imm* i = hashtable.lookupImm(val, Type_DF);
1234     return (i != NULL)? i : hashtable.createImm(val, Type_DF);
1235 }
1236 
getNewType(int64_t imm,G4_Type ty)1237 G4_Type IR_Builder::getNewType(int64_t imm, G4_Type ty)
1238 {
1239     switch (ty)
1240     {
1241     case Type_Q:
1242     case Type_D:
1243         // It is legal to change a positive imm's type from signed to unsigned if it fits
1244         // in the unsigned type. We do prefer signed type however for readability.
1245         if (imm >= MIN_WORD_VALUE && imm <= MAX_WORD_VALUE)
1246         {
1247             return Type_W;
1248         }
1249         else if (imm >= MIN_UWORD_VALUE && imm <= MAX_UWORD_VALUE)
1250         {
1251             return Type_UW;
1252         }
1253         else if (imm >= int(MIN_DWORD_VALUE) && imm <= int(MAX_DWORD_VALUE))
1254         {
1255             return Type_D;
1256         }
1257         else if (imm >= unsigned(MIN_UDWORD_VALUE) && imm <= unsigned(MAX_UDWORD_VALUE))
1258         {
1259             return Type_UD;
1260         }
1261         break;
1262     case Type_UQ:
1263     case Type_UD:
1264     {
1265         // unsigned imm must stay as unsigned
1266         uint64_t immU = static_cast<uint64_t>(imm);
1267         if (immU <= MAX_UWORD_VALUE)
1268         {
1269             return Type_UW;
1270         }
1271         else if (immU <= unsigned(MAX_UDWORD_VALUE))
1272         {
1273             return Type_UD;
1274         }
1275         break;
1276     }
1277     case Type_UB:
1278         return Type_UW;
1279     case Type_B:
1280         return Type_W;
1281     default:
1282         return ty;
1283     }
1284     return ty;
1285 }
1286 
1287 //
1288 // look up an imm operand
1289 //
lookupImm(int64_t imm,G4_Type ty)1290 G4_Imm* OperandHashTable::lookupImm(int64_t imm, G4_Type ty)
1291 {
1292     ImmKey key(imm, ty);
1293     auto iter = immTable.find(key);
1294     return iter != immTable.end() ? iter->second : nullptr;
1295 }
1296 
1297 //
1298 // create a dst reg region
1299 //
createImm(int64_t imm,G4_Type ty)1300 G4_Imm* OperandHashTable::createImm(int64_t imm, G4_Type ty)
1301 {
1302     G4_Imm* i = new (mem)G4_Imm(imm, ty);
1303     ImmKey key(imm, ty);
1304     immTable[key] = i;
1305     return i;
1306 }
1307 
1308 
1309 //
1310 // create the region <vstride; width, hstride> if not yet created
1311 //
createRegion(uint16_t vstride,uint16_t width,uint16_t hstride)1312 const RegionDesc* RegionPool::createRegion(
1313     uint16_t vstride, uint16_t width, uint16_t hstride)
1314 {
1315 
1316     for (unsigned i = 0, size = (unsigned)rgnlist.size(); i < size; i++)
1317     {
1318         RegionDesc* region = rgnlist[i];
1319         if (region->vertStride == vstride &&
1320             region->width == width &&
1321             region->horzStride == hstride)
1322         {
1323             return region; // exist
1324         }
1325     }
1326     //
1327     // create one
1328     //
1329     RegionDesc* rd = new (mem) RegionDesc(vstride, width, hstride);
1330     rgnlist.push_back(rd);
1331     return rd;
1332 }
1333 
1334 /*
1335     Used in IR_Builder::translateVISARawSendInst. All the bits in des and extDesc are already set.
1336 */
createGeneralMsgDesc(uint32_t desc,uint32_t extDesc,SendAccess access,G4_Operand * bti,G4_Operand * sti)1337 G4_SendDescRaw * IR_Builder::createGeneralMsgDesc(
1338     uint32_t desc,
1339     uint32_t extDesc,
1340     SendAccess access,
1341     G4_Operand* bti,
1342     G4_Operand* sti)
1343 {
1344     return new (mem) G4_SendDescRaw(desc, extDesc, access, bti, sti);
1345 }
1346 
createSendMsgDesc(SFID sfid,uint32_t desc,uint32_t extDesc,int src1Len,SendAccess access,G4_Operand * bti,G4_ExecSize execSize,bool isValidFuncCtrl)1347 G4_SendDescRaw * IR_Builder::createSendMsgDesc(
1348     SFID sfid,
1349     uint32_t desc,
1350     uint32_t extDesc,
1351     int src1Len,
1352     SendAccess access,
1353     G4_Operand *bti,
1354     G4_ExecSize execSize,
1355     bool isValidFuncCtrl)
1356 {
1357     return new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, access, bti, execSize, isValidFuncCtrl);
1358 }
1359 
createSendMsgDesc(SFID sfid,uint32_t desc,uint32_t extDesc,int src1Len,SendAccess access,G4_Operand * bti,bool isValidFuncCtrl)1360 G4_SendDescRaw* IR_Builder::createSendMsgDesc(
1361     SFID sfid,
1362     uint32_t desc,
1363     uint32_t extDesc,
1364     int src1Len,
1365     SendAccess access,
1366     G4_Operand* bti,
1367     bool isValidFuncCtrl)
1368 {
1369     return new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, access, bti, isValidFuncCtrl);
1370 }
1371 
createSendMsgDesc(unsigned funcCtrl,unsigned regs2rcv,unsigned regs2snd,SFID funcID,unsigned extMsgLength,uint16_t extFuncCtrl,SendAccess access,G4_Operand * bti,G4_Operand * sti)1372 G4_SendDescRaw * IR_Builder::createSendMsgDesc(
1373     unsigned funcCtrl,
1374     unsigned regs2rcv,
1375     unsigned regs2snd,
1376     SFID funcID,
1377     unsigned extMsgLength,
1378     uint16_t extFuncCtrl,
1379     SendAccess access,
1380     G4_Operand *bti,
1381     G4_Operand *sti)
1382 {
1383     G4_SendDescRaw* msgDesc = new (mem) G4_SendDescRaw(
1384         funcCtrl, regs2rcv, regs2snd, funcID, (uint16_t)extMsgLength,
1385         extFuncCtrl, access, bti, sti, *this);
1386     return msgDesc;
1387 }
1388 
1389 // shorthand for read msg desc. Note that extDesc still needs to be explicitly created,
1390 // SendMsgDesc ctor does not program all the bits
createReadMsgDesc(SFID sfid,uint32_t desc,G4_Operand * bti)1391 G4_SendDescRaw* IR_Builder::createReadMsgDesc(
1392     SFID sfid,
1393     uint32_t desc,
1394     G4_Operand* bti)
1395 {
1396     //ToDo: move extDesc into SendMsgDesc ctor
1397     uint32_t extDesc = G4_SendDescRaw::createExtDesc(sfid);
1398     return new (mem) G4_SendDescRaw(sfid, desc, extDesc, 0, SendAccess::READ_ONLY, bti, true);
1399 }
1400 
createWriteMsgDesc(SFID sfid,uint32_t desc,int src1Len,G4_Operand * bti)1401 G4_SendDescRaw* IR_Builder::createWriteMsgDesc(
1402     SFID sfid,
1403     uint32_t desc,
1404     int src1Len,
1405     G4_Operand* bti)
1406 {
1407     //ToDo: move extDesc into SendMsgDesc ctor
1408     uint32_t extDesc = G4_SendDescRaw::createExtDesc(sfid, false, src1Len);
1409     return new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, SendAccess::WRITE_ONLY, bti, true);
1410 }
1411 
createSyncMsgDesc(SFID sfid,uint32_t desc)1412 G4_SendDescRaw* IR_Builder::createSyncMsgDesc(SFID sfid, uint32_t desc)
1413 {
1414     //ToDo: move extDesc into SendMsgDesc ctor
1415     uint32_t extDesc = G4_SendDescRaw::createExtDesc(sfid);
1416     return new (mem) G4_SendDescRaw(sfid, desc, extDesc, 0, SendAccess::READ_WRITE, nullptr, true);
1417 }
1418 
createSampleMsgDesc(uint32_t desc,bool cps,int src1Len,G4_Operand * bti,G4_Operand * sti)1419 G4_SendDescRaw* IR_Builder::createSampleMsgDesc(
1420     uint32_t desc,
1421     bool cps,
1422     int src1Len,
1423     G4_Operand* bti,
1424     G4_Operand* sti)
1425 {
1426 #define CPS_LOD_COMPENSATION_ENABLE 11
1427 
1428     uint32_t extDesc = G4_SendDescRaw::createExtDesc(SFID::SAMPLER, false, src1Len);
1429     if (cps)
1430     {
1431         extDesc |= 1 << CPS_LOD_COMPENSATION_ENABLE;
1432     }
1433     return new (mem) G4_SendDescRaw(desc, extDesc, SendAccess::READ_ONLY, bti, sti);
1434 }
1435 
emitSampleIndexGE16(G4_Operand * sampler,G4_Declare * headerDecl)1436 G4_Operand* IR_Builder::emitSampleIndexGE16(
1437     G4_Operand* sampler,
1438     G4_Declare* headerDecl)
1439 {
1440     G4_Operand* samplerIdx;
1441 
1442     G4_Declare* t0
1443         = createTempVar(1, Type_UD, Any);
1444     G4_DstRegRegion* t0Dst
1445         = createDstRegRegion(t0, 1);
1446     G4_SrcRegRegion* t0Src
1447         = createSrcRegRegion(t0, getRegionScalar());
1448 
1449     G4_Declare* baseAdj
1450         = createTempVar(1, Type_UD, Any);
1451     G4_DstRegRegion* baseAdjDst
1452         = createDstRegRegion(baseAdj, 1);
1453     G4_SrcRegRegion* baseAdjSrc
1454         = createSrcRegRegion(baseAdj, getRegionScalar());
1455 
1456     G4_Declare* idxLow
1457         = createTempVar(1, Type_UD, Any);
1458     G4_DstRegRegion* idxLowDst
1459         = createDstRegRegion(idxLow, 1);
1460     G4_SrcRegRegion* idxLowSrc
1461         = createSrcRegRegion(idxLow, getRegionScalar());
1462 
1463     // calculate the sampler state base pointer offset based on
1464     // sample index, for putting to msg header M0.3
1465     createBinOp(G4_shr, g4::SIMD1,
1466         t0Dst, sampler, createImm(4, Type_UD),
1467         InstOpt_WriteEnable, true);
1468     createBinOp(G4_shl, g4::SIMD1,
1469         baseAdjDst, t0Src, createImm(8, Type_UD),
1470         InstOpt_WriteEnable, true);
1471 
1472     // get low 4 bits of sample index for putting into msg descriptor
1473     G4_SrcRegRegion* sampler2Src
1474         = createSrc(
1475         sampler->getTopDcl()->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
1476     createBinOp(G4_and, g4::SIMD1,
1477         idxLowDst, sampler2Src, createImm(0xf, Type_UD),
1478         InstOpt_WriteEnable, true);
1479     samplerIdx = idxLowSrc;
1480 
1481     // add the base pointer offset with r0.3 and put to M0.3
1482     G4_DstRegRegion* stateBaseRgn
1483         = createDst(headerDecl->getRegVar(),
1484             0, 3, 1, Type_UD);
1485     G4_SrcRegRegion* src0
1486         = createSrc(
1487             builtinR0->getRegVar(), 0, 3, getRegionScalar(), Type_UD);
1488     createBinOp(G4_add, g4::SIMD1, stateBaseRgn,
1489         src0, baseAdjSrc, InstOpt_WriteEnable, true);
1490 
1491     return samplerIdx;
1492 }
1493 
createInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,bool addToInstList)1494 G4_INST* IR_Builder::createInst(
1495     G4_Predicate* prd,
1496     G4_opcode op,
1497     G4_CondMod* mod,
1498     G4_Sat sat,
1499     G4_ExecSize execSize,
1500     G4_DstRegRegion* dst,
1501     G4_Operand* src0,
1502     G4_Operand* src1,
1503     G4_InstOpts options,
1504     bool addToInstList)
1505 {
1506     MUST_BE_TRUE(op != G4_math, "IR_Builder::createInst should not be used to create math instructions");
1507     G4_INST* i = NULL;
1508 
1509     // ToDo: have separate functions to create call/jmp/ret
1510     if (G4_Inst_Table[op].instType == InstTypeFlow)
1511     {
1512         // TODO: remove this path
1513         MUST_BE_TRUE(!sat, "saturation not defined on branching ops");
1514         i = new (mem)G4_InstCF(*this, prd, op, mod, execSize, dst, src0, options);
1515     }
1516     else
1517     {
1518         i = new (mem)G4_INST(*this, prd, op, mod, sat, execSize, dst, src0, src1, options);
1519     }
1520 
1521     if (addToInstList)
1522     {
1523         i->setCISAOff(curCISAOffset);
1524 
1525         if (m_options->getOption(vISA_EmitLocation))
1526         {
1527             i->setLocation(allocateMDLocation(curLine, curFile));
1528         }
1529 
1530         instList.push_back(i);
1531     }
1532 
1533     instAllocList.push_back(i);
1534 
1535     return i;
1536 }
1537 
1538 // same as above, except we don't add it to the Builder's instList
createInternalInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options)1539 G4_INST* IR_Builder::createInternalInst(
1540     G4_Predicate* prd,
1541     G4_opcode op,
1542     G4_CondMod* mod,
1543     G4_Sat sat,
1544     G4_ExecSize execSize,
1545     G4_DstRegRegion* dst,
1546     G4_Operand* src0,
1547     G4_Operand* src1,
1548     G4_InstOpts options)
1549 {
1550     MUST_BE_TRUE(op != G4_math, "IR_Builder::createInternalInst should not be used to create math instructions");
1551 
1552     auto ii = createInst(prd, op, mod, sat, execSize, dst, src0, src1, options, false);
1553 
1554     return ii;
1555 }
1556 
createNop(G4_InstOpts instOpt)1557 G4_INST* IR_Builder::createNop(G4_InstOpts instOpt)
1558 {
1559     return createInternalInst(
1560         nullptr, G4_nop, nullptr, g4::NOSAT, g4::SIMD1,
1561         nullptr, nullptr, nullptr, instOpt);
1562 }
1563 
1564 // sync inst are always internal, so no option to append it to instList.
1565 // Also currently don't take any InstOpt
createSync(G4_opcode syncOp,G4_Operand * src)1566 G4_INST* IR_Builder::createSync(G4_opcode syncOp, G4_Operand* src)
1567 {
1568     assert(G4_INST::isSyncOpcode(syncOp) && "expect a sync op");
1569     return createInternalInst(
1570         nullptr, syncOp, nullptr, g4::NOSAT, g4::SIMD1,
1571         nullptr, src, nullptr, InstOpt_NoOpt);
1572 }
1573 
createMov(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_InstOpts options,bool appendToInstList)1574 G4_INST* IR_Builder::createMov(
1575     G4_ExecSize execSize,
1576     G4_DstRegRegion* dst, G4_Operand* src0, G4_InstOpts options,
1577     bool appendToInstList)
1578 {
1579     G4_INST* newInst = nullptr;
1580     if (appendToInstList)
1581     {
1582         newInst = createInst(
1583             nullptr, G4_mov, nullptr, g4::NOSAT, execSize,
1584             dst, src0, nullptr, options, true);
1585     }
1586     else
1587     {
1588         newInst = createInternalInst(
1589             nullptr, G4_mov, nullptr, g4::NOSAT, execSize,
1590             dst, src0, nullptr, options);
1591     }
1592     return newInst;
1593 }
1594 
createBinOp(G4_Predicate * pred,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,bool appendToInstList)1595 G4_INST* IR_Builder::createBinOp(
1596     G4_Predicate *pred, G4_opcode op, G4_ExecSize execSize,
1597     G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1,
1598     G4_InstOpts options,
1599     bool appendToInstList)
1600 {
1601     if (appendToInstList)
1602     {
1603         return createInst(
1604             pred, op, nullptr, g4::NOSAT, execSize,
1605             dst, src0, src1, options, true);
1606     }
1607     else
1608     {
1609         return createInternalInst(
1610             pred, op, nullptr, g4::NOSAT, execSize,
1611             dst, src0, src1, options);
1612     }
1613 }
1614 
1615 // mach creates both implicit acc and src using the supplied accType. AccWrCtrl is turned on.
1616 // acc0.0 is always used
createMach(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,G4_Type accType)1617 G4_INST* IR_Builder::createMach(
1618     G4_ExecSize execSize,
1619     G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1,
1620     G4_InstOpts options, G4_Type accType)
1621 {
1622     auto machInst = createInternalInst(
1623         nullptr, G4_mach, nullptr, g4::NOSAT, execSize,
1624         dst, src0, src1, options);
1625     const RegionDesc* rd = execSize > g4::SIMD1 ? getRegionStride1() : getRegionScalar();
1626     auto accSrc = createSrc(phyregpool.getAcc0Reg(), 0, 0, rd, accType);
1627     machInst->setImplAccSrc(accSrc);
1628     auto accDSt = createDst(phyregpool.getAcc0Reg(), 0, 0, 1, accType);
1629     machInst->setImplAccDst(accDSt);
1630     machInst->setOptionOn(InstOpt_AccWrCtrl);
1631     return machInst;
1632 }
1633 
1634 // macl creates an implicit src using the supplied the accType. AccWrCtrl is not set.
1635 // acc0.0 is always used
createMacl(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_InstOpts options,G4_Type accType)1636 G4_INST* IR_Builder::createMacl(
1637     G4_ExecSize execSize,
1638     G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1,
1639     G4_InstOpts options, G4_Type accType)
1640 {
1641     auto maclInst = createInternalInst(
1642         nullptr, G4_mach, nullptr, g4::NOSAT, execSize, dst, src0, src1, options);
1643     const RegionDesc* rd = execSize > g4::SIMD1 ? getRegionStride1() : getRegionScalar();
1644     auto accSrc = createSrc(phyregpool.getAcc0Reg(), 0, 0, rd, accType);
1645     maclInst->setImplAccSrc(accSrc);
1646     return maclInst;
1647 }
1648 
createMadm(G4_Predicate * pred,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_SrcRegRegion * src2,G4_InstOpts options)1649 G4_INST* IR_Builder::createMadm(
1650     G4_Predicate* pred,
1651     G4_ExecSize execSize,
1652     G4_DstRegRegion* dst,
1653     G4_SrcRegRegion* src0, G4_SrcRegRegion* src1, G4_SrcRegRegion* src2,
1654     G4_InstOpts options)
1655 {
1656     // madm is currently only created in vISA->Gen IR translation
1657         return createInst(
1658             pred, G4_madm, nullptr, g4::NOSAT, execSize,
1659             dst, src0, src1, src2, options, true);
1660 }
1661 
createIf(G4_Predicate * prd,G4_ExecSize execSize,G4_InstOpts options)1662 G4_INST* IR_Builder::createIf(G4_Predicate* prd, G4_ExecSize execSize, G4_InstOpts options)
1663 {
1664     auto inst = createCFInst(prd, G4_if, execSize, nullptr, nullptr, options, true);
1665     return inst;
1666 }
1667 
createElse(G4_ExecSize execSize,G4_InstOpts options)1668 G4_INST* IR_Builder::createElse(G4_ExecSize execSize, G4_InstOpts options)
1669 {
1670     auto inst = createCFInst(nullptr, G4_else, execSize, nullptr, nullptr, options, true);
1671     return inst;
1672 }
1673 
createEndif(G4_ExecSize execSize,G4_InstOpts options)1674 G4_INST* IR_Builder::createEndif(G4_ExecSize execSize, G4_InstOpts options)
1675 {
1676     auto inst = createCFInst(nullptr, G4_endif, execSize, nullptr, nullptr, options, true);
1677     return inst;
1678 }
1679 
createLabelInst(G4_Label * label,bool appendToInstList)1680 G4_INST* IR_Builder::createLabelInst(G4_Label* label, bool appendToInstList)
1681 {
1682     if (appendToInstList)
1683     {
1684         return createInst(nullptr, G4_label, nullptr, g4::NOSAT, g4::SIMD_UNDEFINED,
1685             nullptr, label, nullptr, InstOpt_NoOpt, true);
1686     }
1687     else
1688     {
1689         return createInternalInst(
1690             nullptr, G4_label, nullptr, g4::NOSAT, g4::SIMD_UNDEFINED,
1691             nullptr, label, nullptr, 0,
1692             0);
1693     }
1694 }
1695 
1696 // jmpTarget may be either a label (direct jmp) or scalar operand (indirect jmp)
createJmp(G4_Predicate * pred,G4_Operand * jmpTarget,G4_InstOpts options,bool appendToInstList)1697 G4_INST* IR_Builder::createJmp(
1698     G4_Predicate* pred,
1699     G4_Operand* jmpTarget, G4_InstOpts options,
1700     bool appendToInstList)
1701 {
1702     if (appendToInstList)
1703     {
1704         return createInst(pred, G4_jmpi, nullptr, g4::NOSAT, g4::SIMD1,
1705             nullptr, jmpTarget, nullptr, options, true);
1706     }
1707     else
1708     {
1709         return createInternalInst(pred, G4_jmpi, nullptr, g4::NOSAT, g4::SIMD1,
1710             nullptr, jmpTarget, nullptr, options);
1711     }
1712 }
1713 
createInternalCFInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_Label * jip,G4_Label * uip,G4_InstOpts options)1714 G4_INST* IR_Builder::createInternalCFInst(
1715     G4_Predicate* prd, G4_opcode op, G4_ExecSize execSize,
1716     G4_Label* jip, G4_Label* uip,
1717     G4_InstOpts options)
1718 {
1719     MUST_BE_TRUE(G4_Inst_Table[op].instType == InstTypeFlow,
1720         "IR_Builder::createInternalCFInst must be used with InstTypeFlow instruction class");
1721 
1722     auto ii = createCFInst(prd, op, execSize, jip, uip, options, false);
1723     return ii;
1724 }
1725 
createCFInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_Label * jip,G4_Label * uip,G4_InstOpts options,bool addToInstList)1726 G4_INST* IR_Builder::createCFInst(
1727     G4_Predicate* prd, G4_opcode op, G4_ExecSize execSize,
1728     G4_Label* jip, G4_Label* uip,
1729     G4_InstOpts options,
1730     bool addToInstList)
1731 {
1732     MUST_BE_TRUE(G4_Inst_Table[op].instType == InstTypeFlow,
1733         "IR_Builder::createCFInst must be used with InstTypeFlow instruction class");
1734 
1735     G4_InstCF* ii = new (mem)G4_InstCF(*this, prd, op, execSize, jip, uip, options);
1736 
1737     if (addToInstList)
1738     {
1739         ii->setCISAOff(curCISAOffset);
1740 
1741         if (m_options->getOption(vISA_EmitLocation))
1742         {
1743             ii->setLocation(allocateMDLocation(curLine, curFile));
1744         }
1745         instList.push_back(ii);
1746     }
1747 
1748     instAllocList.push_back(ii);
1749 
1750     return ii;
1751 }
1752 
createDpasInst(G4_opcode opc,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_Operand * src3,G4_InstOpts options,GenPrecision A,GenPrecision W,uint8_t D,uint8_t C,bool addToInstList)1753 G4_INST* IR_Builder::createDpasInst(
1754     G4_opcode opc, G4_ExecSize execSize,
1755     G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1, G4_Operand* src2,
1756     G4_Operand* src3,
1757     G4_InstOpts options,
1758     GenPrecision A,
1759     GenPrecision W,
1760     uint8_t      D,
1761     uint8_t      C,
1762     bool addToInstList)
1763 {
1764     G4_INST* i = new (mem)G4_InstDpas(*this,
1765         opc, execSize, dst, src0, src1, src2, src3, options, A, W, D, C);
1766 
1767     if (addToInstList)
1768     {
1769         i->setCISAOff(curCISAOffset);
1770         if (m_options->getOption(vISA_EmitLocation))
1771         {
1772             i->setLocation(allocateMDLocation(curLine, curFile));
1773         }
1774         instList.push_back(i);
1775     }
1776 
1777     instAllocList.push_back(i);
1778 
1779 
1780     return i;
1781 }
1782 
createInternalDpasInst(G4_opcode opc,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_Operand * src3,G4_InstOpts options,GenPrecision A,GenPrecision W,uint8_t D,uint8_t C)1783 G4_INST* IR_Builder::createInternalDpasInst(
1784     G4_opcode opc,
1785     G4_ExecSize execSize,
1786     G4_DstRegRegion* dst,
1787     G4_Operand* src0,
1788     G4_Operand* src1,
1789     G4_Operand* src2,
1790     G4_Operand* src3,
1791     G4_InstOpts options,
1792     GenPrecision A,
1793     GenPrecision W,
1794     uint8_t D,
1795     uint8_t C)
1796 {
1797     auto ii = createDpasInst(opc, execSize, dst, src0, src1, src2,
1798         nullptr, options, A, W, D, C, false);
1799 
1800     return ii;
1801 }
1802 
createBfnInst(uint8_t booleanFuncCtrl,G4_Predicate * prd,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options,bool addToInstList)1803 G4_INST* IR_Builder::createBfnInst(
1804     uint8_t booleanFuncCtrl,
1805     G4_Predicate* prd,
1806     G4_CondMod* mod,
1807     G4_Sat sat,
1808     G4_ExecSize execSize,
1809     G4_DstRegRegion* dst,
1810     G4_Operand* src0,
1811     G4_Operand* src1,
1812     G4_Operand* src2,
1813     G4_InstOpts options,
1814     bool addToInstList)
1815 {
1816     G4_INST* i = new (mem)G4_InstBfn(*this,
1817         prd, mod, sat, execSize, dst, src0, src1, src2, options, booleanFuncCtrl);
1818 
1819     if (addToInstList)
1820     {
1821         i->setCISAOff(curCISAOffset);
1822 
1823         if (m_options->getOption(vISA_EmitLocation))
1824         {
1825             i->setLocation(allocateMDLocation(curLine, curFile));
1826         }
1827         instList.push_back(i);
1828     }
1829 
1830     instAllocList.push_back(i);
1831 
1832     return i;
1833 }
1834 
createInternalBfnInst(uint8_t booleanFuncCtrl,G4_Predicate * prd,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options)1835 G4_INST* IR_Builder::createInternalBfnInst(
1836     uint8_t booleanFuncCtrl,
1837     G4_Predicate* prd,
1838     G4_CondMod* mod,
1839     G4_Sat sat,
1840     G4_ExecSize execSize,
1841     G4_DstRegRegion* dst,
1842     G4_Operand* src0,
1843     G4_Operand* src1,
1844     G4_Operand* src2,
1845     G4_InstOpts options)
1846 {
1847     auto ii = createBfnInst(
1848         booleanFuncCtrl, prd, mod, sat, execSize, dst, src0, src1, src2, options, false);
1849 
1850     return ii;
1851 }
1852 
1853 //scratch surfaces, write r0.5 to message descriptor
1854 //exdesc holds the value of the extended message descriptor for bit [0:11]
1855 // kernel entry:
1856 //      and (1) tmp<1>:ud r0.5<0;1,0>:ud 0xFFFFFC00:ud {NoMask}
1857 // before send message:
1858 //  shl (1) a0.0<1>:ud tmp<1>:ud 0x2 {NoMask}
1859 //  (for old exDesc format) add (1) a0.0<1>:ud tmp<1>:ud exDesc:ud {NoMask}
1860 // returns a0.0<0;1,0>:ud
createScratchExDesc(uint32_t exdesc)1861 G4_SrcRegRegion* IR_Builder::createScratchExDesc(uint32_t exdesc)
1862 {
1863     const char* buf = getNameString(mem, 20, "ExDesc%d", num_temp_dcl++);
1864     G4_Declare* exDescDecl = createDeclareNoLookup(buf, G4_ADDRESS, 1, 1, Type_UD);
1865     exDescDecl->setSubRegAlign(Four_Word);
1866 
1867     // copy r0.5[10:31] to a0[12:31] or a0[6:31] for the new format
1868     initScratchSurfaceOffset();
1869 
1870     if (!useNewExtDescFormat())
1871     {
1872 
1873         // (W) shl (1) a0.0 sso 0x2
1874         auto shlSrc0 = createSrcRegRegion(scratchSurfaceOffset, getRegionScalar());
1875         auto shlDst = createDstRegRegion(exDescDecl, 1);
1876         createBinOp(G4_shl, g4::SIMD1, shlDst, shlSrc0, createImm(0x2, Type_UW), InstOpt_WriteEnable, true);
1877 
1878         G4_DstRegRegion* dst = createDstRegRegion(exDescDecl, 1);
1879         createBinOp(G4_add, g4::SIMD1, dst, createSrcRegRegion(exDescDecl, getRegionScalar()),
1880             createImm(exdesc, Type_UD), InstOpt_WriteEnable, true);
1881     }
1882     else
1883     {
1884         // (W) shr (1) a0.0 ss0 0x4
1885         auto shrSrc0 = createSrcRegRegion(scratchSurfaceOffset, getRegionScalar());
1886         auto shrDst = createDstRegRegion(exDescDecl, 1);
1887         createBinOp(G4_shr, g4::SIMD1, shrDst, shrSrc0, createImm(0x4, Type_UW), InstOpt_WriteEnable, true);
1888     }
1889     return createSrcRegRegion(exDescDecl, getRegionScalar());
1890 }
1891 
createInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options,bool addToInstList)1892 G4_INST* IR_Builder::createInst(
1893     G4_Predicate* prd,
1894     G4_opcode op,
1895     G4_CondMod* mod,
1896     G4_Sat sat,
1897     G4_ExecSize execSize,
1898     G4_DstRegRegion* dst,
1899     G4_Operand* src0,
1900     G4_Operand* src1,
1901     G4_Operand* src2,
1902     G4_InstOpts options,
1903     bool addToInstList)
1904 {
1905     MUST_BE_TRUE(op != G4_math && G4_Inst_Table[op].instType != InstTypeFlow,
1906         "IR_Builder::createInst should not be used to create math/CF instructions");
1907 
1908     if (op == G4_madw)
1909     {
1910         MUST_BE_TRUE(getPlatform() >= GENX_PVC || execSize != g4::SIMD32, "SIMD32 is not supported on this platform for madw");
1911     }
1912 
1913     G4_INST* i = NULL;
1914 
1915     i = new (mem)G4_INST(*this, prd, op, mod, sat, execSize, dst, src0, src1, src2, options);
1916 
1917     if (addToInstList)
1918     {
1919         i->setCISAOff(curCISAOffset);
1920 
1921         if (m_options->getOption(vISA_EmitLocation))
1922         {
1923             i->setLocation(allocateMDLocation(curLine, curFile));
1924         }
1925 
1926         instList.push_back(i);
1927     }
1928 
1929     instAllocList.push_back(i);
1930 
1931     return i;
1932 }
1933 
1934 // same as above, except we don't add it to the Builder's instList
createInternalInst(G4_Predicate * prd,G4_opcode op,G4_CondMod * mod,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options)1935 G4_INST* IR_Builder::createInternalInst(
1936     G4_Predicate* prd,
1937     G4_opcode op,
1938     G4_CondMod* mod,
1939     G4_Sat sat,
1940     G4_ExecSize execSize,
1941     G4_DstRegRegion* dst,
1942     G4_Operand* src0,
1943     G4_Operand* src1,
1944     G4_Operand* src2,
1945     G4_InstOpts options)
1946 {
1947     auto ii = createInst(
1948         prd, op, mod, sat, execSize,
1949         dst, src0, src1, src2, options, false);
1950 
1951     return ii;
1952 
1953 }
1954 
createSendInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * postDst,G4_SrcRegRegion * currSrc,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc,bool addToInstList)1955 G4_InstSend* IR_Builder::createSendInst(
1956     G4_Predicate* prd,
1957     G4_opcode op,
1958     G4_ExecSize execSize,
1959     G4_DstRegRegion* postDst,
1960     G4_SrcRegRegion* currSrc,
1961     G4_Operand* msg,
1962     G4_InstOpts options,
1963     G4_SendDesc *msgDesc,
1964     bool addToInstList)
1965 {
1966 
1967     assert (msgDesc && "msgDesc must not be null");
1968     G4_InstSend* m = new (mem)G4_InstSend(
1969         *this, prd, op, execSize, postDst, currSrc, msg, options, msgDesc);
1970 
1971     if (addToInstList)
1972     {
1973         m->setCISAOff(curCISAOffset);
1974 
1975         if (m_options->getOption(vISA_EmitLocation))
1976         {
1977             m->setLocation(allocateMDLocation(curLine, curFile));
1978         }
1979 
1980         instList.push_back(m);
1981     }
1982 
1983     instAllocList.push_back(m);
1984 
1985     return m;
1986 }
1987 
createInternalSendInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * postDst,G4_SrcRegRegion * currSrc,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc)1988 G4_InstSend* IR_Builder::createInternalSendInst(
1989     G4_Predicate* prd,
1990     G4_opcode op,
1991     G4_ExecSize execSize,
1992     G4_DstRegRegion* postDst,
1993     G4_SrcRegRegion* currSrc,
1994     G4_Operand* msg,
1995     G4_InstOpts options,
1996     G4_SendDesc *msgDesc)
1997 {
1998     auto ii = createSendInst(prd, op, execSize,
1999         postDst, currSrc,
2000         msg, options, msgDesc, false);
2001 
2002     return ii;
2003 }
2004 
2005 //
2006 // Create a split send (sends) instruction
2007 // sends (size) dst src0 src1 exDesc msgDesc
2008 //
2009 
createSplitSendInst(G4_Predicate * prd,G4_opcode op,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc,G4_Operand * src3,bool addToInstList)2010 G4_InstSend* IR_Builder::createSplitSendInst(
2011     G4_Predicate* prd,
2012     G4_opcode op,
2013     G4_ExecSize execSize,
2014     G4_DstRegRegion* dst,
2015     G4_SrcRegRegion* src0, // can be header
2016     G4_SrcRegRegion* src1,
2017     G4_Operand* msg,       // msg descriptor: imm or vec
2018     G4_InstOpts options,
2019     G4_SendDesc* msgDesc,
2020     G4_Operand* src3,      // ext msg desciptor: imm or vec
2021     bool addToInstList)
2022 {
2023 
2024     if (!src1)
2025     {
2026         // src1 may be null if we need to force generate split send (e.g., for bindless surfaces)
2027         MUST_BE_TRUE(msgDesc->getSrc1LenRegs() == 0, "src1 length must be 0 if it is null");
2028         src1 = createNullSrc(Type_UD);
2029     }
2030     if (!src3 && msgDesc->isRaw())
2031     {
2032         src3 = createImm(((G4_SendDescRaw *)msgDesc)->getExtendedDesc(), Type_UD);
2033     }
2034     G4_InstSend* m = new (mem) G4_InstSend(
2035         *this, prd, op, execSize, dst, src0, src1, msg, src3, options, msgDesc);
2036 
2037     if (addToInstList)
2038     {
2039         m->setCISAOff(curCISAOffset);
2040 
2041         if (m_options->getOption(vISA_EmitLocation))
2042         {
2043             m->setLocation(allocateMDLocation(curLine, curFile));
2044         }
2045         instList.push_back(m);
2046     }
2047 
2048     instAllocList.push_back(m);
2049 
2050     return m;
2051 }
2052 
createInternalSplitSendInst(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_Operand * msg,G4_InstOpts options,G4_SendDesc * msgDesc,G4_Operand * src3)2053 G4_InstSend* IR_Builder::createInternalSplitSendInst(
2054     G4_ExecSize execSize,
2055     G4_DstRegRegion* dst,
2056     G4_SrcRegRegion* src0, // can be header
2057     G4_SrcRegRegion* src1,
2058     G4_Operand* msg,       // msg descriptor: imm or vec
2059     G4_InstOpts options,
2060     G4_SendDesc* msgDesc,
2061     G4_Operand* src3)     // ext msg desciptor: imm or vec)
2062 {
2063     auto ii = createSplitSendInst(nullptr, G4_sends, execSize, dst, src0, src1, msg, options,
2064         msgDesc, src3, false);
2065 
2066     return ii;
2067 }
2068 
2069 //
2070 // Math instruction is like a generic one except:
2071 // -- it takes a G4_MathOp to specify the function control
2072 // -- conditional modifier is not allowed
2073 // -- there are additional restrictions on dst/src regions that will be checked in HW conformity
2074 //
createMathInst(G4_Predicate * prd,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_MathOp mathOp,G4_InstOpts options,bool addToInstList)2075 G4_INST* IR_Builder::createMathInst(
2076     G4_Predicate* prd,
2077     G4_Sat sat,
2078     G4_ExecSize execSize,
2079     G4_DstRegRegion* dst,
2080     G4_Operand* src0,
2081     G4_Operand* src1,
2082     G4_MathOp mathOp,
2083     G4_InstOpts options,
2084     bool addToInstList)
2085 {
2086     G4_INST* i = new (mem)G4_InstMath(
2087         *this, prd, G4_math, NULL, sat, execSize, dst, src0, src1, options, mathOp);
2088 
2089     if (addToInstList)
2090     {
2091         i->setCISAOff(curCISAOffset);
2092 
2093         if (m_options->getOption(vISA_EmitLocation))
2094         {
2095             i->setLocation(allocateMDLocation(curLine, curFile));
2096         }
2097         instList.push_back(i);
2098     }
2099 
2100     instAllocList.push_back(i);
2101 
2102     return i;
2103 }
2104 
createInternalMathInst(G4_Predicate * prd,G4_Sat sat,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_MathOp mathOp,G4_InstOpts options)2105 G4_INST* IR_Builder::createInternalMathInst(
2106     G4_Predicate* prd,
2107     G4_Sat sat,
2108     G4_ExecSize execSize,
2109     G4_DstRegRegion* dst,
2110     G4_Operand* src0,
2111     G4_Operand* src1,
2112     G4_MathOp mathOp,
2113     G4_InstOpts options)
2114 {
2115     auto ii = createMathInst(prd, sat, execSize, dst, src0, src1, mathOp, options, false);
2116     return ii;
2117 }
2118 
createIntrinsicInst(G4_Predicate * prd,Intrinsic intrinId,G4_ExecSize size,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options,bool addToInstList)2119 G4_INST* IR_Builder::createIntrinsicInst(
2120     G4_Predicate* prd, Intrinsic intrinId,
2121     G4_ExecSize size,
2122     G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1, G4_Operand* src2,
2123     G4_InstOpts options, bool addToInstList)
2124 {
2125     G4_INST* i = nullptr;
2126 
2127     if (intrinId == Intrinsic::Spill)
2128         i = new (mem) G4_SpillIntrinsic(*this, prd, intrinId, size, dst, src0, src1, src2, options);
2129     else if (intrinId == Intrinsic::Fill)
2130         i = new (mem) G4_FillIntrinsic(*this, prd, intrinId, size, dst, src0, src1, src2, options);
2131     else
2132         i = new (mem) G4_InstIntrinsic(*this, prd, intrinId, size, dst, src0, src1, src2, options);
2133 
2134     if (addToInstList)
2135     {
2136         i->setCISAOff(curCISAOffset);
2137 
2138         if (m_options->getOption(vISA_EmitLocation))
2139         {
2140             i->setLocation(allocateMDLocation(curLine, curFile));
2141         }
2142 
2143         instList.push_back(i);
2144     }
2145 
2146     instAllocList.push_back(i);
2147 
2148     return i;
2149 }
2150 
createInternalIntrinsicInst(G4_Predicate * prd,Intrinsic intrinId,G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_InstOpts options)2151 G4_INST* IR_Builder::createInternalIntrinsicInst(
2152     G4_Predicate* prd, Intrinsic intrinId, G4_ExecSize execSize,
2153     G4_DstRegRegion* dst, G4_Operand* src0, G4_Operand* src1, G4_Operand* src2,
2154     G4_InstOpts options)
2155 {
2156     auto ii = createIntrinsicInst(prd, intrinId, execSize, dst, src0, src1, src2, options, false);
2157 
2158     return ii;
2159 }
2160 
createIntrinsicAddrMovInst(Intrinsic intrinId,G4_DstRegRegion * dst,G4_Operand * src0,G4_Operand * src1,G4_Operand * src2,G4_Operand * src3,G4_Operand * src4,G4_Operand * src5,G4_Operand * src6,G4_Operand * src7,bool addToInstList)2161 G4_INST* IR_Builder::createIntrinsicAddrMovInst(
2162     Intrinsic intrinId,
2163     G4_DstRegRegion* dst,
2164     G4_Operand* src0, G4_Operand* src1, G4_Operand* src2, G4_Operand* src3,
2165     G4_Operand* src4, G4_Operand* src5, G4_Operand* src6, G4_Operand* src7,
2166     bool addToInstList)
2167 {
2168     G4_INST* i = nullptr;
2169     assert(intrinId == Intrinsic::PseudoAddrMov && "expect pseudo_mov op");
2170 
2171     i = new (mem) G4_PseudoAddrMovIntrinsic(*this, intrinId, dst, src0, src1, src2, src3, src4, src5, src6, src7);
2172 
2173     if (addToInstList)
2174     {
2175         i->setCISAOff(curCISAOffset);
2176 
2177         if (m_options->getOption(vISA_EmitLocation))
2178         {
2179             i->setLocation(allocateMDLocation(curLine, curFile));
2180         }
2181 
2182         instList.push_back(i);
2183     }
2184 
2185     instAllocList.push_back(i);
2186 
2187     return i;
2188 }
2189 
Get_MathFuncCtrl(ISA_Opcode op,G4_Type type)2190 G4_MathOp IR_Builder::Get_MathFuncCtrl(ISA_Opcode op, G4_Type type)
2191 {
2192     switch (op)
2193     {
2194     case ISA_LOG:
2195         return MATH_LOG;
2196     case ISA_MOD:   // remainder of IDIV
2197         return MATH_INT_DIV_REM;
2198     case ISA_POW:
2199         return MATH_POW;
2200     case ISA_SIN:
2201         return MATH_SIN;
2202     case ISA_COS:
2203         return MATH_COS;
2204     case ISA_SQRT:
2205         return MATH_SQRT;
2206     case ISA_RSQRT:
2207         return MATH_RSQ;
2208     case ISA_INV:
2209         return MATH_INV;
2210     case ISA_DIV:
2211         return IS_FTYPE(type) || IS_HFTYPE(type) ? MATH_FDIV : MATH_INT_DIV_QUOT;
2212     case ISA_EXP:
2213         return MATH_EXP;
2214     default:
2215         ASSERT_USER(0, "Illegal math opcode.");
2216         return MATH_RESERVED;
2217     }
2218 }
2219 
2220 // After building IR total number number of rows required
2221 // for arg and retvar become known, so resize the pre-defined
2222 // vars here to the max required in current compilation unit.
resizePredefinedStackVars()2223 void IR_Builder::resizePredefinedStackVars()
2224 {
2225     getStackCallArg()->resizeNumRows(this->getArgSize());
2226     getStackCallRet()->resizeNumRows(this->getRetVarSize());
2227 }
2228 
duplicateOpndImpl(G4_Operand * opnd)2229 G4_Operand* IR_Builder::duplicateOpndImpl(G4_Operand* opnd)
2230 {
2231     if (!opnd || opnd->isImm())
2232         return opnd;
2233     if (opnd->isSrcRegRegion()) {
2234         return createSrcRegRegion(*(opnd->asSrcRegRegion()));
2235     }
2236     else if (opnd->isDstRegRegion()) {
2237         return createDstRegRegion(*(opnd->asDstRegRegion()));
2238     }
2239     else if (opnd->isPredicate()) {
2240         return createPredicate(*(opnd->asPredicate()));
2241     }
2242     else if (opnd->isCondMod()) {
2243         return createCondMod(*(opnd->asCondMod()));
2244     }
2245     else {
2246         return opnd;
2247     }
2248 }
2249 
2250 /*
2251 * Create send instruction for specified GenX architecture.
2252 * bti: surface id
2253 * sti: sampler id
2254 */
createSendInst(G4_Predicate * pred,G4_DstRegRegion * postDst,G4_SrcRegRegion * payload,unsigned regs2snd,unsigned regs2rcv,G4_ExecSize execSize,unsigned fc,SFID tf_id,bool header_present,SendAccess access,G4_Operand * bti,G4_Operand * sti,G4_InstOpts options,bool is_sendc)2255 G4_InstSend* IR_Builder::createSendInst(
2256     G4_Predicate* pred,
2257     G4_DstRegRegion *postDst,
2258     G4_SrcRegRegion *payload,
2259     unsigned regs2snd,
2260     unsigned regs2rcv,
2261     G4_ExecSize execSize,
2262     unsigned fc,
2263     SFID tf_id,
2264     bool header_present,
2265     SendAccess access,
2266     G4_Operand* bti,
2267     G4_Operand* sti,
2268     G4_InstOpts options,
2269     bool is_sendc)
2270 {
2271     G4_SendDescRaw* msgDesc =
2272         createSendMsgDesc(fc, regs2rcv, regs2snd, tf_id, 0, 0, access,
2273             bti, sti);
2274 
2275     msgDesc->setHeaderPresent(header_present);
2276 
2277     return createSendInst(
2278         pred, postDst, payload, execSize, msgDesc, options, is_sendc);
2279 }
2280 
2281 //bindless surfaces, write the content of T252 to extended message descriptor
2282 //exdesc holds the value of the extended message descriptor for bit [0:11]
2283 //add (1) a0.2<1>:ud T252<1>:ud exDesc:ud {NoMask}
2284 // returns a0.2<0;1,0>:ud
createBindlessExDesc(uint32_t exdesc)2285 G4_SrcRegRegion* IR_Builder::createBindlessExDesc(uint32_t exdesc)
2286 {
2287     G4_InstOpts dbgOpt = m_options->getOption(vISA_markSamplerMoves) ? InstOpt_BreakPoint : InstOpt_NoOpt;
2288     // virtual var for each exdesc
2289     G4_SrcRegRegion* T252 = createSrcRegRegion(builtinT252, getRegionScalar());
2290     const char* buf = getNameString(mem, 20, "ExDesc%d", num_temp_dcl++);
2291     G4_Declare* exDescDecl = createDeclareNoLookup(buf, G4_ADDRESS, 1, 1, Type_UD);
2292     exDescDecl->setSubRegAlign(Four_Word);
2293     G4_DstRegRegion* dst = createDstRegRegion(exDescDecl, 1);
2294     if (useNewExtDescFormat())
2295     {
2296         createMov(g4::SIMD1, dst, T252, InstOpt_WriteEnable | dbgOpt, true);
2297     }
2298     else
2299     {
2300         createBinOp(G4_add, g4::SIMD1, dst, T252, createImm(exdesc, Type_UD), InstOpt_WriteEnable, true);
2301     }
2302     return createSrcRegRegion(exDescDecl, getRegionScalar());
2303 }
2304 
2305 
2306 /*
2307  *
2308  *  this does two things:
2309  *  -- If send has exec size 16, its destination must have Type W.
2310  *  -- avoid using Q/UQ type on CHV/BXT
2311  */
fixSendDstType(G4_DstRegRegion * dst,G4_ExecSize execSize)2312 static void fixSendDstType(G4_DstRegRegion* dst, G4_ExecSize execSize)
2313 {
2314     MUST_BE_TRUE(dst->getRegAccess() == Direct, "Send dst must be a direct operand");
2315 
2316     MUST_BE_TRUE(dst->getSubRegOff() == 0, "dst may not have a non-zero subreg offset");
2317 
2318     // normally we should create a new alias for dst's declare, but since it's a send
2319     // type mismatch between operand and decl should not matter
2320     if (execSize == g4::SIMD16 && dst->getType() != Type_W && dst->getType() != Type_UW)
2321     {
2322         dst->setType(Type_W);
2323     }
2324 
2325     if (dst->getType() == Type_HF)
2326     {
2327         dst->setType(Type_W);
2328     }
2329 }
2330 
2331 
createSendInst(G4_Predicate * pred,G4_DstRegRegion * postDst,G4_SrcRegRegion * payload,G4_ExecSize execsize,G4_SendDescRaw * msgDesc,G4_InstOpts option,bool is_sendc)2332 G4_InstSend *IR_Builder::createSendInst(
2333     G4_Predicate *pred,
2334     G4_DstRegRegion *postDst,
2335     G4_SrcRegRegion *payload,
2336     G4_ExecSize execsize,
2337     G4_SendDescRaw *msgDesc,
2338     G4_InstOpts option,
2339     bool is_sendc)
2340 {
2341     G4_opcode send_opcode= is_sendc ? G4_sendc : G4_send;
2342 
2343     fixSendDstType(postDst, execsize);
2344 
2345     uint32_t desc = msgDesc->getDesc();
2346     G4_Operand *bti = msgDesc->getSurface();
2347     G4_Operand *sti = msgDesc->getSti();
2348     G4_Operand *descOpnd = NULL;
2349 
2350     bool needSamplerMove = sti && !sti->isImm() && !isBindlessSampler(sti);
2351 
2352     if ((bti && !bti->isImm()) || needSamplerMove)
2353     {
2354         // use a0.0 directly
2355         G4_DstRegRegion* addr_dst_opnd = createDstRegRegion(builtinA0, 1);
2356 
2357         if (bti && !bti->isImm())
2358         {
2359             //add (1) a0.0:ud bti:ud desc:ud
2360             // create source for bti
2361             createBinOp(
2362                 G4_add,
2363                 g4::SIMD1,
2364                 addr_dst_opnd,
2365                 bti,
2366                 createImm(desc, Type_UD),
2367                 InstOpt_WriteEnable,
2368                 true);
2369         }
2370 
2371         if (needSamplerMove)
2372         {
2373             G4_Declare *dcl1 = createTempVar(1, Type_UD, Any);
2374             G4_DstRegRegion* tmp_dst_opnd = createDstRegRegion(dcl1, 1);
2375 
2376             createBinOp(
2377                 G4_shl,
2378                 g4::SIMD1,
2379                 tmp_dst_opnd,
2380                 sti,
2381                 createImm(8, Type_UD),
2382                 InstOpt_WriteEnable,
2383                 true);
2384 
2385             G4_SrcRegRegion* tmp_src_opnd = createSrcRegRegion(dcl1, getRegionScalar());
2386 
2387             if (!bti || bti->isImm())
2388             {
2389                 createBinOp(
2390                     G4_add,
2391                     g4::SIMD1,
2392                     addr_dst_opnd,
2393                     tmp_src_opnd,
2394                     createImm(desc, Type_UD),
2395                     InstOpt_WriteEnable,
2396                     true);
2397             }
2398             else
2399             {
2400                 G4_SrcRegRegion* addr_src_opnd = createSrcRegRegion(builtinA0, getRegionScalar());
2401 
2402                 createBinOp(
2403                     G4_add,
2404                     g4::SIMD1,
2405                     duplicateOperand(addr_dst_opnd),
2406                     addr_src_opnd,
2407                     tmp_src_opnd,
2408                     InstOpt_WriteEnable,
2409                     true);
2410             }
2411         }
2412 
2413         descOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2414     }
2415     else
2416     {
2417         descOpnd = createImm(desc, Type_UD);
2418     }
2419 
2420     return createSendInst(
2421         pred,
2422         send_opcode,
2423         execsize,
2424         postDst,
2425         payload,
2426         descOpnd,
2427         option,
2428         msgDesc,
2429         true);
2430 }
2431 
2432 /*
2433  * Create split send instruction for specified GenX architecture.
2434  * bti: surface id
2435  * sti: sampler id
2436  * Gen9: sends (execsize)     dst,  src1,  src2,  ex_desc,  desc
2437  */
createSplitSendInst(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src1,unsigned regs2snd1,G4_SrcRegRegion * src2,unsigned regs2snd2,unsigned regs2rcv,G4_ExecSize execSize,unsigned fc,SFID tf_id,bool header_present,SendAccess access,G4_Operand * bti,G4_Operand * sti,G4_InstOpts options,bool is_sendc)2438 G4_InstSend* IR_Builder::createSplitSendInst(
2439     G4_Predicate* pred,
2440     G4_DstRegRegion *dst,
2441     G4_SrcRegRegion *src1,
2442     unsigned regs2snd1,
2443     G4_SrcRegRegion *src2,
2444     unsigned regs2snd2,
2445     unsigned regs2rcv,
2446     G4_ExecSize execSize,
2447     unsigned fc,
2448     SFID tf_id,
2449     bool header_present,
2450     SendAccess access,
2451     G4_Operand* bti,
2452     G4_Operand* sti,
2453     G4_InstOpts options,
2454     bool is_sendc)
2455 {
2456     G4_SendDescRaw *msgDesc =
2457         createSendMsgDesc(fc, regs2rcv, regs2snd1, tf_id, regs2snd2,
2458                           0, access, bti, sti);
2459 
2460     msgDesc->setHeaderPresent(header_present);
2461 
2462     return createSplitSendInst(pred, dst, src1, src2, execSize,
2463         msgDesc, options, is_sendc);
2464 }
2465 
2466 // desc, if indirect, is constructed from the BTI/STI values in msgDesc and is always a0.0
createSplitSendInst(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src1,G4_SrcRegRegion * src2,G4_ExecSize execsize,G4_SendDescRaw * msgDesc,G4_InstOpts option,bool is_sendc)2467 G4_InstSend *IR_Builder::createSplitSendInst(
2468     G4_Predicate *pred,
2469     G4_DstRegRegion *dst,
2470     G4_SrcRegRegion *src1,
2471     G4_SrcRegRegion *src2,
2472     G4_ExecSize execsize,
2473     G4_SendDescRaw *msgDesc,
2474     G4_InstOpts option,
2475     bool is_sendc)
2476 {
2477     G4_opcode send_opcode = is_sendc ? G4_sendsc : G4_sends;
2478 
2479     fixSendDstType(dst, execsize);
2480 
2481     uint32_t desc = msgDesc->getDesc();
2482     uint32_t exdesc = msgDesc->getExtendedDesc();
2483     G4_Operand *bti = msgDesc->getSurface();
2484     G4_Operand *sti = msgDesc->getSti();
2485 
2486     G4_Operand* descOpnd = NULL;
2487     G4_SrcRegRegion* extDescOpnd = nullptr;
2488 
2489     bool doAlignBindlessSampler = alignBindlessSampler() && sti && isBindlessSampler(sti);
2490     bool needsSamplerMove = (sti && !sti->isImm() && !isBindlessSampler(sti)) || doAlignBindlessSampler;
2491 
2492     bool needsSurfaceMove = false;
2493     bool needsA0ExDesc = false;
2494 
2495     if (bti && bti->isSrcRegRegion())
2496     {
2497         if (isBindlessSurface(bti))
2498         {
2499             needsA0ExDesc = true;
2500             // set T252 as BTI
2501             if ((desc & 0xFF) != PREDEF_SURF_252)
2502             {
2503                 desc = (desc & ~0xFF) | PREDEF_SURF_252;
2504             }
2505         }
2506         else if (isScratchSpace(bti))
2507         {
2508             // use BTI 251
2509             needsA0ExDesc = true;
2510             desc = (desc & ~0xFF) | 251;
2511         }
2512         else
2513         {
2514             needsSurfaceMove = true;
2515         }
2516     }
2517 
2518     if (needsSurfaceMove)
2519     {
2520         //add (1) a0.0:ud bti:ud desc:ud
2521         G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0, 1);
2522 
2523         createBinOp(G4_add, g4::SIMD1, addrDstOpnd, bti,
2524             createImm(desc, Type_UD), InstOpt_WriteEnable, true);
2525     }
2526 
2527     if (needsSamplerMove)
2528     {
2529         G4_Declare *dcl1 = createTempVar(1, Type_UD, Any);
2530 
2531         if (doAlignBindlessSampler)
2532         {
2533             // check if address is 32-byte aligned
2534             // use STI = 0 for 32-byte aligned address, STI = 1 otherwise
2535             // (W) and (1) (nz)f0.0 null S31 0x10:uw
2536             G4_Declare* tmpFlag = createTempFlag(1);
2537             G4_CondMod* condMod = createCondMod(Mod_nz, tmpFlag->getRegVar(), 0);
2538             createInst(nullptr, G4_and, condMod, g4::NOSAT, g4::SIMD1, createNullDst(Type_UD),
2539                 createSrcRegRegion(*(sti->asSrcRegRegion())), createImm(0x10, Type_UW), InstOpt_WriteEnable, true);
2540             // (W) (f0.0) sel (1) tmp:ud 0x100 0x0
2541             G4_Predicate* pred = createPredicate(PredState_Plus, tmpFlag->getRegVar(), 0);
2542             createInst(pred, G4_sel, nullptr, g4::NOSAT, g4::SIMD1, createDstRegRegion(dcl1, 1),
2543                 createImm(0x100, Type_UW), createImm(0x0, Type_UW), InstOpt_WriteEnable, true);
2544         }
2545         else
2546         {
2547             // shl (1) tmp:ud sti:ud 0x8:uw
2548             G4_DstRegRegion* tmpDstOpnd = createDstRegRegion(dcl1, 1);
2549             createBinOp(G4_shl, g4::SIMD1, tmpDstOpnd, sti,
2550                 createImm(8, Type_UD), InstOpt_WriteEnable, true);
2551         }
2552 
2553         G4_SrcRegRegion* tmpSrcOpnd = createSrcRegRegion(dcl1, getRegionScalar());
2554         G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0, 1);
2555         if (!needsSurfaceMove)
2556         {
2557             // add (1) a0.0 tmp:ud desc:ud
2558             createBinOp(G4_add, g4::SIMD1, addrDstOpnd, tmpSrcOpnd,
2559                 createImm(desc, Type_UD),
2560                 InstOpt_WriteEnable,
2561                 true);
2562         }
2563         else
2564         {
2565             // add (1) a0.0 a0.0:ud tmp:ud
2566             G4_SrcRegRegion* addrSrcOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2567             createBinOp(G4_add, g4::SIMD1, addrDstOpnd, addrSrcOpnd,
2568                 tmpSrcOpnd, InstOpt_WriteEnable, true);
2569         }
2570     }
2571 
2572     if (needsSurfaceMove || needsSamplerMove)
2573     {
2574         descOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2575     }
2576     else
2577     {
2578         descOpnd = createImm(desc, Type_UD);
2579     }
2580 
2581     if (needsA0ExDesc)
2582     {
2583         extDescOpnd = isBindlessSurface(bti) ? createBindlessExDesc(exdesc) : createScratchExDesc(exdesc);
2584     }
2585     else
2586     {
2587         // do nothing as the extended msg desc will just be a null operand
2588     }
2589 
2590     return createSplitSendInst(pred, send_opcode, execsize,
2591         dst, src1, src2,
2592         descOpnd,
2593         option, msgDesc, extDescOpnd, true);
2594 }
2595 
createLscMsgDesc(LSC_OP op,LSC_SFID lscSfid,VISA_Exec_Size execSizeEnum,LSC_CACHE_OPTS cacheOpts,LSC_ADDR addr,LSC_DATA_SHAPE shape,G4_Operand * surface,uint32_t dstLen,uint32_t addrRegs)2596 G4_SendDescRaw* IR_Builder::createLscMsgDesc(
2597     LSC_OP                      op,
2598     LSC_SFID                    lscSfid,
2599     VISA_Exec_Size              execSizeEnum,
2600     LSC_CACHE_OPTS              cacheOpts,
2601     LSC_ADDR                    addr,
2602     LSC_DATA_SHAPE              shape,
2603     G4_Operand                 *surface,
2604     uint32_t                    dstLen,
2605     uint32_t                    addrRegs)
2606 {
2607     //   Desc[5:0] = OPCODE {LOAD{_BLOCK,_QUAD},STORE{_BLOCK,_QUAD},ATOMIC*}
2608     //   Desc[8:7] = addr size
2609     //   Desc[11:9] = data size
2610     //   Desc[15:12] = data vector size (or cmask if *_QUAD)
2611     //   Desc[19:17] = caching controls (see the table for allowable combinations)
2612     //   Desc[30:29] = addr model (BTI = 3, SS = 2, BSS = 1, FLAT = 0)
2613     int status = VISA_SUCCESS;
2614     uint32_t desc = 0;
2615     uint32_t exDesc = 0;
2616     const auto opInfo = LscOpInfoGet(op);
2617     MUST_BE_TRUE(!opInfo.isBlock2D(), "block2d has a different layout");
2618     desc |= opInfo.encoding; // Desc[5:0]
2619 
2620     lscEncodeAddrSize(addr.size, desc, status); // Desc[8:7]
2621 
2622     int dataSizeBits = lscEncodeDataSize(shape.size, desc, status); // Desc[11:9]
2623 
2624     // Desc[15:12]
2625     int vecSize; // definitely assigned
2626     if (!opInfo.hasChMask())
2627     {
2628         vecSize = lscEncodeDataElems(shape.elems, desc, status);
2629         lscEncodeDataOrder(shape.order, desc, status);
2630     }
2631     else
2632     {
2633         MUST_BE_TRUE(shape.chmask, "channel mask must not be empty");
2634         vecSize = 0;
2635         if (shape.chmask & LSC_DATA_CHMASK_X)
2636         {
2637             desc |= 1 << 12;
2638             vecSize++;
2639         }
2640         if (shape.chmask & LSC_DATA_CHMASK_Y)
2641         {
2642             desc |= 1 << 13;
2643             vecSize++;
2644         }
2645         if (shape.chmask & LSC_DATA_CHMASK_Z)
2646         {
2647             desc |= 1 << 14;
2648             vecSize++;
2649         }
2650         if (shape.chmask & LSC_DATA_CHMASK_W)
2651         {
2652             desc |= 1 << 15;
2653             vecSize++;
2654         }
2655     }
2656 
2657     lscEncodeCachingOpts(opInfo, cacheOpts, desc, status); // Desc[19:17]
2658     lscEncodeAddrType(addr.type, desc, status);  // Desc[30:29]
2659 
2660     desc |= dstLen << 20;   // Desc[24:20]  dst len
2661     desc |= addrRegs << 25; // Desc[29:25]  src0 len
2662 
2663     // promote any immediate surface to the extended descriptor
2664     // ExDesc[31:12]
2665     if (surface && surface->isImm()) {
2666         auto surfaceImm = (uint32_t)surface->asImm()->getImm();
2667         if (addr.type == LSC_ADDR_TYPE_BTI) {
2668             // promote the immediate BTI to the descriptor
2669             exDesc |= surfaceImm << 24;
2670             surface = nullptr;
2671         }
2672         else if (
2673             addr.type == LSC_ADDR_TYPE_BSS ||
2674             addr.type == LSC_ADDR_TYPE_SS)
2675         {
2676             if ((surfaceImm & 0x3FF) == 0) {
2677                 exDesc |= surfaceImm;
2678                 surface = nullptr;
2679             }
2680         }
2681         else {
2682             // flat address type
2683             MUST_BE_TRUE(surface->isNullReg() ||
2684                 surfaceImm == PREDEFINED_SURFACE_SLM ||
2685                 surfaceImm == PREDEFINED_SURFACE_T255, // not sure what's up here
2686                 "flat address type must have null reg (or 0)");
2687             surface = nullptr;
2688         }
2689     }
2690 
2691     MUST_BE_TRUE(addr.immOffset == 0,
2692         "invalid address immediate offset");
2693 
2694     SFID sfid = LSC_SFID_To_SFID(lscSfid);
2695 
2696     const unsigned execSize = Get_VISA_Exec_Size(execSizeEnum);
2697     int src1Len = 0;
2698     uint32_t dataRegs = 1;
2699     bool isBlock2D =
2700         op == LSC_OP::LSC_LOAD_BLOCK2D ||
2701         op == LSC_OP::LSC_STORE_BLOCK2D;
2702     MUST_BE_TRUE(!isBlock2D, "block2d not implemented yet");
2703 
2704     if (shape.order == LSC_DATA_ORDER_NONTRANSPOSE) {
2705         // Non-transpose case is the typical case.
2706         //
2707         // ceil[ SIMT32*dataSize(b)/512(b/REG) ] * vecSize
2708         //   units = (b/b*REG) = REG
2709         dataRegs = std::max<uint32_t>(1,
2710             execSize*dataSizeBits / 8 / COMMON_ISA_GRF_REG_SIZE)*vecSize;
2711     }
2712     else
2713     { // if (shape.transpose == LSC_DATA_TRANSPOSE) {
2714            // The transpose case is a little odder
2715            // So the data size is the SIMD size (ExecSize) times the number of
2716            // registers consumed by each vector sequence (always a full
2717            // register number per seq).
2718         uint32_t regsPerVec = vecSize * dataSizeBits / 8 / COMMON_ISA_GRF_REG_SIZE;
2719         if (vecSize*dataSizeBits / 8 % COMMON_ISA_GRF_REG_SIZE)
2720             regsPerVec++; // pad out to full reg
2721         dataRegs = regsPerVec * execSize;
2722     }
2723 
2724     // override sizes for special cases
2725     if (op == LSC_OP::LSC_LOAD_STATUS)
2726     {
2727         dataRegs = 1; // just returns a bitset
2728     }
2729 
2730     if (opInfo.isLoad())
2731     {
2732         src1Len = 0;
2733     }
2734     else if (opInfo.isStore())
2735     {
2736         src1Len = (int)dataRegs;
2737     }
2738 
2739     SendAccess access = opInfo.isLoad() && opInfo.isStore() ?
2740         SendAccess::READ_WRITE : (opInfo.isLoad() ? SendAccess::READ_ONLY : SendAccess::WRITE_ONLY);
2741 
2742     G4_SendDescRaw *g4desc = createSendMsgDesc(
2743         sfid,
2744         desc,
2745         exDesc,
2746         src1Len,
2747         access,
2748         surface);
2749     return g4desc;
2750 }
2751 
createLscDesc(SFID sfid,uint32_t desc,uint32_t extDesc,int src1Len,SendAccess access,G4_Operand * bti)2752 G4_SendDescRaw * IR_Builder::createLscDesc(
2753     SFID sfid,
2754     uint32_t desc,
2755     uint32_t extDesc,
2756     int src1Len,
2757     SendAccess access,
2758     G4_Operand* bti)
2759 {
2760     auto msgDesc = new (mem) G4_SendDescRaw(sfid, desc, extDesc, src1Len, access, bti, true);
2761     return msgDesc;
2762 }
2763 
createLscSendInst(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_ExecSize execSize,G4_SendDescRaw * msgDesc,G4_InstOpts option,LSC_ADDR_TYPE addrType,bool emitA0RegDef)2764 G4_InstSend *IR_Builder::createLscSendInst(
2765     G4_Predicate *pred,
2766     G4_DstRegRegion *dst,
2767     G4_SrcRegRegion *src0,
2768     G4_SrcRegRegion *src1,
2769     G4_ExecSize execSize,
2770     G4_SendDescRaw *msgDesc,
2771     G4_InstOpts option,
2772     LSC_ADDR_TYPE addrType,
2773     bool emitA0RegDef)
2774 {
2775     uint32_t exDesc = msgDesc->getExtendedDesc();
2776     G4_Operand *surface = msgDesc->getSurface();   // BTI or SS/BSS
2777     G4_Operand *exDescOpnd = nullptr;
2778 
2779     if (surface && surface->isSrcRegRegion()) {
2780         if (emitA0RegDef)
2781         {
2782             // This path is taken when caller hasnt defined a0.2 register for use
2783             // as ext msg descriptor of lsc. Currently, spill module defines a0.2
2784             // once per BB and reuses it in all spill msgs for that BB. Without this
2785             // check, each spill/fill msg would get its own computation of a0.2
2786             // which is wasteful.
2787             if (addrType == LSC_ADDR_TYPE_BTI) {
2788                 // .declare shifted_bti v_type=T num_elts=1
2789                 // ...
2790                 // (surface is the BTI)
2791                 //   shl    tmp        surface      24
2792                 G4_Declare* tmpDecl = createTempVar(1, Type_UD, Any);
2793                 G4_DstRegRegion* tmpDst = createDstRegRegion(tmpDecl, 1);
2794                 createBinOp(G4_shl, g4::SIMD1, tmpDst, surface,
2795                     createImm(24, Type_UD), InstOpt_WriteEnable, true);
2796                 auto tmpSrc = createSrcRegRegion(tmpDecl, getRegionScalar());
2797                 // set src1.length into exDesc. BTI message is required to be on ExBSO=0
2798                 // mode, so the src.length is part of exDesc
2799                 exDesc = (exDesc & (~0x7FF)) | (msgDesc->extMessageLength() << 6);
2800                 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0Dot2, 1);
2801                 // add a0.2 tmpSrc exdesc
2802                 createBinOp(G4_add, g4::SIMD1, addrDstOpnd, tmpSrc,
2803                     createImm(exDesc, Type_UD), InstOpt_WriteEnable, true);
2804             }
2805             else {
2806                 // SS or BSS
2807                 G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0Dot2, 1);
2808                 if ((addrType == LSC_ADDR_TYPE_BSS) || (addrType == LSC_ADDR_TYPE_SS))
2809                 {
2810                     //   mov    a0.2  surface
2811                     createMov(g4::SIMD1, addrDstOpnd, surface, InstOpt_WriteEnable, true);
2812                 }
2813                 else
2814                 {
2815                     assert(false && "FLAT have surface == nullptr here");
2816                 }
2817             }
2818         }
2819 
2820         exDescOpnd = createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2821         msgDesc->setSurface(exDescOpnd); // link a0.2 to the send descriptor
2822     } else if (surface && surface->isImm()) {
2823         // If by some chance the surface is an immediate value that didn't fold
2824         // to ExDesc (c.f. lscTryPromoteSurfaceImmToExDesc),
2825         // we can still possibly move it to a0.2 and use that way.
2826         // This enables us to access the full ExDesc[31:5] rather than
2827         // ExDesc[31:12] (the send instruction lacks room encode [11:6])
2828         // This can happen for BSS/SS, for example, with a small
2829         // surface state offset.
2830         //
2831         // Callers that fold the ExDesc value into an immediate descriptor
2832         // should pass nullptr as the surface.
2833         G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0Dot2, 1);
2834         if (addrType == LSC_ADDR_TYPE_BSS || addrType == LSC_ADDR_TYPE_SS) {
2835             //   mov    a0.2   SurfaceAddrImm
2836             auto imm = surface->asImm()->getImm();
2837             assert(
2838                 (imm & 0x1F) == 0 &&
2839                 (imm & 0xFFFFFFFF00000000LL) == 0 && "ExDesc can only access [31:5]");
2840             createMov(g4::SIMD1, addrDstOpnd,
2841                 createImm(imm, Type_UD), InstOpt_WriteEnable, true);
2842 
2843             exDescOpnd = createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2844             msgDesc->setSurface(exDescOpnd); // link a0.2 to the send descriptor
2845         }
2846         else
2847         {
2848             // BTI is in ExDesc[31:24] and that is always available.
2849             assert(false && "BTI/FLAT should not reach this. "
2850                 "FLAT should have surface == nullptr and"
2851                 "BTI should either use a register for a variable BTI or have "
2852                 "folded the immediate vlaue into ExDesc"
2853                 " (and thus surface==nullptr here)");
2854         }
2855     } else {
2856         exDescOpnd = createImm(exDesc, Type_UD);
2857     }
2858 
2859     return createSplitSendInst(
2860         pred, G4_sends, execSize, dst, src0, src1,
2861         createImm(msgDesc->getDesc(), Type_UD),
2862         option, msgDesc, exDescOpnd, true);
2863 }
2864 
2865 //Using r0.8:ud to save and restore a0.2
getScratchSurfaceStatusIndex()2866 G4_SrcRegRegion* IR_Builder::getScratchSurfaceStatusIndex()
2867 {
2868     auto dst = createDst(builtinR0->getRegVar(), 0, 8, 1, Type_UD);
2869     auto src0 = createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2870     createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, true);
2871 
2872     G4_SrcRegRegion* R0_5 = createSrc(builtinR0->getRegVar(), 0, 5, getRegionScalar(), Type_UD);
2873     G4_DstRegRegion* A02Dst = createDstRegRegion(builtinA0Dot2, 1);
2874     createMov(g4::SIMD1, A02Dst, R0_5, InstOpt_WriteEnable, true);
2875     return createSrcRegRegion(builtinA0Dot2, getRegionScalar());
2876 }
2877 
RestoreA0()2878 void IR_Builder::RestoreA0()
2879 {
2880     auto dst = createDstRegRegion(builtinA0Dot2, 1);
2881     auto src0 = createSrc(builtinR0->getRegVar(), 0, 8, getRegionStride1(), Type_UD);
2882     createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, true);
2883 }
2884 
createLscSendInstToScratch(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_ExecSize execSize,G4_SendDescRaw * msgDesc,G4_InstOpts options,bool usesBti)2885 G4_InstSend *IR_Builder::createLscSendInstToScratch(
2886     G4_Predicate *pred,
2887     G4_DstRegRegion *dst,
2888     G4_SrcRegRegion *src0,
2889     G4_SrcRegRegion *src1,
2890     G4_ExecSize execSize,
2891     G4_SendDescRaw *msgDesc,
2892     G4_InstOpts options,
2893     bool usesBti)
2894 {
2895     uint32_t desc = msgDesc->getDesc();
2896     G4_Operand *surface = msgDesc->getSurface();   // BTI or SS/BSS
2897     G4_Operand *exDescOpnd = nullptr;
2898 
2899     if (isScratchSpace(surface))
2900     {
2901         desc = (desc & ~0xFF) | 251;
2902     }
2903     exDescOpnd = getScratchSurfaceStatusIndex();
2904 
2905     G4_InstSend* inst = createSplitSendInst(
2906         pred, G4_sends, execSize, dst, src0, src1,
2907         createImm(desc, Type_UD),
2908         options, msgDesc, exDescOpnd, true);
2909     RestoreA0();
2910 
2911     return inst;
2912 }
2913 
2914 // for reder target messages,
2915 // desc has a constant BTI value (i.e., no bindless) and no STI
2916 // extDesc may be indirect (MRT and other bits) and is passed in
createSplitSendToRenderTarget(G4_Predicate * pred,G4_DstRegRegion * dst,G4_SrcRegRegion * src1,G4_SrcRegRegion * src2,G4_SrcRegRegion * extDescOpnd,G4_ExecSize execSize,G4_SendDescRaw * msgDesc,G4_InstOpts option)2917 G4_InstSend *IR_Builder::createSplitSendToRenderTarget(
2918     G4_Predicate *pred,
2919     G4_DstRegRegion *dst,
2920     G4_SrcRegRegion *src1,
2921     G4_SrcRegRegion *src2,
2922     G4_SrcRegRegion *extDescOpnd,
2923     G4_ExecSize execSize,
2924     G4_SendDescRaw *msgDesc,
2925     G4_InstOpts option)
2926 {
2927     G4_opcode send_opcode = G4_sendsc;
2928 
2929     fixSendDstType(dst, execSize);
2930 
2931     uint32_t desc = msgDesc->getDesc();
2932     G4_Operand* descOpnd = nullptr;
2933     G4_Operand *bti = msgDesc->getSurface();
2934 
2935     if (bti && bti->isSrcRegRegion())
2936     {
2937         //add (1) a0.0:ud bti:ud desc:ud
2938         G4_DstRegRegion* addrDstOpnd = createDstRegRegion(builtinA0, 1);
2939         createBinOp(G4_add, g4::SIMD1, addrDstOpnd, bti,
2940             createImm(desc, Type_UD), InstOpt_WriteEnable, true);
2941         descOpnd = createSrcRegRegion(builtinA0, getRegionScalar());
2942     }
2943     else
2944     {
2945         descOpnd = createImm(desc, Type_UD);
2946     }
2947 
2948     return createSplitSendInst(pred, send_opcode, execSize,
2949         dst, src1, src2, descOpnd,
2950         option, msgDesc, extDescOpnd, true);
2951 }
2952 
2953 // create a declare for send payload
createSendPayloadDcl(unsigned num_elt,G4_Type type)2954 G4_Declare* IR_Builder::createSendPayloadDcl(unsigned num_elt, G4_Type type)
2955 {
2956     const char* name = getNameString(mem, 16, "M%u", ++num_temp_dcl);
2957     const uint16_t sizeOfType = TypeSize(type);
2958     unsigned short numRow = (num_elt * sizeOfType - 1) / numEltPerGRF<Type_UB>() + 1;
2959     unsigned short numElt = (numRow == 1) ? num_elt : (numEltPerGRF<Type_UB>()/sizeOfType);
2960     G4_Declare *dcl = createDeclareNoLookup(
2961         name,
2962         G4_GRF,
2963         numElt,
2964         numRow,
2965         type);
2966     return dcl;
2967 }
2968 
createMovR0Inst(G4_Declare * dcl,short regOff,short subregOff,bool use_nomask,G4_InstOpts options)2969 void IR_Builder::createMovR0Inst(G4_Declare* dcl, short regOff, short subregOff, bool use_nomask, G4_InstOpts options)
2970 {
2971     G4_DstRegRegion* dst1_opnd = createDst(
2972         dcl->getRegVar(),
2973         regOff,
2974         subregOff,
2975         1,
2976         dcl->getElemType());
2977 
2978     // create r0 src
2979     G4_SrcRegRegion* r0_src_opnd = createSrcRegRegion(builtinR0, getRegionStride1());
2980     // create inst
2981     createMov(
2982         G4_ExecSize(GENX_DATAPORT_IO_SZ),
2983         dst1_opnd,
2984         r0_src_opnd,
2985         (use_nomask ? InstOpt_WriteEnable | options : options),
2986         true);
2987 }
2988 
createAddInst(G4_Declare * dcl,short regOff,short subregOff,G4_ExecSize execsize,G4_Predicate * pred,G4_CondMod * condMod,G4_Operand * src0_opnd,G4_Operand * src1_opnd,G4_InstOption options)2989 void IR_Builder::createAddInst(
2990     G4_Declare* dcl, short regOff, short subregOff, G4_ExecSize execsize,
2991     G4_Predicate* pred, G4_CondMod* condMod,
2992     G4_Operand* src0_opnd, G4_Operand* src1_opnd, G4_InstOption options)
2993 {
2994     auto dst = createDst(dcl->getRegVar(), regOff, subregOff, 1, dcl->getElemType());
2995 
2996     if (src0_opnd->isImm() && src0_opnd->asImm()->isZero())
2997     {
2998         createInst(pred, G4_mov, condMod, g4::NOSAT, execsize, dst, src1_opnd, NULL, options, true);
2999     }
3000     else if (src1_opnd->isImm() && src1_opnd->asImm()->isZero())
3001     {
3002         createInst(pred, G4_mov, condMod, g4::NOSAT, execsize, dst, src0_opnd, NULL, options, true);
3003     }
3004     else if (src0_opnd->isImm() && !src1_opnd->isImm())
3005     {
3006         createInst(pred, G4_add, condMod, g4::NOSAT, execsize, dst, src1_opnd, src0_opnd, options, true);
3007     }
3008     else
3009     {
3010         createInst(pred, G4_add, condMod, g4::NOSAT, execsize, dst, src0_opnd, src1_opnd, options, true);
3011     }
3012 }
3013 
3014 // Currently this function is mostly used in dataport intrinsic translation functions.
3015 // If it is used in some other places, Qtrctrl should be added in options if needed.
createMovInst(G4_Declare * dcl,short regOff,short subregOff,G4_ExecSize execSize,G4_Predicate * pred,G4_CondMod * condMod,G4_Operand * src_opnd,bool use_nomask,G4_InstOpts options)3016 void IR_Builder::createMovInst(
3017     G4_Declare* dcl,
3018     short regOff,
3019     short subregOff,
3020     G4_ExecSize execSize,
3021     G4_Predicate* pred,
3022     G4_CondMod* condMod,
3023     G4_Operand* src_opnd,
3024     bool use_nomask,
3025     G4_InstOpts options)
3026 {
3027     G4_DstRegRegion* dst2_opnd = createDst(
3028         dcl->getRegVar(),
3029         regOff,
3030         subregOff,
3031         1,
3032         dcl->getElemType());
3033 
3034     createInst(
3035         pred,
3036         G4_mov,
3037         condMod,
3038         g4::NOSAT,
3039         execSize,
3040         dst2_opnd,
3041         src_opnd,
3042         NULL,
3043         use_nomask ? (InstOpt_WriteEnable | options) : options,
3044         true);
3045 }
3046 
3047 // send payload preparation.
3048 // dcl: decl for send payload
3049 // num_dword: number of DW to send
3050 // src_opnd: send src, its size may be several GRFs
createMovSendSrcInst(G4_Declare * dcl,short regoff,short subregoff,unsigned num_dword,G4_Operand * src_opnd,G4_InstOpts options)3051 void IR_Builder::createMovSendSrcInst(
3052     G4_Declare* dcl,
3053     short regoff,
3054     short subregoff,
3055     unsigned num_dword,
3056     G4_Operand* src_opnd,
3057     G4_InstOpts options)
3058 {
3059     // since src_opnd is raw source in CISA, it is aligned to GRF, so there is no subRegOff.
3060     unsigned remained_dword = num_dword;
3061     // if data type of src_opnd is not UD, change it to UD
3062     // assumption: size of src_opnd is multiple of UD
3063     short dst_regoff = regoff, dst_subregoff = subregoff;
3064     G4_ExecSize execsize = g4::SIMD1;
3065     G4_DstRegRegion* dst = NULL;
3066     //G4_SrcRegRegion* src = NULL;
3067     G4_Operand* src = NULL;
3068     const RegionDesc *rd = NULL;
3069     G4_Declare *dst_dcl = dcl;
3070     short src_regoff = 0, src_subregoff = 0;
3071     bool non_ud_scalar = false;
3072     bool scalar_src = (src_opnd->isImm() || num_dword == 1);
3073 
3074     if (scalar_src && src_opnd->getType() != Type_UD) {
3075         // change the type of dst dcl to src type
3076         remained_dword = num_dword * (TypeSize(Type_UD)/src_opnd->getTypeSize());
3077         dst_dcl = createSendPayloadDcl(remained_dword, src_opnd->getType());
3078         dst_dcl->setAliasDeclare(dcl, regoff * numEltPerGRF<Type_UB>() + subregoff * TypeSize(Type_UD));
3079         dst_regoff = 0;
3080         dst_subregoff = 0;
3081         non_ud_scalar = true;
3082     }
3083 
3084     src_regoff = src_opnd->asSrcRegRegion()->getRegOff();
3085     src_subregoff = src_opnd->asSrcRegRegion()->getSubRegOff();
3086     src_subregoff = src_subregoff * src_opnd->getTypeSize() / dst_dcl->getElemSize();
3087 
3088     auto getMaxEsize = [](uint32_t opt)
3089     {
3090         unsigned maskOption = (opt & InstOpt_QuarterMasks);
3091         switch (maskOption)
3092         {
3093         case InstOpt_M4:
3094         case InstOpt_M12:
3095         case InstOpt_M20:
3096         case InstOpt_M28:
3097             return 4;
3098         case InstOpt_M8:
3099         case InstOpt_M24:
3100             return 8;
3101         case InstOpt_M16:
3102             return 16;
3103         default:
3104             return 32;
3105         }
3106     };
3107     G4_ExecSize maxEsize(getMaxEsize(options));
3108 
3109     // here remained_dword is not the number of DW, but the number of dst data type.
3110     while (remained_dword)
3111     {
3112         if (non_ud_scalar && src_opnd->getTypeSize() != TypeSize(Type_UD))
3113         {
3114             if (remained_dword >= 32)
3115             {
3116                 execsize = g4::SIMD32;
3117             }
3118             else if (remained_dword >= 16)
3119             {
3120                 execsize = g4::SIMD16;
3121             }
3122             else
3123             {
3124                 execsize = G4_ExecSize((uint8_t)Round_Down_Pow2(remained_dword));
3125             }
3126 
3127             execsize = (execsize > maxEsize) ? maxEsize :  execsize;
3128             if (execsize == g4::SIMD1)
3129             {
3130                 rd = getRegionScalar();
3131             }
3132             else
3133             {
3134                 rd = getRegionStride1();
3135             }
3136         }
3137         else
3138         {
3139             if (remained_dword >= 16)
3140             {
3141                 execsize = g4::SIMD16;
3142             }
3143             else if (remained_dword >= 8)
3144             {
3145                 execsize = g4::SIMD8;
3146             }
3147             else
3148             {
3149                 execsize = G4_ExecSize(Round_Down_Pow2(remained_dword));
3150             }
3151             execsize = (execsize > maxEsize) ? maxEsize :  execsize;
3152             if (execsize == g4::SIMD1)
3153             {
3154                 rd = getRegionScalar();
3155             }
3156             else
3157             {
3158                 rd = getRegionStride1();
3159             }
3160         }
3161 
3162         dst = createDst(
3163             dst_dcl->getRegVar(),
3164             dst_regoff,
3165             dst_subregoff,
3166             1,
3167             dst_dcl->getElemType());
3168 
3169         if (scalar_src && src_opnd->isImm())
3170         {
3171             src = src_opnd->asImm();
3172         }
3173         else
3174         {
3175             src = createSrc(
3176                 src_opnd->asSrcRegRegion()->getBase(),
3177                 src_regoff,
3178                 src_subregoff,
3179                 rd,
3180                 dst_dcl->getElemType());
3181         }
3182 
3183         createMov(
3184             execsize,
3185             dst,
3186             src,
3187             options,
3188             true);
3189 
3190         // update offset in decl
3191         if (remained_dword >= execsize) {
3192             remained_dword -= execsize;
3193             if (execsize * dst_dcl->getElemSize() == 2 * numEltPerGRF<Type_UB>()) {
3194                 dst_regoff += 2;
3195                 if (!scalar_src) {
3196                     src_regoff += 2;
3197                 }
3198             }
3199             else if (execsize * dst_dcl->getElemSize() == numEltPerGRF<Type_UB>()) {
3200                 dst_regoff += 1;
3201                 if (!scalar_src) {
3202                     src_regoff += 1;
3203                 }
3204             }
3205             else {
3206                 dst_subregoff += execsize;
3207                 if (dst_subregoff > ((int)numEltPerGRF<Type_UB>() / dst_dcl->getElemSize())) {
3208                     dst_regoff++;
3209                     dst_subregoff -= numEltPerGRF<Type_UB>() / dst_dcl->getElemSize();
3210                 }
3211                 if (!scalar_src) {
3212                     src_subregoff += execsize;
3213                     if (src_subregoff > (short)(numEltPerGRF<Type_UB>() / TypeSize(Type_UD))) {
3214                         src_regoff++;
3215                         src_subregoff -= numEltPerGRF<Type_UB>() / TypeSize(Type_UD);
3216                     }
3217                 }
3218             }
3219         }
3220     }
3221 }
3222 // create an opnd without regpoff and subregoff
createDstRegRegion(G4_Declare * dcl,unsigned short hstride)3223 G4_DstRegRegion* IR_Builder::createDstRegRegion(
3224     G4_Declare* dcl, unsigned short hstride)
3225 {
3226     return createDst(
3227         dcl->getRegVar(),
3228         0,
3229         0,
3230         hstride,
3231         dcl->getElemType());
3232 }
3233 // create an opnd without regpoff and subregoff
createSrcRegRegion(G4_Declare * dcl,const RegionDesc * rd)3234 G4_SrcRegRegion* IR_Builder::createSrcRegRegion(
3235     G4_Declare* dcl, const RegionDesc* rd)
3236 {
3237     return createSrcRegRegion(
3238         Mod_src_undef,
3239         Direct,
3240         dcl->getRegVar(),
3241         0,
3242         0,
3243         rd,
3244         dcl->getElemType());
3245 }
3246 
createNullDst(G4_Type dstType)3247 G4_DstRegRegion* IR_Builder::createNullDst(G4_Type dstType)
3248 {
3249     return createDst(
3250         phyregpool.getNullReg(),
3251         0,
3252         0,
3253         1,
3254         dstType);
3255 }
3256 
createNullSrc(G4_Type srcType)3257 G4_SrcRegRegion* IR_Builder::createNullSrc(G4_Type srcType)
3258 {
3259     return createSrcRegRegion(Mod_src_undef,
3260                                Direct,
3261                                phyregpool.getNullReg(),
3262                                0,
3263                                0,
3264                                getRegionScalar(),
3265                                srcType);
3266 }
3267 
3268 // check if the dst opnd align to GRF.
3269 // if it is not aligned to GRF
3270 // 1. change align of var dcl to GRF if the dst size is smaller than GRF size,
3271 //    no alias or alias offset is 0.
3272 // 2. otherwise, create a temp operand and return it.
checkSendDst(G4_DstRegRegion * dst_opnd)3273 G4_DstRegRegion* IR_Builder::checkSendDst(G4_DstRegRegion *dst_opnd)
3274 {
3275     //FIXME: This function seems to be bogus
3276     G4_DstRegRegion* d;
3277     // check if dst is align to GRF
3278 
3279     const unsigned short SIZEOF_DW = 4;
3280     if (dst_opnd->getTypeSize() > 1)
3281     {
3282         d = dst_opnd;
3283     }
3284     else
3285     {
3286         // change type of dcl and offset in it
3287         short new_SubRegOff = dst_opnd->getSubRegOff();
3288         if (dst_opnd->getRegAccess() == Direct)
3289         {
3290             new_SubRegOff = dst_opnd->getSubRegOff() / SIZEOF_DW;
3291         }
3292         G4_DstRegRegion new_dst(
3293             dst_opnd->getRegAccess(),
3294             dst_opnd->getBase(),
3295             dst_opnd->getRegOff(),
3296             new_SubRegOff,
3297             1,
3298             Type_UD);
3299         d = createDstRegRegion(new_dst);
3300     }
3301 
3302     return d;
3303 }
3304 
addInputArg(input_info_t * inpt)3305 void IR_Builder::addInputArg(input_info_t * inpt)
3306 {
3307     m_inputVect.push_back(inpt);
3308 }
3309 
getInputArg(unsigned int index) const3310 input_info_t * IR_Builder::getInputArg(unsigned int index) const
3311 {
3312     return m_inputVect[index];
3313 }
3314 
getInputCount() const3315 unsigned int IR_Builder::getInputCount() const
3316 {
3317     return (uint32_t)m_inputVect.size();
3318 }
3319 
getRetIPArg() const3320 input_info_t *IR_Builder::getRetIPArg() const {
3321     // TODO: So far, we assume the last argument of caller of callable kernel
3322     // or callable kernel is the RetIP argument. If required, extra attribute
3323     // will be added to specify which QWORD argument is used as RetIP argument
3324     // and the code will traverse all argument to find that one.
3325     input_info_t *RetIP = getInputArg(getInputCount() - 1);
3326     // More sanity check on the argument.
3327     ASSERT_USER(IS_QTYPE(RetIP->dcl->getElemType()), "RetIP needs to be QWORD!");
3328     ASSERT_USER(RetIP->dcl->getNumElems() == 1, "RetIP needs to be QWORD!");
3329     return RetIP;
3330 }
3331 
vISAPredicateToG4Predicate(VISA_PREDICATE_CONTROL control,G4_ExecSize execSize)3332 G4_Predicate_Control IR_Builder::vISAPredicateToG4Predicate(
3333     VISA_PREDICATE_CONTROL control, G4_ExecSize execSize)
3334 {
3335     switch (control)
3336     {
3337     case PRED_CTRL_NON:
3338         return PRED_DEFAULT;
3339     case PRED_CTRL_ANY:
3340     {
3341         if (!predCtrlHasWidth())
3342         {
3343             return PRED_ANY_WHOLE;
3344         }
3345         switch (execSize)
3346         {
3347         case 1: return PRED_DEFAULT;
3348         case 2: return PRED_ANY2H;
3349         case 4: return PRED_ANY4H;
3350         case 8: return PRED_ANY8H;
3351         case 16: return PRED_ANY16H;
3352         case 32: return PRED_ANY32H;
3353         default:
3354             MUST_BE_TRUE(0, "Invalid predicate control group size.");
3355             return PRED_DEFAULT;
3356         }
3357     }
3358     case PRED_CTRL_ALL:
3359     {
3360         if (!predCtrlHasWidth())
3361         {
3362             return PRED_ALL_WHOLE;
3363         }
3364         switch (execSize)
3365         {
3366         case 1: return PRED_DEFAULT;
3367         case 2: return PRED_ALL2H;
3368         case 4: return PRED_ALL4H;
3369         case 8: return PRED_ALL8H;
3370         case 16: return PRED_ALL16H;
3371         case 32: return PRED_ALL32H;
3372         default:
3373             MUST_BE_TRUE(0, "Invalid predicate control group size.");
3374             return PRED_DEFAULT;
3375         }
3376     }
3377     default:
3378         MUST_BE_TRUE(0, "Invalid predicate control.");
3379         return PRED_DEFAULT;
3380     }
3381 }
3382 
3383 
3384 // helper function to fold BinOp with two immediate operands
3385 // supported opcodes are given below in doConsFolding
3386 // returns nullptr if the two constants may not be folded
foldConstVal(G4_Imm * const1,G4_Imm * const2,G4_opcode op)3387 G4_Imm* IR_Builder::foldConstVal(G4_Imm* const1, G4_Imm* const2, G4_opcode op)
3388 {
3389     bool isNonQInt = IS_TYPE_INT(const1->getType()) && IS_TYPE_INT(const2->getType()) &&
3390         !IS_QTYPE(const1->getType()) && !IS_QTYPE(const2->getType());
3391 
3392     if (!isNonQInt)
3393     {
3394         return nullptr;
3395     }
3396 
3397     G4_Type src0T = const1->getType(), src1T = const2->getType(), resultType = src0T;
3398 
3399     if (op == G4_mul || op == G4_add || op == G4_and || op == G4_xor || op == G4_or)
3400     {
3401         resultType = findConstFoldCommonType(src0T, src1T);
3402         if (resultType == Type_UNDEF)
3403         {
3404             return nullptr;
3405         }
3406 
3407         int64_t res;
3408         switch (op)
3409         {
3410         case G4_and:
3411             res = (int64_t)(const1->getInt()) & (int64_t)(const2->getInt());
3412             break;
3413 
3414         case G4_xor:
3415             res = (int64_t)(const1->getInt()) ^ (int64_t)(const2->getInt());
3416             break;
3417 
3418         case G4_or:
3419             res = (int64_t)(const1->getInt()) | (int64_t)(const2->getInt());
3420             break;
3421 
3422         case G4_add:
3423             res = (int64_t)(const1->getInt()) + (int64_t)(const2->getInt());
3424             break;
3425 
3426         case G4_mul:
3427             res = (int64_t)(const1->getInt()) * (int64_t)(const2->getInt());
3428             break;
3429 
3430         default:
3431             return nullptr;
3432         }
3433 
3434         // result type is either D or UD
3435         // don't fold if the value overflows D/UD
3436         if (!G4_Imm::isInTypeRange(res, resultType))
3437         {
3438             return nullptr;
3439         }
3440         return createImmWithLowerType(res, resultType);
3441     }
3442     else
3443     {
3444         uint32_t shift = const2->getInt() % 32;
3445 
3446         if (op == G4_shl || op == G4_shr)
3447         {
3448             uint32_t value = (uint32_t)const1->getInt();
3449             // set result type to D/UD as it may overflow W. If the value fits the type will be lowered later
3450             // source type matters here since it affects sign extension
3451             resultType = IS_SIGNED_INT(resultType) ? Type_D : Type_UD;
3452             int64_t res = op == G4_shl ?
3453                 ((int64_t)value) << shift :
3454                 value >> shift;
3455             if (!G4_Imm::isInTypeRange(res, resultType))
3456             {
3457                 return nullptr;
3458             }
3459 
3460             return createImmWithLowerType(res, resultType);
3461         }
3462         else if (op == G4_asr)
3463         {
3464             if (IS_SIGNED_INT(resultType))
3465             {
3466                 int64_t value = const1->getInt();
3467                 int64_t res = value >> shift;
3468                 return createImmWithLowerType(res, resultType);
3469             }
3470             else
3471             {
3472                 uint64_t value = const1->getInt();
3473                 uint64_t res = value >> shift;
3474                 return createImmWithLowerType(res, resultType);
3475             }
3476         }
3477     }
3478     return nullptr;
3479 }
3480 
3481 
3482 // Currently constant folding is done for the following code patterns:
3483 //
3484 // - op v, imm, imm
3485 //    where op is shl, shr, asr, or, xor, and, add, mul
3486 // Restrictions:
3487 // - operand type cannot be float or Q/UQ
3488 // - saturation is not allowed
doConsFolding(G4_INST * inst)3489 void IR_Builder::doConsFolding(G4_INST *inst)
3490 {
3491     if (inst->getSaturate())
3492         return; // TODO: we could do this if we wanted to bad enough
3493 
3494     auto srcIsFoldableImm = [](const G4_Operand *op) {
3495         return op && op->isImm() && !op->isRelocImm();
3496     };
3497 
3498     if (inst->getNumSrc() == 2) {
3499         G4_Operand *src0 = inst->getSrc(0);
3500         G4_Operand *src1 = inst->getSrc(1);
3501         if (srcIsFoldableImm(src0) && srcIsFoldableImm(src1)) {
3502             G4_Imm *foldedImm =
3503                 foldConstVal(src0->asImm(), src1->asImm(), inst->opcode());
3504             if (foldedImm)
3505             {
3506                 // change instruction into a MOV
3507                 inst->setOpcode(G4_mov);
3508                 inst->setSrc(foldedImm, 0);
3509                 inst->setSrc(nullptr, 1);
3510             }
3511         }
3512     } else if (inst->getNumSrc() == 3) {
3513         G4_Operand *src0 = inst->getSrc(0);
3514         G4_Operand *src1 = inst->getSrc(1);
3515         G4_Operand *src2 = inst->getSrc(2);
3516         if (inst->opcode() == G4_add3) {
3517             // always fold the variable into src0
3518             G4_Imm *foldedImm = nullptr;
3519             G4_Operand *otherSrc = nullptr;
3520             if (srcIsFoldableImm(src0) && srcIsFoldableImm(src1)) {
3521                 foldedImm = foldConstVal(src0->asImm(), src1->asImm(), G4_add);
3522                 otherSrc = src2;
3523             } else if (srcIsFoldableImm(src0) && srcIsFoldableImm(src2)) {
3524                 foldedImm = foldConstVal(src0->asImm(), src2->asImm(), G4_add);
3525                 otherSrc = src1;
3526             } else if (srcIsFoldableImm(src1) && srcIsFoldableImm(src2)) {
3527                 foldedImm = foldConstVal(src1->asImm(), src2->asImm(), G4_add);
3528                 otherSrc = src0;
3529             }
3530             if (foldedImm) {
3531                 // always put the possible register in src0
3532                 inst->setOpcode(G4_add);
3533                 if (otherSrc != src0) {
3534                     inst->setSrc(otherSrc, 0);
3535                     inst->swapDefUse(
3536                         Opnd_src0,
3537                         otherSrc == src1 ? Opnd_src1 : Opnd_src2);
3538                 }
3539                 inst->setSrc(foldedImm, 1);
3540                 inst->setSrc(nullptr, 2);
3541                 // recurse for possible fold again
3542                 doConsFolding(inst);
3543             }
3544         } // TODO: integer mad, bfn, bfi, bfe
3545     }
3546 }
3547 // Do the following algebraic simplification:
3548 // - mul v, src0, 0 ==> 0, commutative
3549 // - and v, src0, 0 ==> 0, commutative
3550 // - mul v, src0, 1 ==> src0, commutative
3551 // - shl v, src0, 0 ==> src0
3552 // - shr v, src0, 0 ==> src0
3553 // - asr v, src0, 0 ==> src0
3554 // - add v, src0, 0 ==> src0, commutative
doSimplification(G4_INST * inst)3555 void IR_Builder::doSimplification(G4_INST *inst)
3556 {
3557     // Just handle following commonly used ops for now.
3558     if (inst->opcode() != G4_mul && inst->opcode() != G4_and &&
3559         inst->opcode() != G4_add && inst->opcode() != G4_shl &&
3560         inst->opcode() != G4_shr && inst->opcode() != G4_asr &&
3561         inst->opcode() != G4_mov)
3562     {
3563         return;
3564     }
3565 
3566 
3567     // Perform 'mov' to 'movi' transform when it's a 'mov' of
3568     // - simd8
3569     // - it's a raw mov
3570     // - dst is within a single GRF.
3571     // - src uses VxH indirect access.
3572     // - src is within one GRF.
3573     // - indices to src are all within src.
3574     // - destination stride in bytes must be equal to the source element size in bytes.
3575     bool canConvertMovToMovi = inst->opcode() == G4_mov && inst->getExecSize() == g4::SIMD8 &&
3576         inst->isRawMov() && inst->getDst() &&
3577         !inst->getDst()->asDstRegRegion()->isCrossGRFDst() &&
3578         inst->getSrc(0) && inst->getSrc(0)->isSrcRegRegion() &&
3579         inst->getSrc(0)->asSrcRegRegion()->isIndirect() &&
3580         inst->getSrc(0)->asSrcRegRegion()->getRegion()->isRegionWH() &&
3581         inst->getSrc(0)->asSrcRegRegion()->getRegion()->width == 1 &&
3582         inst->getSrc(0)->getTypeSize() == inst->getDst()->getTypeSize() * inst->getDst()->asDstRegRegion()->getHorzStride();
3583     if (canConvertMovToMovi)
3584     {
3585         // Convert 'mov' to 'movi' if the following conditions are met.
3586 
3587         auto getSingleDefInst = [](G4_INST *UI,
3588             Gen4_Operand_Number OpndNum)
3589             -> G4_INST * {
3590             G4_INST *Def = nullptr;
3591             for (auto I = UI->def_begin(), E = UI->def_end(); I != E; ++I) {
3592                 if (I->second != OpndNum)
3593                     continue;
3594                 if (Def) {
3595                     // Not single defined, bail out
3596                     Def = nullptr;
3597                     break;
3598                 }
3599                 Def = I->first;
3600             }
3601             return Def;
3602         };
3603 
3604         unsigned SrcSizeInBytes =
3605             inst->getExecSize() * inst->getSrc(0)->getTypeSize();
3606         if (SrcSizeInBytes == numEltPerGRF<Type_UB>()/2 ||
3607             SrcSizeInBytes == numEltPerGRF<Type_UB>())
3608         {
3609             G4_INST *LEA = getSingleDefInst(inst, Opnd_src0);
3610             if (LEA && LEA->opcode() == G4_add &&
3611                 LEA->getExecSize() == inst->getExecSize()) {
3612                 G4_Operand *Op0 = LEA->getSrc(0);
3613                 G4_Operand *Op1 = LEA->getSrc(1);
3614                 G4_Declare *Dcl = nullptr;
3615                 int Offset = 0;
3616                 if (Op0->isAddrExp()) {
3617                     G4_AddrExp *AE = Op0->asAddrExp();
3618                     Dcl = AE->getRegVar()->getDeclare();
3619                     Offset = AE->getOffset();
3620                 }
3621                 if (Dcl && (Offset % SrcSizeInBytes) == 0 &&
3622                     Op1->isImm() && Op1->getType() == Type_UV) {
3623                     // Immeidates in 'uv' ensures each element is a
3624                     // byte-offset within half-GRF.
3625                     G4_SubReg_Align SubAlign = GRFALIGN;
3626                     if (SrcSizeInBytes <= numEltPerGRF<Type_UB>()/2u)
3627                         SubAlign = (G4_SubReg_Align)(numEltPerGRF<Type_UW>()/2);
3628                     inst->setOpcode(G4_movi);
3629                     if (!Dcl->isEvenAlign() && Dcl->getSubRegAlign() != GRFALIGN)
3630                     {
3631                         Dcl->setSubRegAlign(SubAlign);
3632                     }
3633                     const RegionDesc *rd = getRegionStride1();
3634                     inst->getSrc(0)->asSrcRegRegion()->setRegion(rd);
3635                     // Set subreg alignment for the address variable.
3636                     Dcl =
3637                         LEA->getDst()->getBase()->asRegVar()->getDeclare();
3638                     assert(Dcl->getRegFile() == G4_ADDRESS &&
3639                         "Address variable is required.");
3640                     Dcl->setSubRegAlign(Eight_Word);
3641                 }
3642             }
3643         }
3644     }
3645 
3646     auto isInteger = [](G4_Operand *opnd, int64_t val)
3647     {
3648         if (opnd && IS_TYPE_INT(opnd->getType()) && !opnd->isRelocImm())
3649         {
3650             return opnd->isImm() && opnd->asImm()->getInt() == val;
3651         }
3652         return false;
3653     };
3654 
3655     G4_Operand *src0 = inst->getSrc(0);
3656     G4_Operand *src1 = inst->getSrc(1);
3657     G4_Operand *newSrc = nullptr;
3658     if (inst->opcode() == G4_mul || inst->opcode() == G4_and)
3659     {
3660         if (isInteger(src1, 0))
3661         {
3662             inst->removeDefUse(Opnd_src0);
3663             newSrc = createImm(0, Type_W);
3664         }
3665         else if (isInteger(src0, 0))
3666         {
3667             inst->removeDefUse(Opnd_src1);
3668             newSrc = createImm(0, Type_W);
3669         }
3670         else if (inst->opcode() == G4_mul)
3671         {
3672             if (isInteger(src1, 1))
3673             {
3674                 newSrc = src0;
3675             }
3676             else if (isInteger(src0, 1))
3677             {
3678                 inst->swapDefUse();
3679                 newSrc = src1;
3680             }
3681         }
3682     }
3683     else if (inst->opcode() == G4_shl || inst->opcode() == G4_shr ||
3684         inst->opcode() == G4_asr || inst->opcode() == G4_add)
3685     {
3686         if (isInteger(src1, 0))
3687         {
3688             newSrc = src0;
3689         }
3690         else if (inst->opcode() == G4_add && isInteger(src0, 0))
3691         {
3692             inst->swapDefUse();
3693             newSrc = src1;
3694         }
3695     }
3696 
3697     if (newSrc != nullptr)
3698     {
3699         inst->setOpcode(G4_mov);
3700         if (newSrc != src0)
3701         {
3702             inst->setSrc(newSrc, 0);
3703         }
3704         inst->setSrc(nullptr, 1);
3705     }
3706 }
3707 
3708 //  find a common (integer) type for constant folding.  The rules are:
3709 //  -- both types must be int
3710 //  -- Q and UQ are not folded
3711 //  -- UD if one of the type is UD
3712 //  -- D otherwise
3713 //
3714 //  returns Type_UNDEF if no appropriate type can be found
3715 //
findConstFoldCommonType(G4_Type type1,G4_Type type2)3716 G4_Type IR_Builder::findConstFoldCommonType(G4_Type type1, G4_Type type2)
3717 {
3718     if (IS_TYPE_INT(type1) && IS_TYPE_INT(type2))
3719     {
3720         if (TypeSize(type1) == 8 || TypeSize(type2) == 8)
3721         {
3722             return Type_UNDEF;
3723         }
3724         if (type1 == Type_UD || type2 == Type_UD)
3725         {
3726             return Type_UD;
3727         }
3728         else
3729         {
3730             return Type_D;
3731         }
3732     }
3733     return Type_UNDEF;
3734 }
3735