1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2020-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "BuildIR.h"
10 #include "../Timer.h"
11 
12 #include <cmath>
13 
14 using namespace vISA;
15 
lscMinExecSize(LSC_SFID lscSfid) const16 G4_ExecSize IR_Builder::lscMinExecSize(LSC_SFID lscSfid) const
17 {
18     const TARGET_PLATFORM P = getPlatform();
19     uint32_t minExecSize = (P == GENX_DG2 ? 8 : 16);
20     if (!hasLSCEnableHalfSIMD())
21     {
22         minExecSize *= 2;
23     }
24     return G4_ExecSize(minExecSize);
25 }
26 
lscTryPromoteSurfaceImmToExDesc(G4_Operand * surface,LSC_ADDR_TYPE addrModel,uint32_t & exDesc)27 static G4_Operand *lscTryPromoteSurfaceImmToExDesc(
28     G4_Operand *surface, LSC_ADDR_TYPE addrModel, uint32_t &exDesc)
29 {
30     if (surface && surface->isImm()) {
31         // try and promote any immediate surface to the extended descriptor if
32         // possible; we get [31:12] in the EU ISA to work with.
33         auto surfaceImm = (uint32_t)surface->asImm()->getImm();
34         if (addrModel == LSC_ADDR_TYPE_BTI) {
35             // promote the immediate BTI to the descriptor
36             exDesc |= surfaceImm << 24;
37             surface = nullptr;
38         } else if (
39             addrModel == LSC_ADDR_TYPE_BSS ||
40             addrModel == LSC_ADDR_TYPE_SS)
41         {
42             if ((surfaceImm & 0x3FF) == 0) {
43                 exDesc |= surfaceImm;
44                 surface = nullptr;
45             }
46         } else {
47             // flat address type
48             MUST_BE_TRUE(surface->isNullReg() ||
49                 surfaceImm == PREDEFINED_SURFACE_SLM ||
50                 surfaceImm == PREDEFINED_SURFACE_T255, // not sure what's up here
51                 "flat address type must have null reg (or 0)");
52             surface = nullptr;
53         }
54     } else {
55         MUST_BE_TRUE(surface || addrModel == LSC_ADDR_TYPE_FLAT,
56             "only flat address model may have null surface");
57     }
58     return surface;
59 }
60 
isNullOperand(const G4_Operand * opnd)61 static bool isNullOperand(const G4_Operand *opnd) {
62     return opnd == nullptr || opnd->isNullReg();
63 }
64 
alignUp(int a,int n)65 static int alignUp(int a, int n) {
66     return n + a - 1 - ((n + a -1) % a);
67 }
68 
lscBlock2dComputeDataRegs(LSC_OP op,LSC_DATA_SHAPE_BLOCK2D dataShape2d,int dataSizeBits)69 static int lscBlock2dComputeDataRegs(
70     LSC_OP op,
71     LSC_DATA_SHAPE_BLOCK2D dataShape2d,
72     int dataSizeBits)
73 {
74     const static int BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
75 
76     auto roundUpToPowerOf2 =
77         [] (int n) {
78         while (n & (n-1))
79             n++;
80         return n;
81     };
82 
83     // this comes out of the HAS (1408569497)
84     // non-transpose
85     //   5.1.2.3 non-vnni (HAS pg. 8)
86     //   5.1.1.2 vnni (pg.13) perversely, this comes after 5.1.2.3 in the doc
87     // transpose
88     //   5.1.3.2 non-vnni (HAS pg. 10)
89     //   5.1.2.2 vnni (HAS pg. 15)
90     bool transpose = dataShape2d.order == LSC_DATA_ORDER_TRANSPOSE;
91     int grfRowPitchElems =
92         roundUpToPowerOf2(!transpose ? dataShape2d.width : dataShape2d.height);
93     int blockRows = !transpose ? dataShape2d.height : dataShape2d.width;
94     int elemsPerGrf = 8*BYTES_PER_REG/dataSizeBits;
95     // alignUp needed for padding between blocks; each block pads out to
96     // a full GRF
97     int regsPerBlock =
98         alignUp(elemsPerGrf, blockRows*grfRowPitchElems)/elemsPerGrf;
99     //
100     int dataRegs = dataShape2d.blocks*regsPerBlock;
101     // C.f. DP_LOAD_2DBLOCK_ARRAY
102     //   https://gfxspecs.intel.com/Predator/Home/Index/53680
103     //
104     //   Data payload size, in registers. Destination length of 32 is
105     //   encoded as 31.  Data port hardware derives the correct destination
106     //   length based on message parameters.
107     if (op == LSC_LOAD_BLOCK2D && dataRegs == 32)
108         dataRegs = 31;
109     return dataRegs;
110 }
111 
translateLscUntypedInst(LSC_OP op,LSC_SFID lscSfid,G4_Predicate * pred,VISA_Exec_Size visaExecSize,VISA_EMask_Ctrl execCtrl,LSC_CACHE_OPTS cacheOpts,LSC_ADDR addrInfo,LSC_DATA_SHAPE dataShape,G4_Operand * surface,G4_DstRegRegion * dstRead,G4_SrcRegRegion * src0Addr,G4_Operand * src0AddrStride,G4_SrcRegRegion * src1Data,G4_SrcRegRegion * src2Data)112 int IR_Builder::translateLscUntypedInst(
113     LSC_OP                      op,
114     LSC_SFID                    lscSfid,
115     G4_Predicate               *pred,
116     VISA_Exec_Size              visaExecSize,
117     VISA_EMask_Ctrl             execCtrl,
118     LSC_CACHE_OPTS              cacheOpts,
119     LSC_ADDR                    addrInfo,
120     LSC_DATA_SHAPE              dataShape,
121     G4_Operand                 *surface, // can be G4_Imm or G4_SrcRegRegion
122     G4_DstRegRegion            *dstRead,  // dst can be NULL reg (e.g store)
123     G4_SrcRegRegion            *src0Addr, // always the addresses (base for strided)
124     G4_Operand                 *src0AddrStride, // only for strided
125     G4_SrcRegRegion            *src1Data, // store data/extra atomic operands
126     G4_SrcRegRegion            *src2Data // store data/extra atomic operands
127 )
128 {
129     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
130 
131     int status = VISA_SUCCESS;
132     auto check =
133         [&] (bool z, const char *what) {
134         if (!z) {
135             MUST_BE_TRUE(false, what);
136             status = VISA_FAILURE;
137         }
138     };
139 
140     const G4_ExecSize execSize = toExecSize(visaExecSize);
141     const G4_InstOpts instOpt = Get_Gen4_Emask(execCtrl, execSize);
142 
143     const static uint32_t BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
144 
145     // send descriptor
146     uint32_t desc = 0;
147     uint32_t exDesc = 0;
148 
149     // try and promote the surface identifier (e.g. BTI or SS obj) to ex desc
150     surface = lscTryPromoteSurfaceImmToExDesc(surface, addrInfo.type, exDesc);
151     const auto opInfo = LscOpInfoGet(op);
152     MUST_BE_TRUE(!opInfo.isBlock2D(),
153         "use translateLscUntypedBlock2DInst for lsc_*_block2d");
154 
155     check(
156         opInfo.kind == LscOpInfo::LOAD ||
157         opInfo.kind == LscOpInfo::STORE ||
158         opInfo.kind == LscOpInfo::ATOMIC, "unhandled LSC op class");
159 
160     // Desc[5:0] is the message opcode
161     desc |= opInfo.encoding; // Desc[5:0]
162 
163     // build the descriptor (Sect. 3.3.1 of the HAS)
164     // (also https://gfxspecs.intel.com/Predator/Home/Index/53522)
165     //
166     //   Desc[5:0] = OPCODE {LOAD,STORE,LOAD_BLOCK,STORE_BLOCK,...}
167     //   Desc[8:7] = addr size
168     //   Desc[11:9] = data size
169     //   Desc[15:12] = data vector size (or cmask)
170     //   Desc[19:17] = caching controls (see the table for allowable combinations)
171     //   Desc[30:29] = addr model (BTI = 3, SS = 2, BSS = 1, FLAT = 0)
172     //
173     // All other bits are undefined as of now
174     //
175     const int addrSizeBits = lscEncodeAddrSize(addrInfo.size, desc, status);
176     const int dataSizeBits = lscEncodeDataSize(dataShape.size, desc, status);
177     //
178     int vecSize = 0; // definitely assigned
179     if (!opInfo.hasChMask()) {
180         vecSize = lscEncodeDataElems(dataShape.elems, desc, status);
181         lscEncodeDataOrder(dataShape.order, desc, status);
182     } else {
183         MUST_BE_TRUE(dataShape.chmask, "channel mask must not be empty");
184         vecSize = 0;
185         if (dataShape.chmask & LSC_DATA_CHMASK_X) {
186             desc |= 1 << 12;
187             vecSize++;
188         }
189         if (dataShape.chmask & LSC_DATA_CHMASK_Y) {
190             desc |= 1 << 13;
191             vecSize++;
192         }
193         if (dataShape.chmask & LSC_DATA_CHMASK_Z) {
194             desc |= 1 << 14;
195             vecSize++;
196         }
197         if (dataShape.chmask & LSC_DATA_CHMASK_W) {
198             desc |= 1 << 15;
199             vecSize++;
200         }
201     }
202     lscEncodeCachingOpts(opInfo, cacheOpts, desc, status);
203     lscEncodeAddrType(addrInfo.type, desc, status);
204 
205     ///////////////////////////////////////////////////////////////////////////
206     // address adjustment and extra codegen (adds, shifts, and multiplies)
207     // only pass exDesc if it's an immediate field
208     auto addrExecSize = execSize;
209     auto addrExecCtrl = execCtrl;
210     const auto isStrided =
211         op == LSC_OP::LSC_LOAD_STRIDED || op == LSC_OP::LSC_STORE_STRIDED;
212     if (isStrided) {
213         addrExecSize = g4::SIMD1;
214         addrExecCtrl = vISA_EMASK_M1_NM;
215     }
216     src0Addr = lscLoadEffectiveAddress(
217         op,
218         lscSfid,
219         pred, addrExecSize, addrExecCtrl, addrInfo, dataSizeBits / 8,
220         surface,
221         src0Addr,
222         exDesc
223     );
224 
225     uint32_t dataRegs = 1;
226     uint32_t addrRegs = 1;
227 
228     G4_ExecSize minExecSize = lscMinExecSize(lscSfid);
229 
230     if (dataShape.order == LSC_DATA_ORDER_NONTRANSPOSE) {
231         // Non-transpose case is the typical case.
232         //
233         // ceil[ SIMT32*dataSize(b)/512(b/REG) ] * vecSize
234         //   units = (b/b*REG) = REG
235         uint32_t width = std::max(execSize, minExecSize);
236         dataRegs = std::max<uint32_t>(1,
237             width * dataSizeBits / 8 / BYTES_PER_REG) * vecSize;
238         addrRegs = std::max<uint32_t>(1,
239             width * addrSizeBits / 8 / BYTES_PER_REG);
240 
241         if (execSize < minExecSize)
242         {
243             // we may need to even-align src and data
244             auto evenAlignDcl = [](G4_Operand* opnd)
245             {
246                 G4_Declare* dcl = opnd->getTopDcl()->getRootDeclare();
247                 if (dcl->getByteSize() <= getGRFSize())
248                 {
249                     dcl->setEvenAlign();
250                 }
251             };
252 
253             if ((addrSizeBits / 8) * minExecSize > getGRFSize())
254             {
255                 evenAlignDcl(src0Addr);
256             }
257 
258             if ((dataSizeBits / 8) * minExecSize > getGRFSize())
259             {
260                 if (!isNullOperand(dstRead))
261                 {
262                     evenAlignDcl(dstRead);
263                 }
264                 if (!isNullOperand(src1Data))
265                 {
266                     evenAlignDcl(src1Data);
267                 }
268             }
269             // we don't need to align src2 if it exists, as we'd need to generate
270             // a temp send payload containing both src1 and src2 anyway
271         }
272     } else { // if (dataShape.order == LSC_DATA_TRANSPOSE) {
273              // The transpose case is a little odder
274              //
275              // So the data size is the SIMD size (ExecSize) times the number of
276              // registers consumed by each vector sequence (always a full
277              // register number per seq).
278         uint32_t regsPerVec = vecSize * dataSizeBits / 8 / BYTES_PER_REG;
279         if (vecSize * dataSizeBits / 8 % BYTES_PER_REG)
280             regsPerVec++; // pad out to full reg
281         dataRegs = regsPerVec * execSize;
282     }
283 
284     // override sizes for special cases
285     if (op == LSC_OP::LSC_LOAD_STATUS) {
286         dataRegs = 1; // this message just returns a bitset in the low DW
287     }
288 
289     // cases that need a payload register built
290     if (isStrided) {
291         src0Addr = lscBuildStridedPayload(
292             pred,
293             src0Addr,
294             src0AddrStride,
295             dataSizeBits / 8,
296             vecSize,
297             dataShape.order == LSC_DATA_ORDER_TRANSPOSE);
298         addrRegs = 1;
299     }
300 
301     int src1Len = 0;
302     uint32_t dstLen = 0;
303     uint32_t src0Len = addrRegs;
304     if (opInfo.isLoad()) {
305         if (isNullOperand(dstRead)) {
306             dstLen = 0; // prefetch
307         } else {
308             dstLen = dataRegs;
309         }
310         src1Len = 0;
311     } else if (opInfo.isStore()) {
312         dstLen = 0;
313         src0Len = addrRegs;
314         src1Len = (int)dataRegs;
315     } else if (opInfo.isAtomic()) {
316         if (opInfo.extraOperands == 0) { // e.g. lsc_atomic_iinc
317             check(isNullOperand(src1Data) && isNullOperand(src2Data),
318                 "atmoic unary must have null src1 and src2");
319         } else if (opInfo.extraOperands == 1) { // e.g. lsc_atomic_add
320             check(!isNullOperand(src1Data) && isNullOperand(src2Data),
321                 "atmoic binary must have non-null src1 and null src2");
322         } else {
323             // lsc_atomic_icas/lsc_atomic_fcas: coalesce parmeters into one
324             check(!isNullOperand(src1Data) && !isNullOperand(src2Data),
325                 "atmoic ternary must have non-null src1 and src2");
326             src1Data =
327                 coalescePayload(
328                     BYTES_PER_REG, BYTES_PER_REG,
329                     std::max(minExecSize, execSize), execSize,
330                     {src1Data, src2Data}, execCtrl);
331         }
332         src1Len = (int)dataRegs*opInfo.extraOperands;
333 
334         if (dstRead->isNullReg()) {
335             dstLen = 0;
336         } else {
337             dstLen = dataRegs;
338         }
339     } else {
340         check(false, "unexpected message type");
341     }
342 
343     check(dstLen < 32, "too many destination registers (read operand)");
344     check(src0Len < 32, "too many src0 registers (address)");
345     check(src1Len < 32, "too many src1 registers (write operand)");
346 
347     // FIXME: we need to first sort out what the rules are on virtual registers
348     // I initially thought that one was supposed to use an alias over a .decl
349     // And have properly sized inputs, but this assumption is proving false.
350     auto checkDeclSize =
351         [&] (const char *what,
352             G4_Declare *dcl,
353             int visaRegsInDcl,
354             int genRegsNeeded)
355     {
356         // if (visaRegsInDcl != genRegsNeeded)
357         if (visaRegsInDcl < genRegsNeeded) {
358             std::stringstream ss;
359             ss << what << " register dimensions don't fit data type\n";
360             ss << "vISA decl given is: "; dcl->emit(ss);
361             ss << " (" << (dcl->getTotalElems()*dcl->getElemSize()) << "B)\n";
362             ss << "but payload should be " << genRegsNeeded << " reg(s)\n";
363             switch (addrInfo.size) {
364             case LSC_ADDR_SIZE_16b: ss << "addr size is 16b"; break;
365             case LSC_ADDR_SIZE_32b: ss << "addr size is 32b"; break;
366             case LSC_ADDR_SIZE_64b: ss << "addr size is 64b"; break;
367             default: ss << "??";
368             }
369             ss << " x " << (int)execSize << " elem(s) ";
370             if (dataShape.order == LSC_DATA_ORDER_TRANSPOSE) {
371                 ss << "transposed ";
372             } else {
373                 ss << "non-transposed ";
374             }
375             ss << " and data ";
376             switch (dataShape.size) {
377             case LSC_DATA_SIZE_8b: ss << "8b"; break;
378             case LSC_DATA_SIZE_16b: ss << "16b"; break;
379             case LSC_DATA_SIZE_64b: ss << "64b"; break;
380             default: ss << "32b"; break; // 32b or the conversion types
381             }
382             ss << " x " << vecSize;
383             check(false, ss.str().c_str());
384         }
385     };
386 
387     // Some sanity checking of vISA region sizes with the computed sizes
388     G4_Declare *addrDcl =
389         src0Addr->getBase()->asRegVar()->getDeclare()->getRootDeclare();
390     // addrDcl->emit(std::cout,true,false);
391     check(addrDcl,"cannot find declaration for address register");
392 
393     // disable size checks if execSize is < min payload width,
394     // since declares is allowed to be smaller than payload size in this case
395     if (execSize >= minExecSize)
396     {
397         if (addrDcl) {
398             auto addrRegSize = addrDcl->getElemSize() * addrDcl->getTotalElems();
399             auto visaAddrRegsInDcl =
400                 std::max<int>(addrRegSize / COMMON_ISA_GRF_REG_SIZE, 1);
401             checkDeclSize("address", addrDcl, visaAddrRegsInDcl, addrRegs);
402         }
403 
404         // loading/store into the null register for prefetch
405         if (!isNullOperand(dstRead)) {
406             // sanity check the number of destination operands with the types given
407             G4_Declare* dstDcl =
408                 dstRead->getBase()->asRegVar()->getDeclare()->getRootDeclare();
409             check(dstDcl != nullptr, "cannot find declaration for data register");
410             unsigned dataRegBytes = dstDcl->getTotalElems() * dstDcl->getElemSize();
411             auto visaRegsInDcl =
412                 std::max<int>(dataRegBytes / COMMON_ISA_GRF_REG_SIZE, 1);
413             checkDeclSize("data", dstDcl, visaRegsInDcl, dstLen);
414         }
415     }
416 
417     desc |= dstLen << 20;   // Desc[24:20]  dst len
418     desc |= addrRegs << 25; // Desc[29:25]  src0 len
419 
420     SFID sfid = SFID::NULL_SFID;
421     switch (lscSfid) {
422     case LSC_UGM:  sfid = SFID::UGM;  break;
423     case LSC_UGML: sfid = SFID::UGML; break;
424     case LSC_SLM:  sfid = SFID::SLM;  break;
425     default: check(false,"invalid SFID for untyped LSC message");
426     }
427 
428     G4_SendDescRaw *msgDesc = createLscDesc(
429         sfid,
430         desc,
431         exDesc,
432         src1Len,
433         getSendAccessType(opInfo.isLoad(), opInfo.isStore()),
434         surface);
435     createLscSendInst(
436         pred,
437         dstRead,
438         src0Addr,
439         src1Data,
440         execSize,
441         msgDesc,
442         instOpt,
443         addrInfo.type,
444         true);
445 
446     return status;
447 }
448 
449 
450 
translateLscUntypedBlock2DInst(LSC_OP op,LSC_SFID lscSfid,G4_Predicate * pred,VISA_Exec_Size visaExecSize,VISA_EMask_Ctrl emask,LSC_CACHE_OPTS cacheOpts,LSC_DATA_SHAPE_BLOCK2D dataShape2D,G4_DstRegRegion * dstRead,G4_Operand * src0Addrs[LSC_BLOCK2D_ADDR_PARAMS],G4_SrcRegRegion * src1Data)451 int IR_Builder::translateLscUntypedBlock2DInst(
452     LSC_OP                      op,
453     LSC_SFID                    lscSfid,
454     G4_Predicate               *pred,
455     VISA_Exec_Size              visaExecSize,
456     VISA_EMask_Ctrl             emask,
457     LSC_CACHE_OPTS              cacheOpts,
458     LSC_DATA_SHAPE_BLOCK2D      dataShape2D,
459     G4_DstRegRegion            *dstRead,  // dst can be NULL reg (e.g store)
460     G4_Operand                 *src0Addrs[LSC_BLOCK2D_ADDR_PARAMS], // always the addresses
461     G4_SrcRegRegion            *src1Data // store data
462 )
463 {
464     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
465 
466     int status = VISA_SUCCESS;
467     auto check =
468         [&](bool z, const char *what) {
469         if (!z) {
470             MUST_BE_TRUE(false, what);
471             status = VISA_FAILURE;
472         }
473     };
474 
475     const auto opInfo = LscOpInfoGet(op);
476     MUST_BE_TRUE(opInfo.isBlock2D(), "not an LSC block2d op");
477 
478     // send descriptor
479     uint32_t desc = 0;
480     uint32_t exDesc = 0;
481 
482     desc |= opInfo.encoding;
483     if (dataShape2D.vnni)
484         desc |= (1 << 7); // Desc[7]
485     int dataSizeBits =
486         lscEncodeDataSize(dataShape2D.size, desc, status);
487     if (dataShape2D.order == LSC_DATA_ORDER_TRANSPOSE)
488         desc |= (1 << 15);
489     lscEncodeCachingOpts(opInfo, cacheOpts, desc, status);
490     desc |= (0 << 29); // Desc[30:29] = FLAT
491 
492     G4_SrcRegRegion *src0Addr =
493         lscBuildBlock2DPayload(dataShape2D, pred, src0Addrs);
494 
495     uint32_t dataRegs =
496         lscBlock2dComputeDataRegs(op, dataShape2D, dataSizeBits);
497     uint32_t addrRegs = 1;
498 
499     int src1Len = 0;
500     uint32_t dstLen = 0;
501     uint32_t src0Len = addrRegs;
502 
503     if (opInfo.isLoad()) {
504         if (isNullOperand(dstRead)) {
505             dstLen = 0; // prefetch
506         } else {
507             dstLen = dataRegs;
508         }
509         src1Len = 0;
510     } else if (opInfo.isStore()) {
511         dstLen = 0;
512         src0Len = addrRegs;
513         src1Len = (int)dataRegs;
514     } else {
515         check(false, "unexpected message type");
516     }
517 
518     desc |= dstLen << 20;   // Desc[24:20]  dst len
519     desc |= addrRegs << 25; // Desc[28:25]  src0 len
520 
521     SFID sfid = SFID::NULL_SFID;
522     switch (lscSfid) {
523     case LSC_UGM:  sfid = SFID::UGM;  break;
524     case LSC_UGML: sfid = SFID::UGML; break;
525     case LSC_SLM:  sfid = SFID::SLM;  break;
526     default: check(false, "invalid SFID for untyped block2d LSC message");
527     }
528 
529     G4_SendDescRaw * msgDesc = createLscDesc(
530         sfid,
531         desc,
532         exDesc,
533         src1Len,
534         getSendAccessType(opInfo.isLoad(), opInfo.isStore()),
535         nullptr);
536 
537     const G4_ExecSize execSize = toExecSize(visaExecSize);
538     const G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
539     G4_InstSend *sendInst = createLscSendInst(
540         pred,
541         dstRead,
542         src0Addr,
543         src1Data,
544         execSize,
545         msgDesc,
546         instOpt,
547         LSC_ADDR_TYPE_FLAT,
548         true);
549     (void)sendInst;
550 
551     return status;
552 }
553 
554 
translateLscTypedInst(LSC_OP op,G4_Predicate * pred,VISA_Exec_Size execSizeEnum,VISA_EMask_Ctrl emask,LSC_CACHE_OPTS cacheOpts,LSC_ADDR_TYPE addrModel,LSC_ADDR_SIZE addrSize,LSC_DATA_SHAPE shape,G4_Operand * surface,G4_DstRegRegion * dstData,G4_SrcRegRegion * src0AddrUs,G4_SrcRegRegion * src0AddrVs,G4_SrcRegRegion * src0AddrRs,G4_SrcRegRegion * src0AddrLODs,G4_SrcRegRegion * src1Data,G4_SrcRegRegion * src2Data)555 int IR_Builder::translateLscTypedInst(
556     LSC_OP                  op,
557     G4_Predicate           *pred,
558     VISA_Exec_Size          execSizeEnum,
559     VISA_EMask_Ctrl         emask,
560     LSC_CACHE_OPTS          cacheOpts,
561     LSC_ADDR_TYPE           addrModel,
562     LSC_ADDR_SIZE           addrSize,
563     LSC_DATA_SHAPE          shape,
564     G4_Operand             *surface,  // surface/bti
565     G4_DstRegRegion        *dstData,  // dst on load/atomic
566     G4_SrcRegRegion        *src0AddrUs,
567     G4_SrcRegRegion        *src0AddrVs,
568     G4_SrcRegRegion        *src0AddrRs,
569     G4_SrcRegRegion        *src0AddrLODs,
570     G4_SrcRegRegion        *src1Data, // store data/extra atomic operands
571     G4_SrcRegRegion        *src2Data // icas/fcas only
572 )
573 {
574     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
575 
576     int status = VISA_SUCCESS;
577 
578     const uint32_t BYTES_PER_GRF = COMMON_ISA_GRF_REG_SIZE;
579 
580     const G4_ExecSize execSize = toExecSize(execSizeEnum);
581     const G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
582 
583     const auto opInfo = LscOpInfoGet(op);
584 
585     uint32_t desc = opInfo.encoding;
586     uint32_t exDesc = 0;
587 
588     surface = lscTryPromoteSurfaceImmToExDesc(surface, addrModel, exDesc);
589 
590     int numChannels = 0;
591     if (opInfo.hasChMask()) {
592         if (shape.chmask & LSC_DATA_CHMASK_X) {
593             desc |= 1 << 12;
594             numChannels++;
595         }
596         if (shape.chmask & LSC_DATA_CHMASK_Y) {
597             desc |= 1 << 13;
598             numChannels++;
599         }
600         if (shape.chmask & LSC_DATA_CHMASK_Z) {
601             desc |= 1 << 14;
602             numChannels++;
603         }
604         if (shape.chmask & LSC_DATA_CHMASK_W) {
605             desc |= 1 << 15;
606             numChannels++;
607         }
608         MUST_BE_TRUE(numChannels != 0, "empty channel mask");
609     } else {
610         // atomics are single channel
611         numChannels = 1;
612     }
613     int addrSizeBits = lscEncodeAddrSize(addrSize, desc, status);
614     int dataSizeBits = lscEncodeDataSize(shape.size, desc, status);
615     (void)addrSizeBits;
616     (void)dataSizeBits;
617 
618     lscEncodeCachingOpts(opInfo, cacheOpts, desc, status); // Desc[19:17]
619     lscEncodeAddrType(addrModel, desc, status);
620 
621     auto checkPayloadSize =
622         [&] (const char *which,
623             const G4_Declare *decl,
624             int expectDeclRegs)
625     {
626         int dclRegs =
627             std::max<int>(1,
628                 decl->getTotalElems()*decl->getElemSize()/BYTES_PER_GRF);
629         // if (expectDeclRegs != dclRegs)
630         // TODO: need to fix issue with IGC codegen using offsets
631         // in raw vars
632         if (expectDeclRegs > dclRegs) {
633             std::stringstream ss;
634             ss << which << " .decl size ";
635             decl->emit(ss);
636             ss << " (" << dclRegs << ")";
637             ss << " mismatches expected number of registers for "
638                 "payload (" << expectDeclRegs << ")";
639             // std::cerr << ss.str();
640             MUST_BE_TRUE(false,ss.str().c_str());
641         }
642     };
643 
644     auto checkAddrPayloadSize =
645         [&] (const char *which, const G4_SrcRegRegion *srcAddr) {
646         if (srcAddr == nullptr || srcAddr->isNullReg()) {
647             return;
648         }
649         const G4_Declare *decl = getDeclare(srcAddr);
650         const int regsPerAddrChannel =
651             std::max<int>(1,addrSizeBits*(int)execSize/8/BYTES_PER_GRF);
652         checkPayloadSize(which, decl, regsPerAddrChannel);
653     };
654     checkAddrPayloadSize("src0AddrUs", src0AddrUs);
655     checkAddrPayloadSize("src0AddrVs", src0AddrVs);
656     checkAddrPayloadSize("src0AddrRs", src0AddrRs);
657     checkAddrPayloadSize("src0AddrLODs", src0AddrLODs);
658 
659     G4_SrcRegRegion *srcAddrs[2] { };
660     G4_SrcRegRegion *srcData = nullptr;
661     unsigned srcAddrRegs[2]{ };
662     unsigned srcDataRegs = 0;
663     uint32_t dstDataRegs = 0;
664     if (opInfo.op == LSC_READ_STATE_INFO) {
665         // like fences, send requires *something* (at least one reg) to be
666         // sent out; we pick the initial r0 value since it's known to
667         // be floating around somewhere until EOT
668         const RegionDesc *rd = getRegionStride1();
669         G4_Declare *r0 = getBuiltinR0();
670         G4_SrcRegRegion *src0Dummy = createSrc(
671             r0->getRegVar(),
672             0, 0, rd, Type_UD);
673         srcAddrRegs[0] = 1;
674         srcAddrRegs[1] = 0;
675         srcAddrs[0] = src0Dummy;
676     } else {
677         PayloadSource srcAddrPayloads[4] { }; // U, V, R, LOD
678         unsigned numSrcAddrPayloads = 0;
679         buildTypedSurfaceAddressPayload(
680             src0AddrUs,
681             src0AddrVs,
682             src0AddrRs,
683             src0AddrLODs,
684             execSize,
685             instOpt,
686             srcAddrPayloads,
687             numSrcAddrPayloads);
688         preparePayload(
689             srcAddrs,
690             srcAddrRegs,
691             execSize,
692             false, // not a split send (so all the addrs lands in one reg)
693             srcAddrPayloads,
694             numSrcAddrPayloads);
695         MUST_BE_TRUE(srcAddrs[1] == nullptr, "invalid addr split");
696         MUST_BE_TRUE(srcAddrRegs[0] < 32, "too many address registers");
697 
698         // each channel consumes at least one register (top padding may be 0)
699         const int regsPerDataChannel =
700             std::max<int>(1, dataSizeBits*(int)execSize/8/BYTES_PER_GRF);
701         auto checkDataDeclSize =
702             [&](const char *which, const G4_Operand *data) {
703             if (data == nullptr || data->isNullReg()) {
704                 return;
705             }
706             const G4_Declare *decl = getDeclare(data);
707             checkPayloadSize(which, decl, regsPerDataChannel*numChannels);
708         };
709         checkDataDeclSize("dstData", dstData);
710         checkDataDeclSize("src1Data", src1Data);
711         checkDataDeclSize("src2Data", src2Data);
712 
713         srcData = coalescePayload(
714             BYTES_PER_GRF, BYTES_PER_GRF, std::max(getNativeExecSize(), execSize), execSize, {src1Data, src2Data}, emask);
715         srcDataRegs = 0;
716         if (!srcData->isNullReg()) {
717             const G4_Declare *srcDcl = getDeclare(srcData);
718             // srcDcl->emit(std::cerr, false, false);
719             srcDataRegs =
720                 srcDcl->getTotalElems()*srcDcl->getElemSize()/BYTES_PER_GRF;
721         }
722         dstDataRegs =
723             opInfo.isLoad() || (opInfo.isAtomic() && !dstData->isNullReg()) ?
724             regsPerDataChannel*numChannels : 0;
725     }
726     int src1Len = (int)srcDataRegs; // lsc_load_quad.tgm / lsc_atomic_icas.tgm
727 
728     if (op == LSC_OP::LSC_LOAD_STATUS ||
729         op == LSC_OP::LSC_READ_STATE_INFO)
730     {
731         dstDataRegs = 1; // just a single DW of bits (padded to 1 reg)
732     }
733     // MUST_BE_TRUE(dataSrcsRegs == dataRegs, "mismatch in .decls for "
734     //     "number of data registers in actual message");
735     MUST_BE_TRUE(srcDataRegs < 32, "too many data registers");
736 
737     desc |= (srcAddrRegs[0] & 0xF) << 25; // mlen == Desc[28:25]
738     if (opInfo.isLoad() || (opInfo.isAtomic() && !dstData->isNullReg())) {
739         desc |= (dstDataRegs & 0x1F) << 20; // rlen == Desc[24:20]
740     }
741 
742     G4_SendDescRaw *msgDesc = createLscDesc(
743         SFID::TGM,
744         desc,
745         exDesc,
746         src1Len,
747         getSendAccessType(opInfo.isLoad(), opInfo.isStore()),
748         surface);
749     G4_InstSend *sendInst = createLscSendInst(
750         pred,
751         dstData,
752         srcAddrs[0],
753         srcData,
754         execSize,
755         msgDesc,
756         instOpt,
757         addrModel,
758         true);
759     (void)sendInst;
760 
761     return status;
762 }
763 
lscGetElementNum(unsigned eNum) const764 LSC_DATA_ELEMS IR_Builder::lscGetElementNum(unsigned eNum) const
765 {
766     switch (eNum)
767     {
768     case 1:
769         return LSC_DATA_ELEMS_1;
770     case 2:
771         return LSC_DATA_ELEMS_2;
772     case 3:
773         return LSC_DATA_ELEMS_3;
774     case 4:
775         return LSC_DATA_ELEMS_4;
776     case 8:
777         return LSC_DATA_ELEMS_8;
778     case 16:
779         return LSC_DATA_ELEMS_16;
780     case 32:
781         return LSC_DATA_ELEMS_32;
782     case 64:
783         return LSC_DATA_ELEMS_64;
784     default:
785         return LSC_DATA_ELEMS_INVALID;
786     };
787 
788     return LSC_DATA_ELEMS_INVALID;
789 }
790 
lscEncodeAddrSize(LSC_ADDR_SIZE addrSize,uint32_t & desc,int & status) const791 int IR_Builder::lscEncodeAddrSize(
792     LSC_ADDR_SIZE addrSize, uint32_t &desc, int &status) const
793 {
794     int addrSizeBits = 32;
795     uint32_t addrSizeEnc = 0;
796     switch (addrSize) {
797     case LSC_ADDR_SIZE_16b: addrSizeEnc = 0x1; addrSizeBits = 16; break;
798     case LSC_ADDR_SIZE_32b: addrSizeEnc = 0x2; addrSizeBits = 32; break;
799     case LSC_ADDR_SIZE_64b: addrSizeEnc = 0x3; addrSizeBits = 64; break;
800     default: MUST_BE_TRUE(false, "invalid address size"); status = VISA_FAILURE;
801     }
802     desc |= addrSizeEnc << 7;  // Desc[8:7]
803     return addrSizeBits;
804 }
805 
lscEncodeDataSize(LSC_DATA_SIZE dataSize,uint32_t & desc,int & status) const806 int IR_Builder::lscEncodeDataSize(
807     LSC_DATA_SIZE dataSize, uint32_t &desc, int &status) const
808 {
809     uint32_t dataSizeEnc = 0;
810     int dataSizeBits = 32;
811     switch (dataSize) {
812     case LSC_DATA_SIZE_8b:      dataSizeEnc = 0x0; dataSizeBits =  8; break;
813     case LSC_DATA_SIZE_16b:     dataSizeEnc = 0x1; dataSizeBits = 16; break;
814     case LSC_DATA_SIZE_32b:     dataSizeEnc = 0x2; dataSizeBits = 32; break;
815     case LSC_DATA_SIZE_64b:     dataSizeEnc = 0x3; dataSizeBits = 64; break;
816     case LSC_DATA_SIZE_8c32b:   dataSizeEnc = 0x4; dataSizeBits = 32; break;
817     case LSC_DATA_SIZE_16c32b:  dataSizeEnc = 0x5; dataSizeBits = 32; break;
818     case LSC_DATA_SIZE_16c32bH: dataSizeEnc = 0x6; dataSizeBits = 32; break;
819     default: MUST_BE_TRUE(false, "invalid data size"); status = VISA_FAILURE;
820     }
821     desc |= dataSizeEnc << 9; // Desc[11:9]
822     return dataSizeBits;
823 }
824 
lscEncodeDataElems(LSC_DATA_ELEMS dataElems,uint32_t & desc,int & status) const825 int IR_Builder::lscEncodeDataElems(
826     LSC_DATA_ELEMS dataElems, uint32_t &desc, int &status) const
827 {
828     uint32_t vecSizeEnc = 0;
829     int vecSize = 1;
830     switch (dataElems) {
831     case LSC_DATA_ELEMS_1:  vecSizeEnc = 0x0; vecSize =  1; break;
832     case LSC_DATA_ELEMS_2:  vecSizeEnc = 0x1; vecSize =  2; break;
833     case LSC_DATA_ELEMS_3:  vecSizeEnc = 0x2; vecSize =  3; break;
834     case LSC_DATA_ELEMS_4:  vecSizeEnc = 0x3; vecSize =  4; break;
835     case LSC_DATA_ELEMS_8:  vecSizeEnc = 0x4; vecSize =  8; break;
836     case LSC_DATA_ELEMS_16: vecSizeEnc = 0x5; vecSize = 16; break;
837     case LSC_DATA_ELEMS_32: vecSizeEnc = 0x6; vecSize = 32; break;
838     case LSC_DATA_ELEMS_64: vecSizeEnc = 0x7; vecSize = 64; break;
839     default: MUST_BE_TRUE(false, "number of data elements"); status = VISA_FAILURE;
840     }
841     desc |= vecSizeEnc << 12; // desc[14:12] is the vector size
842     return vecSize;
843 }
844 
lscEncodeDataOrder(LSC_DATA_ORDER order,uint32_t & desc,int & status) const845 void IR_Builder::lscEncodeDataOrder(
846     LSC_DATA_ORDER order, uint32_t &desc, int &status) const
847 {
848     if (order == LSC_DATA_ORDER_TRANSPOSE) {
849         desc |= 1 << 15; // desc[15] is transpose
850     } else if (order != LSC_DATA_ORDER_NONTRANSPOSE) {
851         MUST_BE_TRUE(false, "bad transpose value");
852         status = VISA_FAILURE;
853     }
854 }
855 
lscEncodeCachingOpts(const LscOpInfo & opInfo,LSC_CACHE_OPTS cacheOpts,uint32_t & desc,int & status) const856 void IR_Builder::lscEncodeCachingOpts(
857     const LscOpInfo &opInfo,
858     LSC_CACHE_OPTS cacheOpts,
859     uint32_t &desc,
860     int &status) const
861 {
862     uint32_t cacheEnc = 0;
863     if (!LscTryEncodeCacheOpts(opInfo, cacheOpts, cacheEnc, isLSCCacheOpt17_19())) {
864         MUST_BE_TRUE(false, "unsupported caching options");
865         status = VISA_FAILURE;
866     }
867 
868     desc |= cacheEnc;
869 }
870 
lscEncodeAddrType(LSC_ADDR_TYPE addrModel,uint32_t & desc,int & status) const871 void IR_Builder::lscEncodeAddrType(
872     LSC_ADDR_TYPE addrModel,
873     uint32_t &desc,
874     int &status) const
875 {
876     uint32_t addrTypeEnc = 0;
877     switch (addrModel) {
878     case LSC_ADDR_TYPE_FLAT: addrTypeEnc = 0; break;
879     case LSC_ADDR_TYPE_BSS:  addrTypeEnc = 1; break;
880     case LSC_ADDR_TYPE_SS:   addrTypeEnc = 2; break;
881     case LSC_ADDR_TYPE_BTI:  addrTypeEnc = 3; break;
882     default: MUST_BE_TRUE(false, "invalid address model"); status = VISA_FAILURE;
883     }
884     desc |= addrTypeEnc << 29; // [30:29] addr size
885 }
886 
lscBuildStridedPayload(G4_Predicate * pred,G4_SrcRegRegion * src0AddrBase,G4_Operand * src0AddrStride,int dataSizeBytes,int vecSize,bool transposed)887 G4_SrcRegRegion *IR_Builder::lscBuildStridedPayload(
888     G4_Predicate        *pred,
889     G4_SrcRegRegion     *src0AddrBase, // output
890     G4_Operand          *src0AddrStride,
891     int dataSizeBytes, int vecSize, bool transposed)
892 {
893     const uint32_t BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
894     // We've been passed in a single value for the address, and we
895     // have to generate the address payload register from that value
896     // along with the pitch.
897     //
898     // E.g. we've been passed in the following.
899     // .decl VADDR v_type=G type=UD num_elts=1 align=GRF
900     //       (VADDR doesn't necessarily need to be GRF aligned)
901     //
902     // We need to generate:
903     //    .decl VADDR_REG_UD v_type=G type=UD num_elts=NUM_PER_GRF(T) align=GRF
904     //    .decl VADDR_REG_UQ type=UQ alias=<VADDR_REG_UD,0>
905     //
906     G4_Declare *addrTmpDeclUd = createSendPayloadDcl(BYTES_PER_REG/4, Type_UD);
907     G4_Declare *addrTmpDeclUq = createSendPayloadDcl(BYTES_PER_REG/8, Type_UQ);
908     addrTmpDeclUq->setAliasDeclare(addrTmpDeclUd, 0);
909     //
910     // Then to build the payload we need the following.
911     //    ...
912     //  [for 64b base addresses]
913     //    (P) mov (M1_NM,1) VADDR_REG(0,0)<1>:uq  VADDR(0,0)<0;1,0>:T
914     //  [for 32b base addresses]
915     //    (P) mov (M1_NM,1) VADDR_REG(0,0)<1>:ud  VADDR(0,0)<0;1,0>:T
916     //  ...
917     //    (P) mov (M1_NM,1) VADDR_REG(0,2)<1>:ud  sizeof(T):ud
918     //    (P) send (M1_NM,1) VDATA  VADDR_REG  null  lsc_load_block....
919     //
920     if (src0AddrBase->getType() == Type_UQ ||
921         src0AddrBase->getType() == Type_Q)
922     {
923         G4_DstRegRegion
924             *payloadDstAddrUq = createDst(
925                 addrTmpDeclUq->getRegVar(), 0, 0, 1, Type_UQ);
926         createInst(
927             pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
928             payloadDstAddrUq, src0AddrBase, nullptr,
929             Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
930     }
931     else
932     {
933         G4_DstRegRegion
934             *payloadDstAddrUd = createDst(
935                 addrTmpDeclUd->getRegVar(), 0, 0, 1, Type_UD);
936         createInst(
937             pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
938             payloadDstAddrUd, src0AddrBase, nullptr,
939             Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
940     }
941     //
942     G4_DstRegRegion
943         *payloadDstPitch = createDst(
944             addrTmpDeclUd->getRegVar(), 0, 2, 1, Type_UD);
945     if (src0AddrStride == nullptr) {
946         int defaultPitch = dataSizeBytes;
947         if (!transposed)
948             defaultPitch *= vecSize;
949         src0AddrStride = createImmWithLowerType(defaultPitch, Type_UD);
950     }
951     createInst(
952         pred, G4_mov, 0, g4::NOSAT, g4::SIMD1, payloadDstPitch, src0AddrStride, nullptr,
953         Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
954     //
955     return createSrc(
956         addrTmpDeclUd->getRegVar(), 0, 0,
957         getRegionScalar(), Type_UD);
958 }
959 
lscBuildBlock2DPayload(LSC_DATA_SHAPE_BLOCK2D dataShape2D,G4_Predicate * pred,G4_Operand * src0Addrs[6])960 G4_SrcRegRegion *IR_Builder::lscBuildBlock2DPayload(
961     LSC_DATA_SHAPE_BLOCK2D   dataShape2D,
962     G4_Predicate            *pred,
963     G4_Operand              *src0Addrs[6])
964 {
965     // Similar to lscBuildStridedPayload, but this formats the payload
966     // as follows.
967     //
968     // https://gfxspecs.intel.com/Predator/Home/Index/53567
969     // A2DBLOCK_PAYLOAD:
970     //   [31:0]:    base address lo (32b)
971     //   [63:32]:   base address hi (32b)
972     //   [95:64]:   surface width minus 1 (32b)
973     //   [127:96]:  surface height minus 1 (32b)
974     //   [159:128]: surface pitch minus 1 (32b)
975     //   [191:160]: block X (32b)
976     //   [223:192]: block Y (32b)
977     //   [231:224]: block width (8b)
978     //   [239:232]: block height (8b)
979     //   [243:240]: array length (4b)
980     //   [255:244]: UNDEFINED
981     //
982     // [StartX:s32, StartY:s32, Width:u32, Height:u32, ArrayLenMinus1:u4]
983     // ArrayLenMinus1 is at [131:128]
984     //
985     // We generate the following.  Since the width and height are immediate
986     //
987     //   .decl VADDR_REG_UD v_type=G type=UD num_elts=NUM_PER_GRF(T) align=GRF
988     //   .decl VADDR_REG_UQ type=UQ alias=<VADDR_REG_UD,0>
989     //   mov (M1_NM,1) ADDR(0,0):d   src0AddrX
990     //   mov (M1_NM,1) ADDR(0,1):d   src0AddrY
991     //   mov (M1_NM,1) ADDR(0,1):uq  ((blockWidth << 32)|blockHeight):uq
992     //   mov (M1_NM,1) ADDR(0,4):d   arrayLen:uw
993     const uint32_t BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
994     G4_Declare *addrTmpDeclUd = createSendPayloadDcl(BYTES_PER_REG/4, Type_UD);
995     G4_Declare *addrTmpDeclUq = createSendPayloadDcl(BYTES_PER_REG/8, Type_UQ);
996     addrTmpDeclUq->setAliasDeclare(addrTmpDeclUd, 0);
997     ///////////////////////
998     auto movUQ =
999         [&](int dstSubReg, G4_Operand *src) {
1000         G4_DstRegRegion
1001             *payloadDstAddr_0_Q = createDst(
1002                 addrTmpDeclUq->getRegVar(),
1003                 0, dstSubReg,
1004                 1,
1005                 Type_UQ);
1006         createInst(
1007             pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
1008             payloadDstAddr_0_Q,
1009             src, nullptr, Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
1010     };
1011     auto movUD =
1012         [&](int dstSubReg, G4_Operand *src) {
1013         G4_DstRegRegion
1014             *payloadDst = createDst(
1015                 addrTmpDeclUd->getRegVar(), 0, dstSubReg, 1, Type_UD);
1016         createInst(
1017             pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
1018             payloadDst, src, nullptr,
1019             Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
1020     };
1021     auto movImmUD =
1022         [&](int dstSubReg, uint32_t imm) {
1023         movUD(dstSubReg, createImmWithLowerType(imm, Type_UD));
1024     };
1025 
1026     ///////////////////////////////////
1027     //   .decl ADDR v_type=G type=UD num_elts=NUM_PER_GRF(T) align=GRF
1028     //   .decl ADDR type=UQ alias=<VADDR_REG_UD,0>
1029     //   mov (M1_NM,1) ADDR(0,0):uq   src0AddrBase[0]:uq
1030     //   mov (M1_NM,1) ADDR(0,2):ud   src0AddrBase[1]:ud
1031     //   mov (M1_NM,1) ADDR(0,3):ud   src0AddrBase[2]:ud
1032     //   mov (M1_NM,1) ADDR(0,4):ud   src0AddrBase[3]:ud
1033     //   mov (M1_NM,1) ADDR(0,5):ud   src0AddrBase[4]:ud
1034     //   mov (M1_NM,1) ADDR(0,6):ud   src0AddrBase[5]:ud
1035     //   mov (M1_NM,1) ADDR(0,7):ud   (width x height x blocks):ud
1036     //
1037     // bottom 64b
1038     movUQ(0, src0Addrs[0]); // surface address
1039                             // these start at REG.2:d
1040     movUD(2, src0Addrs[1]); // surface width - 1
1041     movUD(3, src0Addrs[2]); // surface height - 1
1042     movUD(4, src0Addrs[3]); // surface pitch - 1
1043     movUD(5, src0Addrs[4]); // block x
1044     movUD(6, src0Addrs[5]); // block y
1045     uint32_t blockSize =
1046         (dataShape2D.width - 1) |
1047         ((dataShape2D.height - 1) << 8) |
1048         ((dataShape2D.blocks - 1) << 16);
1049     movImmUD(7, blockSize);
1050     //
1051     return createSrc(
1052         addrTmpDeclUd->getRegVar(), 0, 0,
1053         getRegionScalar(), Type_UD);
1054 }
1055 
lscLoadEffectiveAddress(LSC_OP lscOp,LSC_SFID lscSfid,G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,LSC_ADDR addrInfo,int bytesPerDataElem,const G4_Operand * surface,G4_SrcRegRegion * addr,uint32_t & exDesc)1056 G4_SrcRegRegion *IR_Builder::lscLoadEffectiveAddress(
1057     LSC_OP                    lscOp,
1058     LSC_SFID                  lscSfid,
1059     G4_Predicate             *pred,
1060     G4_ExecSize               execSize,
1061     VISA_EMask_Ctrl           execCtrl,
1062     LSC_ADDR                  addrInfo,
1063     int                       bytesPerDataElem,
1064     const G4_Operand         *surface,
1065     G4_SrcRegRegion          *addr,
1066     uint32_t                 &exDesc
1067     )
1068 {
1069     MUST_BE_TRUE(addrInfo.immScale == 1, "address scaling not supported yet");
1070     // The address may need scaling and offset adjustment
1071     //    NEW_ADDR = SCALE*ADDR + OFF
1072     //
1073     // e.g. lsc_load.ugm.d32.a64 ... [4*ADDR - 0x100]
1074     //
1075 
1076     // emulate scale and add if necessary
1077     return lscMulAdd(
1078         pred, execSize, execCtrl,
1079         addr, (int16_t)addrInfo.immScale, addrInfo.immOffset);
1080 }
1081 
1082 
lscCheckRegion(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src)1083 G4_SrcRegRegion *IR_Builder::lscCheckRegion(
1084     G4_Predicate             *pred,
1085     G4_ExecSize               execSize,
1086     VISA_EMask_Ctrl           execCtrl,
1087     G4_SrcRegRegion          *src)
1088 {
1089     const G4_Type srcType = src->getType();
1090     // Later extension could repack and work in these case,
1091     // for now throw a tantrum if they give us
1092     // ... VAR<2;1,0>
1093     // we do permit VAR<0;1,0>
1094     MUST_BE_TRUE(
1095         src->getRegion()->isPackedRegion() || src->getRegion()->isScalar(),
1096         "input must be scalar/packed");
1097     MUST_BE_TRUE(src->getSubRegOff() == 0 || src->getRegion()->isScalar(),
1098         "vector operands must be register aligned");
1099     return src;
1100 }
1101 
lscMulAdd(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src,int16_t mulImm16,int64_t addImm64)1102 G4_SrcRegRegion *IR_Builder::lscMulAdd(
1103     G4_Predicate             *pred,
1104     G4_ExecSize               execSize,
1105     VISA_EMask_Ctrl           execCtrl,
1106     G4_SrcRegRegion          *src,
1107     int16_t                   mulImm16,
1108     int64_t                   addImm64)
1109 {
1110     if (mulImm16 == 1 && addImm64 == 0) {
1111         // no op
1112         return src;
1113     } else if (mulImm16 == 1 && addImm64 != 0) {
1114         // reduces to an add
1115         return lscAdd(pred, execSize, execCtrl, src, addImm64);
1116     } else if (mulImm16 != 1 && addImm64 == 0) {
1117         // reduces to a multiply
1118         return lscMul(pred, execSize, execCtrl, src, mulImm16);
1119     } else {
1120         MUST_BE_TRUE(false, "multiply not supported yet");
1121         return nullptr;
1122         /*
1123         // hard cases...
1124         auto srcType = src->getElemType();
1125         if (srcType == Type_UQ || srcType == Type_Q) {
1126         // harder case: sub-optimal code for now will
1127         // flip the lo32/hi32 pairs around twice
1128         auto *tmpVar = lscMul(pred, execSize, execCtrl, src, mulImm16);
1129         return lscAdd(pred, execSize, execCtrl, tmpVar, addImm64);
1130         } else {
1131         G4_Imm *addImmOpnd;
1132         if (srcType == Type_UD || srcType == Type_D) {
1133         MUST_BE_TRUE(
1134         addImm64 >= std::numeric_limits<int32_t>::min() &&
1135         addImm64 <= std::numeric_limits<int32_t>::max(),
1136         "imm offset for A32 must fit in 32b");
1137         addImmOpnd = createImmWithLowerType(addImm64, srcType);
1138         } else {
1139         MUST_BE_TRUE(
1140         addImm64 >= std::numeric_limits<int16_t>::min() &&
1141         addImm64 <= std::numeric_limits<int16_t>::max(),
1142         "imm offset for A16 must fit in 16b");
1143         addImmOpnd = createImmWithLowerType(addImm64, srcType);
1144         }
1145         // can use 32b + 32b x 16b mad (all platforms) (in place)
1146         // create a new register in case there's aliasing
1147         G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1148         G4_DstRegRegion *dstRgn =
1149         createDst(result->getRegVar(), 0, 0, 1, srcType);
1150         const auto *srcRgnVal = execSize == 1 ? getRegionScalar() : getRegionStride1();
1151         G4_SrcRegRegion *srcRgn =
1152         createSrcRegRegion(src->getRegVar(), 0, 0, srcRgnVal, srcType);
1153         //
1154         G4_Operand *mulImmOp = createImm(mulImm16, Type_W);
1155         createInst(pred, G4_mad, nullptr, false, execSize,
1156         dstRgn, addImmOpnd, srcRgn, mulImmOp,
1157         Get_Gen4_Emask(execCtrl, execSize));
1158         //
1159         return result;
1160         }
1161         */
1162     }
1163 }
1164 
1165 
1166 
isPow2(int x)1167 static bool isPow2(int x)
1168 {
1169     return (x & (x - 1)) == 0;
1170 }
intLog2(int x)1171 static int intLog2(int x)
1172 {
1173     int shiftAmt = 0;
1174     while (x > 1) {
1175         x >>= 1;
1176         shiftAmt++;
1177     }
1178     return shiftAmt;
1179 }
1180 
lscMul(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src0,int16_t mulImm)1181 G4_SrcRegRegion *IR_Builder::lscMul(
1182     G4_Predicate             *pred,
1183     G4_ExecSize               execSize,
1184     VISA_EMask_Ctrl           execCtrl,
1185     G4_SrcRegRegion          *src0,
1186     int16_t                   mulImm)
1187 {
1188     if (mulImm == 1)
1189         return src0;
1190 
1191     const auto srcType = src0->getType();
1192     if (srcType == Type_UQ || srcType == Type_Q) {
1193         return lscMul64Aos(pred, execSize, execCtrl, src0, mulImm);
1194     } else {
1195         /*
1196         G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1197         G4_DstRegRegion *dst =
1198         createDst(result->getRegVar(), 0, 0, 1, srcType);
1199         const auto *srcRgn = execSize == 1 ?
1200         getRegionScalar() : getRegionStride1();
1201         G4_SrcRegRegion *src0 =
1202         createSrcRegRegion(srcVar->getRegVar(), 0, 0, srcRgn, srcType);
1203         G4_Operand *mulImmOp = createImm(mulImm, Type_W);
1204         createInst(
1205         duplicateOperand(pred),
1206         G4_mul, nullptr, false,
1207         execSize, dst, src0, mulImmOp, execCtrl);
1208         return result;
1209         */
1210         MUST_BE_TRUE(false, "lscMul unsupported");
1211         return nullptr;
1212     }
1213 }
1214 
1215 
lscAdd(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src0,int64_t addImm64)1216 G4_SrcRegRegion *IR_Builder::lscAdd(
1217     G4_Predicate             *pred,
1218     G4_ExecSize               execSize,
1219     VISA_EMask_Ctrl           execCtrl,
1220     G4_SrcRegRegion          *src0,
1221     int64_t                   addImm64)
1222 {
1223     if (addImm64 == 0)
1224         return src0;
1225 
1226     const G4_Type srcType = src0->getType();
1227     MUST_BE_TRUE(
1228         srcType == Type_UQ || srcType == Type_Q ||
1229         srcType == Type_UD || srcType == Type_D ||
1230         srcType == Type_UW || srcType == Type_W,
1231         "function only supports integer types");
1232 
1233     src0 = lscCheckRegion(pred, execSize, execCtrl, src0);
1234 
1235     if (srcType == Type_UQ || srcType == Type_Q) {
1236         if (hasInt64Add()) {
1237             return lscAdd64AosNative(pred, execSize, execCtrl, src0, addImm64);
1238         } else {
1239             return lscAdd64AosEmu(pred, execSize, execCtrl, src0, addImm64);
1240         }
1241     } else if ((int32_t)addImm64 != addImm64) {
1242         MUST_BE_TRUE(false, "<64b add must not use >32b imm off");
1243     } else if ((srcType == Type_UW || srcType == Type_W) &&
1244         (int16_t)addImm64 != addImm64)
1245     {
1246         MUST_BE_TRUE(false, "16b add must not use >16b imm off");
1247     }
1248 
1249     // we can do this in one instruction
1250     G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1251     G4_DstRegRegion *dst = createDst(result->getRegVar(), srcType);
1252     const auto *srcRgn = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride1();
1253     G4_Operand *immOp = createImmWithLowerType(addImm64, srcType);
1254     createInst(
1255         duplicateOperand(pred),
1256         G4_add, nullptr, g4::NOSAT, execSize,
1257         dst, src0, immOp, Get_Gen4_Emask(execCtrl, execSize), true);
1258 
1259     return createSrc(result->getRegVar(), 0, 0, srcRgn, srcType);
1260 }
1261 
lscAdd64AosNative(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * srcReg64,int64_t addImm64)1262 G4_SrcRegRegion *IR_Builder::lscAdd64AosNative(
1263     G4_Predicate             *pred,
1264     G4_ExecSize               execSize,
1265     VISA_EMask_Ctrl           execCtrl,
1266     G4_SrcRegRegion          *srcReg64,
1267     int64_t                   addImm64)
1268 {
1269     if (addImm64 == 0)
1270         return srcReg64;
1271     // we can assume this is only called on >=PVC (has LSC and DG2 lacks native int64)
1272     const auto *srcRgn1 = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride1();
1273     const G4_Type srcType = srcReg64->getType();
1274     MUST_BE_TRUE(
1275         srcType == Type_UQ || srcType == Type_Q,
1276         "this function only supports Q/UQ types");
1277     G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1278     G4_DstRegRegion *dst =
1279         createDst(result->getRegVar(), 0, 0, 1, Type_Q);
1280     MUST_BE_TRUE(
1281         addImm64 >= std::numeric_limits<int32_t>::min() &&
1282         addImm64 <= std::numeric_limits<int32_t>::max(), "offset too big");
1283     G4_Imm *srcImm = createImm((int32_t)addImm64, Type_D);
1284     createInst(
1285         duplicateOperand(pred),
1286         G4_add, nullptr, g4::NOSAT, execSize,
1287         dst, srcReg64, srcImm, Get_Gen4_Emask(execCtrl, execSize), true);
1288 
1289     return createSrc(result->getRegVar(), 0, 0, srcRgn1, srcReg64->getType());
1290 }
1291 
lscAdd64AosEmu(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * srcReg64,int64_t addImm64)1292 G4_SrcRegRegion *IR_Builder::lscAdd64AosEmu(
1293     G4_Predicate             *pred,
1294     G4_ExecSize               execSize,
1295     VISA_EMask_Ctrl           execCtrl,
1296     G4_SrcRegRegion          *srcReg64,
1297     int64_t                   addImm64)
1298 {
1299     if (addImm64 == 0)
1300         return srcReg64;
1301 
1302     const auto *srcRgn1 = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride1();
1303     const auto *srcRgn2 = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride2();
1304     int dstRgnHz2 = execSize == g4::SIMD1 ? 1 : 2;
1305 
1306     const G4_Type srcType = srcReg64->getType();
1307     MUST_BE_TRUE(
1308         srcType == Type_UQ || srcType == Type_Q,
1309         "this function only supports integer types");
1310 
1311     // Given REG64.K<1;1,0>:q we need to split this into the low and high
1312     // halves: REG32.(2*K)<2,1,0>:d and REG32.(2*K+1)<2,1,0>:d
1313     // (scalar gets scalar regions)
1314     //
1315     // These are lambdas because we have to extract these regions repeatedly
1316     // for each pass (walking them forward)
1317     auto getSrcReg32 = [&] (int pass, short evenOdd) {
1318         // walk the base register forward if the input is vector
1319         int passRegOff = srcReg64->getRegion()->isScalar() ? 0  : 2 * pass;
1320         G4_SrcRegRegion *srcReg32 =
1321             createSrc(
1322                 srcReg64->getBase(),
1323                 srcReg64->getRegOff() + passRegOff,
1324                 srcReg64->getSubRegOff()/2 + evenOdd,
1325                 srcRgn2,
1326                 Type_UD);
1327         return srcReg32;
1328     };
1329 
1330     // DST = SRC + IMM64
1331     // (W) addc (..|M0) TMP0<1>   SRC.0<2>  LO32(imm64)         {AccWrEn}
1332     // (W) addX (..|M0) TMP1<1>   SRC.1<2>  [HI32(imm64)] acc0
1333     // (P) mov  (..|MX) DST.0<2>  TMP1.0<1> // mux it back out
1334     // (P) mov  (..|MX) DST.1<2>  TMP2.0<1>
1335     G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1336     //
1337     VISA_EMask_Ctrl passExecCtrl = execCtrl;
1338     const G4_ExecSize passExecSize = std::min<G4_ExecSize>(execSize, getNativeExecSize());
1339     const int passes = std::max<int>(1, execSize/getNativeExecSize());
1340     //
1341     // shared immediate operands
1342     G4_Imm *srcImmLo32 = createImm(addImm64 & 0xFFFFFFFF, Type_UD);
1343     uint32_t hi32Bits = (uint32_t)(addImm64 >> 32);
1344     G4_Imm *srcImmHi32 = (hi32Bits != 0) ?  createImm(hi32Bits, Type_UD) : nullptr;
1345     //
1346     for (int pass = 0; pass < passes; pass++)
1347     {
1348         // e.g. someone tries to do a SIMD32 starting at M16
1349         MUST_BE_TRUE(passExecCtrl != vISA_NUM_EMASK, "invalid exec mask");
1350         //
1351         G4_Declare *TMP_LO32 = createTempVar(passExecSize, Type_UD, GRFALIGN);
1352         G4_DstRegRegion *dstAddcLo =
1353             createDst(TMP_LO32->getRegVar(), 0, 0, 1, Type_UD);
1354         G4_SrcRegRegion *srcAddcLo = getSrcReg32(pass, 0);
1355         G4_INST* addLoInst = createInst(
1356             duplicateOperand(pred),
1357             G4_addc, nullptr, g4::NOSAT, passExecSize,
1358             dstAddcLo, srcAddcLo, srcImmLo32,
1359             Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize) | InstOpt_AccWrCtrl, true);
1360         G4_DstRegRegion *dstAcc0 = createDst(phyregpool.getAcc0Reg(), 0, 0, 1, Type_UD);
1361         addLoInst->setImplAccDst(dstAcc0);
1362         //
1363         G4_Declare *TMP_HI32 = createTempVar(passExecSize, Type_UD, GRFALIGN);
1364         G4_DstRegRegion *dstAddHi =
1365             createDst(TMP_HI32->getRegVar(), 0, 0, 1, Type_UD);
1366         G4_SrcRegRegion *srcAddHi = getSrcReg32(pass, 1);
1367         G4_SrcRegRegion *srcAcc0 =
1368             createSrc(phyregpool.getAcc0Reg(), 0, 0, srcRgn1, Type_UD);
1369         if (srcImmHi32) {
1370             createInst(
1371                 duplicateOperand(pred),
1372                 G4_add3, nullptr, g4::NOSAT, passExecSize,
1373                 dstAddHi, srcAcc0, srcAddHi, srcImmHi32,
1374                 Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize), true);
1375         } else {
1376             createInst(
1377                 duplicateOperand(pred),
1378                 G4_add, nullptr, g4::NOSAT, passExecSize,
1379                 dstAddHi, srcAcc0, srcAddHi,
1380                 Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize), true);
1381         }
1382         //
1383         G4_DstRegRegion *resultLo =
1384             createDst(result->getRegVar(), 2*pass, 0, dstRgnHz2, Type_UD);
1385         G4_SrcRegRegion *tmpLoSrc =
1386             createSrc(TMP_LO32->getRegVar(), 0, 0, srcRgn1, Type_UD);
1387         createInst(
1388             duplicateOperand(pred),
1389             G4_mov, nullptr, g4::NOSAT, passExecSize,
1390             resultLo, tmpLoSrc, nullptr, Get_Gen4_Emask(passExecCtrl, passExecSize), true);
1391         //
1392         G4_DstRegRegion *resultHi =
1393             createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_UD);
1394         G4_SrcRegRegion *tmpHiSrc =
1395             createSrc(TMP_HI32->getRegVar(), 0, 0, srcRgn1, Type_UD);
1396         createInst(
1397             duplicateOperand(pred),
1398             G4_mov, nullptr, g4::NOSAT, passExecSize,
1399             resultHi, tmpHiSrc, nullptr, Get_Gen4_Emask(passExecCtrl, passExecSize), true);
1400         //
1401         passExecCtrl = Get_Next_EMask(passExecCtrl, passExecSize);
1402     }
1403 
1404     return createSrc(result->getRegVar(), 0, 0, srcRgn1, srcReg64->getType());
1405 }
1406 
lscMul64Aos(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src0,int16_t mulImm)1407 G4_SrcRegRegion *IR_Builder::lscMul64Aos(
1408     G4_Predicate             *pred,
1409     G4_ExecSize               execSize,
1410     VISA_EMask_Ctrl           execCtrl,
1411     G4_SrcRegRegion          *src0,
1412     int16_t                   mulImm)
1413 {
1414     if (mulImm == 1)
1415         return src0;
1416 
1417     MUST_BE_TRUE(false, "mul64-aos not supported yet");
1418     return nullptr;
1419 
1420     /*
1421     const auto *srcRgn1 = execSize == 1 ? getRegionScalar() : getRegionStride1();
1422     const auto *srcRgn2 = execSize == 1 ? getRegionScalar() : getRegionStride2();
1423     int dstRgnHz2 = execSize == 1 ? 1 : 2;
1424 
1425     // int64 vs 16b multiply with int32 math
1426     auto srcType = srcVar->getElemType();
1427     MUST_BE_TRUE(srcType == Type_UQ || srcType == Type_Q, "type should be 64b");
1428     //
1429     // either way below we need the accumulator, so we're limited to using
1430     // multiple passes to perform the math
1431     const auto passExecSize = std::min<unsigned>(execSize, getNativeExecSize());
1432     const int passes = std::max<int>(1, execSize/getNativeExecSize());
1433 
1434     G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1435     if (isPow2(mulImm)) {
1436     // e.g. SIMD32 when SIMD8 is max HW size requires four passes
1437     int shlAmt = intLog2(mulImm);
1438     VISA_EMask_Ctrl passExecCtrl = execCtrl;
1439     for (int pass = 0; pass < passes; pass++)
1440     {
1441     // e.g. someone tries to do a SIMD32 starting at M16
1442     MUST_BE_TRUE(passExecCtrl != vISA_NUM_EMASK, "invalid exec mask");
1443     //
1444     // shr (E|M0)  TMP.1<1>:ud     SRC.0<2>       29
1445     // shl (E|M0)  DST.0<2>:ud     SRC.0<2>        3
1446     // shl (E|M0)  DST.1<2>:ud     SRC.0<2>        3
1447     // or  (E|M0)  DST.1<2>:ud     DST.1<2>      TMP
1448     int passInstOpt = Get_Gen4_Emask(passExecCtrl, passExecSize);
1449     G4_Declare *TMP = createTempVar(passExecSize, Type_UD, GRFALIGN);
1450     G4_DstRegRegion *dstTMP =
1451     createDst(TMP->getRegVar(), 0, 0, 1, Type_UD);
1452     G4_SrcRegRegion *srcLo32a =
1453     createSrcRegRegion(srcVar->getRegVar(), 2*pass, 0, srcRgn2, Type_UD);
1454     G4_Operand *shrImmAmt = createImm(32 - shlAmt, Type_W);
1455     createBinOp(
1456     duplicateOperand(pred),
1457     G4_shr, passExecSize,
1458     dstTMP, srcLo32a, shrImmAmt, passInstOpt);
1459     //
1460     G4_Operand *shlImmAmt = createImm(shlAmt, Type_W);
1461     G4_DstRegRegion *dstLo32 =
1462     createDst(result->getRegVar(), 2*pass, 0, dstRgnHz2, Type_UD);
1463     G4_SrcRegRegion *srcLo32b =
1464     createSrcRegRegion(srcVar->getRegVar(), 2*pass, 0, srcRgn2, Type_UD);
1465     createBinOp(
1466     duplicateOperand(pred),
1467     G4_shl, passExecSize, dstLo32, srcLo32b, shlImmAmt, passInstOpt);
1468     //
1469     G4_DstRegRegion *dstHi32a =
1470     createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_UD);
1471     G4_SrcRegRegion *srcHi32a =
1472     createSrcRegRegion(srcVar->getRegVar(), 2*pass, 1, srcRgn2, Type_UD);
1473     createBinOp(
1474     duplicateOperand(pred),
1475     G4_shl, passExecSize, dstHi32a, srcHi32a, shlImmAmt, passInstOpt);
1476     //
1477     G4_DstRegRegion *dstHi32b =
1478     createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_UD);
1479     G4_SrcRegRegion *srcHi32b =
1480     createSrcRegRegion(srcVar->getRegVar(), 2*pass, 1, srcRgn2, Type_UD);
1481     G4_SrcRegRegion *srcTMP =
1482     createSrcRegRegion(TMP->getRegVar(), 0, 1, srcRgn1, Type_UD);
1483     createBinOp(
1484     duplicateOperand(pred),
1485     G4_or, passExecSize, dstHi32b, srcHi32b, srcTMP,
1486     passInstOpt);
1487 
1488     passExecCtrl = Get_Next_EMask(execCtrl, (int)passExecSize);
1489     }
1490     } else {
1491     // have to use mul/mach
1492     // SOA version
1493     //     mul  (8|M0)   DST_LO32<1>:ud   SRC.lo32:ud      imm16:uw
1494     // (W) mul  (8|M0)   acc0.0<1>:ud     SRC.lo32:ud      imm16:uw
1495     //     mach (8|M0)   TMP0.0<1>:d      SRC.lo32:ud      imm16:ud {AccWrEn}
1496     //     mul  (8|M0)   TMP1.0<1>:d      SRC.hi32:d       imm16:uw
1497     //     add  (8|M0)   DST_HI32<1>:d    TMP0:d           TMP:d
1498     // AOS version: pass execution size is int sizeof(acc0), with pass offset PO
1499     //     mul  (P|PO)   DST.0<2>:ud    SRC.0<2>:ud  imm16:uw
1500     // (W) mul  (P|M0)   acc0.0<1>:ud   SRC.0<2>:ud  imm16:uw
1501     // (W) mach (P|M0)   TMP0<1>:d      SRC.0<2>:ud  imm16:ud {AccWrEn}
1502     //     mul  (P|PO)   TMP1<1>:d      SRC.1<2>:d   imm16:uw
1503     //     add  (P|PO)   DST.1<2>:d     TMP0:d       TMP1:d
1504     VISA_EMask_Ctrl passExecCtrl = execCtrl;
1505     G4_Operand *srcImm16 = createImm(mulImm, Type_UW);
1506 
1507     for (int pass = 0; pass < passes; pass++)
1508     {
1509     // e.g. someone tries to do a SIMD32 starting at M16
1510     MUST_BE_TRUE(passExecCtrl != vISA_NUM_EMASK, "invalid exec mask");
1511     //
1512     G4_DstRegRegion *dstMul1 =
1513     createDst(result->getRegVar(), 2*pass, 0, dstRgnHz2, Type_UD);
1514     G4_SrcRegRegion *srcMul1 =
1515     createSrcRegRegion(srcVar->getRegVar(), 2*pass, 0, srcRgn2, Type_UD);
1516     createInst(
1517     duplicateOperand(pred),
1518     G4_mul, nullptr, false, passExecSize,
1519     dstMul1, srcMul1, srcImm16, Get_Gen4_Emask(passExecCtrl, passExecSize));
1520     //
1521     G4_Declare *TMP0 = createTempVar(passExecSize, Type_UD, GRFALIGN);
1522     G4_DstRegRegion *dstMul2 =
1523     createDst(phyregpool.getAcc0Reg(), 0, 0, 1, Type_UD);
1524     G4_SrcRegRegion *srcMul2 = duplicateOperand(srcMul1);
1525     createInst(
1526     duplicateOperand(pred),
1527     G4_mul, nullptr, false, passExecSize,
1528     dstMul2, srcMul2, srcImm16,
1529     Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize));
1530     //
1531     G4_DstRegRegion *dstMach3 =
1532     createDst(TMP0->getRegVar(), 0, 0, 1, Type_D);
1533     G4_SrcRegRegion *srcMach3 = duplicateOperand(srcMul1);
1534     G4_INST *i = createInst(
1535     duplicateOperand(pred),
1536     G4_mach, nullptr, false, passExecSize,
1537     dstMach3, srcMach3, srcImm16,
1538     Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize) | InstOpt_AccWrCtrl);
1539     G4_SrcRegRegion *srcImplAcc =
1540     createSrcRegRegion(phyregpool.getAcc0Reg(), 0, 0, srcRgn1, Type_D);
1541     i->setImplAccSrc(srcImplAcc);
1542     //
1543     G4_Declare *TMP1 = createTempVar(passExecSize, Type_D, GRFALIGN);
1544     G4_DstRegRegion *dstMul4 =
1545     createDst(TMP1->getRegVar(), 0, 0, 1, Type_D);
1546     G4_SrcRegRegion *srcMul4 =
1547     createSrcRegRegion(srcVar->getRegVar(), 2*pass, 1, srcRgn2, Type_D);
1548     createInst(
1549     duplicateOperand(pred),
1550     G4_mul, nullptr, false, passExecSize,
1551     dstMul4, srcMul4, srcImm16, Get_Gen4_Emask(passExecCtrl, passExecSize));
1552     //
1553     G4_DstRegRegion *dstAdd5 =
1554     createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_D);
1555     G4_SrcRegRegion *src0Add5 =
1556     createSrcRegRegion(TMP0->getRegVar(), 0, 1, srcRgn1, Type_D);
1557     G4_SrcRegRegion *src1Add5 =
1558     createSrcRegRegion(TMP1->getRegVar(), 0, 1, srcRgn1, Type_D);
1559     createInst(
1560     duplicateOperand(pred),
1561     G4_mul, nullptr, false, passExecSize,
1562     dstAdd5, src0Add5, src1Add5, Get_Gen4_Emask(passExecCtrl, passExecSize));
1563     //
1564     passExecCtrl = Get_Next_EMask(execCtrl, (int)passExecSize);
1565     }
1566     }
1567     return result;
1568     */
1569 }
1570