1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2020-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "BuildIR.h"
10 #include "../Timer.h"
11
12 #include <cmath>
13
14 using namespace vISA;
15
lscMinExecSize(LSC_SFID lscSfid) const16 G4_ExecSize IR_Builder::lscMinExecSize(LSC_SFID lscSfid) const
17 {
18 const TARGET_PLATFORM P = getPlatform();
19 uint32_t minExecSize = (P == GENX_DG2 ? 8 : 16);
20 if (!hasLSCEnableHalfSIMD())
21 {
22 minExecSize *= 2;
23 }
24 return G4_ExecSize(minExecSize);
25 }
26
lscTryPromoteSurfaceImmToExDesc(G4_Operand * surface,LSC_ADDR_TYPE addrModel,uint32_t & exDesc)27 static G4_Operand *lscTryPromoteSurfaceImmToExDesc(
28 G4_Operand *surface, LSC_ADDR_TYPE addrModel, uint32_t &exDesc)
29 {
30 if (surface && surface->isImm()) {
31 // try and promote any immediate surface to the extended descriptor if
32 // possible; we get [31:12] in the EU ISA to work with.
33 auto surfaceImm = (uint32_t)surface->asImm()->getImm();
34 if (addrModel == LSC_ADDR_TYPE_BTI) {
35 // promote the immediate BTI to the descriptor
36 exDesc |= surfaceImm << 24;
37 surface = nullptr;
38 } else if (
39 addrModel == LSC_ADDR_TYPE_BSS ||
40 addrModel == LSC_ADDR_TYPE_SS)
41 {
42 if ((surfaceImm & 0x3FF) == 0) {
43 exDesc |= surfaceImm;
44 surface = nullptr;
45 }
46 } else {
47 // flat address type
48 MUST_BE_TRUE(surface->isNullReg() ||
49 surfaceImm == PREDEFINED_SURFACE_SLM ||
50 surfaceImm == PREDEFINED_SURFACE_T255, // not sure what's up here
51 "flat address type must have null reg (or 0)");
52 surface = nullptr;
53 }
54 } else {
55 MUST_BE_TRUE(surface || addrModel == LSC_ADDR_TYPE_FLAT,
56 "only flat address model may have null surface");
57 }
58 return surface;
59 }
60
isNullOperand(const G4_Operand * opnd)61 static bool isNullOperand(const G4_Operand *opnd) {
62 return opnd == nullptr || opnd->isNullReg();
63 }
64
alignUp(int a,int n)65 static int alignUp(int a, int n) {
66 return n + a - 1 - ((n + a -1) % a);
67 }
68
lscBlock2dComputeDataRegs(LSC_OP op,LSC_DATA_SHAPE_BLOCK2D dataShape2d,int dataSizeBits)69 static int lscBlock2dComputeDataRegs(
70 LSC_OP op,
71 LSC_DATA_SHAPE_BLOCK2D dataShape2d,
72 int dataSizeBits)
73 {
74 const static int BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
75
76 auto roundUpToPowerOf2 =
77 [] (int n) {
78 while (n & (n-1))
79 n++;
80 return n;
81 };
82
83 // this comes out of the HAS (1408569497)
84 // non-transpose
85 // 5.1.2.3 non-vnni (HAS pg. 8)
86 // 5.1.1.2 vnni (pg.13) perversely, this comes after 5.1.2.3 in the doc
87 // transpose
88 // 5.1.3.2 non-vnni (HAS pg. 10)
89 // 5.1.2.2 vnni (HAS pg. 15)
90 bool transpose = dataShape2d.order == LSC_DATA_ORDER_TRANSPOSE;
91 int grfRowPitchElems =
92 roundUpToPowerOf2(!transpose ? dataShape2d.width : dataShape2d.height);
93 int blockRows = !transpose ? dataShape2d.height : dataShape2d.width;
94 int elemsPerGrf = 8*BYTES_PER_REG/dataSizeBits;
95 // alignUp needed for padding between blocks; each block pads out to
96 // a full GRF
97 int regsPerBlock =
98 alignUp(elemsPerGrf, blockRows*grfRowPitchElems)/elemsPerGrf;
99 //
100 int dataRegs = dataShape2d.blocks*regsPerBlock;
101 // C.f. DP_LOAD_2DBLOCK_ARRAY
102 // https://gfxspecs.intel.com/Predator/Home/Index/53680
103 //
104 // Data payload size, in registers. Destination length of 32 is
105 // encoded as 31. Data port hardware derives the correct destination
106 // length based on message parameters.
107 if (op == LSC_LOAD_BLOCK2D && dataRegs == 32)
108 dataRegs = 31;
109 return dataRegs;
110 }
111
translateLscUntypedInst(LSC_OP op,LSC_SFID lscSfid,G4_Predicate * pred,VISA_Exec_Size visaExecSize,VISA_EMask_Ctrl execCtrl,LSC_CACHE_OPTS cacheOpts,LSC_ADDR addrInfo,LSC_DATA_SHAPE dataShape,G4_Operand * surface,G4_DstRegRegion * dstRead,G4_SrcRegRegion * src0Addr,G4_Operand * src0AddrStride,G4_SrcRegRegion * src1Data,G4_SrcRegRegion * src2Data)112 int IR_Builder::translateLscUntypedInst(
113 LSC_OP op,
114 LSC_SFID lscSfid,
115 G4_Predicate *pred,
116 VISA_Exec_Size visaExecSize,
117 VISA_EMask_Ctrl execCtrl,
118 LSC_CACHE_OPTS cacheOpts,
119 LSC_ADDR addrInfo,
120 LSC_DATA_SHAPE dataShape,
121 G4_Operand *surface, // can be G4_Imm or G4_SrcRegRegion
122 G4_DstRegRegion *dstRead, // dst can be NULL reg (e.g store)
123 G4_SrcRegRegion *src0Addr, // always the addresses (base for strided)
124 G4_Operand *src0AddrStride, // only for strided
125 G4_SrcRegRegion *src1Data, // store data/extra atomic operands
126 G4_SrcRegRegion *src2Data // store data/extra atomic operands
127 )
128 {
129 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
130
131 int status = VISA_SUCCESS;
132 auto check =
133 [&] (bool z, const char *what) {
134 if (!z) {
135 MUST_BE_TRUE(false, what);
136 status = VISA_FAILURE;
137 }
138 };
139
140 const G4_ExecSize execSize = toExecSize(visaExecSize);
141 const G4_InstOpts instOpt = Get_Gen4_Emask(execCtrl, execSize);
142
143 const static uint32_t BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
144
145 // send descriptor
146 uint32_t desc = 0;
147 uint32_t exDesc = 0;
148
149 // try and promote the surface identifier (e.g. BTI or SS obj) to ex desc
150 surface = lscTryPromoteSurfaceImmToExDesc(surface, addrInfo.type, exDesc);
151 const auto opInfo = LscOpInfoGet(op);
152 MUST_BE_TRUE(!opInfo.isBlock2D(),
153 "use translateLscUntypedBlock2DInst for lsc_*_block2d");
154
155 check(
156 opInfo.kind == LscOpInfo::LOAD ||
157 opInfo.kind == LscOpInfo::STORE ||
158 opInfo.kind == LscOpInfo::ATOMIC, "unhandled LSC op class");
159
160 // Desc[5:0] is the message opcode
161 desc |= opInfo.encoding; // Desc[5:0]
162
163 // build the descriptor (Sect. 3.3.1 of the HAS)
164 // (also https://gfxspecs.intel.com/Predator/Home/Index/53522)
165 //
166 // Desc[5:0] = OPCODE {LOAD,STORE,LOAD_BLOCK,STORE_BLOCK,...}
167 // Desc[8:7] = addr size
168 // Desc[11:9] = data size
169 // Desc[15:12] = data vector size (or cmask)
170 // Desc[19:17] = caching controls (see the table for allowable combinations)
171 // Desc[30:29] = addr model (BTI = 3, SS = 2, BSS = 1, FLAT = 0)
172 //
173 // All other bits are undefined as of now
174 //
175 const int addrSizeBits = lscEncodeAddrSize(addrInfo.size, desc, status);
176 const int dataSizeBits = lscEncodeDataSize(dataShape.size, desc, status);
177 //
178 int vecSize = 0; // definitely assigned
179 if (!opInfo.hasChMask()) {
180 vecSize = lscEncodeDataElems(dataShape.elems, desc, status);
181 lscEncodeDataOrder(dataShape.order, desc, status);
182 } else {
183 MUST_BE_TRUE(dataShape.chmask, "channel mask must not be empty");
184 vecSize = 0;
185 if (dataShape.chmask & LSC_DATA_CHMASK_X) {
186 desc |= 1 << 12;
187 vecSize++;
188 }
189 if (dataShape.chmask & LSC_DATA_CHMASK_Y) {
190 desc |= 1 << 13;
191 vecSize++;
192 }
193 if (dataShape.chmask & LSC_DATA_CHMASK_Z) {
194 desc |= 1 << 14;
195 vecSize++;
196 }
197 if (dataShape.chmask & LSC_DATA_CHMASK_W) {
198 desc |= 1 << 15;
199 vecSize++;
200 }
201 }
202 lscEncodeCachingOpts(opInfo, cacheOpts, desc, status);
203 lscEncodeAddrType(addrInfo.type, desc, status);
204
205 ///////////////////////////////////////////////////////////////////////////
206 // address adjustment and extra codegen (adds, shifts, and multiplies)
207 // only pass exDesc if it's an immediate field
208 auto addrExecSize = execSize;
209 auto addrExecCtrl = execCtrl;
210 const auto isStrided =
211 op == LSC_OP::LSC_LOAD_STRIDED || op == LSC_OP::LSC_STORE_STRIDED;
212 if (isStrided) {
213 addrExecSize = g4::SIMD1;
214 addrExecCtrl = vISA_EMASK_M1_NM;
215 }
216 src0Addr = lscLoadEffectiveAddress(
217 op,
218 lscSfid,
219 pred, addrExecSize, addrExecCtrl, addrInfo, dataSizeBits / 8,
220 surface,
221 src0Addr,
222 exDesc
223 );
224
225 uint32_t dataRegs = 1;
226 uint32_t addrRegs = 1;
227
228 G4_ExecSize minExecSize = lscMinExecSize(lscSfid);
229
230 if (dataShape.order == LSC_DATA_ORDER_NONTRANSPOSE) {
231 // Non-transpose case is the typical case.
232 //
233 // ceil[ SIMT32*dataSize(b)/512(b/REG) ] * vecSize
234 // units = (b/b*REG) = REG
235 uint32_t width = std::max(execSize, minExecSize);
236 dataRegs = std::max<uint32_t>(1,
237 width * dataSizeBits / 8 / BYTES_PER_REG) * vecSize;
238 addrRegs = std::max<uint32_t>(1,
239 width * addrSizeBits / 8 / BYTES_PER_REG);
240
241 if (execSize < minExecSize)
242 {
243 // we may need to even-align src and data
244 auto evenAlignDcl = [](G4_Operand* opnd)
245 {
246 G4_Declare* dcl = opnd->getTopDcl()->getRootDeclare();
247 if (dcl->getByteSize() <= getGRFSize())
248 {
249 dcl->setEvenAlign();
250 }
251 };
252
253 if ((addrSizeBits / 8) * minExecSize > getGRFSize())
254 {
255 evenAlignDcl(src0Addr);
256 }
257
258 if ((dataSizeBits / 8) * minExecSize > getGRFSize())
259 {
260 if (!isNullOperand(dstRead))
261 {
262 evenAlignDcl(dstRead);
263 }
264 if (!isNullOperand(src1Data))
265 {
266 evenAlignDcl(src1Data);
267 }
268 }
269 // we don't need to align src2 if it exists, as we'd need to generate
270 // a temp send payload containing both src1 and src2 anyway
271 }
272 } else { // if (dataShape.order == LSC_DATA_TRANSPOSE) {
273 // The transpose case is a little odder
274 //
275 // So the data size is the SIMD size (ExecSize) times the number of
276 // registers consumed by each vector sequence (always a full
277 // register number per seq).
278 uint32_t regsPerVec = vecSize * dataSizeBits / 8 / BYTES_PER_REG;
279 if (vecSize * dataSizeBits / 8 % BYTES_PER_REG)
280 regsPerVec++; // pad out to full reg
281 dataRegs = regsPerVec * execSize;
282 }
283
284 // override sizes for special cases
285 if (op == LSC_OP::LSC_LOAD_STATUS) {
286 dataRegs = 1; // this message just returns a bitset in the low DW
287 }
288
289 // cases that need a payload register built
290 if (isStrided) {
291 src0Addr = lscBuildStridedPayload(
292 pred,
293 src0Addr,
294 src0AddrStride,
295 dataSizeBits / 8,
296 vecSize,
297 dataShape.order == LSC_DATA_ORDER_TRANSPOSE);
298 addrRegs = 1;
299 }
300
301 int src1Len = 0;
302 uint32_t dstLen = 0;
303 uint32_t src0Len = addrRegs;
304 if (opInfo.isLoad()) {
305 if (isNullOperand(dstRead)) {
306 dstLen = 0; // prefetch
307 } else {
308 dstLen = dataRegs;
309 }
310 src1Len = 0;
311 } else if (opInfo.isStore()) {
312 dstLen = 0;
313 src0Len = addrRegs;
314 src1Len = (int)dataRegs;
315 } else if (opInfo.isAtomic()) {
316 if (opInfo.extraOperands == 0) { // e.g. lsc_atomic_iinc
317 check(isNullOperand(src1Data) && isNullOperand(src2Data),
318 "atmoic unary must have null src1 and src2");
319 } else if (opInfo.extraOperands == 1) { // e.g. lsc_atomic_add
320 check(!isNullOperand(src1Data) && isNullOperand(src2Data),
321 "atmoic binary must have non-null src1 and null src2");
322 } else {
323 // lsc_atomic_icas/lsc_atomic_fcas: coalesce parmeters into one
324 check(!isNullOperand(src1Data) && !isNullOperand(src2Data),
325 "atmoic ternary must have non-null src1 and src2");
326 src1Data =
327 coalescePayload(
328 BYTES_PER_REG, BYTES_PER_REG,
329 std::max(minExecSize, execSize), execSize,
330 {src1Data, src2Data}, execCtrl);
331 }
332 src1Len = (int)dataRegs*opInfo.extraOperands;
333
334 if (dstRead->isNullReg()) {
335 dstLen = 0;
336 } else {
337 dstLen = dataRegs;
338 }
339 } else {
340 check(false, "unexpected message type");
341 }
342
343 check(dstLen < 32, "too many destination registers (read operand)");
344 check(src0Len < 32, "too many src0 registers (address)");
345 check(src1Len < 32, "too many src1 registers (write operand)");
346
347 // FIXME: we need to first sort out what the rules are on virtual registers
348 // I initially thought that one was supposed to use an alias over a .decl
349 // And have properly sized inputs, but this assumption is proving false.
350 auto checkDeclSize =
351 [&] (const char *what,
352 G4_Declare *dcl,
353 int visaRegsInDcl,
354 int genRegsNeeded)
355 {
356 // if (visaRegsInDcl != genRegsNeeded)
357 if (visaRegsInDcl < genRegsNeeded) {
358 std::stringstream ss;
359 ss << what << " register dimensions don't fit data type\n";
360 ss << "vISA decl given is: "; dcl->emit(ss);
361 ss << " (" << (dcl->getTotalElems()*dcl->getElemSize()) << "B)\n";
362 ss << "but payload should be " << genRegsNeeded << " reg(s)\n";
363 switch (addrInfo.size) {
364 case LSC_ADDR_SIZE_16b: ss << "addr size is 16b"; break;
365 case LSC_ADDR_SIZE_32b: ss << "addr size is 32b"; break;
366 case LSC_ADDR_SIZE_64b: ss << "addr size is 64b"; break;
367 default: ss << "??";
368 }
369 ss << " x " << (int)execSize << " elem(s) ";
370 if (dataShape.order == LSC_DATA_ORDER_TRANSPOSE) {
371 ss << "transposed ";
372 } else {
373 ss << "non-transposed ";
374 }
375 ss << " and data ";
376 switch (dataShape.size) {
377 case LSC_DATA_SIZE_8b: ss << "8b"; break;
378 case LSC_DATA_SIZE_16b: ss << "16b"; break;
379 case LSC_DATA_SIZE_64b: ss << "64b"; break;
380 default: ss << "32b"; break; // 32b or the conversion types
381 }
382 ss << " x " << vecSize;
383 check(false, ss.str().c_str());
384 }
385 };
386
387 // Some sanity checking of vISA region sizes with the computed sizes
388 G4_Declare *addrDcl =
389 src0Addr->getBase()->asRegVar()->getDeclare()->getRootDeclare();
390 // addrDcl->emit(std::cout,true,false);
391 check(addrDcl,"cannot find declaration for address register");
392
393 // disable size checks if execSize is < min payload width,
394 // since declares is allowed to be smaller than payload size in this case
395 if (execSize >= minExecSize)
396 {
397 if (addrDcl) {
398 auto addrRegSize = addrDcl->getElemSize() * addrDcl->getTotalElems();
399 auto visaAddrRegsInDcl =
400 std::max<int>(addrRegSize / COMMON_ISA_GRF_REG_SIZE, 1);
401 checkDeclSize("address", addrDcl, visaAddrRegsInDcl, addrRegs);
402 }
403
404 // loading/store into the null register for prefetch
405 if (!isNullOperand(dstRead)) {
406 // sanity check the number of destination operands with the types given
407 G4_Declare* dstDcl =
408 dstRead->getBase()->asRegVar()->getDeclare()->getRootDeclare();
409 check(dstDcl != nullptr, "cannot find declaration for data register");
410 unsigned dataRegBytes = dstDcl->getTotalElems() * dstDcl->getElemSize();
411 auto visaRegsInDcl =
412 std::max<int>(dataRegBytes / COMMON_ISA_GRF_REG_SIZE, 1);
413 checkDeclSize("data", dstDcl, visaRegsInDcl, dstLen);
414 }
415 }
416
417 desc |= dstLen << 20; // Desc[24:20] dst len
418 desc |= addrRegs << 25; // Desc[29:25] src0 len
419
420 SFID sfid = SFID::NULL_SFID;
421 switch (lscSfid) {
422 case LSC_UGM: sfid = SFID::UGM; break;
423 case LSC_UGML: sfid = SFID::UGML; break;
424 case LSC_SLM: sfid = SFID::SLM; break;
425 default: check(false,"invalid SFID for untyped LSC message");
426 }
427
428 G4_SendDescRaw *msgDesc = createLscDesc(
429 sfid,
430 desc,
431 exDesc,
432 src1Len,
433 getSendAccessType(opInfo.isLoad(), opInfo.isStore()),
434 surface);
435 createLscSendInst(
436 pred,
437 dstRead,
438 src0Addr,
439 src1Data,
440 execSize,
441 msgDesc,
442 instOpt,
443 addrInfo.type,
444 true);
445
446 return status;
447 }
448
449
450
translateLscUntypedBlock2DInst(LSC_OP op,LSC_SFID lscSfid,G4_Predicate * pred,VISA_Exec_Size visaExecSize,VISA_EMask_Ctrl emask,LSC_CACHE_OPTS cacheOpts,LSC_DATA_SHAPE_BLOCK2D dataShape2D,G4_DstRegRegion * dstRead,G4_Operand * src0Addrs[LSC_BLOCK2D_ADDR_PARAMS],G4_SrcRegRegion * src1Data)451 int IR_Builder::translateLscUntypedBlock2DInst(
452 LSC_OP op,
453 LSC_SFID lscSfid,
454 G4_Predicate *pred,
455 VISA_Exec_Size visaExecSize,
456 VISA_EMask_Ctrl emask,
457 LSC_CACHE_OPTS cacheOpts,
458 LSC_DATA_SHAPE_BLOCK2D dataShape2D,
459 G4_DstRegRegion *dstRead, // dst can be NULL reg (e.g store)
460 G4_Operand *src0Addrs[LSC_BLOCK2D_ADDR_PARAMS], // always the addresses
461 G4_SrcRegRegion *src1Data // store data
462 )
463 {
464 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
465
466 int status = VISA_SUCCESS;
467 auto check =
468 [&](bool z, const char *what) {
469 if (!z) {
470 MUST_BE_TRUE(false, what);
471 status = VISA_FAILURE;
472 }
473 };
474
475 const auto opInfo = LscOpInfoGet(op);
476 MUST_BE_TRUE(opInfo.isBlock2D(), "not an LSC block2d op");
477
478 // send descriptor
479 uint32_t desc = 0;
480 uint32_t exDesc = 0;
481
482 desc |= opInfo.encoding;
483 if (dataShape2D.vnni)
484 desc |= (1 << 7); // Desc[7]
485 int dataSizeBits =
486 lscEncodeDataSize(dataShape2D.size, desc, status);
487 if (dataShape2D.order == LSC_DATA_ORDER_TRANSPOSE)
488 desc |= (1 << 15);
489 lscEncodeCachingOpts(opInfo, cacheOpts, desc, status);
490 desc |= (0 << 29); // Desc[30:29] = FLAT
491
492 G4_SrcRegRegion *src0Addr =
493 lscBuildBlock2DPayload(dataShape2D, pred, src0Addrs);
494
495 uint32_t dataRegs =
496 lscBlock2dComputeDataRegs(op, dataShape2D, dataSizeBits);
497 uint32_t addrRegs = 1;
498
499 int src1Len = 0;
500 uint32_t dstLen = 0;
501 uint32_t src0Len = addrRegs;
502
503 if (opInfo.isLoad()) {
504 if (isNullOperand(dstRead)) {
505 dstLen = 0; // prefetch
506 } else {
507 dstLen = dataRegs;
508 }
509 src1Len = 0;
510 } else if (opInfo.isStore()) {
511 dstLen = 0;
512 src0Len = addrRegs;
513 src1Len = (int)dataRegs;
514 } else {
515 check(false, "unexpected message type");
516 }
517
518 desc |= dstLen << 20; // Desc[24:20] dst len
519 desc |= addrRegs << 25; // Desc[28:25] src0 len
520
521 SFID sfid = SFID::NULL_SFID;
522 switch (lscSfid) {
523 case LSC_UGM: sfid = SFID::UGM; break;
524 case LSC_UGML: sfid = SFID::UGML; break;
525 case LSC_SLM: sfid = SFID::SLM; break;
526 default: check(false, "invalid SFID for untyped block2d LSC message");
527 }
528
529 G4_SendDescRaw * msgDesc = createLscDesc(
530 sfid,
531 desc,
532 exDesc,
533 src1Len,
534 getSendAccessType(opInfo.isLoad(), opInfo.isStore()),
535 nullptr);
536
537 const G4_ExecSize execSize = toExecSize(visaExecSize);
538 const G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
539 G4_InstSend *sendInst = createLscSendInst(
540 pred,
541 dstRead,
542 src0Addr,
543 src1Data,
544 execSize,
545 msgDesc,
546 instOpt,
547 LSC_ADDR_TYPE_FLAT,
548 true);
549 (void)sendInst;
550
551 return status;
552 }
553
554
translateLscTypedInst(LSC_OP op,G4_Predicate * pred,VISA_Exec_Size execSizeEnum,VISA_EMask_Ctrl emask,LSC_CACHE_OPTS cacheOpts,LSC_ADDR_TYPE addrModel,LSC_ADDR_SIZE addrSize,LSC_DATA_SHAPE shape,G4_Operand * surface,G4_DstRegRegion * dstData,G4_SrcRegRegion * src0AddrUs,G4_SrcRegRegion * src0AddrVs,G4_SrcRegRegion * src0AddrRs,G4_SrcRegRegion * src0AddrLODs,G4_SrcRegRegion * src1Data,G4_SrcRegRegion * src2Data)555 int IR_Builder::translateLscTypedInst(
556 LSC_OP op,
557 G4_Predicate *pred,
558 VISA_Exec_Size execSizeEnum,
559 VISA_EMask_Ctrl emask,
560 LSC_CACHE_OPTS cacheOpts,
561 LSC_ADDR_TYPE addrModel,
562 LSC_ADDR_SIZE addrSize,
563 LSC_DATA_SHAPE shape,
564 G4_Operand *surface, // surface/bti
565 G4_DstRegRegion *dstData, // dst on load/atomic
566 G4_SrcRegRegion *src0AddrUs,
567 G4_SrcRegRegion *src0AddrVs,
568 G4_SrcRegRegion *src0AddrRs,
569 G4_SrcRegRegion *src0AddrLODs,
570 G4_SrcRegRegion *src1Data, // store data/extra atomic operands
571 G4_SrcRegRegion *src2Data // icas/fcas only
572 )
573 {
574 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
575
576 int status = VISA_SUCCESS;
577
578 const uint32_t BYTES_PER_GRF = COMMON_ISA_GRF_REG_SIZE;
579
580 const G4_ExecSize execSize = toExecSize(execSizeEnum);
581 const G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
582
583 const auto opInfo = LscOpInfoGet(op);
584
585 uint32_t desc = opInfo.encoding;
586 uint32_t exDesc = 0;
587
588 surface = lscTryPromoteSurfaceImmToExDesc(surface, addrModel, exDesc);
589
590 int numChannels = 0;
591 if (opInfo.hasChMask()) {
592 if (shape.chmask & LSC_DATA_CHMASK_X) {
593 desc |= 1 << 12;
594 numChannels++;
595 }
596 if (shape.chmask & LSC_DATA_CHMASK_Y) {
597 desc |= 1 << 13;
598 numChannels++;
599 }
600 if (shape.chmask & LSC_DATA_CHMASK_Z) {
601 desc |= 1 << 14;
602 numChannels++;
603 }
604 if (shape.chmask & LSC_DATA_CHMASK_W) {
605 desc |= 1 << 15;
606 numChannels++;
607 }
608 MUST_BE_TRUE(numChannels != 0, "empty channel mask");
609 } else {
610 // atomics are single channel
611 numChannels = 1;
612 }
613 int addrSizeBits = lscEncodeAddrSize(addrSize, desc, status);
614 int dataSizeBits = lscEncodeDataSize(shape.size, desc, status);
615 (void)addrSizeBits;
616 (void)dataSizeBits;
617
618 lscEncodeCachingOpts(opInfo, cacheOpts, desc, status); // Desc[19:17]
619 lscEncodeAddrType(addrModel, desc, status);
620
621 auto checkPayloadSize =
622 [&] (const char *which,
623 const G4_Declare *decl,
624 int expectDeclRegs)
625 {
626 int dclRegs =
627 std::max<int>(1,
628 decl->getTotalElems()*decl->getElemSize()/BYTES_PER_GRF);
629 // if (expectDeclRegs != dclRegs)
630 // TODO: need to fix issue with IGC codegen using offsets
631 // in raw vars
632 if (expectDeclRegs > dclRegs) {
633 std::stringstream ss;
634 ss << which << " .decl size ";
635 decl->emit(ss);
636 ss << " (" << dclRegs << ")";
637 ss << " mismatches expected number of registers for "
638 "payload (" << expectDeclRegs << ")";
639 // std::cerr << ss.str();
640 MUST_BE_TRUE(false,ss.str().c_str());
641 }
642 };
643
644 auto checkAddrPayloadSize =
645 [&] (const char *which, const G4_SrcRegRegion *srcAddr) {
646 if (srcAddr == nullptr || srcAddr->isNullReg()) {
647 return;
648 }
649 const G4_Declare *decl = getDeclare(srcAddr);
650 const int regsPerAddrChannel =
651 std::max<int>(1,addrSizeBits*(int)execSize/8/BYTES_PER_GRF);
652 checkPayloadSize(which, decl, regsPerAddrChannel);
653 };
654 checkAddrPayloadSize("src0AddrUs", src0AddrUs);
655 checkAddrPayloadSize("src0AddrVs", src0AddrVs);
656 checkAddrPayloadSize("src0AddrRs", src0AddrRs);
657 checkAddrPayloadSize("src0AddrLODs", src0AddrLODs);
658
659 G4_SrcRegRegion *srcAddrs[2] { };
660 G4_SrcRegRegion *srcData = nullptr;
661 unsigned srcAddrRegs[2]{ };
662 unsigned srcDataRegs = 0;
663 uint32_t dstDataRegs = 0;
664 if (opInfo.op == LSC_READ_STATE_INFO) {
665 // like fences, send requires *something* (at least one reg) to be
666 // sent out; we pick the initial r0 value since it's known to
667 // be floating around somewhere until EOT
668 const RegionDesc *rd = getRegionStride1();
669 G4_Declare *r0 = getBuiltinR0();
670 G4_SrcRegRegion *src0Dummy = createSrc(
671 r0->getRegVar(),
672 0, 0, rd, Type_UD);
673 srcAddrRegs[0] = 1;
674 srcAddrRegs[1] = 0;
675 srcAddrs[0] = src0Dummy;
676 } else {
677 PayloadSource srcAddrPayloads[4] { }; // U, V, R, LOD
678 unsigned numSrcAddrPayloads = 0;
679 buildTypedSurfaceAddressPayload(
680 src0AddrUs,
681 src0AddrVs,
682 src0AddrRs,
683 src0AddrLODs,
684 execSize,
685 instOpt,
686 srcAddrPayloads,
687 numSrcAddrPayloads);
688 preparePayload(
689 srcAddrs,
690 srcAddrRegs,
691 execSize,
692 false, // not a split send (so all the addrs lands in one reg)
693 srcAddrPayloads,
694 numSrcAddrPayloads);
695 MUST_BE_TRUE(srcAddrs[1] == nullptr, "invalid addr split");
696 MUST_BE_TRUE(srcAddrRegs[0] < 32, "too many address registers");
697
698 // each channel consumes at least one register (top padding may be 0)
699 const int regsPerDataChannel =
700 std::max<int>(1, dataSizeBits*(int)execSize/8/BYTES_PER_GRF);
701 auto checkDataDeclSize =
702 [&](const char *which, const G4_Operand *data) {
703 if (data == nullptr || data->isNullReg()) {
704 return;
705 }
706 const G4_Declare *decl = getDeclare(data);
707 checkPayloadSize(which, decl, regsPerDataChannel*numChannels);
708 };
709 checkDataDeclSize("dstData", dstData);
710 checkDataDeclSize("src1Data", src1Data);
711 checkDataDeclSize("src2Data", src2Data);
712
713 srcData = coalescePayload(
714 BYTES_PER_GRF, BYTES_PER_GRF, std::max(getNativeExecSize(), execSize), execSize, {src1Data, src2Data}, emask);
715 srcDataRegs = 0;
716 if (!srcData->isNullReg()) {
717 const G4_Declare *srcDcl = getDeclare(srcData);
718 // srcDcl->emit(std::cerr, false, false);
719 srcDataRegs =
720 srcDcl->getTotalElems()*srcDcl->getElemSize()/BYTES_PER_GRF;
721 }
722 dstDataRegs =
723 opInfo.isLoad() || (opInfo.isAtomic() && !dstData->isNullReg()) ?
724 regsPerDataChannel*numChannels : 0;
725 }
726 int src1Len = (int)srcDataRegs; // lsc_load_quad.tgm / lsc_atomic_icas.tgm
727
728 if (op == LSC_OP::LSC_LOAD_STATUS ||
729 op == LSC_OP::LSC_READ_STATE_INFO)
730 {
731 dstDataRegs = 1; // just a single DW of bits (padded to 1 reg)
732 }
733 // MUST_BE_TRUE(dataSrcsRegs == dataRegs, "mismatch in .decls for "
734 // "number of data registers in actual message");
735 MUST_BE_TRUE(srcDataRegs < 32, "too many data registers");
736
737 desc |= (srcAddrRegs[0] & 0xF) << 25; // mlen == Desc[28:25]
738 if (opInfo.isLoad() || (opInfo.isAtomic() && !dstData->isNullReg())) {
739 desc |= (dstDataRegs & 0x1F) << 20; // rlen == Desc[24:20]
740 }
741
742 G4_SendDescRaw *msgDesc = createLscDesc(
743 SFID::TGM,
744 desc,
745 exDesc,
746 src1Len,
747 getSendAccessType(opInfo.isLoad(), opInfo.isStore()),
748 surface);
749 G4_InstSend *sendInst = createLscSendInst(
750 pred,
751 dstData,
752 srcAddrs[0],
753 srcData,
754 execSize,
755 msgDesc,
756 instOpt,
757 addrModel,
758 true);
759 (void)sendInst;
760
761 return status;
762 }
763
lscGetElementNum(unsigned eNum) const764 LSC_DATA_ELEMS IR_Builder::lscGetElementNum(unsigned eNum) const
765 {
766 switch (eNum)
767 {
768 case 1:
769 return LSC_DATA_ELEMS_1;
770 case 2:
771 return LSC_DATA_ELEMS_2;
772 case 3:
773 return LSC_DATA_ELEMS_3;
774 case 4:
775 return LSC_DATA_ELEMS_4;
776 case 8:
777 return LSC_DATA_ELEMS_8;
778 case 16:
779 return LSC_DATA_ELEMS_16;
780 case 32:
781 return LSC_DATA_ELEMS_32;
782 case 64:
783 return LSC_DATA_ELEMS_64;
784 default:
785 return LSC_DATA_ELEMS_INVALID;
786 };
787
788 return LSC_DATA_ELEMS_INVALID;
789 }
790
lscEncodeAddrSize(LSC_ADDR_SIZE addrSize,uint32_t & desc,int & status) const791 int IR_Builder::lscEncodeAddrSize(
792 LSC_ADDR_SIZE addrSize, uint32_t &desc, int &status) const
793 {
794 int addrSizeBits = 32;
795 uint32_t addrSizeEnc = 0;
796 switch (addrSize) {
797 case LSC_ADDR_SIZE_16b: addrSizeEnc = 0x1; addrSizeBits = 16; break;
798 case LSC_ADDR_SIZE_32b: addrSizeEnc = 0x2; addrSizeBits = 32; break;
799 case LSC_ADDR_SIZE_64b: addrSizeEnc = 0x3; addrSizeBits = 64; break;
800 default: MUST_BE_TRUE(false, "invalid address size"); status = VISA_FAILURE;
801 }
802 desc |= addrSizeEnc << 7; // Desc[8:7]
803 return addrSizeBits;
804 }
805
lscEncodeDataSize(LSC_DATA_SIZE dataSize,uint32_t & desc,int & status) const806 int IR_Builder::lscEncodeDataSize(
807 LSC_DATA_SIZE dataSize, uint32_t &desc, int &status) const
808 {
809 uint32_t dataSizeEnc = 0;
810 int dataSizeBits = 32;
811 switch (dataSize) {
812 case LSC_DATA_SIZE_8b: dataSizeEnc = 0x0; dataSizeBits = 8; break;
813 case LSC_DATA_SIZE_16b: dataSizeEnc = 0x1; dataSizeBits = 16; break;
814 case LSC_DATA_SIZE_32b: dataSizeEnc = 0x2; dataSizeBits = 32; break;
815 case LSC_DATA_SIZE_64b: dataSizeEnc = 0x3; dataSizeBits = 64; break;
816 case LSC_DATA_SIZE_8c32b: dataSizeEnc = 0x4; dataSizeBits = 32; break;
817 case LSC_DATA_SIZE_16c32b: dataSizeEnc = 0x5; dataSizeBits = 32; break;
818 case LSC_DATA_SIZE_16c32bH: dataSizeEnc = 0x6; dataSizeBits = 32; break;
819 default: MUST_BE_TRUE(false, "invalid data size"); status = VISA_FAILURE;
820 }
821 desc |= dataSizeEnc << 9; // Desc[11:9]
822 return dataSizeBits;
823 }
824
lscEncodeDataElems(LSC_DATA_ELEMS dataElems,uint32_t & desc,int & status) const825 int IR_Builder::lscEncodeDataElems(
826 LSC_DATA_ELEMS dataElems, uint32_t &desc, int &status) const
827 {
828 uint32_t vecSizeEnc = 0;
829 int vecSize = 1;
830 switch (dataElems) {
831 case LSC_DATA_ELEMS_1: vecSizeEnc = 0x0; vecSize = 1; break;
832 case LSC_DATA_ELEMS_2: vecSizeEnc = 0x1; vecSize = 2; break;
833 case LSC_DATA_ELEMS_3: vecSizeEnc = 0x2; vecSize = 3; break;
834 case LSC_DATA_ELEMS_4: vecSizeEnc = 0x3; vecSize = 4; break;
835 case LSC_DATA_ELEMS_8: vecSizeEnc = 0x4; vecSize = 8; break;
836 case LSC_DATA_ELEMS_16: vecSizeEnc = 0x5; vecSize = 16; break;
837 case LSC_DATA_ELEMS_32: vecSizeEnc = 0x6; vecSize = 32; break;
838 case LSC_DATA_ELEMS_64: vecSizeEnc = 0x7; vecSize = 64; break;
839 default: MUST_BE_TRUE(false, "number of data elements"); status = VISA_FAILURE;
840 }
841 desc |= vecSizeEnc << 12; // desc[14:12] is the vector size
842 return vecSize;
843 }
844
lscEncodeDataOrder(LSC_DATA_ORDER order,uint32_t & desc,int & status) const845 void IR_Builder::lscEncodeDataOrder(
846 LSC_DATA_ORDER order, uint32_t &desc, int &status) const
847 {
848 if (order == LSC_DATA_ORDER_TRANSPOSE) {
849 desc |= 1 << 15; // desc[15] is transpose
850 } else if (order != LSC_DATA_ORDER_NONTRANSPOSE) {
851 MUST_BE_TRUE(false, "bad transpose value");
852 status = VISA_FAILURE;
853 }
854 }
855
lscEncodeCachingOpts(const LscOpInfo & opInfo,LSC_CACHE_OPTS cacheOpts,uint32_t & desc,int & status) const856 void IR_Builder::lscEncodeCachingOpts(
857 const LscOpInfo &opInfo,
858 LSC_CACHE_OPTS cacheOpts,
859 uint32_t &desc,
860 int &status) const
861 {
862 uint32_t cacheEnc = 0;
863 if (!LscTryEncodeCacheOpts(opInfo, cacheOpts, cacheEnc, isLSCCacheOpt17_19())) {
864 MUST_BE_TRUE(false, "unsupported caching options");
865 status = VISA_FAILURE;
866 }
867
868 desc |= cacheEnc;
869 }
870
lscEncodeAddrType(LSC_ADDR_TYPE addrModel,uint32_t & desc,int & status) const871 void IR_Builder::lscEncodeAddrType(
872 LSC_ADDR_TYPE addrModel,
873 uint32_t &desc,
874 int &status) const
875 {
876 uint32_t addrTypeEnc = 0;
877 switch (addrModel) {
878 case LSC_ADDR_TYPE_FLAT: addrTypeEnc = 0; break;
879 case LSC_ADDR_TYPE_BSS: addrTypeEnc = 1; break;
880 case LSC_ADDR_TYPE_SS: addrTypeEnc = 2; break;
881 case LSC_ADDR_TYPE_BTI: addrTypeEnc = 3; break;
882 default: MUST_BE_TRUE(false, "invalid address model"); status = VISA_FAILURE;
883 }
884 desc |= addrTypeEnc << 29; // [30:29] addr size
885 }
886
lscBuildStridedPayload(G4_Predicate * pred,G4_SrcRegRegion * src0AddrBase,G4_Operand * src0AddrStride,int dataSizeBytes,int vecSize,bool transposed)887 G4_SrcRegRegion *IR_Builder::lscBuildStridedPayload(
888 G4_Predicate *pred,
889 G4_SrcRegRegion *src0AddrBase, // output
890 G4_Operand *src0AddrStride,
891 int dataSizeBytes, int vecSize, bool transposed)
892 {
893 const uint32_t BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
894 // We've been passed in a single value for the address, and we
895 // have to generate the address payload register from that value
896 // along with the pitch.
897 //
898 // E.g. we've been passed in the following.
899 // .decl VADDR v_type=G type=UD num_elts=1 align=GRF
900 // (VADDR doesn't necessarily need to be GRF aligned)
901 //
902 // We need to generate:
903 // .decl VADDR_REG_UD v_type=G type=UD num_elts=NUM_PER_GRF(T) align=GRF
904 // .decl VADDR_REG_UQ type=UQ alias=<VADDR_REG_UD,0>
905 //
906 G4_Declare *addrTmpDeclUd = createSendPayloadDcl(BYTES_PER_REG/4, Type_UD);
907 G4_Declare *addrTmpDeclUq = createSendPayloadDcl(BYTES_PER_REG/8, Type_UQ);
908 addrTmpDeclUq->setAliasDeclare(addrTmpDeclUd, 0);
909 //
910 // Then to build the payload we need the following.
911 // ...
912 // [for 64b base addresses]
913 // (P) mov (M1_NM,1) VADDR_REG(0,0)<1>:uq VADDR(0,0)<0;1,0>:T
914 // [for 32b base addresses]
915 // (P) mov (M1_NM,1) VADDR_REG(0,0)<1>:ud VADDR(0,0)<0;1,0>:T
916 // ...
917 // (P) mov (M1_NM,1) VADDR_REG(0,2)<1>:ud sizeof(T):ud
918 // (P) send (M1_NM,1) VDATA VADDR_REG null lsc_load_block....
919 //
920 if (src0AddrBase->getType() == Type_UQ ||
921 src0AddrBase->getType() == Type_Q)
922 {
923 G4_DstRegRegion
924 *payloadDstAddrUq = createDst(
925 addrTmpDeclUq->getRegVar(), 0, 0, 1, Type_UQ);
926 createInst(
927 pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
928 payloadDstAddrUq, src0AddrBase, nullptr,
929 Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
930 }
931 else
932 {
933 G4_DstRegRegion
934 *payloadDstAddrUd = createDst(
935 addrTmpDeclUd->getRegVar(), 0, 0, 1, Type_UD);
936 createInst(
937 pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
938 payloadDstAddrUd, src0AddrBase, nullptr,
939 Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
940 }
941 //
942 G4_DstRegRegion
943 *payloadDstPitch = createDst(
944 addrTmpDeclUd->getRegVar(), 0, 2, 1, Type_UD);
945 if (src0AddrStride == nullptr) {
946 int defaultPitch = dataSizeBytes;
947 if (!transposed)
948 defaultPitch *= vecSize;
949 src0AddrStride = createImmWithLowerType(defaultPitch, Type_UD);
950 }
951 createInst(
952 pred, G4_mov, 0, g4::NOSAT, g4::SIMD1, payloadDstPitch, src0AddrStride, nullptr,
953 Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
954 //
955 return createSrc(
956 addrTmpDeclUd->getRegVar(), 0, 0,
957 getRegionScalar(), Type_UD);
958 }
959
lscBuildBlock2DPayload(LSC_DATA_SHAPE_BLOCK2D dataShape2D,G4_Predicate * pred,G4_Operand * src0Addrs[6])960 G4_SrcRegRegion *IR_Builder::lscBuildBlock2DPayload(
961 LSC_DATA_SHAPE_BLOCK2D dataShape2D,
962 G4_Predicate *pred,
963 G4_Operand *src0Addrs[6])
964 {
965 // Similar to lscBuildStridedPayload, but this formats the payload
966 // as follows.
967 //
968 // https://gfxspecs.intel.com/Predator/Home/Index/53567
969 // A2DBLOCK_PAYLOAD:
970 // [31:0]: base address lo (32b)
971 // [63:32]: base address hi (32b)
972 // [95:64]: surface width minus 1 (32b)
973 // [127:96]: surface height minus 1 (32b)
974 // [159:128]: surface pitch minus 1 (32b)
975 // [191:160]: block X (32b)
976 // [223:192]: block Y (32b)
977 // [231:224]: block width (8b)
978 // [239:232]: block height (8b)
979 // [243:240]: array length (4b)
980 // [255:244]: UNDEFINED
981 //
982 // [StartX:s32, StartY:s32, Width:u32, Height:u32, ArrayLenMinus1:u4]
983 // ArrayLenMinus1 is at [131:128]
984 //
985 // We generate the following. Since the width and height are immediate
986 //
987 // .decl VADDR_REG_UD v_type=G type=UD num_elts=NUM_PER_GRF(T) align=GRF
988 // .decl VADDR_REG_UQ type=UQ alias=<VADDR_REG_UD,0>
989 // mov (M1_NM,1) ADDR(0,0):d src0AddrX
990 // mov (M1_NM,1) ADDR(0,1):d src0AddrY
991 // mov (M1_NM,1) ADDR(0,1):uq ((blockWidth << 32)|blockHeight):uq
992 // mov (M1_NM,1) ADDR(0,4):d arrayLen:uw
993 const uint32_t BYTES_PER_REG = COMMON_ISA_GRF_REG_SIZE;
994 G4_Declare *addrTmpDeclUd = createSendPayloadDcl(BYTES_PER_REG/4, Type_UD);
995 G4_Declare *addrTmpDeclUq = createSendPayloadDcl(BYTES_PER_REG/8, Type_UQ);
996 addrTmpDeclUq->setAliasDeclare(addrTmpDeclUd, 0);
997 ///////////////////////
998 auto movUQ =
999 [&](int dstSubReg, G4_Operand *src) {
1000 G4_DstRegRegion
1001 *payloadDstAddr_0_Q = createDst(
1002 addrTmpDeclUq->getRegVar(),
1003 0, dstSubReg,
1004 1,
1005 Type_UQ);
1006 createInst(
1007 pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
1008 payloadDstAddr_0_Q,
1009 src, nullptr, Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
1010 };
1011 auto movUD =
1012 [&](int dstSubReg, G4_Operand *src) {
1013 G4_DstRegRegion
1014 *payloadDst = createDst(
1015 addrTmpDeclUd->getRegVar(), 0, dstSubReg, 1, Type_UD);
1016 createInst(
1017 pred, G4_mov, nullptr, g4::NOSAT, g4::SIMD1,
1018 payloadDst, src, nullptr,
1019 Get_Gen4_Emask(vISA_EMASK_M1_NM, g4::SIMD1), true);
1020 };
1021 auto movImmUD =
1022 [&](int dstSubReg, uint32_t imm) {
1023 movUD(dstSubReg, createImmWithLowerType(imm, Type_UD));
1024 };
1025
1026 ///////////////////////////////////
1027 // .decl ADDR v_type=G type=UD num_elts=NUM_PER_GRF(T) align=GRF
1028 // .decl ADDR type=UQ alias=<VADDR_REG_UD,0>
1029 // mov (M1_NM,1) ADDR(0,0):uq src0AddrBase[0]:uq
1030 // mov (M1_NM,1) ADDR(0,2):ud src0AddrBase[1]:ud
1031 // mov (M1_NM,1) ADDR(0,3):ud src0AddrBase[2]:ud
1032 // mov (M1_NM,1) ADDR(0,4):ud src0AddrBase[3]:ud
1033 // mov (M1_NM,1) ADDR(0,5):ud src0AddrBase[4]:ud
1034 // mov (M1_NM,1) ADDR(0,6):ud src0AddrBase[5]:ud
1035 // mov (M1_NM,1) ADDR(0,7):ud (width x height x blocks):ud
1036 //
1037 // bottom 64b
1038 movUQ(0, src0Addrs[0]); // surface address
1039 // these start at REG.2:d
1040 movUD(2, src0Addrs[1]); // surface width - 1
1041 movUD(3, src0Addrs[2]); // surface height - 1
1042 movUD(4, src0Addrs[3]); // surface pitch - 1
1043 movUD(5, src0Addrs[4]); // block x
1044 movUD(6, src0Addrs[5]); // block y
1045 uint32_t blockSize =
1046 (dataShape2D.width - 1) |
1047 ((dataShape2D.height - 1) << 8) |
1048 ((dataShape2D.blocks - 1) << 16);
1049 movImmUD(7, blockSize);
1050 //
1051 return createSrc(
1052 addrTmpDeclUd->getRegVar(), 0, 0,
1053 getRegionScalar(), Type_UD);
1054 }
1055
lscLoadEffectiveAddress(LSC_OP lscOp,LSC_SFID lscSfid,G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,LSC_ADDR addrInfo,int bytesPerDataElem,const G4_Operand * surface,G4_SrcRegRegion * addr,uint32_t & exDesc)1056 G4_SrcRegRegion *IR_Builder::lscLoadEffectiveAddress(
1057 LSC_OP lscOp,
1058 LSC_SFID lscSfid,
1059 G4_Predicate *pred,
1060 G4_ExecSize execSize,
1061 VISA_EMask_Ctrl execCtrl,
1062 LSC_ADDR addrInfo,
1063 int bytesPerDataElem,
1064 const G4_Operand *surface,
1065 G4_SrcRegRegion *addr,
1066 uint32_t &exDesc
1067 )
1068 {
1069 MUST_BE_TRUE(addrInfo.immScale == 1, "address scaling not supported yet");
1070 // The address may need scaling and offset adjustment
1071 // NEW_ADDR = SCALE*ADDR + OFF
1072 //
1073 // e.g. lsc_load.ugm.d32.a64 ... [4*ADDR - 0x100]
1074 //
1075
1076 // emulate scale and add if necessary
1077 return lscMulAdd(
1078 pred, execSize, execCtrl,
1079 addr, (int16_t)addrInfo.immScale, addrInfo.immOffset);
1080 }
1081
1082
lscCheckRegion(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src)1083 G4_SrcRegRegion *IR_Builder::lscCheckRegion(
1084 G4_Predicate *pred,
1085 G4_ExecSize execSize,
1086 VISA_EMask_Ctrl execCtrl,
1087 G4_SrcRegRegion *src)
1088 {
1089 const G4_Type srcType = src->getType();
1090 // Later extension could repack and work in these case,
1091 // for now throw a tantrum if they give us
1092 // ... VAR<2;1,0>
1093 // we do permit VAR<0;1,0>
1094 MUST_BE_TRUE(
1095 src->getRegion()->isPackedRegion() || src->getRegion()->isScalar(),
1096 "input must be scalar/packed");
1097 MUST_BE_TRUE(src->getSubRegOff() == 0 || src->getRegion()->isScalar(),
1098 "vector operands must be register aligned");
1099 return src;
1100 }
1101
lscMulAdd(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src,int16_t mulImm16,int64_t addImm64)1102 G4_SrcRegRegion *IR_Builder::lscMulAdd(
1103 G4_Predicate *pred,
1104 G4_ExecSize execSize,
1105 VISA_EMask_Ctrl execCtrl,
1106 G4_SrcRegRegion *src,
1107 int16_t mulImm16,
1108 int64_t addImm64)
1109 {
1110 if (mulImm16 == 1 && addImm64 == 0) {
1111 // no op
1112 return src;
1113 } else if (mulImm16 == 1 && addImm64 != 0) {
1114 // reduces to an add
1115 return lscAdd(pred, execSize, execCtrl, src, addImm64);
1116 } else if (mulImm16 != 1 && addImm64 == 0) {
1117 // reduces to a multiply
1118 return lscMul(pred, execSize, execCtrl, src, mulImm16);
1119 } else {
1120 MUST_BE_TRUE(false, "multiply not supported yet");
1121 return nullptr;
1122 /*
1123 // hard cases...
1124 auto srcType = src->getElemType();
1125 if (srcType == Type_UQ || srcType == Type_Q) {
1126 // harder case: sub-optimal code for now will
1127 // flip the lo32/hi32 pairs around twice
1128 auto *tmpVar = lscMul(pred, execSize, execCtrl, src, mulImm16);
1129 return lscAdd(pred, execSize, execCtrl, tmpVar, addImm64);
1130 } else {
1131 G4_Imm *addImmOpnd;
1132 if (srcType == Type_UD || srcType == Type_D) {
1133 MUST_BE_TRUE(
1134 addImm64 >= std::numeric_limits<int32_t>::min() &&
1135 addImm64 <= std::numeric_limits<int32_t>::max(),
1136 "imm offset for A32 must fit in 32b");
1137 addImmOpnd = createImmWithLowerType(addImm64, srcType);
1138 } else {
1139 MUST_BE_TRUE(
1140 addImm64 >= std::numeric_limits<int16_t>::min() &&
1141 addImm64 <= std::numeric_limits<int16_t>::max(),
1142 "imm offset for A16 must fit in 16b");
1143 addImmOpnd = createImmWithLowerType(addImm64, srcType);
1144 }
1145 // can use 32b + 32b x 16b mad (all platforms) (in place)
1146 // create a new register in case there's aliasing
1147 G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1148 G4_DstRegRegion *dstRgn =
1149 createDst(result->getRegVar(), 0, 0, 1, srcType);
1150 const auto *srcRgnVal = execSize == 1 ? getRegionScalar() : getRegionStride1();
1151 G4_SrcRegRegion *srcRgn =
1152 createSrcRegRegion(src->getRegVar(), 0, 0, srcRgnVal, srcType);
1153 //
1154 G4_Operand *mulImmOp = createImm(mulImm16, Type_W);
1155 createInst(pred, G4_mad, nullptr, false, execSize,
1156 dstRgn, addImmOpnd, srcRgn, mulImmOp,
1157 Get_Gen4_Emask(execCtrl, execSize));
1158 //
1159 return result;
1160 }
1161 */
1162 }
1163 }
1164
1165
1166
isPow2(int x)1167 static bool isPow2(int x)
1168 {
1169 return (x & (x - 1)) == 0;
1170 }
intLog2(int x)1171 static int intLog2(int x)
1172 {
1173 int shiftAmt = 0;
1174 while (x > 1) {
1175 x >>= 1;
1176 shiftAmt++;
1177 }
1178 return shiftAmt;
1179 }
1180
lscMul(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src0,int16_t mulImm)1181 G4_SrcRegRegion *IR_Builder::lscMul(
1182 G4_Predicate *pred,
1183 G4_ExecSize execSize,
1184 VISA_EMask_Ctrl execCtrl,
1185 G4_SrcRegRegion *src0,
1186 int16_t mulImm)
1187 {
1188 if (mulImm == 1)
1189 return src0;
1190
1191 const auto srcType = src0->getType();
1192 if (srcType == Type_UQ || srcType == Type_Q) {
1193 return lscMul64Aos(pred, execSize, execCtrl, src0, mulImm);
1194 } else {
1195 /*
1196 G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1197 G4_DstRegRegion *dst =
1198 createDst(result->getRegVar(), 0, 0, 1, srcType);
1199 const auto *srcRgn = execSize == 1 ?
1200 getRegionScalar() : getRegionStride1();
1201 G4_SrcRegRegion *src0 =
1202 createSrcRegRegion(srcVar->getRegVar(), 0, 0, srcRgn, srcType);
1203 G4_Operand *mulImmOp = createImm(mulImm, Type_W);
1204 createInst(
1205 duplicateOperand(pred),
1206 G4_mul, nullptr, false,
1207 execSize, dst, src0, mulImmOp, execCtrl);
1208 return result;
1209 */
1210 MUST_BE_TRUE(false, "lscMul unsupported");
1211 return nullptr;
1212 }
1213 }
1214
1215
lscAdd(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src0,int64_t addImm64)1216 G4_SrcRegRegion *IR_Builder::lscAdd(
1217 G4_Predicate *pred,
1218 G4_ExecSize execSize,
1219 VISA_EMask_Ctrl execCtrl,
1220 G4_SrcRegRegion *src0,
1221 int64_t addImm64)
1222 {
1223 if (addImm64 == 0)
1224 return src0;
1225
1226 const G4_Type srcType = src0->getType();
1227 MUST_BE_TRUE(
1228 srcType == Type_UQ || srcType == Type_Q ||
1229 srcType == Type_UD || srcType == Type_D ||
1230 srcType == Type_UW || srcType == Type_W,
1231 "function only supports integer types");
1232
1233 src0 = lscCheckRegion(pred, execSize, execCtrl, src0);
1234
1235 if (srcType == Type_UQ || srcType == Type_Q) {
1236 if (hasInt64Add()) {
1237 return lscAdd64AosNative(pred, execSize, execCtrl, src0, addImm64);
1238 } else {
1239 return lscAdd64AosEmu(pred, execSize, execCtrl, src0, addImm64);
1240 }
1241 } else if ((int32_t)addImm64 != addImm64) {
1242 MUST_BE_TRUE(false, "<64b add must not use >32b imm off");
1243 } else if ((srcType == Type_UW || srcType == Type_W) &&
1244 (int16_t)addImm64 != addImm64)
1245 {
1246 MUST_BE_TRUE(false, "16b add must not use >16b imm off");
1247 }
1248
1249 // we can do this in one instruction
1250 G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1251 G4_DstRegRegion *dst = createDst(result->getRegVar(), srcType);
1252 const auto *srcRgn = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride1();
1253 G4_Operand *immOp = createImmWithLowerType(addImm64, srcType);
1254 createInst(
1255 duplicateOperand(pred),
1256 G4_add, nullptr, g4::NOSAT, execSize,
1257 dst, src0, immOp, Get_Gen4_Emask(execCtrl, execSize), true);
1258
1259 return createSrc(result->getRegVar(), 0, 0, srcRgn, srcType);
1260 }
1261
lscAdd64AosNative(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * srcReg64,int64_t addImm64)1262 G4_SrcRegRegion *IR_Builder::lscAdd64AosNative(
1263 G4_Predicate *pred,
1264 G4_ExecSize execSize,
1265 VISA_EMask_Ctrl execCtrl,
1266 G4_SrcRegRegion *srcReg64,
1267 int64_t addImm64)
1268 {
1269 if (addImm64 == 0)
1270 return srcReg64;
1271 // we can assume this is only called on >=PVC (has LSC and DG2 lacks native int64)
1272 const auto *srcRgn1 = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride1();
1273 const G4_Type srcType = srcReg64->getType();
1274 MUST_BE_TRUE(
1275 srcType == Type_UQ || srcType == Type_Q,
1276 "this function only supports Q/UQ types");
1277 G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1278 G4_DstRegRegion *dst =
1279 createDst(result->getRegVar(), 0, 0, 1, Type_Q);
1280 MUST_BE_TRUE(
1281 addImm64 >= std::numeric_limits<int32_t>::min() &&
1282 addImm64 <= std::numeric_limits<int32_t>::max(), "offset too big");
1283 G4_Imm *srcImm = createImm((int32_t)addImm64, Type_D);
1284 createInst(
1285 duplicateOperand(pred),
1286 G4_add, nullptr, g4::NOSAT, execSize,
1287 dst, srcReg64, srcImm, Get_Gen4_Emask(execCtrl, execSize), true);
1288
1289 return createSrc(result->getRegVar(), 0, 0, srcRgn1, srcReg64->getType());
1290 }
1291
lscAdd64AosEmu(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * srcReg64,int64_t addImm64)1292 G4_SrcRegRegion *IR_Builder::lscAdd64AosEmu(
1293 G4_Predicate *pred,
1294 G4_ExecSize execSize,
1295 VISA_EMask_Ctrl execCtrl,
1296 G4_SrcRegRegion *srcReg64,
1297 int64_t addImm64)
1298 {
1299 if (addImm64 == 0)
1300 return srcReg64;
1301
1302 const auto *srcRgn1 = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride1();
1303 const auto *srcRgn2 = execSize == g4::SIMD1 ? getRegionScalar() : getRegionStride2();
1304 int dstRgnHz2 = execSize == g4::SIMD1 ? 1 : 2;
1305
1306 const G4_Type srcType = srcReg64->getType();
1307 MUST_BE_TRUE(
1308 srcType == Type_UQ || srcType == Type_Q,
1309 "this function only supports integer types");
1310
1311 // Given REG64.K<1;1,0>:q we need to split this into the low and high
1312 // halves: REG32.(2*K)<2,1,0>:d and REG32.(2*K+1)<2,1,0>:d
1313 // (scalar gets scalar regions)
1314 //
1315 // These are lambdas because we have to extract these regions repeatedly
1316 // for each pass (walking them forward)
1317 auto getSrcReg32 = [&] (int pass, short evenOdd) {
1318 // walk the base register forward if the input is vector
1319 int passRegOff = srcReg64->getRegion()->isScalar() ? 0 : 2 * pass;
1320 G4_SrcRegRegion *srcReg32 =
1321 createSrc(
1322 srcReg64->getBase(),
1323 srcReg64->getRegOff() + passRegOff,
1324 srcReg64->getSubRegOff()/2 + evenOdd,
1325 srcRgn2,
1326 Type_UD);
1327 return srcReg32;
1328 };
1329
1330 // DST = SRC + IMM64
1331 // (W) addc (..|M0) TMP0<1> SRC.0<2> LO32(imm64) {AccWrEn}
1332 // (W) addX (..|M0) TMP1<1> SRC.1<2> [HI32(imm64)] acc0
1333 // (P) mov (..|MX) DST.0<2> TMP1.0<1> // mux it back out
1334 // (P) mov (..|MX) DST.1<2> TMP2.0<1>
1335 G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1336 //
1337 VISA_EMask_Ctrl passExecCtrl = execCtrl;
1338 const G4_ExecSize passExecSize = std::min<G4_ExecSize>(execSize, getNativeExecSize());
1339 const int passes = std::max<int>(1, execSize/getNativeExecSize());
1340 //
1341 // shared immediate operands
1342 G4_Imm *srcImmLo32 = createImm(addImm64 & 0xFFFFFFFF, Type_UD);
1343 uint32_t hi32Bits = (uint32_t)(addImm64 >> 32);
1344 G4_Imm *srcImmHi32 = (hi32Bits != 0) ? createImm(hi32Bits, Type_UD) : nullptr;
1345 //
1346 for (int pass = 0; pass < passes; pass++)
1347 {
1348 // e.g. someone tries to do a SIMD32 starting at M16
1349 MUST_BE_TRUE(passExecCtrl != vISA_NUM_EMASK, "invalid exec mask");
1350 //
1351 G4_Declare *TMP_LO32 = createTempVar(passExecSize, Type_UD, GRFALIGN);
1352 G4_DstRegRegion *dstAddcLo =
1353 createDst(TMP_LO32->getRegVar(), 0, 0, 1, Type_UD);
1354 G4_SrcRegRegion *srcAddcLo = getSrcReg32(pass, 0);
1355 G4_INST* addLoInst = createInst(
1356 duplicateOperand(pred),
1357 G4_addc, nullptr, g4::NOSAT, passExecSize,
1358 dstAddcLo, srcAddcLo, srcImmLo32,
1359 Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize) | InstOpt_AccWrCtrl, true);
1360 G4_DstRegRegion *dstAcc0 = createDst(phyregpool.getAcc0Reg(), 0, 0, 1, Type_UD);
1361 addLoInst->setImplAccDst(dstAcc0);
1362 //
1363 G4_Declare *TMP_HI32 = createTempVar(passExecSize, Type_UD, GRFALIGN);
1364 G4_DstRegRegion *dstAddHi =
1365 createDst(TMP_HI32->getRegVar(), 0, 0, 1, Type_UD);
1366 G4_SrcRegRegion *srcAddHi = getSrcReg32(pass, 1);
1367 G4_SrcRegRegion *srcAcc0 =
1368 createSrc(phyregpool.getAcc0Reg(), 0, 0, srcRgn1, Type_UD);
1369 if (srcImmHi32) {
1370 createInst(
1371 duplicateOperand(pred),
1372 G4_add3, nullptr, g4::NOSAT, passExecSize,
1373 dstAddHi, srcAcc0, srcAddHi, srcImmHi32,
1374 Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize), true);
1375 } else {
1376 createInst(
1377 duplicateOperand(pred),
1378 G4_add, nullptr, g4::NOSAT, passExecSize,
1379 dstAddHi, srcAcc0, srcAddHi,
1380 Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize), true);
1381 }
1382 //
1383 G4_DstRegRegion *resultLo =
1384 createDst(result->getRegVar(), 2*pass, 0, dstRgnHz2, Type_UD);
1385 G4_SrcRegRegion *tmpLoSrc =
1386 createSrc(TMP_LO32->getRegVar(), 0, 0, srcRgn1, Type_UD);
1387 createInst(
1388 duplicateOperand(pred),
1389 G4_mov, nullptr, g4::NOSAT, passExecSize,
1390 resultLo, tmpLoSrc, nullptr, Get_Gen4_Emask(passExecCtrl, passExecSize), true);
1391 //
1392 G4_DstRegRegion *resultHi =
1393 createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_UD);
1394 G4_SrcRegRegion *tmpHiSrc =
1395 createSrc(TMP_HI32->getRegVar(), 0, 0, srcRgn1, Type_UD);
1396 createInst(
1397 duplicateOperand(pred),
1398 G4_mov, nullptr, g4::NOSAT, passExecSize,
1399 resultHi, tmpHiSrc, nullptr, Get_Gen4_Emask(passExecCtrl, passExecSize), true);
1400 //
1401 passExecCtrl = Get_Next_EMask(passExecCtrl, passExecSize);
1402 }
1403
1404 return createSrc(result->getRegVar(), 0, 0, srcRgn1, srcReg64->getType());
1405 }
1406
lscMul64Aos(G4_Predicate * pred,G4_ExecSize execSize,VISA_EMask_Ctrl execCtrl,G4_SrcRegRegion * src0,int16_t mulImm)1407 G4_SrcRegRegion *IR_Builder::lscMul64Aos(
1408 G4_Predicate *pred,
1409 G4_ExecSize execSize,
1410 VISA_EMask_Ctrl execCtrl,
1411 G4_SrcRegRegion *src0,
1412 int16_t mulImm)
1413 {
1414 if (mulImm == 1)
1415 return src0;
1416
1417 MUST_BE_TRUE(false, "mul64-aos not supported yet");
1418 return nullptr;
1419
1420 /*
1421 const auto *srcRgn1 = execSize == 1 ? getRegionScalar() : getRegionStride1();
1422 const auto *srcRgn2 = execSize == 1 ? getRegionScalar() : getRegionStride2();
1423 int dstRgnHz2 = execSize == 1 ? 1 : 2;
1424
1425 // int64 vs 16b multiply with int32 math
1426 auto srcType = srcVar->getElemType();
1427 MUST_BE_TRUE(srcType == Type_UQ || srcType == Type_Q, "type should be 64b");
1428 //
1429 // either way below we need the accumulator, so we're limited to using
1430 // multiple passes to perform the math
1431 const auto passExecSize = std::min<unsigned>(execSize, getNativeExecSize());
1432 const int passes = std::max<int>(1, execSize/getNativeExecSize());
1433
1434 G4_Declare *result = createTempVar(execSize, srcType, GRFALIGN);
1435 if (isPow2(mulImm)) {
1436 // e.g. SIMD32 when SIMD8 is max HW size requires four passes
1437 int shlAmt = intLog2(mulImm);
1438 VISA_EMask_Ctrl passExecCtrl = execCtrl;
1439 for (int pass = 0; pass < passes; pass++)
1440 {
1441 // e.g. someone tries to do a SIMD32 starting at M16
1442 MUST_BE_TRUE(passExecCtrl != vISA_NUM_EMASK, "invalid exec mask");
1443 //
1444 // shr (E|M0) TMP.1<1>:ud SRC.0<2> 29
1445 // shl (E|M0) DST.0<2>:ud SRC.0<2> 3
1446 // shl (E|M0) DST.1<2>:ud SRC.0<2> 3
1447 // or (E|M0) DST.1<2>:ud DST.1<2> TMP
1448 int passInstOpt = Get_Gen4_Emask(passExecCtrl, passExecSize);
1449 G4_Declare *TMP = createTempVar(passExecSize, Type_UD, GRFALIGN);
1450 G4_DstRegRegion *dstTMP =
1451 createDst(TMP->getRegVar(), 0, 0, 1, Type_UD);
1452 G4_SrcRegRegion *srcLo32a =
1453 createSrcRegRegion(srcVar->getRegVar(), 2*pass, 0, srcRgn2, Type_UD);
1454 G4_Operand *shrImmAmt = createImm(32 - shlAmt, Type_W);
1455 createBinOp(
1456 duplicateOperand(pred),
1457 G4_shr, passExecSize,
1458 dstTMP, srcLo32a, shrImmAmt, passInstOpt);
1459 //
1460 G4_Operand *shlImmAmt = createImm(shlAmt, Type_W);
1461 G4_DstRegRegion *dstLo32 =
1462 createDst(result->getRegVar(), 2*pass, 0, dstRgnHz2, Type_UD);
1463 G4_SrcRegRegion *srcLo32b =
1464 createSrcRegRegion(srcVar->getRegVar(), 2*pass, 0, srcRgn2, Type_UD);
1465 createBinOp(
1466 duplicateOperand(pred),
1467 G4_shl, passExecSize, dstLo32, srcLo32b, shlImmAmt, passInstOpt);
1468 //
1469 G4_DstRegRegion *dstHi32a =
1470 createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_UD);
1471 G4_SrcRegRegion *srcHi32a =
1472 createSrcRegRegion(srcVar->getRegVar(), 2*pass, 1, srcRgn2, Type_UD);
1473 createBinOp(
1474 duplicateOperand(pred),
1475 G4_shl, passExecSize, dstHi32a, srcHi32a, shlImmAmt, passInstOpt);
1476 //
1477 G4_DstRegRegion *dstHi32b =
1478 createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_UD);
1479 G4_SrcRegRegion *srcHi32b =
1480 createSrcRegRegion(srcVar->getRegVar(), 2*pass, 1, srcRgn2, Type_UD);
1481 G4_SrcRegRegion *srcTMP =
1482 createSrcRegRegion(TMP->getRegVar(), 0, 1, srcRgn1, Type_UD);
1483 createBinOp(
1484 duplicateOperand(pred),
1485 G4_or, passExecSize, dstHi32b, srcHi32b, srcTMP,
1486 passInstOpt);
1487
1488 passExecCtrl = Get_Next_EMask(execCtrl, (int)passExecSize);
1489 }
1490 } else {
1491 // have to use mul/mach
1492 // SOA version
1493 // mul (8|M0) DST_LO32<1>:ud SRC.lo32:ud imm16:uw
1494 // (W) mul (8|M0) acc0.0<1>:ud SRC.lo32:ud imm16:uw
1495 // mach (8|M0) TMP0.0<1>:d SRC.lo32:ud imm16:ud {AccWrEn}
1496 // mul (8|M0) TMP1.0<1>:d SRC.hi32:d imm16:uw
1497 // add (8|M0) DST_HI32<1>:d TMP0:d TMP:d
1498 // AOS version: pass execution size is int sizeof(acc0), with pass offset PO
1499 // mul (P|PO) DST.0<2>:ud SRC.0<2>:ud imm16:uw
1500 // (W) mul (P|M0) acc0.0<1>:ud SRC.0<2>:ud imm16:uw
1501 // (W) mach (P|M0) TMP0<1>:d SRC.0<2>:ud imm16:ud {AccWrEn}
1502 // mul (P|PO) TMP1<1>:d SRC.1<2>:d imm16:uw
1503 // add (P|PO) DST.1<2>:d TMP0:d TMP1:d
1504 VISA_EMask_Ctrl passExecCtrl = execCtrl;
1505 G4_Operand *srcImm16 = createImm(mulImm, Type_UW);
1506
1507 for (int pass = 0; pass < passes; pass++)
1508 {
1509 // e.g. someone tries to do a SIMD32 starting at M16
1510 MUST_BE_TRUE(passExecCtrl != vISA_NUM_EMASK, "invalid exec mask");
1511 //
1512 G4_DstRegRegion *dstMul1 =
1513 createDst(result->getRegVar(), 2*pass, 0, dstRgnHz2, Type_UD);
1514 G4_SrcRegRegion *srcMul1 =
1515 createSrcRegRegion(srcVar->getRegVar(), 2*pass, 0, srcRgn2, Type_UD);
1516 createInst(
1517 duplicateOperand(pred),
1518 G4_mul, nullptr, false, passExecSize,
1519 dstMul1, srcMul1, srcImm16, Get_Gen4_Emask(passExecCtrl, passExecSize));
1520 //
1521 G4_Declare *TMP0 = createTempVar(passExecSize, Type_UD, GRFALIGN);
1522 G4_DstRegRegion *dstMul2 =
1523 createDst(phyregpool.getAcc0Reg(), 0, 0, 1, Type_UD);
1524 G4_SrcRegRegion *srcMul2 = duplicateOperand(srcMul1);
1525 createInst(
1526 duplicateOperand(pred),
1527 G4_mul, nullptr, false, passExecSize,
1528 dstMul2, srcMul2, srcImm16,
1529 Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize));
1530 //
1531 G4_DstRegRegion *dstMach3 =
1532 createDst(TMP0->getRegVar(), 0, 0, 1, Type_D);
1533 G4_SrcRegRegion *srcMach3 = duplicateOperand(srcMul1);
1534 G4_INST *i = createInst(
1535 duplicateOperand(pred),
1536 G4_mach, nullptr, false, passExecSize,
1537 dstMach3, srcMach3, srcImm16,
1538 Get_Gen4_Emask(vISA_EMASK_M1_NM, passExecSize) | InstOpt_AccWrCtrl);
1539 G4_SrcRegRegion *srcImplAcc =
1540 createSrcRegRegion(phyregpool.getAcc0Reg(), 0, 0, srcRgn1, Type_D);
1541 i->setImplAccSrc(srcImplAcc);
1542 //
1543 G4_Declare *TMP1 = createTempVar(passExecSize, Type_D, GRFALIGN);
1544 G4_DstRegRegion *dstMul4 =
1545 createDst(TMP1->getRegVar(), 0, 0, 1, Type_D);
1546 G4_SrcRegRegion *srcMul4 =
1547 createSrcRegRegion(srcVar->getRegVar(), 2*pass, 1, srcRgn2, Type_D);
1548 createInst(
1549 duplicateOperand(pred),
1550 G4_mul, nullptr, false, passExecSize,
1551 dstMul4, srcMul4, srcImm16, Get_Gen4_Emask(passExecCtrl, passExecSize));
1552 //
1553 G4_DstRegRegion *dstAdd5 =
1554 createDst(result->getRegVar(), 2*pass, 1, dstRgnHz2, Type_D);
1555 G4_SrcRegRegion *src0Add5 =
1556 createSrcRegRegion(TMP0->getRegVar(), 0, 1, srcRgn1, Type_D);
1557 G4_SrcRegRegion *src1Add5 =
1558 createSrcRegRegion(TMP1->getRegVar(), 0, 1, srcRgn1, Type_D);
1559 createInst(
1560 duplicateOperand(pred),
1561 G4_mul, nullptr, false, passExecSize,
1562 dstAdd5, src0Add5, src1Add5, Get_Gen4_Emask(passExecCtrl, passExecSize));
1563 //
1564 passExecCtrl = Get_Next_EMask(execCtrl, (int)passExecSize);
1565 }
1566 }
1567 return result;
1568 */
1569 }
1570