1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2020-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "BuildIR.h"
10 #include "../Timer.h"
11 
12 using namespace vISA;
13 
14 
15 #define FIX_OWORD_SEND_EXEC_SIZE(BLOCK_SIZE)(((BLOCK_SIZE) > 2)? 16: (BLOCK_SIZE*4))
16 
17 
buildDescForScatter(uint32_t msgType,VISA_SVM_Block_Num numBlocks,MDC_SM2 simdMode)18 static uint32_t buildDescForScatter(
19     uint32_t msgType, VISA_SVM_Block_Num numBlocks, MDC_SM2 simdMode)
20 {
21     uint32_t MD = (msgType & 0x1F) << 14;
22     MD |= numBlocks << 10;
23     MD |= 1 << 9;
24     MD |= simdMode << 8;
25     return MD;
26 }
27 
28 
isMessageHeaderOptional(G4_Operand * surface,G4_Operand * Offset) const29 bool IR_Builder::isMessageHeaderOptional(
30     G4_Operand *surface, G4_Operand *Offset) const
31 {
32     // Message header is require for T255 stateless surface on pre-SKL devices
33     // as a workaround for HW issue.
34     if (needsA32MsgHeader() && isStatelessSurface(surface))
35     {
36         return false;
37     }
38 
39     // Message Header is optional when offset is 0.
40     // When GlobalOffset is 0, message header is optional.
41     // "If the header is not present, behavior is as if the message was sent
42     // with all fields in the header set to zero."
43     return Offset->isImm() && Offset->asImm()->isZero();
44 }
45 
translateVISAQWGatherInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * surface,G4_SrcRegRegion * addresses,G4_DstRegRegion * dst)46 int IR_Builder::translateVISAQWGatherInst(
47     VISA_Exec_Size execSize,
48     VISA_EMask_Ctrl eMask,
49     G4_Predicate* pred,
50     VISA_SVM_Block_Num numBlocks,
51     G4_SrcRegRegion* surface,
52     G4_SrcRegRegion* addresses,
53     G4_DstRegRegion* dst)
54 {
55     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
56 
57     VISA_Exec_Size instExecSize = execSize;
58     execSize = roundUpExecSize(execSize);
59 
60     unsigned exSize = Get_VISA_Exec_Size(execSize);
61     G4_ExecSize instExSize = G4_ExecSize(Get_VISA_Exec_Size(instExecSize));
62     unsigned int instOpt = Get_Gen4_Emask(eMask, instExSize);
63     uint32_t messageLength = (exSize / 8);
64     uint32_t responseLength = Get_Common_ISA_SVM_Block_Num(numBlocks) * 2 * (exSize / 8);
65 
66     uint32_t desc = buildDescForScatter(DC_QWORD_SCATTERED_READ, numBlocks,
67         (execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16));
68 
69     createSendInst(
70         pred, dst, addresses, messageLength, responseLength, instExSize, desc,
71         SFID::DP_DC0, false, SendAccess::READ_ONLY, surface, nullptr, instOpt, false);
72 
73     return VISA_SUCCESS;
74 }
75 
translateVISAQWScatterInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * surface,G4_SrcRegRegion * addresses,G4_SrcRegRegion * src)76 int IR_Builder::translateVISAQWScatterInst(
77     VISA_Exec_Size execSize,
78     VISA_EMask_Ctrl eMask,
79     G4_Predicate* pred,
80     VISA_SVM_Block_Num numBlocks,
81     G4_SrcRegRegion* surface,
82     G4_SrcRegRegion* addresses,
83     G4_SrcRegRegion* src)
84 {
85     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
86 
87     VISA_Exec_Size instExecSize = execSize;
88     execSize = roundUpExecSize(execSize);
89 
90     G4_ExecSize exSize = toExecSize(execSize);
91     G4_ExecSize instExSize = toExecSize(instExecSize);
92     unsigned int instOpt = Get_Gen4_Emask(eMask, instExSize);
93     bool useSplitSend = useSends();
94 
95     PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
96     unsigned len = 0;
97 
98     sources[len].opnd = addresses;
99     sources[len].execSize = exSize;
100     sources[len].instOpt = instOpt;
101     ++len;
102 
103     unsigned numElems = Get_Common_ISA_SVM_Block_Num(numBlocks);
104 
105     sources[len].opnd = src;
106     sources[len].execSize = G4_ExecSize(exSize * numElems);
107     sources[len].instOpt = instOpt;
108     ++len;
109 
110     G4_SrcRegRegion *msgs[2] {0, 0};
111     unsigned sizes[2] {0, 0};
112     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
113 
114     uint32_t desc = buildDescForScatter(DC_QWORD_SCATTERED_WRITE, numBlocks,
115         execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16);
116 
117     G4_DstRegRegion* dst = createNullDst(Type_UD);
118     if (msgs[1] == 0)
119     {
120         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
121         createSendInst(
122             pred, dst,
123             msgs[0], sizes[0],
124             0, instExSize,
125             desc, SFID::DP_DC0,
126             false,
127             SendAccess::WRITE_ONLY,
128             surface, nullptr,
129             instOpt, false);
130     }
131     else
132     {
133         createSplitSendInst(
134             pred, dst,
135             msgs[0], sizes[0],
136             msgs[1], sizes[1],
137             0, instExSize,
138             desc, SFID::DP_DC0,
139             false,
140             SendAccess::WRITE_ONLY,
141             surface, nullptr,
142             instOpt, false);
143     }
144 
145     return VISA_SUCCESS;
146 }
147 
148 // if surface is PRED_SURF_255, lower it to PRED_SURF_253 so that it's non IA-coherent
149 // the surface is not changed otherwise
lowerSurface255To253(G4_Operand * surface,IR_Builder & builder)150 static G4_Operand* lowerSurface255To253(G4_Operand* surface, IR_Builder& builder)
151 {
152     // disable due to OCL SVM atomics regression
153 #if 0
154     if (surface && surface->isImm() && surface->asImm()->getImm() == PREDEF_SURF_255)
155     {
156         return builder.createImm(PREDEF_SURF_253, Type_UW);
157     }
158     else
159 #endif
160     {
161         return surface;
162     }
163 }
164 
BuildStatelessSurfaceMessageHeader(IR_Builder * IRB,G4_Declare * Header)165 static void BuildStatelessSurfaceMessageHeader(IR_Builder *IRB, G4_Declare *Header)
166 {
167     // No need to mask fft id when scratch surface is bindless as
168     // A32 accesses are guaranteed to not be scratch accesses.
169     if (IRB->hasScratchSurface())
170     {
171         // Clear header
172         // Rx (8) = 0
173         auto DstOpnd = IRB->createDst(Header->getRegVar(), 0, 0, 1, Type_UD);
174         auto SrcImm0 = IRB->createImm(0, Type_UD);
175         IRB->createMov(g4::SIMD8, DstOpnd, SrcImm0, InstOpt_WriteEnable, true);
176         return;
177     }
178     // For A32, clearing off scratch space offset or Buffer Base Address is
179     // always required once header is present.
180     G4_Type ElemTy = Header->getElemType();
181 
182     // R0.5<31:10> is defined as Scratch Space Offset.
183     // R0.5<8:0> is defined as FF Thread ID (FFTID) in SKL+ devices.
184     // R0.5<7:0> is defined as FF Thread ID (FFTID) in pre-SKL devices.
185     // We increase the bit range to <9:0> to copy reserved bits as well.
186     const unsigned FFTID_Mask = 0x3ff;
187 
188     // Rx.5[31:0] = 0 | R0.5[9:0]
189     G4_DstRegRegion *DstOpnd = IRB->createDst(Header->getRegVar(), 0, 5, 1, ElemTy);
190     // R0.5
191     G4_SrcRegRegion *SrcOpnd = IRB->createSrc(
192         IRB->getBuiltinR0()->getRegVar(), 0, 5,
193         IRB->getRegionScalar(), ElemTy);
194     // Mask
195     G4_Imm *Mask = IRB->createImm(FFTID_Mask, Type_UD);
196     IRB->createBinOp(G4_and, g4::SIMD1, DstOpnd, SrcOpnd, Mask,
197         InstOpt_WriteEnable, true);
198 }
199 
200 
201 // TODO: remove
202 #define SET_DATAPORT_MESSAGE_TYPE(dest, value)\
203     dest |= value << 14;
204 
setOwordForDesc(uint32_t desc,int numOword,bool isSLM) const205 uint32_t IR_Builder::setOwordForDesc(uint32_t desc, int numOword, bool isSLM) const
206 {
207     static const uint32_t MESSAGE_SPECIFIC_CONTROL = 8;
208     switch (numOword)
209     {
210     case 1:
211         return desc;
212     case 2:
213         return desc | (0x2 << MESSAGE_SPECIFIC_CONTROL);
214     case 4:
215         return desc | (0x3 << MESSAGE_SPECIFIC_CONTROL);
216     case 8:
217         return desc | (0x4 << MESSAGE_SPECIFIC_CONTROL);
218     case 16:
219         assert(isSLM && has16OWordSLMBlockRW() && "16OWord block r/w not supported");
220         return desc | (0x5 << MESSAGE_SPECIFIC_CONTROL);
221     default:
222         /// TODO(move to verifier): default: ASSERT_USER(false, "OWord block size must be 1/2/4/8.");
223         return desc;
224     }
225 }
226 
227 
228 /*
229 * Translates OWord Block read CISA inst.
230 *
231 * For GT, assume size is 8 then the code should look like
232 *
233 * .declare  VX Base=m ElementSize=4 Type=ud Total=8
234 * .declare  VY Base=r ElementSize=4 Type=ud Total=8
235 *
236 * mov  (8)     VX(0,0)<1>,  r0:ud
237 * mov  (1)     VX(0,2)<1>,  P
238 * send (8)     VY(0,0)<1>,  VX(0,0),    0x5,  0x02180200
239 * mov  (8)     v(0,0)<1>,   VY(0,0)
240 *
241 * P: M0.2 in the message header (Global offset)
242 *
243 * 0x5 == 0 (Not the EOT)
244 *
245 * 0x02180200 == Bit 31-29: 000 (Reserved)
246 *               Bit 28-25: 0001 (Msg. leng. = 1)
247 *               Bit 24-20: 00001 (Response msg. leng. = 1)
248 *               Bit 19:    1 (Header present)
249 *               Bit 18:    0 (Ignored)
250 *               Bit 17:    0 (Send write commit message; ignored for read message
251 *               Bit 16-13: 0000 (Msg. type = OWord block read - for Render Cache)
252 *               Bit 12-8:  00010 (Block size = 2 OWords) - can only be 1/2/4/8 for sampler/render cache
253 *               Bit 7-0:   00000000 + I (Binding table index)
254 *
255 */
translateVISAOwordLoadInst(ISA_Opcode opcode,bool modified,G4_Operand * surface,VISA_Oword_Num size,G4_Operand * offOpnd,G4_DstRegRegion * dstOpnd)256 int IR_Builder::translateVISAOwordLoadInst(
257     ISA_Opcode opcode,
258     bool modified,
259     G4_Operand* surface,
260     VISA_Oword_Num size,
261     G4_Operand* offOpnd,
262     G4_DstRegRegion* dstOpnd)
263 {
264     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
265 
266     surface = lowerSurface255To253(surface, *this);
267 
268     unsigned num_oword = Get_VISA_Oword_Num(size);
269     bool unaligned = (opcode == ISA_OWORD_LD_UNALIGNED);
270 
271     // create dcl for VX
272     G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
273 
274     if (isStatelessSurface(surface))
275     {
276         // Build stateless surface message header.
277         BuildStatelessSurfaceMessageHeader(this, dcl);
278     }
279 
280     /* mov (1)      VX(0,2)<1>,    P  */
281     if (unaligned && (kernel.major_version == 3 && kernel.minor_version <= 1))
282     {
283         // for vISA3.1 and earlier
284         // the offset for unaligned OW load is in unit of DW, tranlate it into BYTE.
285         if (offOpnd->isImm())
286         {
287             // imm type must be UD as the result of shift could overflow word type
288             G4_Imm *new_src_opnd1 = createImm(
289                 offOpnd->asImm()->getInt() << 2, Type_UD);
290             createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, new_src_opnd1, true);
291         }
292         else
293         {
294             G4_DstRegRegion* dstOpnd = createDst(dcl->getRegVar(), 0, 2, 1, dcl->getElemType());
295             createBinOp(G4_shl, g4::SIMD1, dstOpnd, offOpnd,
296                 createImm(2, Type_UW), InstOpt_WriteEnable, true);
297         }
298     }
299     else
300     {
301         dcl->setCapableOfReuse();
302         createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, offOpnd, true);
303     }
304     // send's operands preparation
305     G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
306     G4_DstRegRegion* d = checkSendDst(dstOpnd->asDstRegRegion());
307 
308     uint32_t temp = 0;
309 
310     if (unaligned)
311     {
312         SET_DATAPORT_MESSAGE_TYPE(temp, DC_ALIGNED_OWORD_BLOCK_READ)
313     }
314 
315     // Set bit 12-8 for the message descriptor
316     temp = setOwordForDesc(temp, num_oword, IsSLMSurface(surface));
317 
318     // !!!WHY???
319     if (num_oword > 2)
320     {
321         // redefine the type and offset of post dst.
322         if ((d->getType() != Type_W) &&
323             (d->getType() != Type_UW)) {
324             short new_SubRegOff = dstOpnd->asDstRegRegion()->getSubRegOff();
325             if (dstOpnd->getRegAccess() == Direct) {
326                 new_SubRegOff = (dstOpnd->asDstRegRegion()->getSubRegOff() * dstOpnd->getTypeSize()) / TypeSize(Type_W);
327             }
328             G4_DstRegRegion new_dst(
329                 dstOpnd->getRegAccess(),
330                 dstOpnd->asDstRegRegion()->getBase(),
331                 dstOpnd->asDstRegRegion()->getRegOff(),
332                 new_SubRegOff,
333                 1,
334                 Type_W);
335             d = createDstRegRegion(new_dst);
336         }
337     }
338 
339     SFID tf_id =  SFID::DP_DC0;
340 
341     G4_ExecSize send_exec_size = G4_ExecSize(FIX_OWORD_SEND_EXEC_SIZE(num_oword));
342     bool forceSplitSend = shouldForceSplitSend(surface);
343 
344     if (!forceSplitSend)
345     {
346         createSendInst(
347             NULL, d,
348             payload,
349             1,
350             (num_oword * 16 + getGRFSize() - 1) / getGRFSize(),
351             send_exec_size,
352             temp,
353             tf_id,
354             true,
355             SendAccess::READ_ONLY,
356             surface,
357             NULL,
358             InstOpt_WriteEnable,
359             false);
360     }
361     else {
362         G4_SrcRegRegion *m0 = createSrcRegRegion(dcl, getRegionStride1());
363         createSplitSendInst(
364             NULL, d, m0, 1,
365             createNullSrc(Type_UD), 0,
366             (num_oword * 16 + getGRFSize() - 1) / getGRFSize(),
367             send_exec_size,
368             temp,
369             tf_id,
370             true,
371             SendAccess::READ_ONLY,
372             surface,
373             nullptr,
374             InstOpt_WriteEnable,
375             false);
376     }
377 
378     return VISA_SUCCESS;
379 }
380 
381 /*
382 * Translates OWord Block write intrinsic.
383 *
384 * write(I, P, vector<int, S> v)
385 *
386 * For GT, assume S = 8 then the code should look like
387 *
388 * .declare  VX Base=m ElementSize=4 Type=ud Total=16
389 * .declare  VY Base=m ElementSize=4 Type=ud Total=8  ALIAS(VX,8)
390 *
391 * mov  (8)     VX(0,0)<1>,  r0:ud
392 * mov  (8)     VY(0,0)<1>,  v       // mov  (8)     VX(1,0)<1>,  v
393 * mov  (1)     VX(0,2)<2>,  P
394 * send (8)     null<1>,  VX(0,0),  0x5,   0x04090200
395 *
396 * P: M0.2 in the message header (Global offset)
397 *
398 * 0x5 == 0 (Not the EOT)
399 *        0101 (Target Function ID: DP Render Cache)
400 *
401 * 0x04090200 == Bit 31-29: 000 (Reserved)
402 *               Bit 28-25: 0010 (Msg. leng. = 2)
403 *               Bit 24-20: 00000 (Response msg. leng. = 0)
404 *               Bit 19:    1 (Header present)
405 *               Bit 18:    0 (Ignored)
406 *               Bit 17:    0 (Send write commit message
407 *               Bit 16-13: 1000 (Msg. type = OWord block read - for Render Cache)
408 *               Bit 12-8:  00010 (Block size = 2 OWords) - can only be 1/2/4/8 for sampler/render cache
409 *               Bit 7-0:   00000000 + I (Binding table index)
410 *
411 */
translateVISAOwordStoreInst(G4_Operand * surface,VISA_Oword_Num size,G4_Operand * offOpnd,G4_SrcRegRegion * srcOpnd)412 int IR_Builder::translateVISAOwordStoreInst(
413     G4_Operand* surface,
414     VISA_Oword_Num size,
415     G4_Operand* offOpnd,
416     G4_SrcRegRegion* srcOpnd)
417 {
418     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
419 
420     surface = lowerSurface255To253(surface, *this);
421 
422     unsigned num_oword = Get_VISA_Oword_Num(size);
423     unsigned obj_size = num_oword * 16; // size of obj in bytes
424 
425     unsigned funcCtrl = DC_OWORD_BLOCK_WRITE << 14;
426 
427     uint32_t payloadGRFSize = (num_oword * 16 + getGRFSize() - 1) / getGRFSize();
428 
429     // Set bit 12-8 for the message descriptor
430     funcCtrl = setOwordForDesc(funcCtrl, num_oword, IsSLMSurface(surface));
431     bool forceSplitSend = shouldForceSplitSend(surface);
432     if (forceSplitSend || useSends())
433     {
434         G4_Declare *headerDcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
435 
436         if (isStatelessSurface(surface))
437         {
438             // Build stateless surface message header.
439             BuildStatelessSurfaceMessageHeader(this, headerDcl);
440         }
441 
442         /* mov (1)     VX(0,2)<1>,   P  */
443         createMovInst(headerDcl, 0, 2, g4::SIMD1, nullptr, nullptr, offOpnd, true);
444 
445         unsigned msgDesc = funcCtrl;
446         unsigned extMsgLength = payloadGRFSize;
447         uint16_t extFuncCtrl = 0;
448 
449         // message length = 1, response length = 0, header present = 1
450         msgDesc += (1 << getSendMsgLengthBitOffset()) + (1 << getSendHeaderPresentBitOffset());
451 
452         G4_SendDescRaw* desc = createSendMsgDesc(msgDesc, 0, 1, SFID::DP_DC0,
453             extMsgLength, extFuncCtrl, SendAccess::WRITE_ONLY, surface);
454 
455         G4_ExecSize sendSize = G4_ExecSize(FIX_OWORD_SEND_EXEC_SIZE(num_oword));
456 
457         G4_SrcRegRegion* src0 = createSrcRegRegion(headerDcl, getRegionStride1());
458         G4_DstRegRegion* dst = createNullDst(sendSize > 8 ? Type_UW: Type_UD);
459 
460         createSplitSendInst(
461             nullptr, dst, src0, srcOpnd, sendSize, desc, InstOpt_WriteEnable, false);
462     }
463     else
464     {
465         uint32_t temp =  obj_size/TypeSize(Type_UD) + GENX_DATAPORT_IO_SZ;
466 
467         G4_Declare *dcl = createSendPayloadDcl(temp, Type_UD);
468 
469         /* mov  (c*r)    VX(1,0)<1>,  V */
470         temp =  obj_size/TypeSize(Type_UD);
471 
472         createMovSendSrcInst(dcl, 1, 0, temp, srcOpnd, InstOpt_WriteEnable);
473 
474         if (isStatelessSurface(surface)) {
475             // Build stateless surface message header.
476             BuildStatelessSurfaceMessageHeader(this, dcl);
477         } else {
478             // Copy R0 header.
479             createMovR0Inst(dcl, 0, 0, true);
480         }
481 
482         /* mov (1)     VX(0,2)<1>,   P  */
483         createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, offOpnd, true);
484 
485         // send's operands preparation
486         /* Size of whole operand in UINT elements */
487         G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
488 
489         unsigned send_size = FIX_OWORD_SEND_EXEC_SIZE(num_oword);
490         G4_DstRegRegion *post_dst_opnd = createNullDst(send_size > 8 ? Type_UW: Type_UD);
491 
492         createSendInst(
493             NULL,
494             post_dst_opnd,
495             payload,
496             payloadGRFSize + 1,
497             0,
498             G4_ExecSize(send_size),
499             funcCtrl,
500             SFID::DP_DC0,
501             true,
502             SendAccess::WRITE_ONLY,
503             surface,
504             NULL,
505             InstOpt_WriteEnable,
506             false);
507     }
508 
509     return VISA_SUCCESS;
510 }
511 
512 static const uint8_t mapExecSizeToNumElts[6] = {1, 2, 4, 8, 16, 32};
513 
514 /*
515 * Translates scattered read intrinsic.
516 *
517 * For GT, assume N = 8 then the code should look like
518 *
519 * .declare  VX Base=m ElementSize=4 Type=ud Total=16
520 * .declare  VY Base=r ElementSize=4 Type=ud Total=8
521 *
522 * mov  (8)     VX(0,0)<1>,  r0:ud
523 * mov  (1)     VX(0,2)<1>,  P
524 * mov  (8)     VX(1,0)<1>,  E
525 * send (8)     VY(0,0)<1>,  VX(0,0),    0x5,  0x0418C200
526 *
527 * P: M0.2 in the message header (Global offset)
528 * E: M1 in the message payload (Element offsets)
529 * 0x5 == 0 (Not the EOT)
530 *        0101 (Target Function ID: DP Render Cache)
531 *
532 * 0x0418C200 == Bit 31-29: 000 (Reserved)
533 *               Bit 28-25: 0010 (Msg. leng. = 2)
534 *               Bit 24-20: 00001 (Response msg. leng. = 1)
535 *               Bit 19:    1 (Header present)
536 *               Bit 18:    0 (Ignored)
537 *               Bit 17:    0 (Send write commit message; ignored for read message
538 *               Bit 16-13: 0110 (Msg. type = DWord Scattered read - for Render Cache)
539 *               Bit 12-10: 010 Specifies the data size for each slot. 0: 1 byte; 1: 2 bytes; 2: 4 bytes; 3: Reserved
540 *               Bit 9-8:  00 (Block size = 8 DWords)
541 *               Bit 7-0:   00000000 + I (Binding table index)
542 *
543 */
translateVISAGatherInst(VISA_EMask_Ctrl emask,bool modified,GATHER_SCATTER_ELEMENT_SIZE eltSize,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_DstRegRegion * dstOpnd)544 int IR_Builder::translateVISAGatherInst(
545     VISA_EMask_Ctrl emask,
546     bool modified,
547     GATHER_SCATTER_ELEMENT_SIZE eltSize,
548     VISA_Exec_Size executionSize,
549     G4_Operand* surface,
550     G4_Operand* gOffOpnd,
551     G4_SrcRegRegion* eltOffOpnd,
552     G4_DstRegRegion* dstOpnd)
553 {
554     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
555 
556     surface = lowerSurface255To253(surface, *this);
557 
558     // Before GEN10, we translate DWORD GATHER on SLM to untyped GATHER4 on
559     // SLM with only R channel enabled. The later is considered more
560     // efficient without recalculating offsets in BYTE.
561     if (eltSize == GATHER_SCATTER_DWORD && IsSLMSurface(surface)) {
562         return translateVISAGather4Inst(emask, modified,
563             ChannelMask::createFromAPI(CHANNEL_MASK_R),
564             executionSize, surface, gOffOpnd,
565             eltOffOpnd, dstOpnd);
566     }
567 
568     G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
569     unsigned int instOpt = Get_Gen4_Emask(emask, exsize);
570     bool headerLess = isMessageHeaderOptional(surface, gOffOpnd);
571     // Element size in gather/scatter message. Initially, we assume it's the
572     // same as the request.
573     GATHER_SCATTER_ELEMENT_SIZE msgEltSize = eltSize;
574 
575     // SLM access
576     //              HEADLESS    BYTE    WORD    DWORD
577     // BDW          Opt         YES     NO      NO
578     // SKL          Req         YES     NO      NO
579     // CNL          Req         YES     NO      YES
580 
581     G4_Predicate* pred = NULL; // for SIMD1 gather
582     uint8_t numElt = mapExecSizeToNumElts[executionSize];
583     // we need to treat simd1 as simd8 in several places during code gen
584     uint8_t effectiveNumElt = (numElt == 1 ? 8 : numElt);
585 
586     if (!headerLess && noSLMMsgHeader() && IsSLMSurface(surface))
587     {
588         // From SKL, SLM messages forbid message header. Recalculate offset by
589         // adding global offset and force headerLess.
590         G4_Declare *dcl = createSendPayloadDcl(numElt, eltOffOpnd->getType());
591         dcl->setSubRegAlign(GRFALIGN);
592         G4_DstRegRegion *newEltOffOpnd = createDstRegRegion(dcl, 1);
593         createBinOp(G4_add, G4_ExecSize(numElt), newEltOffOpnd, eltOffOpnd, gOffOpnd, instOpt, true);
594         eltOffOpnd = createSrcRegRegion(dcl, numElt == 1 ? getRegionScalar() : getRegionStride1());
595         headerLess = true;
596     }
597 
598     bool useSplitSend = useSends();
599     // When header is not required, split-send is not needed as there's only
600     // one part in the message. When header is present, we will split the
601     // message as (header, offset).
602     if (headerLess)
603         useSplitSend = false;
604 
605     G4_Declare *header = 0;
606     G4_Declare *offset = createSendPayloadDcl(numElt, Type_UD);
607     offset->setSubRegAlign(GRFALIGN);
608 
609     if (useSplitSend)
610     {
611         ASSERT_USER(!headerLess, "SplitSend should not be used when header is not required!");
612         // Without header, it's unnecessary to split the message.
613         header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
614     }
615     else if (!headerLess)
616     {
617         header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + effectiveNumElt, Type_UD);
618         offset->setAliasDeclare(header, numEltPerGRF<Type_UB>());
619     }
620 
621     G4_SrcRegRegion* msgSrcOpnd = NULL;
622 
623     if (headerLess)
624     {
625         ASSERT_USER(!header, "'header' should not be allocated when header is not required!");
626 
627         if (eltSize == GATHER_SCATTER_WORD ||
628             (eltSize != GATHER_SCATTER_BYTE && IsSLMSurface(surface)))
629         {
630             // Use byte gather for WORD gather as well as SLM surfaces (only supports byte gather)
631             // need a shift to make the offset to be byte offset
632             // shl (8) tmp<1>:ud elt_off<8;8,1>:ud 0x2:uw
633             // Don't do this for Dword because we use the dword scatter message instead
634             G4_DstRegRegion* tmpDstOpnd = createDstRegRegion(offset, 1);
635             createBinOp(G4_shl, G4_ExecSize(numElt), tmpDstOpnd, eltOffOpnd,
636                 createImm(unsigned(eltSize), Type_UD), instOpt, true);
637             msgSrcOpnd = createSrcRegRegion(offset, getRegionStride1());
638             msgEltSize = GATHER_SCATTER_BYTE;
639         }
640         else
641         {
642             msgSrcOpnd = eltOffOpnd;
643         }
644     }
645     else
646     {
647         if (isStatelessSurface(surface)) {
648             // Build stateless surface message header.
649             BuildStatelessSurfaceMessageHeader(this, header);
650         } else {
651             // Copy R0 header.
652             createMovR0Inst(header, 0, 0, true);
653         }
654 
655         G4_DstRegRegion* dst1_opnd = createDst(offset->getRegVar(), 0, 0, 1, offset->getElemType());
656 
657         if (eltSize == GATHER_SCATTER_WORD || IsSLMSurface(surface))
658         {
659             // For non-SLM surface, WORD gather/scatter has no hardware
660             // support and must be translated into BYTE gather/scatter.
661             //
662             // SLM surface supports only BYTE gather/scatter
663             // support and also needs translating into BYTE gather/scatter.
664             //
665             /* mov (1)     VX(0,2)<1>,   P  */
666             if (gOffOpnd->isImm())
667             {
668                 G4_Imm *new_src_opnd1 = createImm(
669                     gOffOpnd->asImm()->getInt() * (eltSize == GATHER_SCATTER_WORD ? 2 : 4),
670                     gOffOpnd->getType());
671                 createMovInst(header, 0, 2, g4::SIMD1, NULL, NULL, new_src_opnd1, true);
672             }
673             else
674             {
675                 G4_DstRegRegion* dst2_opnd = createDst(header->getRegVar(), 0, 2, 1, header->getElemType());
676 
677                 createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd,
678                     createImm((unsigned)eltSize, Type_UD), InstOpt_WriteEnable, true);
679             }
680             createBinOp(G4_shl, G4_ExecSize(numElt), dst1_opnd, eltOffOpnd,
681                 createImm((unsigned)eltSize, Type_UD), instOpt, true);
682             msgEltSize = GATHER_SCATTER_BYTE;
683         }
684         else
685         {
686             /* mov (1)     VX(0,2)<1>,   P  */
687             createMovInst(header, 0, 2, g4::SIMD1, NULL, NULL, gOffOpnd, true);
688             /* mov  (numElt)    VX(1,0)<1>,  E */
689             createMov(G4_ExecSize(numElt), dst1_opnd,
690                 eltOffOpnd, instOpt, true);
691         }
692 
693         // Create a <8;8,1> src region for the send payload
694         msgSrcOpnd = createSrcRegRegion(header, getRegionStride1());
695     }
696 
697     G4_DstRegRegion* d = dstOpnd->asDstRegRegion();
698 
699     SFID tf_id = SFID::DP_DC0;
700     unsigned temp = 0;
701     // Set bit 9-8 for the message descriptor
702     if (msgEltSize == GATHER_SCATTER_DWORD)
703     {
704         if (effectiveNumElt == 8)
705         {
706             temp += 2 << 8;
707         }
708         else {
709             temp += 3 << 8;
710         }
711         temp += DC_DWORD_SCATTERED_READ << 14; // '0011' for DWORD scattered read
712     }
713     else
714     {
715         if (effectiveNumElt == 16)
716         {
717             temp += 1 << 8;
718         }
719         temp += (unsigned char)eltSize << 10;
720         temp += DC_BYTE_SCATTERED_READ << 14;
721     }
722 
723     if (useSplitSend)
724     {
725         ASSERT_USER(!headerLess, "SplitSend should only be used when header is required!");
726 
727         G4_SrcRegRegion *m0 = createSrcRegRegion(header, getRegionStride1());
728         G4_SrcRegRegion *m1 = createSrcRegRegion(offset, getRegionStride1());
729         createSplitSendInst(pred, d,
730             m0, 1,
731             m1, effectiveNumElt / GENX_DATAPORT_IO_SZ,
732             effectiveNumElt / GENX_DATAPORT_IO_SZ,
733             G4_ExecSize(numElt),
734             temp,
735             tf_id, true,
736             SendAccess::READ_ONLY,
737             surface, NULL, instOpt, false);
738     }
739     else
740     {
741         createSendInst(
742             pred,
743             d,
744             msgSrcOpnd,
745             headerLess ? effectiveNumElt/GENX_DATAPORT_IO_SZ : effectiveNumElt/GENX_DATAPORT_IO_SZ + 1,
746             effectiveNumElt/GENX_DATAPORT_IO_SZ,
747             G4_ExecSize(numElt),
748             temp,
749             tf_id,
750             !headerLess,
751             SendAccess::READ_ONLY,
752             surface,
753             nullptr,
754             instOpt,
755             false);
756     }
757 
758     return VISA_SUCCESS;
759 }
760 
761 
762 
763 
764 /*
765 * Translates scattered write intrinsic.
766 *
767 * For GT, assume N = 8 then the code should look like
768 *
769 * .declare  VX Base=m ElementSize=4 Type=ud Total=24
770 *
771 * mov  (8)     VX(0,0)<1>,  r0:ud
772 * mov  (1)     VX(0,2)<1>,  P
773 * mov  (8)     VX(1,0)<1>,  E
774 * mov  (8)     VX(2,0)<1>,  V
775 * send (8)     null<1>,     VX(0,0),    0x5,  0x06096200
776 *
777 * P: M0.2 in the message header (Global offset)
778 * E: M1 in the message payload (Element offsets)
779 * v: M2 in the message payload (written data)
780 *
781 * 0x5 == 0 (Not the EOT)
782 *        0101 (Target Function ID: DP Render Cache)
783 *
784 * 0x06096200 == Bit 31-29: 000 (Reserved)
785 *               Bit 28-25: 0011 (Msg. leng. = 3)
786 *               Bit 24-20: 00000 (Response msg. leng. = 0)
787 *               Bit 19:    1 (Header present)
788 *               Bit 18:    0 (Ignored)
789 *               Bit 17:    0 (Send write commit message)
790 *               Bit 16-13: 1011 (Msg. type = DWord Scattered write - for Render Cache)
791 *               Bit 12-8:  00010 (Block size = 8 DWords)
792 *               Bit 7-0:   00000000 + I (Binding table index)
793 *
794 */
translateVISAScatterInst(VISA_EMask_Ctrl emask,GATHER_SCATTER_ELEMENT_SIZE eltSize,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_SrcRegRegion * srcOpnd)795 int IR_Builder::translateVISAScatterInst(
796     VISA_EMask_Ctrl emask,
797     GATHER_SCATTER_ELEMENT_SIZE eltSize,
798     VISA_Exec_Size executionSize,
799     G4_Operand* surface,
800     G4_Operand* gOffOpnd,
801     G4_SrcRegRegion* eltOffOpnd,
802     G4_SrcRegRegion* srcOpnd)
803 {
804     // Before GEN10, we translate DWORD SCATTER on SLM to untyped GATHER4 on
805     // SLM with only R channel enabled. The later is considered more
806     // efficient without recalculating offsets in BYTE.
807     if (eltSize == GATHER_SCATTER_DWORD && IsSLMSurface(surface)) {
808         return translateVISAScatter4Inst(emask,
809             ChannelMask::createFromAPI(CHANNEL_MASK_R),
810             executionSize, surface, gOffOpnd,
811             eltOffOpnd, srcOpnd);
812     }
813 
814     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
815     surface = lowerSurface255To253(surface, *this);
816 
817     G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
818     G4_InstOpts instOpt = Get_Gen4_Emask(emask, exsize);
819     G4_Predicate *pred = NULL;
820     // Element size in gather/scatter message. Initially, we assume it's the same as the request.
821     GATHER_SCATTER_ELEMENT_SIZE msgEltSize = eltSize;
822 
823     uint8_t numElt = mapExecSizeToNumElts[executionSize];
824     // we need to treat simd1 as simd8 in several places during code gen
825     uint8_t effectiveNumElt = (numElt == 1 ? 8 : numElt);
826 
827     bool headerLess = isMessageHeaderOptional(surface, gOffOpnd);
828     G4_SrcRegRegion* msgSrcOpnd = NULL;
829 
830     // SLM access
831     //              HEADLESS    BYTE    WORD    DWORD
832     // BDW          Opt         YES     NO      NO
833     // SKL          Req         YES     NO      NO
834     // CNL          Req         YES     NO      YES
835 
836     if (!headerLess && noSLMMsgHeader() && IsSLMSurface(surface)) {
837         // From SKL, SLM messages forbid message header. Recalculate offset by
838         // adding global offset and force headerLess.
839         G4_Declare *dcl = createSendPayloadDcl(numElt, eltOffOpnd->getType());
840         G4_DstRegRegion *newEltOffOpnd = createDstRegRegion(dcl, 1);
841         createBinOp(G4_add, G4_ExecSize(numElt), newEltOffOpnd, eltOffOpnd, gOffOpnd, instOpt, true);
842         eltOffOpnd = createSrcRegRegion(dcl, numElt == 1 ? getRegionScalar() : getRegionStride1());
843         headerLess = true;
844     }
845 
846     if (headerLess)
847     {
848         // header size = 2 * #elt
849         G4_Declare *dcl = createSendPayloadDcl(effectiveNumElt * 2, Type_UD);
850         G4_DstRegRegion* tmpDstOpnd = createDstRegRegion(dcl, 1);
851         if (eltSize == GATHER_SCATTER_WORD ||
852             (eltSize != GATHER_SCATTER_BYTE && IsSLMSurface(surface)))
853         {
854             // For non-SLM surface,
855             // need a shift to make the offset to be byte offset
856             // shl (esize) tmp.0<1>:ud elt_off<8;8,1>:ud 0x2:uw
857             // Don't do this for Dword because we use the dword scatter message instead
858             //
859             // SLM surface has only BYTE scattered
860             // read/write support. Always use BYTE scater.
861             createBinOp(G4_shl, G4_ExecSize(numElt), tmpDstOpnd, eltOffOpnd,
862                 createImm(unsigned(eltSize), Type_UD), instOpt, true);
863             msgEltSize = GATHER_SCATTER_BYTE;
864         }
865         else
866         {
867             createMov(G4_ExecSize(numElt), tmpDstOpnd, eltOffOpnd, instOpt, true);
868         }
869 
870         createMovSendSrcInst(dcl, effectiveNumElt/8, 0, numElt, srcOpnd, instOpt);
871         msgSrcOpnd = createSrcRegRegion(dcl, getRegionStride1());
872     }
873     else
874     {
875         // mov (8)      VX(0,0)<1>,  r0:ud
876         // add dcl for VX
877         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + effectiveNumElt * 2, Type_UD);
878 
879         if (isStatelessSurface(surface)) {
880             // Build stateless surface message header.
881             BuildStatelessSurfaceMessageHeader(this, dcl);
882         } else {
883             // Copy R0 header.
884             createMovR0Inst(dcl, 0, 0, true);
885         }
886 
887         auto dst1_opnd = createDst(dcl->getRegVar(), 1, 0, 1, dcl->getElemType());
888 
889         if (eltSize == GATHER_SCATTER_WORD || IsSLMSurface(surface))
890         {
891             // For non-SLM surface, WORD gather/scatter has no hardware
892             // supportr and must be translated into BYTE gather/scatter.
893             //
894             // For SLM surface, gen9 devices has only BYTE gather/scatter
895             // support and also needs translating into BYTE gather/scatter.
896             //
897             /* mov (1)     VX(0,2)<1>,   P  */
898             if (gOffOpnd->isImm())
899             {
900                 G4_Imm *new_src_opnd1 = createImm(
901                     gOffOpnd->asImm()->getInt() * (eltSize == GATHER_SCATTER_WORD ? 2 : 4),
902                     gOffOpnd->getType());
903                 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, new_src_opnd1, true);
904             }
905             else
906             {
907                 G4_DstRegRegion* dst2_opnd = createDst(dcl->getRegVar(), 0, 2, 1, dcl->getElemType());
908                 createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd,
909                     createImm((unsigned)eltSize, Type_UD), InstOpt_WriteEnable, true);
910             }
911             createBinOp(G4_shl, G4_ExecSize(numElt), dst1_opnd, eltOffOpnd,
912                 createImm((unsigned)eltSize, Type_UD), instOpt, true);
913             msgEltSize = GATHER_SCATTER_BYTE;
914         }
915         else
916         {
917             /* mov (1)     VX(0,2)<1>,   P  */
918             createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, gOffOpnd, true);
919             /* mov  (numElt)    VX(1,0)<1>,  E */
920             createMov(G4_ExecSize(numElt), dst1_opnd,
921                 eltOffOpnd, instOpt, true);
922         }
923 
924         /* mov  (numElt)    VX(numElt/8+1,0)<1>,  V */
925         createMovSendSrcInst(dcl, (effectiveNumElt/8+1), 0, numElt, srcOpnd, instOpt);
926 
927         // send's operands preparation
928         // create a currDst for VX
929         msgSrcOpnd = createSrcRegRegion(dcl, getRegionStride1());
930     }
931 
932     unsigned temp = 0;
933 
934     // Set bit 9-8 for the message descriptor
935     if (msgEltSize == GATHER_SCATTER_DWORD)
936     {
937         if (effectiveNumElt == 8)
938         {
939             temp += 2 << 8;
940         }
941         else {
942             temp += 3 << 8;
943         }
944         temp += DC_DWORD_SCATTERED_WRITE << 14;
945     }
946     else
947     {
948         if (effectiveNumElt == 16)
949         {
950             temp += 1 << 8;
951         }
952         temp += (unsigned char)eltSize << 10;
953         temp += DC_BYTE_SCATTERED_WRITE << 14;
954     }
955 
956     G4_DstRegRegion *post_dst_opnd = createNullDst(effectiveNumElt > 8 ? Type_UW : Type_UD);
957 
958     createSendInst(
959         pred,
960         post_dst_opnd,
961         msgSrcOpnd,
962         headerLess ? effectiveNumElt/GENX_DATAPORT_IO_SZ * 2 :
963         effectiveNumElt/GENX_DATAPORT_IO_SZ * 2 + 1,
964         0,
965         G4_ExecSize(numElt),
966         temp,
967         SFID::DP_DC0,
968         !headerLess,
969         SendAccess::WRITE_ONLY,
970         surface,
971         NULL,
972         instOpt,
973         false);
974 
975     return VISA_SUCCESS;
976 }
977 
978 
BuildUntypedStatelessSurfaceMessageHeader(IR_Builder * IRB,G4_Declare * Header)979 static void BuildUntypedStatelessSurfaceMessageHeader(IR_Builder *IRB, G4_Declare *Header)
980 {
981     // Set PSM (Pixel Sample Mask) in MH1_A32_PSM
982     G4_Type ElemTy = Header->getElemType();
983 
984     // R0.7<31:0> is defined as MHC_PSM where the lower 16 bits specify the
985     // pixel sample mask.
986     const unsigned PSM_Mask = 0xffff;
987 
988     // Rx.7[31:0] = 0xFFFF
989     G4_DstRegRegion *DstOpnd = IRB->createDst(Header->getRegVar(), 0, 7, 1, ElemTy);
990     // Mask
991     G4_Imm *Mask = IRB->createImm(PSM_Mask, Type_UD);
992     IRB->createMov(g4::SIMD1, DstOpnd, Mask, InstOpt_WriteEnable, true);
993 
994     BuildStatelessSurfaceMessageHeader(IRB, Header);
995 }
996 
997 
998 /*
999 * Translates untyped surface read.
1000 *
1001 * For GT, assume N = 8 then the code should look like
1002 *
1003 * .declare  VX Base=m ElementSize=4 Type=ud Total=16
1004 * .declare  VY Base=r ElementSize=4 Type=ud Total=8
1005 *
1006 * mov  (8)     VX(0,0)<1>,  r0:ud
1007 * mov  (8)     VX(1,0)<1>,  P+E
1008 * send (8)     VY(0,0)<1>,  VX(0,0),    0x5,  0x0418C200
1009 *
1010 * E: M1 in the message payload (Element offsets in BYTEs)
1011 * 1010 (Target Function ID: Data Cache)
1012 *
1013 * 0x0418C200 == Bit 31-29: 000 (Reserved)
1014 *               Bit 28-25: 0010 (Msg. leng. = 2)
1015 *               Bit 24-20: 00001 (Response msg. leng. = 1)
1016 *               Bit 19:    1 (Header present)
1017 *               Bit 18:    0 (Ignored)
1018 *               Bit 17-14: 1101 (Msg. type = untyped write - for data Cache)
1019 *               Bit 13-12:  0010 (SIMD mode = 8)
1020 *               Bit 11-8:  0000 (masked channels)
1021 *               Bit 7-0:   00000000 + I (Binding table index)
1022 *
1023 */
translateVISAGather4Inst(VISA_EMask_Ctrl emask,bool modified,ChannelMask chMask,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_DstRegRegion * dstOpnd)1024 int IR_Builder::translateVISAGather4Inst(
1025     VISA_EMask_Ctrl emask,
1026     bool modified,
1027     ChannelMask chMask,
1028     VISA_Exec_Size executionSize,
1029     G4_Operand* surface,
1030     G4_Operand* gOffOpnd,
1031     G4_SrcRegRegion* eltOffOpnd,
1032     G4_DstRegRegion* dstOpnd)
1033 {
1034     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1035 
1036     surface = lowerSurface255To253(surface, *this);
1037 
1038     G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
1039     G4_InstOpts instOpt = Get_Gen4_Emask(emask, exsize);
1040     unsigned int num_channel = chMask.getNumEnabledChannels();
1041 
1042     uint8_t numElt = mapExecSizeToNumElts[executionSize];
1043     uint8_t hdrSize = 0;
1044 
1045     bool useSplitSend = useSends();
1046 
1047     G4_Declare *header = 0;
1048     G4_Declare *offset = createSendPayloadDcl(numElt, Type_UD);
1049 
1050     if (surface && isStatelessSurface(surface) && needsA32MsgHeader())
1051     {
1052         // Header is required to work around a HW issue on pre-SKL devices.
1053         hdrSize = GENX_DATAPORT_IO_SZ;
1054         if (useSplitSend) {
1055             header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1056         } else {
1057             header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + numElt, Type_UD);
1058             offset->setAliasDeclare(header, numEltPerGRF<Type_UB>());
1059         }
1060     } else {
1061         // When the surface is not stateless one, header is not used and therefore
1062         // split-send is not used.
1063         useSplitSend = false;
1064     }
1065 
1066     if (header) {
1067         // With 'header' allocated, we need prepare the header for the
1068         // (stateless) surface.
1069         ASSERT_USER(isStatelessSurface(surface), "With 'header' allocated, stateless surface is expected!");
1070         // Build stateless surface message header.
1071         BuildUntypedStatelessSurfaceMessageHeader(this, header);
1072     }
1073 
1074     // convert to byte address
1075     // shl (esize) offset<1>:ud elt_off<8;8,1>:ud 2:uw
1076     G4_DstRegRegion* dst1_opnd = createDst(offset->getRegVar(), 0, 0, 1, offset->getElemType());
1077 
1078     G4_Declare *tmp_dcl = createTempVar(numElt, Type_UD, GRFALIGN);
1079     G4_DstRegRegion* dst3_opnd = createDst(tmp_dcl->getRegVar(), 0, 0, 1, tmp_dcl->getElemType());
1080 
1081     createBinOp(G4_shl, G4_ExecSize(numElt), dst3_opnd, eltOffOpnd, createImm(2, Type_UW), instOpt, true);
1082 
1083     G4_SrcRegRegion* src2_opnd = createSrc(tmp_dcl->getRegVar(), 0, 0,
1084         getRegionStride1(), tmp_dcl->getElemType());
1085 
1086     // As untyped surface message use MH_IGNORE based header, if global offset
1087     // is non-zero, we need recalculate element offsets.
1088     if (gOffOpnd->isImm())
1089     {
1090         if (gOffOpnd->asImm()->getInt() != 0)
1091         {
1092             gOffOpnd = createImm(
1093                 gOffOpnd->asImm()->getInt() * 4,
1094                 gOffOpnd->getType());
1095             createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, gOffOpnd, instOpt, true);
1096         }
1097         else
1098         {
1099             createMov(G4_ExecSize(numElt), dst1_opnd, src2_opnd, instOpt, true);
1100         }
1101     }
1102     else
1103     {
1104         G4_Declare *tmp_dcl1 = createTempVar(1, gOffOpnd->getType(), Any);
1105         G4_DstRegRegion* dst2_opnd = createDst(tmp_dcl1->getRegVar(), 0, 0, 1, tmp_dcl1->getElemType());
1106 
1107         createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd, createImm(2, Type_UW), InstOpt_WriteEnable, true);
1108 
1109         G4_SrcRegRegion* src1Opnd = createSrc(tmp_dcl1->getRegVar(), 0, 0,
1110             getRegionScalar(), tmp_dcl1->getElemType());
1111 
1112         createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, src1Opnd, instOpt, true);
1113     }
1114 
1115     // send's operands preparation
1116 
1117     G4_DstRegRegion* d = checkSendDst(dstOpnd->asDstRegRegion());
1118 
1119     unsigned temp = 0;
1120 
1121     // Set bit 13-12 for the message descriptor
1122     if (numElt == 8)
1123     {
1124         temp += 2 << 12;
1125     }
1126     else
1127     {
1128         temp += 1 << 12;
1129     }
1130 
1131     SFID tf_id = SFID::DP_DC1;
1132     temp += DC1_UNTYPED_SURFACE_READ << 14;
1133 
1134     // bits 11-8: channel mask
1135     // HW defines 0 to mean the channel is on, so we have to flip it
1136     temp += chMask.getHWEncoding() << 8;
1137 
1138     if (surface == NULL)
1139     {
1140         temp |= 0xFE;
1141     }
1142 
1143     if (useSplitSend) {
1144         ASSERT_USER(header, "'header' should be allocated when split-send is to be used.");
1145 
1146         G4_SrcRegRegion *m0 = createSrcRegRegion(header, getRegionStride1());
1147         G4_SrcRegRegion *m1 = createSrcRegRegion(offset, getRegionStride1());
1148         createSplitSendInst(NULL, d,
1149             m0, 1, m1, numElt / GENX_DATAPORT_IO_SZ,
1150             (numElt / GENX_DATAPORT_IO_SZ)* num_channel,
1151             G4_ExecSize(numElt), temp, tf_id, hdrSize != 0,
1152             SendAccess::READ_ONLY,
1153             surface, NULL, instOpt, false);
1154     }
1155     else
1156     {
1157         G4_SrcRegRegion* payload = createSrcRegRegion(header ? header : offset, getRegionStride1());
1158         createSendInst(
1159             NULL,
1160             d,
1161             payload,
1162             (hdrSize + numElt)/GENX_DATAPORT_IO_SZ,
1163             (numElt/GENX_DATAPORT_IO_SZ) * num_channel,
1164             G4_ExecSize(numElt),
1165             temp,
1166             tf_id,
1167             hdrSize != 0,
1168             SendAccess::READ_ONLY,
1169             surface,
1170             NULL,
1171             instOpt,
1172             false);
1173     }
1174 
1175     return VISA_SUCCESS;
1176 }
1177 
1178 
1179 /*
1180 * Translates untyped surface write intrinsic.
1181 *
1182 * For GT, assume N = 8 then the code should look like
1183 *
1184 * .declare  VX Base=m ElementSize=4 Type=ud Total=24
1185 *
1186 * mov  (8)     VX(0,0)<1>,  r0:ud
1187 * mov  (8)     VX(1,0)<1>,  E + P
1188 * mov  (8)     VX(2,0)<1>,  V
1189 * send (8)     null<1>,     VX(0,0),    0x5,  0x06096200
1190 *
1191 * E: M1 in the message payload (Element offsets)
1192 * v: M2 in the message payload (written data)
1193 *
1194 * 1010 (Target Function ID: DP Data Cache)
1195 *
1196 * 0x06096200 == Bit 31-29: 000 (Reserved)
1197 *               Bit 28-25: 0011 (Msg. leng. = 3)
1198 *               Bit 24-20: 00000 (Response msg. leng. = 0)
1199 *               Bit 19:    1 (Header present)
1200 *               Bit 18:    0 (Ignored)
1201 *               Bit 17-14: 1101 (Msg. type = untyped write - for data Cache)
1202 *               Bit 13-12:  0010 (SIMD mode = 8)
1203 *                  Bit 11-8:  0000 (masked channels)
1204 *               Bit 7-0:   00000000 + I (Binding table index)
1205 *
1206 */
translateVISAScatter4Inst(VISA_EMask_Ctrl emask,ChannelMask chMask,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_SrcRegRegion * srcOpnd)1207 int IR_Builder::translateVISAScatter4Inst(
1208     VISA_EMask_Ctrl emask,
1209     ChannelMask chMask,
1210     VISA_Exec_Size executionSize,
1211     G4_Operand* surface,
1212     G4_Operand* gOffOpnd,
1213     G4_SrcRegRegion* eltOffOpnd,
1214     G4_SrcRegRegion* srcOpnd)
1215 {
1216     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1217 
1218     surface = lowerSurface255To253(surface, *this);
1219 
1220     G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
1221     G4_InstOpts instOpt = Get_Gen4_Emask(emask, exsize);
1222 
1223     unsigned int num_channel = chMask.getNumEnabledChannels();
1224 
1225     uint8_t numElt = mapExecSizeToNumElts[executionSize];
1226     uint8_t hdrSize = 0;
1227 
1228     unsigned int data_size = numElt * num_channel;
1229     G4_Declare *src_dcl = srcOpnd->asSrcRegRegion()->getBase()->asRegVar()->getDeclare();
1230 
1231     int payload_size = numElt + data_size;
1232 
1233     bool useSplitSend = useSends();
1234 
1235     G4_Declare *header = 0;
1236     G4_Declare *offset = 0;
1237     G4_Declare *data = createSendPayloadDcl(data_size, Type_UD);
1238 
1239     if (surface && isStatelessSurface(surface) && needsA32MsgHeader())
1240     {
1241         // Header is required to work around a HW issue on pre-SKL devices.
1242         hdrSize = GENX_DATAPORT_IO_SZ;
1243         offset = createSendPayloadDcl(numElt, Type_UD);
1244         if (useSplitSend) {
1245             // When header is required, we split the message as
1246             // (header, offset + data) if split-send is supported.
1247             header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1248             offset = createSendPayloadDcl(payload_size, Type_UD);
1249             data->setAliasDeclare(offset, (numElt/8) * numEltPerGRF<Type_UB>());
1250         } else {
1251             header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + payload_size, Type_UD);
1252             offset->setAliasDeclare(header, numEltPerGRF<Type_UB>());
1253             data->setAliasDeclare(header, numEltPerGRF<Type_UB>() * ((numElt/8) + 1));
1254         }
1255     } else {
1256         if (useSplitSend) {
1257             // When header is not required, we split the message as (offset, data)
1258             // if split-send is supported.
1259             offset = createSendPayloadDcl(numElt, Type_UD);
1260         } else {
1261             offset = createSendPayloadDcl(payload_size, Type_UD);
1262             data->setAliasDeclare(offset, (numElt/8) * numEltPerGRF<Type_UB>());
1263         }
1264     }
1265 
1266     if (header) {
1267         // With 'header' allocated, we need prepare the header for the
1268         // (stateless) surface.
1269         ASSERT_USER(isStatelessSurface(surface),
1270             "With 'header' allocated, stateless surface is expected!");
1271         // Build stateless surface message header.
1272         BuildUntypedStatelessSurfaceMessageHeader(this, header);
1273     }
1274 
1275     if (!header && useSplitSend)
1276     {
1277         data = src_dcl;
1278     } else
1279     {
1280         // Copy data from src operand.
1281         for (unsigned i = 0; i != num_channel; ++i)
1282         {
1283             G4_SrcRegRegion* s2_opnd =
1284                 createSrc(
1285                     src_dcl->getRegVar(), (i * numElt) / 8, 0, getRegionStride1(), src_dcl->getElemType());
1286             createMovSendSrcInst(data, (i * numElt) / 8, 0, numElt, s2_opnd, instOpt);
1287         }
1288     }
1289 
1290     // mov  VX(0,0)<1>, r0
1291     // createMovR0Inst(header, 0, 0, true);
1292 
1293     G4_DstRegRegion* dst1_opnd = createDst(offset->getRegVar(), 0, 0, 1, offset->getElemType());
1294 
1295     G4_Declare *tmp_dcl = createTempVar(numElt, Type_UD, GRFALIGN);
1296     G4_DstRegRegion* dst3_opnd = createDst(tmp_dcl->getRegVar(), 0, 0, 1, tmp_dcl->getElemType());
1297 
1298     createBinOp(G4_shl, G4_ExecSize(numElt), dst3_opnd, eltOffOpnd, createImm(2, Type_UW), instOpt, true);
1299 
1300     G4_SrcRegRegion* src2_opnd =
1301         createSrc(
1302             tmp_dcl->getRegVar(), 0, 0, getRegionStride1(), tmp_dcl->getElemType());
1303 
1304     if (gOffOpnd->isImm())
1305     {
1306         if (gOffOpnd->asImm()->getInt() != 0)
1307         {
1308             gOffOpnd = createImm(
1309                 gOffOpnd->asImm()->getInt() * 4,
1310                 gOffOpnd->getType());
1311             createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, gOffOpnd, instOpt, true);
1312         }
1313         else
1314         {
1315             createMov(G4_ExecSize(numElt), dst1_opnd, src2_opnd, instOpt, true);
1316         }
1317     }
1318     else
1319     {
1320         G4_Declare *tmp_dcl1 = createTempVar(1, gOffOpnd->getType(), Any);
1321         G4_DstRegRegion* dst2_opnd = createDst(tmp_dcl1->getRegVar(), 0, 0, 1, tmp_dcl1->getElemType());
1322 
1323         createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd, createImm(2, Type_UW), InstOpt_WriteEnable, true);
1324 
1325         G4_SrcRegRegion* src1Opnd = createSrc(tmp_dcl1->getRegVar(), 0, 0,
1326             getRegionScalar(), tmp_dcl1->getElemType());
1327 
1328         createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, src1Opnd, instOpt, true);
1329     }
1330 
1331     // send's operands preparation
1332     unsigned temp = 0;
1333 
1334     // Set bit 13-12 for the message descriptor
1335     if (numElt == 8) {
1336         temp += 2 << 12;
1337     } else {
1338         temp += 1 << 12;
1339     }
1340 
1341     SFID tf_id = SFID::DP_DC1;
1342     temp += DC1_UNTYPED_SURFACE_WRITE << 14;
1343     // bits 11-8: channel mask
1344     temp += chMask.getHWEncoding() << 8;
1345 
1346     // Set bit 9-8 for the message descriptor
1347 
1348     if (surface == NULL)
1349     {
1350         temp |= 0xFF - 1;
1351     }
1352 
1353     G4_DstRegRegion *post_dst_opnd = createNullDst(numElt > 8 ? Type_UW : Type_UD);
1354 
1355     if (useSplitSend) {
1356         G4_SrcRegRegion *m0 = 0; unsigned m0Len = 0;
1357         G4_SrcRegRegion *m1 = 0; unsigned m1Len = 0;
1358         if (header) {
1359             m0 = createSrcRegRegion(header, getRegionStride1());
1360             m0Len = 1;
1361             m1 = createSrcRegRegion(offset, getRegionStride1());
1362             m1Len = payload_size / GENX_DATAPORT_IO_SZ;
1363         } else {
1364             m0 = createSrcRegRegion(offset, getRegionStride1());
1365             m0Len = numElt / GENX_DATAPORT_IO_SZ;
1366             m1 = createSrcRegRegion(data, getRegionStride1());
1367             m1Len = data_size / GENX_DATAPORT_IO_SZ;
1368         }
1369         createSplitSendInst(NULL, post_dst_opnd,
1370             m0, m0Len, m1, m1Len, 0,
1371             G4_ExecSize(numElt),
1372             temp, tf_id, hdrSize != 0,
1373             SendAccess::WRITE_ONLY,
1374             surface, NULL,
1375             instOpt, false);
1376     }
1377     else
1378     {
1379         G4_SrcRegRegion* payload = createSrcRegRegion(header ? header : offset, getRegionStride1());
1380         createSendInst(
1381             NULL,
1382             post_dst_opnd,
1383             payload,
1384             (numElt * (num_channel + 1) + hdrSize)/GENX_DATAPORT_IO_SZ,
1385             0,
1386             G4_ExecSize(numElt),
1387             temp,
1388             tf_id,
1389             hdrSize != 0,
1390             SendAccess::WRITE_ONLY,
1391             surface,
1392             NULL,
1393             instOpt,
1394             false);
1395     }
1396 
1397     return VISA_SUCCESS;
1398 }
1399 
IsFloatAtomicOps(VISAAtomicOps op)1400 static bool IsFloatAtomicOps(VISAAtomicOps op)
1401 {
1402     return op == ATOMIC_FMAX || op == ATOMIC_FMIN || op == ATOMIC_FCMPWR ||
1403         op == ATOMIC_FADD || op == ATOMIC_FSUB;
1404 }
1405 
BuildMH1_A32_PSM(IR_Builder * IRB,G4_Declare * header)1406 static void BuildMH1_A32_PSM(IR_Builder *IRB, G4_Declare *header) {
1407     // Clear header. Ignore PSM so far.
1408     G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
1409         0, 0, 1, Type_UD);
1410     IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1411     // Set PSM to all 1s.
1412     G4_DstRegRegion *h0_7 =
1413         IRB->createDst(header->getRegVar(), 0, 7, 1, Type_UD);
1414     G4_Imm *Mask = IRB->createImm(0xFFFF, Type_UD);
1415     IRB->createMov(g4::SIMD1, h0_7, Mask, InstOpt_WriteEnable, true);
1416 }
1417 
1418 
BuildMH1_BTS_PSM(IR_Builder * IRB,G4_Declare * header)1419 static void BuildMH1_BTS_PSM(IR_Builder *IRB, G4_Declare *header) {
1420     // Clear header
1421     G4_DstRegRegion* h = IRB->createDst(header->getRegVar(),
1422         0, 0, 1, Type_UD);
1423     IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1424     // Set PSM to 0xFFFF so far.
1425     G4_Operand* maskImm = IRB->createImm(0xFFFF, Type_UD);
1426     G4_DstRegRegion* pitchDst = IRB->createDst(
1427         header->getRegVar(),
1428         0, 7, 1, Type_UD);
1429     IRB->createMov(g4::SIMD1, pitchDst, maskImm, InstOpt_WriteEnable, true);
1430 }
1431 
1432 
1433 // This version takes byte offsets and predicates
translateVISADwordAtomicInst(VISAAtomicOps atomicOp,bool is16Bit,G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Operand * surface,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_DstRegRegion * dst)1434 int IR_Builder::translateVISADwordAtomicInst(
1435     VISAAtomicOps atomicOp,
1436     bool is16Bit,
1437     G4_Predicate *pred,
1438     VISA_Exec_Size execSize,
1439     VISA_EMask_Ctrl eMask,
1440     G4_Operand* surface,
1441     G4_SrcRegRegion* offsets,
1442     G4_SrcRegRegion* src0,
1443     G4_SrcRegRegion* src1,
1444     G4_DstRegRegion* dst)
1445 {
1446     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1447 
1448     ASSERT_USER(!IsFloatAtomicOps(atomicOp) || hasFloatAtomics(),
1449         "Float atomic operations are only supported on SKL+ devices");
1450 
1451     ASSERT_USER(getPlatform() >= XeHP_SDV || ((atomicOp != ATOMIC_FADD) && (atomicOp != ATOMIC_FSUB)),
1452         "FADD/FSUB atomic operations are only supported on this devices");
1453 
1454     surface = lowerSurface255To253(surface, *this);
1455 
1456     VISA_Exec_Size instExecSize = execSize;
1457     execSize = roundUpExecSize(execSize);
1458 
1459     // always 8 or 16
1460     G4_ExecSize exSize = toExecSize(execSize);
1461     // can be 1 for scalar atomics
1462     G4_ExecSize instExSize = toExecSize(instExecSize);
1463     G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
1464     unsigned subOpc = Get_Atomic_Op(atomicOp);
1465 
1466     bool useSplitSend = useSends();
1467     bool hasRet = !dst->isNullReg();
1468 
1469     if (atomicOp == ATOMIC_CMPXCHG)
1470     {
1471         std::swap(src0, src1);
1472     }
1473 
1474     PayloadSource sources[4]; // optional header + offsets + [src0] + [src1]
1475     unsigned len = 0;
1476 
1477     bool useHeader = needsA32MsgHeader() && surface && isStatelessSurface(surface);
1478     if (useHeader) {
1479         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1480 
1481         BuildMH1_A32_PSM(this, dcl);
1482 
1483         G4_SrcRegRegion *header
1484             = createSrcRegRegion(dcl, getRegionStride1());
1485         sources[len].opnd = header;
1486         sources[len].execSize = g4::SIMD8;
1487         sources[len].instOpt = InstOpt_WriteEnable;
1488         ++len;
1489     }
1490 
1491     sources[len].opnd = offsets;
1492     sources[len].execSize = exSize;
1493     sources[len].instOpt = instOpt;
1494     ++len;
1495 
1496     if (src0 && !src0->isNullReg()) {
1497         sources[len].opnd = src0;
1498         sources[len].execSize = exSize;
1499         sources[len].instOpt = instOpt;
1500         ++len;
1501     }
1502 
1503     if (src1 && !src1->isNullReg()) {
1504         sources[len].opnd = src1;
1505         sources[len].execSize = exSize;
1506         sources[len].instOpt = instOpt;
1507         ++len;
1508     }
1509 
1510     G4_SrcRegRegion *msgs[2] = {0, 0};
1511     unsigned sizes[2] = {0, 0};
1512     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1513 
1514     SFID sfid = SFID::DP_DC1;
1515     unsigned MD = 0;
1516     bool IsFloatOp = IsFloatAtomicOps(atomicOp);
1517 
1518     // Bit 12 specifies the SIMD mode.
1519     MD |= (execSize == EXEC_SIZE_8 ? MDC_SM2R_SIMD8 : MDC_SM2R_SIMD16) << 12;
1520     if (is16Bit)
1521     {
1522         MD |= (IsFloatOp ? static_cast<unsigned>(DC1_UNTYPED_HALF_FLOAT_ATOMIC)
1523             : static_cast<unsigned>(DC1_UNTYPED_HALF_INTEGER_ATOMIC))
1524             << 14;
1525     }
1526     else
1527     {
1528         MD |= (IsFloatOp ? static_cast<unsigned>(DC1_UNTYPED_FLOAT_ATOMIC)
1529             : static_cast<unsigned>(DC1_UNTYPED_ATOMIC))
1530             << 14;
1531     }
1532     MD |= (hasRet ? 1 : 0) << 13;
1533     MD |= subOpc << 8;
1534 
1535     unsigned resLen = hasRet ? (exSize / GENX_DATAPORT_IO_SZ) : 0;
1536     bool forceSplitSend = shouldForceSplitSend(surface);
1537     if (msgs[1] == 0 && !forceSplitSend) {
1538         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1539         createSendInst(pred, dst,
1540             msgs[0], sizes[0],
1541             resLen,
1542             instExSize,
1543             MD, sfid,
1544             useHeader,
1545             SendAccess::READ_WRITE,
1546             surface, NULL,
1547             instOpt, false);
1548     } else {
1549         createSplitSendInst(pred, dst,
1550             msgs[0], sizes[0], msgs[1], sizes[1],
1551             resLen,
1552             instExSize,
1553             MD, sfid,
1554             useHeader,
1555             SendAccess::READ_WRITE,
1556             surface, NULL,
1557             instOpt, false);
1558     }
1559 
1560     return VISA_SUCCESS;
1561 }
1562 
1563 
1564 // build the address payload for typed messages (read/write/atomic)
1565 // sources stores the address payload, and its length len is also updated
buildTypedSurfaceAddressPayload(G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_ExecSize exSize,G4_InstOpts instOpt,PayloadSource sources[],uint32_t & len)1566 void IR_Builder::buildTypedSurfaceAddressPayload(
1567     G4_SrcRegRegion* uOffsetOpnd,
1568     G4_SrcRegRegion* vOffsetOpnd,
1569     G4_SrcRegRegion* rOffsetOpnd,
1570     G4_SrcRegRegion* lodOpnd,
1571     G4_ExecSize exSize,
1572     G4_InstOpts instOpt,
1573     PayloadSource sources[],
1574     uint32_t& len)
1575 {
1576     // Valid address payload pattern are listed below:
1577     // (* means the parameter is ignored by HW but must be included in payload)
1578     // U
1579     // U, V
1580     // U, V, R
1581     // U, *, *, LOD
1582     // U, V, *, LOD
1583     // U, V, R, LOD
1584 
1585     // Append U
1586     sources[len].opnd = uOffsetOpnd;
1587     sources[len].execSize = exSize;
1588     sources[len].instOpt = instOpt;
1589     ++len;
1590 
1591     // Append V if any.
1592     if (!vOffsetOpnd->isNullReg()) {
1593         sources[len].opnd = vOffsetOpnd;
1594         sources[len].execSize = exSize;
1595         sources[len].instOpt = instOpt;
1596         ++len;
1597     }
1598     else if (!lodOpnd->isNullReg()) {
1599         G4_SrcRegRegion *nullVOffset = createNullSrc(Type_UD);
1600         sources[len].opnd = nullVOffset;
1601         sources[len].execSize = exSize;
1602         sources[len].instOpt = instOpt;
1603         ++len;
1604     }
1605 
1606     // Append R if any.
1607     if (!rOffsetOpnd->isNullReg()) {
1608         ASSERT_USER(!vOffsetOpnd->isNullReg(),
1609             "r offset must be NULL if v offset is NULL");
1610         sources[len].opnd = rOffsetOpnd;
1611         sources[len].execSize = exSize;
1612         sources[len].instOpt = instOpt;
1613         ++len;
1614     }
1615     else if (!lodOpnd->isNullReg()) {
1616         G4_SrcRegRegion *nullROffset = createNullSrc(Type_UD);
1617         sources[len].opnd = nullROffset;
1618         sources[len].execSize = exSize;
1619         sources[len].instOpt = instOpt;
1620         ++len;
1621     }
1622 
1623     // Append LOD if any.
1624     if (!lodOpnd->isNullReg()) {
1625         sources[len].opnd = lodOpnd;
1626         sources[len].execSize = exSize;
1627         sources[len].instOpt = instOpt;
1628         ++len;
1629     }
1630 }
1631 
1632 
1633 // u must not be V0. v and r are allowed to be V0, in which case they will be
1634 // skipped in payload.
translateVISAGather4TypedInst(G4_Predicate * pred,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,VISA_Exec_Size executionSize,G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_DstRegRegion * dstOpnd)1635 int IR_Builder::translateVISAGather4TypedInst(
1636     G4_Predicate *pred,
1637     VISA_EMask_Ctrl emask,
1638     ChannelMask chMask,
1639     G4_Operand *surface,
1640     VISA_Exec_Size executionSize,
1641     G4_SrcRegRegion *uOffsetOpnd,
1642     G4_SrcRegRegion *vOffsetOpnd,
1643     G4_SrcRegRegion *rOffsetOpnd,
1644     G4_SrcRegRegion *lodOpnd,
1645     G4_DstRegRegion *dstOpnd)
1646 {
1647     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1648 
1649     G4_ExecSize exSize = executionSize == EXEC_SIZE_16 ? g4::SIMD16 : g4::SIMD8;
1650     assert((exSize == 8 || hasSIMD16TypedRW()) && "only simd8 is supported");
1651     G4_InstOpts instOpt = Get_Gen4_Emask(emask, exSize);
1652     int numEnabledChannels = chMask.getNumEnabledChannels();
1653 
1654     bool useSplitSend = useSends();
1655 
1656     bool hasHeader = getPlatform() == GENX_BDW;
1657 
1658     PayloadSource sources[5]; // (maybe header) + maximal 4 addresses
1659     unsigned len = 0;
1660 
1661     if (hasHeader)
1662     {
1663         // Build header
1664         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1665         BuildMH1_BTS_PSM(this, dcl);
1666 
1667         // Append header
1668         G4_SrcRegRegion *header
1669             = createSrcRegRegion(dcl, getRegionStride1());
1670         sources[len].opnd = header;
1671         sources[len].execSize = g4::SIMD8;
1672         sources[len].instOpt = InstOpt_WriteEnable;
1673         ++len;
1674     }
1675 
1676     buildTypedSurfaceAddressPayload(uOffsetOpnd, vOffsetOpnd, rOffsetOpnd, lodOpnd, exSize, instOpt, sources, len);
1677     G4_SrcRegRegion *msgs[2] = {0, 0};
1678     unsigned sizes[2] = {0, 0};
1679     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1680 
1681     //bit 8-11: RGBA channel enable
1682     unsigned msgDesc = chMask.getHWEncoding() << 8;
1683     SFID sfId;
1684 
1685     // DC1
1686     // bit14-17: 0101 (read), 1101 (write)
1687     msgDesc |= DC1_TYPED_SURFACE_READ << 14;
1688     // bit12-13: 01 (use low 8 slot)
1689     msgDesc |= MDC_SG3_SG8L << 12;
1690     sfId = SFID::DP_DC1;
1691 
1692     bool forceSplitSend = shouldForceSplitSend(surface);
1693     if (msgs[1] == 0 && !forceSplitSend) {
1694         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1695         createSendInst(pred, dstOpnd,
1696             msgs[0], sizes[0],
1697             numEnabledChannels,
1698             exSize,
1699             msgDesc, sfId,
1700             hasHeader,
1701             SendAccess::READ_ONLY,
1702             surface, nullptr,
1703             instOpt, false);
1704     } else {
1705         createSplitSendInst(pred, dstOpnd,
1706             msgs[0], sizes[0], msgs[1], sizes[1],
1707             numEnabledChannels,
1708             exSize,
1709             msgDesc, sfId,
1710             hasHeader,
1711             SendAccess::READ_ONLY,
1712             surface, nullptr,
1713             instOpt, false);
1714     }
1715 
1716     return VISA_SUCCESS;
1717 }
1718 
1719 // u must not be V0. v and r are allowed to be V0, in which case they will be
1720 // skipped in payload.
translateVISAScatter4TypedInst(G4_Predicate * pred,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,VISA_Exec_Size executionSize,G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_SrcRegRegion * srcOpnd)1721 int IR_Builder::translateVISAScatter4TypedInst(
1722     G4_Predicate *pred,
1723     VISA_EMask_Ctrl emask,
1724     ChannelMask chMask,
1725     G4_Operand *surface,
1726     VISA_Exec_Size executionSize,
1727     G4_SrcRegRegion *uOffsetOpnd,
1728     G4_SrcRegRegion *vOffsetOpnd,
1729     G4_SrcRegRegion *rOffsetOpnd,
1730     G4_SrcRegRegion *lodOpnd,
1731     G4_SrcRegRegion *srcOpnd)
1732 {
1733     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1734 
1735     G4_ExecSize exSize = executionSize == EXEC_SIZE_16 ? g4::SIMD16 : g4::SIMD8;
1736     assert((exSize == g4::SIMD8 || hasSIMD16TypedRW()) && "only simd8 is supported");
1737     G4_InstOpts instOpt = Get_Gen4_Emask(emask, exSize);
1738     int numEnabledChannels = chMask.getNumEnabledChannels();
1739 
1740     bool useSplitSend = useSends();
1741 
1742     bool hasHeader = getPlatform() == GENX_BDW;
1743 
1744     PayloadSource sources[6]; // (maybe header) + maximal 4 addresses + source
1745     unsigned len = 0;
1746 
1747     if (hasHeader)
1748     {
1749         // Build header
1750         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1751         BuildMH1_BTS_PSM(this, dcl);
1752 
1753         // Append header
1754         G4_SrcRegRegion *header
1755             = createSrcRegRegion(dcl, getRegionStride1());
1756         sources[len].opnd = header;
1757         sources[len].execSize = g4::SIMD8;
1758         sources[len].instOpt = InstOpt_WriteEnable;
1759         ++len;
1760     }
1761 
1762     buildTypedSurfaceAddressPayload(uOffsetOpnd, vOffsetOpnd, rOffsetOpnd, lodOpnd, exSize, instOpt, sources, len);
1763 
1764     // Append source
1765     sources[len].opnd = srcOpnd;
1766     sources[len].execSize = G4_ExecSize(exSize * numEnabledChannels);
1767     sources[len].instOpt = instOpt;
1768     ++len;
1769 
1770     G4_SrcRegRegion *msgs[2] = {0, 0};
1771     unsigned sizes[2] = {0, 0};
1772     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1773 
1774     //bit 8-11: RGBA channel enable
1775     unsigned msgDesc = 0;
1776     SFID sfId;
1777 
1778     // DC1
1779     // bit14-17: 0101 (read), 1101 (write)
1780     msgDesc |= DC1_TYPED_SURFACE_WRITE << 14;
1781     // bit12-13: 01 (use low 8 slot)
1782     msgDesc |= MDC_SG3_SG8L << 12;
1783     sfId = SFID::DP_DC1;
1784 
1785     msgDesc |= chMask.getHWEncoding() << 8;
1786 
1787     G4_DstRegRegion* dstOpnd = createNullDst(Type_UD);
1788 
1789     bool forceSplitSend = shouldForceSplitSend(surface);
1790     if (msgs[1] == 0 && !forceSplitSend) {
1791         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1792         createSendInst(
1793             pred, dstOpnd,
1794             msgs[0], sizes[0],
1795             0,
1796             exSize,
1797             msgDesc, sfId,
1798             hasHeader,
1799             SendAccess::WRITE_ONLY,
1800             surface, NULL,
1801             instOpt, false);
1802     } else
1803     {
1804         createSplitSendInst(
1805             pred, dstOpnd,
1806             msgs[0], sizes[0], msgs[1], sizes[1],
1807             0,
1808             exSize,
1809             msgDesc, sfId,
1810             hasHeader,
1811             SendAccess::WRITE_ONLY,
1812             surface, NULL,
1813             instOpt, false);
1814     }
1815 
1816     return VISA_SUCCESS;
1817 }
1818 
translateVISATypedAtomicInst(VISAAtomicOps atomicOp,bool is16Bit,G4_Predicate * pred,VISA_EMask_Ctrl emask,VISA_Exec_Size execSize,G4_Operand * surface,G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_DstRegRegion * dst)1819 int IR_Builder::translateVISATypedAtomicInst(
1820     VISAAtomicOps atomicOp,
1821     bool is16Bit,
1822     G4_Predicate *pred,
1823     VISA_EMask_Ctrl emask,
1824     VISA_Exec_Size execSize,
1825     G4_Operand *surface,
1826     G4_SrcRegRegion *uOffsetOpnd,
1827     G4_SrcRegRegion *vOffsetOpnd,
1828     G4_SrcRegRegion *rOffsetOpnd,
1829     G4_SrcRegRegion *lodOpnd,
1830     G4_SrcRegRegion *src0,
1831     G4_SrcRegRegion *src1,
1832     G4_DstRegRegion *dst)
1833 {
1834     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1835 
1836     VISA_Exec_Size instExecSize = execSize;
1837     assert(execSize <= (getNativeExecSize() == g4::SIMD8 ? EXEC_SIZE_8 : EXEC_SIZE_16) &&
1838         "send exec size must not exceed the platform's native execution size");
1839 
1840     unsigned op = Get_Atomic_Op(atomicOp);
1841 
1842     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
1843     G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
1844     G4_InstOpts instOpt = Get_Gen4_Emask(emask, instExSize);
1845 
1846     if (atomicOp == ATOMIC_CMPXCHG)
1847     {
1848         // we have to swap src0 and src1 since vISA has them in different order from HW
1849         G4_SrcRegRegion* tmp = src0;
1850         src0 = src1;
1851         src1 = tmp;
1852     }
1853 
1854     bool useSplitSend = useSends();
1855 
1856     PayloadSource sources[6]; // u, v, r, lod, src0, src1
1857     unsigned len = 0;
1858 
1859     buildTypedSurfaceAddressPayload(uOffsetOpnd, vOffsetOpnd, rOffsetOpnd, lodOpnd, exSize, instOpt, sources, len);
1860 
1861     if (src0 != nullptr && !src0->isNullReg())
1862     {
1863         sources[len].opnd = src0;
1864         sources[len].execSize = exSize;
1865         sources[len].instOpt = instOpt;
1866         ++len;
1867     }
1868 
1869     if (src1 != nullptr && !src1->isNullReg())
1870     {
1871         sources[len].opnd = src1;
1872         sources[len].execSize = exSize;
1873         sources[len].instOpt = instOpt;
1874         ++len;
1875     }
1876 
1877     G4_SrcRegRegion *msgs[2] = {0, 0};
1878     unsigned sizes[2] = {0, 0};
1879     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1880 
1881     unsigned dstLength = dst->isNullReg() ? 0 : 1;
1882 
1883     unsigned msgDesc = 0;
1884     // BTI is filled later
1885     msgDesc |= op << 8;
1886     msgDesc |= (dstLength != 0 ? 1 : 0) << 13;
1887 
1888     if (is16Bit)
1889     {
1890         msgDesc |= DC1_TYPED_HALF_INTEGER_ATOMIC << 14;
1891     }
1892     else
1893     {
1894         msgDesc |= DC1_TYPED_ATOMIC << 14;
1895     }
1896 
1897     bool forceSplitSend = shouldForceSplitSend(surface);
1898     if (msgs[1] == 0 && !forceSplitSend)
1899     {
1900         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1901         createSendInst(pred, dst,
1902             msgs[0], sizes[0], dstLength, exSize,
1903             msgDesc, SFID::DP_DC1,
1904             false,
1905             SendAccess::READ_WRITE,
1906             surface, nullptr,
1907             instOpt, false);
1908     }
1909     else
1910     {
1911         createSplitSendInst(pred, dst,
1912             msgs[0], sizes[0], msgs[1], sizes[1],
1913             dstLength, exSize,
1914             msgDesc, SFID::DP_DC1,
1915             false,
1916             SendAccess::READ_WRITE,
1917             surface, nullptr,
1918             instOpt, false);
1919     }
1920 
1921     return VISA_SUCCESS;
1922 }
1923 
BuildMH2_A32_PSM(IR_Builder * IRB,G4_Declare * header,uint16_t scale,G4_Operand * globalOffset)1924 static void BuildMH2_A32_PSM(
1925     IR_Builder *IRB, G4_Declare *header, uint16_t scale, G4_Operand *globalOffset)
1926 {
1927     // Clear header
1928     G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
1929         0, 0, 1, Type_UD);
1930     IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1931     // Copy global offset if necessary.
1932     if (!(globalOffset->isImm() && globalOffset->asImm()->isZero())) {
1933         G4_DstRegRegion* gOffDst = IRB->createDst(
1934             header->getRegVar(),
1935             0, 5, 1, Type_UD);
1936         IRB->createMov(g4::SIMD1, gOffDst, globalOffset, InstOpt_WriteEnable, true);
1937     }
1938     // Copy scale pitch if necessary.
1939     if (scale != 0) {
1940         G4_Operand* scaleImm = IRB->createImm(scale, Type_UD);
1941         G4_DstRegRegion* pitchDst = IRB->createDst(
1942             header->getRegVar(),
1943             0, 0, 1, Type_UD);
1944         IRB->createMov(g4::SIMD1, pitchDst, scaleImm, InstOpt_WriteEnable, true);
1945     }
1946     // Copy PSM which is set to 0xFFFF so far.
1947     G4_Operand* maskImm = IRB->createImm(0xFFFF, Type_UD);
1948     G4_DstRegRegion* pitchDst = IRB->createDst(
1949         header->getRegVar(),
1950         0, 7, 1, Type_UD);
1951     IRB->createMov(g4::SIMD1, pitchDst, maskImm, InstOpt_WriteEnable, true);
1952 }
1953 
1954 // apply the sideband offset (can be either imm or variable) to the message descriptor
applySideBandOffset(G4_Operand * sideBand,const G4_SendDescRaw * sendMsgDesc)1955 void IR_Builder::applySideBandOffset(
1956     G4_Operand* sideBand, const G4_SendDescRaw* sendMsgDesc)
1957 {
1958 #define SIDEBAND_OFFSET_IN_EXDESC 12
1959 
1960     if (sideBand->isImm())
1961     {
1962         // mov (1) a0.0 sideband << 0xC
1963         uint32_t sidebandInDesc = (uint32_t)(sideBand->asImm()->getImm() << SIDEBAND_OFFSET_IN_EXDESC);
1964         G4_DstRegRegion* dst = createDstRegRegion(builtinA0, 1);
1965         createMov(g4::SIMD1, dst, createImm(sidebandInDesc, Type_UD), InstOpt_WriteEnable, true);
1966     }
1967     else
1968     {
1969         MUST_BE_TRUE(sideBand->isSrcRegRegion(), "sideband offset should be a srcRegRegion");
1970         // shl (1) a0.0 sideband 0xC
1971         G4_DstRegRegion* dst = createDstRegRegion(builtinA0, 1);
1972         createBinOp(G4_shl, g4::SIMD1, dst, sideBand,
1973             createImm(SIDEBAND_OFFSET_IN_EXDESC, Type_UW), InstOpt_WriteEnable, true);
1974     }
1975 
1976     // add (1) a0.0 a0.0 MD
1977     G4_DstRegRegion* a0Dst = createDstRegRegion(builtinA0, 1);
1978     G4_SrcRegRegion* a0Src = createSrcRegRegion(builtinA0, getRegionScalar());
1979     createBinOp(G4_add, g4::SIMD1, a0Dst, a0Src,
1980         createImm(sendMsgDesc->getExtendedDesc(), Type_UD), InstOpt_WriteEnable, true);
1981 }
1982 
BuildMH2_A32(IR_Builder * IRB,G4_Declare * header,uint16_t scale,G4_Operand * globalOffset)1983 static void BuildMH2_A32(IR_Builder *IRB, G4_Declare *header,
1984     uint16_t scale, G4_Operand *globalOffset) {
1985     // Clear header
1986     G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
1987         0, 0, 1, Type_UD);
1988     IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1989     // Copy global offset if necessary.
1990     if (!(globalOffset->isImm() && globalOffset->asImm()->isZero())) {
1991         G4_DstRegRegion* gOffDst = IRB->createDst(
1992             header->getRegVar(),
1993             0, 5, 1, Type_UD);
1994         IRB->createMov(g4::SIMD1, gOffDst, globalOffset, InstOpt_WriteEnable, true);
1995     }
1996     // Copy scale pitch if necessary.
1997     if (scale != 0) {
1998         G4_Operand* scaleImm = IRB->createImm(scale, Type_UD);
1999         G4_DstRegRegion* pitchDst = IRB->createDst(
2000             header->getRegVar(),
2001             0, 0, 1, Type_UD);
2002         IRB->createMov(g4::SIMD1, pitchDst, scaleImm, InstOpt_WriteEnable, true);
2003     }
2004 }
2005 
translateVISAGather4ScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2006 int IR_Builder::translateVISAGather4ScaledInst(
2007     G4_Predicate     *pred,
2008     VISA_Exec_Size    execSize,
2009     VISA_EMask_Ctrl   eMask,
2010     ChannelMask       chMask,
2011     G4_Operand       *surface,
2012     G4_Operand       *globalOffset,
2013     G4_SrcRegRegion  *offsets,
2014     G4_DstRegRegion  *dst)
2015 {
2016     surface = lowerSurface255To253(surface, *this);
2017     return translateGather4Inst(pred, execSize, eMask, chMask, surface,
2018         globalOffset, offsets, dst);
2019 }
2020 
translateVISAScatter4ScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2021 int IR_Builder::translateVISAScatter4ScaledInst(
2022     G4_Predicate     *pred,
2023     VISA_Exec_Size    execSize,
2024     VISA_EMask_Ctrl   eMask,
2025     ChannelMask       chMask,
2026     G4_Operand       *surface,
2027     G4_Operand       *globalOffset,
2028     G4_SrcRegRegion  *offsets,
2029     G4_SrcRegRegion  *src)
2030 {
2031     surface = lowerSurface255To253(surface, *this);
2032     return translateScatter4Inst(pred, execSize, eMask, chMask, surface,
2033         globalOffset, offsets, src);
2034 }
2035 
translateGather4Inst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2036 int IR_Builder::translateGather4Inst(
2037     G4_Predicate     *pred,
2038     VISA_Exec_Size    execSize,
2039     VISA_EMask_Ctrl   eMask,
2040     ChannelMask       chMask,
2041     G4_Operand       *surface,
2042     G4_Operand       *globalOffset,
2043     G4_SrcRegRegion  *offsets,
2044     G4_DstRegRegion  *dst)
2045 {
2046     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2047 
2048     ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2049         execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2050         execSize == EXEC_SIZE_16,
2051         "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2052 
2053     VISA_Exec_Size instExecSize = execSize;
2054     execSize = roundUpExecSize(execSize);
2055 
2056     G4_ExecSize exSize = toExecSize(execSize);
2057     G4_ExecSize instExSize = toExecSize(instExecSize);
2058     unsigned instOpt = Get_Gen4_Emask(eMask, exSize);
2059 
2060     bool useSplitSend = useSends();
2061     bool useHeader = needsA32MsgHeader() && surface && isStatelessSurface(surface);
2062 
2063     // In case non-zero global offset is specified, we need to recalculate
2064     // offsets.
2065     if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0) {
2066         G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2067         G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2068         createInst(pred, G4_add, 0, g4::NOSAT, instExSize, tmp, offsets, globalOffset, instOpt, true);
2069         offsets = createSrcRegRegion(dcl, getRegionStride1());
2070     }
2071 
2072     PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
2073     unsigned len = 0;
2074 
2075     if (useHeader) {
2076         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2077 
2078         BuildMH1_A32_PSM(this, dcl);
2079 
2080         G4_SrcRegRegion *header
2081             = createSrcRegRegion(dcl, getRegionStride1());
2082         sources[len].opnd = header;
2083         sources[len].execSize = g4::SIMD8;
2084         sources[len].instOpt = InstOpt_WriteEnable;
2085         ++len;
2086     }
2087 
2088     sources[len].opnd = offsets;
2089     sources[len].execSize = exSize;
2090     sources[len].instOpt = instOpt;
2091     ++len;
2092 
2093     G4_SrcRegRegion *msgs[2] = {0, 0};
2094     unsigned sizes[2] = {0, 0};
2095     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2096 
2097     SFID sfid = SFID::DP_DC1;
2098 
2099     unsigned MD = 0;
2100     // Leave sidebind scale offset 0 as it is not used now.
2101     MD |= DC1_UNTYPED_SURFACE_READ << 14;
2102     MD |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
2103     MD |= chMask.getHWEncoding() << 8;
2104 
2105     unsigned resLen = (exSize / GENX_DATAPORT_IO_SZ) *
2106         chMask.getNumEnabledChannels();
2107 
2108     bool forceSplitSend = shouldForceSplitSend(surface);
2109     if (msgs[1] == 0 && !forceSplitSend) {
2110         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2111         createSendInst(pred, dst,
2112             msgs[0], sizes[0],
2113             resLen,
2114             instExSize,
2115             MD, sfid,
2116             useHeader,
2117             SendAccess::READ_ONLY,
2118             surface, NULL,
2119             instOpt, false);
2120     } else {
2121         createSplitSendInst(pred, dst,
2122             msgs[0], sizes[0], msgs[1], sizes[1],
2123             resLen,
2124             instExSize,
2125             MD, sfid,
2126             useHeader,
2127             SendAccess::READ_ONLY,
2128             surface, NULL,
2129             instOpt, false);
2130     }
2131 
2132     return VISA_SUCCESS;
2133 }
2134 
2135 
translateScatter4Inst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2136 int IR_Builder::translateScatter4Inst(
2137     G4_Predicate          *pred,
2138     VISA_Exec_Size         execSize,
2139     VISA_EMask_Ctrl        eMask,
2140     ChannelMask            chMask,
2141     G4_Operand             *surface,
2142     G4_Operand             *globalOffset,
2143     G4_SrcRegRegion        *offsets,
2144     G4_SrcRegRegion        *src)
2145 {
2146     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2147 
2148     ASSERT_USER(
2149         execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2150         execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2151         execSize == EXEC_SIZE_16,
2152         "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2153 
2154     VISA_Exec_Size instExecSize = execSize;
2155     execSize = roundUpExecSize(execSize);
2156 
2157     G4_ExecSize exSize = toExecSize(execSize);
2158     G4_ExecSize instExSize = toExecSize(instExecSize);
2159     unsigned instOpt = Get_Gen4_Emask(eMask, exSize);
2160 
2161     bool useSplitSend = useSends();
2162     bool useHeader = needsA32MsgHeader() && surface && isStatelessSurface(surface);
2163 
2164     // In case non-zero global offset is specified, we need to recalculate
2165     // offsets.
2166     if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0) {
2167         G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2168         G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2169         createInst(pred, G4_add, 0, g4::NOSAT, instExSize, tmp, offsets, globalOffset, instOpt, true);
2170         offsets = createSrcRegRegion(dcl, getRegionStride1());
2171     }
2172 
2173     PayloadSource sources[3]; // Maximal 3 sources, optional header + offsets + src
2174     unsigned len = 0;
2175 
2176     if (useHeader) {
2177         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2178 
2179         // TODO: Get PSM supported on demand.
2180         BuildMH1_A32_PSM(this, dcl);
2181 
2182         G4_SrcRegRegion *header
2183             = createSrcRegRegion(dcl, getRegionStride1());
2184         sources[len].opnd = header;
2185         sources[len].execSize = g4::SIMD8;
2186         sources[len].instOpt = InstOpt_WriteEnable;
2187         ++len;
2188     }
2189 
2190     sources[len].opnd = offsets;
2191     sources[len].execSize = exSize;
2192     sources[len].instOpt = instOpt;
2193     ++len;
2194     sources[len].opnd = src;
2195     sources[len].execSize = G4_ExecSize(exSize * chMask.getNumEnabledChannels());
2196     sources[len].instOpt = instOpt;
2197     ++len;
2198 
2199     G4_SrcRegRegion *msgs[2] = {0, 0};
2200     unsigned sizes[2] = {0, 0};
2201     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2202 
2203     SFID sfid = SFID::DP_DC1;
2204 
2205     unsigned MD = 0;
2206     // Leave sidebind scale offset 0 as it is not used now.
2207     MD |= DC1_UNTYPED_SURFACE_WRITE << 14;
2208     MD |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
2209     MD |= chMask.getHWEncoding() << 8;
2210 
2211     G4_DstRegRegion *dst = createNullDst(Type_UD);
2212     bool forceSplitSend = shouldForceSplitSend(surface);
2213     if (msgs[1] == 0 && !forceSplitSend) {
2214         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2215         createSendInst(pred, dst,
2216             msgs[0], sizes[0],
2217             0,
2218             instExSize,
2219             MD, sfid,
2220             useHeader,
2221             SendAccess::WRITE_ONLY,
2222             surface, NULL,
2223             instOpt, false);
2224     }
2225     else {
2226         createSplitSendInst(pred, dst,
2227             msgs[0], sizes[0], msgs[1], sizes[1],
2228             0,
2229             instExSize,
2230             MD, sfid,
2231             useHeader,
2232             SendAccess::WRITE_ONLY,
2233             surface, NULL,
2234             instOpt, false);
2235     }
2236 
2237     return VISA_SUCCESS;
2238 }
2239 
2240 /// GetNumBatch() - return the number of batches required to copy the raw
2241 /// operand to message payload
GetNumBatch(VISA_SVM_Block_Type blockSize,VISA_SVM_Block_Num numBlocks)2242 static unsigned GetNumBatch(
2243     VISA_SVM_Block_Type blockSize, VISA_SVM_Block_Num  numBlocks)
2244 {
2245     switch (blockSize) {
2246     case SVM_BLOCK_TYPE_BYTE:
2247         switch (numBlocks) {
2248         case SVM_BLOCK_NUM_1:
2249         case SVM_BLOCK_NUM_2:
2250         case SVM_BLOCK_NUM_4:
2251             return 1;
2252         case SVM_BLOCK_NUM_8:
2253             return 2;
2254         }
2255         break;
2256     case SVM_BLOCK_TYPE_DWORD:
2257         return Get_Common_ISA_SVM_Block_Num(numBlocks);
2258     case SVM_BLOCK_TYPE_QWORD:
2259         return Get_Common_ISA_SVM_Block_Num(numBlocks);
2260     }
2261     ASSERT_USER(false, "Unhandled sizes/numbers of block/element!");
2262     return 0;
2263 }
2264 
translateVISAGatherScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2265 int IR_Builder::translateVISAGatherScaledInst(
2266     G4_Predicate              *pred,
2267     VISA_Exec_Size             execSize,
2268     VISA_EMask_Ctrl            eMask,
2269     VISA_SVM_Block_Num         numBlocks,
2270     G4_Operand                *surface,
2271     G4_Operand                *globalOffset,
2272     G4_SrcRegRegion           *offsets,
2273     G4_DstRegRegion           *dst)
2274 {
2275     surface = lowerSurface255To253(surface, *this);
2276 
2277     return translateByteGatherInst(pred, execSize, eMask, numBlocks,
2278         surface, globalOffset, offsets, dst);
2279 }
2280 
translateVISAScatterScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2281 int IR_Builder::translateVISAScatterScaledInst(
2282     G4_Predicate              *pred,
2283     VISA_Exec_Size             execSize,
2284     VISA_EMask_Ctrl            eMask,
2285     VISA_SVM_Block_Num         numBlocks,
2286     G4_Operand                *surface,
2287     G4_Operand                *globalOffset,
2288     G4_SrcRegRegion           *offsets,
2289     G4_SrcRegRegion           *src)
2290 {
2291 
2292     surface = lowerSurface255To253(surface, *this);
2293     return translateByteScatterInst(pred, execSize, eMask, numBlocks,
2294         surface, globalOffset, offsets, src);
2295 }
2296 
BuildMH_A32_GO(IR_Builder * IRB,G4_Declare * header,G4_Operand * globalOffset=0)2297 static void BuildMH_A32_GO(
2298     IR_Builder *IRB, G4_Declare *header, G4_Operand *globalOffset = 0)
2299 {
2300     // Clear header
2301     G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
2302         0, 0, 1, Type_UD);
2303     IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
2304     // Copy global offset if necessary.
2305     if (globalOffset &&
2306         !(globalOffset->isImm() &&
2307             globalOffset->asImm()->isZero())) {
2308         G4_DstRegRegion* gOffDst = IRB->createDst(
2309             header->getRegVar(),
2310             0, 2, 1, Type_UD);
2311         IRB->createMov(g4::SIMD1, gOffDst, globalOffset, InstOpt_WriteEnable, true);
2312     }
2313 }
2314 
translateByteGatherInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2315 int IR_Builder::translateByteGatherInst(
2316     G4_Predicate *pred,
2317     VISA_Exec_Size execSize,
2318     VISA_EMask_Ctrl eMask,
2319     VISA_SVM_Block_Num numBlocks,
2320     G4_Operand *surface,
2321     G4_Operand *globalOffset,
2322     G4_SrcRegRegion *offsets,
2323     G4_DstRegRegion *dst)
2324 {
2325     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2326 
2327     ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2328         execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2329         execSize == EXEC_SIZE_16,
2330         "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2331     ASSERT_USER(numBlocks == SVM_BLOCK_NUM_1 ||
2332         numBlocks == SVM_BLOCK_NUM_2 ||
2333         numBlocks == SVM_BLOCK_NUM_4,
2334         "Byte gather ONLY supports 1, 2, and 4 elements per slot!");
2335 
2336     VISA_Exec_Size instExecSize = execSize;
2337     execSize = roundUpExecSize(execSize);
2338 
2339     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2340     G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2341     G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
2342     unsigned numBatch = GetNumBatch(SVM_BLOCK_TYPE_BYTE, numBlocks);
2343 
2344     bool isSLM = IsSLMSurface(surface);
2345     // SLM forbids header. Header is optional in A32 when both scale and global
2346     // offset are 0s.
2347     bool useHeader = !isSLM && needsA32MsgHeader();
2348     bool useSplitSend = useSends();
2349 
2350     // In case non-zero global offset is specified, we need to recalculate
2351     // offsets.
2352     //
2353     // NOTE: Even though pre-SKL devices require header, eliminating global
2354     //       offset by adjusting offsets will simplify the header generation.
2355     if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
2356     {
2357         G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2358         G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2359         createBinOp(G4_add, instExSize, tmp, offsets, globalOffset, instOpt, true);
2360         offsets = createSrcRegRegion(dcl, getRegionStride1());
2361     }
2362 
2363     PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
2364     unsigned len = 0;
2365 
2366     if (useHeader) {
2367         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2368 
2369         // TODO: Get BTS supported on demand.
2370         BuildMH_A32_GO(this, dcl);
2371 
2372         G4_SrcRegRegion *header
2373             = createSrcRegRegion(dcl, getRegionStride1());
2374         sources[len].opnd = header;
2375         sources[len].execSize = g4::SIMD8;
2376         sources[len].instOpt = InstOpt_WriteEnable;
2377         ++len;
2378     }
2379 
2380     sources[len].opnd = offsets;
2381     sources[len].execSize = exSize;
2382     sources[len].instOpt = instOpt;
2383     ++len;
2384 
2385     G4_SrcRegRegion *msgs[2] = {0, 0};
2386     unsigned sizes[2] = {0, 0};
2387     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2388 
2389     SFID sfid = SFID::DP_DC0;
2390 
2391     unsigned MD = 0;
2392     MD |= DC_BYTE_SCATTERED_READ << 14;
2393     MD |= numBlocks << 10;
2394     MD |= (execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16) << 8;
2395 
2396     unsigned resLen = (exSize / GENX_DATAPORT_IO_SZ) * numBatch;
2397     bool forceSplitSend = shouldForceSplitSend(surface);
2398     if (msgs[1] == 0 && !forceSplitSend) {
2399         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2400         createSendInst(pred, dst,
2401             msgs[0], sizes[0],
2402             resLen,
2403             instExSize,
2404             MD, sfid,
2405             useHeader,
2406             SendAccess::READ_ONLY,
2407             surface, NULL,
2408             instOpt, false);
2409     }
2410     else {
2411         createSplitSendInst(pred, dst,
2412             msgs[0], sizes[0], msgs[1], sizes[1],
2413             resLen,
2414             instExSize,
2415             MD, sfid,
2416             useHeader,
2417             SendAccess::READ_ONLY,
2418             surface, NULL,
2419             instOpt, false);
2420     }
2421 
2422     return VISA_SUCCESS;
2423 }
2424 
translateByteScatterInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2425 int IR_Builder::translateByteScatterInst(
2426     G4_Predicate *pred,
2427     VISA_Exec_Size execSize,
2428     VISA_EMask_Ctrl eMask,
2429     VISA_SVM_Block_Num numBlocks,
2430     G4_Operand *surface,
2431     G4_Operand *globalOffset,
2432     G4_SrcRegRegion *offsets,
2433     G4_SrcRegRegion *src)
2434 {
2435     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2436 
2437     ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2438         execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2439         execSize == EXEC_SIZE_16,
2440         "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2441     ASSERT_USER(numBlocks == SVM_BLOCK_NUM_1 ||
2442         numBlocks == SVM_BLOCK_NUM_2 ||
2443         numBlocks == SVM_BLOCK_NUM_4,
2444         "Byte scatter ONLY supports 1, 2, and 4 elements per slot!");
2445 
2446     VISA_Exec_Size instExecSize = execSize;
2447     execSize = roundUpExecSize(execSize);
2448 
2449     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2450     G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2451     G4_InstOpts instOpt = Get_Gen4_Emask(eMask, exSize);
2452     unsigned numBatch = GetNumBatch(SVM_BLOCK_TYPE_BYTE, numBlocks);
2453 
2454     bool isSLM = IsSLMSurface(surface);
2455     // SLM forbids header. Header is optional in A32 when both scale and global
2456     // offset are 0s.
2457     bool useHeader = !isSLM && needsA32MsgHeader();
2458     bool useSplitSend = useSends();
2459 
2460     // In case non-zero global offset is specified, we need to recalculate
2461     // offsets.
2462     //
2463     // NOTE: Even though pre-SKL devices require header, eliminating global
2464     //       offset by adjusting offsets will simplify the header generation.
2465     if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
2466     {
2467         G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2468         G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2469         createBinOp(G4_add, instExSize, tmp, offsets, globalOffset, instOpt, true);
2470         offsets = createSrcRegRegion(dcl, getRegionStride1());
2471     }
2472 
2473     PayloadSource sources[3]; // Maximal 2 sources, optional header + offsets + src
2474     unsigned len = 0;
2475 
2476     if (useHeader) {
2477         G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2478 
2479         // TODO: Get BTS supported on demand.
2480         BuildMH_A32_GO(this, dcl);
2481 
2482         G4_SrcRegRegion *header
2483             = createSrcRegRegion(dcl, getRegionStride1());
2484         sources[len].opnd = header;
2485         sources[len].execSize = g4::SIMD8;
2486         sources[len].instOpt = InstOpt_WriteEnable;
2487         ++len;
2488     }
2489 
2490     sources[len].opnd = offsets;
2491     sources[len].execSize = exSize;
2492     sources[len].instOpt = instOpt;
2493     ++len;
2494     sources[len].opnd = src;
2495     sources[len].execSize = G4_ExecSize(exSize * numBatch);
2496     sources[len].instOpt = instOpt;
2497     ++len;
2498 
2499     G4_SrcRegRegion *msgs[2] = {0, 0};
2500     unsigned sizes[2] = {0, 0};
2501     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2502 
2503     SFID sfid = SFID::DP_DC0;
2504 
2505     unsigned MD = 0;
2506     // Leave sidebind scale offset 0 as it is not used now.
2507     MD |= DC_BYTE_SCATTERED_WRITE << 14;
2508     MD |= numBlocks << 10;
2509     MD |= (execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16) << 8;
2510 
2511     G4_DstRegRegion *dst = createNullDst(Type_UD);
2512     bool forceSplitSend = shouldForceSplitSend(surface);
2513     if (msgs[1] == 0 && !forceSplitSend) {
2514         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2515         createSendInst(pred, dst,
2516             msgs[0], sizes[0],
2517             0,
2518             instExSize,
2519             MD, sfid,
2520             useHeader,
2521             SendAccess::WRITE_ONLY,
2522             surface, NULL,
2523             instOpt, false);
2524     } else {
2525         createSplitSendInst(pred, dst,
2526             msgs[0], sizes[0], msgs[1], sizes[1],
2527             0,
2528             instExSize,
2529             MD, sfid,
2530             useHeader,
2531             SendAccess::WRITE_ONLY,
2532             surface, NULL,
2533             instOpt, false);
2534     }
2535 
2536     return VISA_SUCCESS;
2537 }
2538 
2539 
2540 ///
2541 /// Bits 31-29: Reserved
2542 /// Bits 28-25: Message Length: Total 256bit registers expected to be sent.
2543 /// Bits 24-20: Response Length: Total 256bit registers expected in response.
2544 /// Bit  19:    Does this Message Descriptor have a header? 1 Yes, 0 No.
2545 /// Bits 18-14: Message Type: 10100: A64 Block Read, 10101: A64 Block Write
2546 /// Bit  13:    Ignore
2547 /// Bits 12-11: Message sub-type (00 for OWord Block Read/Write, 01 for Unaligned OWord Block Read/Write)
2548 /// Bits 10-8:  Block Size, 000 for 1 OWord, 001 for 2 OWords, 010 for 4 OWords, 100 for 8 OWords.
2549 /// Bits 7-0:   Binding Table Index: Set to 0xFF for stateless memory space used bu A64 SVM Data Port.
translateVISASVMBlockReadInst(VISA_Oword_Num size,bool unaligned,G4_Operand * address,G4_DstRegRegion * dst)2550 int IR_Builder::translateVISASVMBlockReadInst(
2551     VISA_Oword_Num size,
2552     bool unaligned,
2553     G4_Operand* address,
2554     G4_DstRegRegion* dst)
2555 {
2556     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2557 
2558     unsigned numOword = Get_VISA_Oword_Num(size);
2559     G4_Declare* dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2560     if (noInt64())
2561     {
2562         G4_SrcRegRegion *region = address->asSrcRegRegion();
2563         G4_SrcRegRegion *tmp;
2564         tmp = createSrcRegRegion(Mod_src_undef,
2565             region->getRegAccess(),
2566             region->getBase(),
2567             region->getRegOff(),
2568             region->getSubRegOff() * 2,
2569             region->getRegion(), Type_UD);
2570         createMovInst(dcl, 0, 0, g4::SIMD1, NULL, NULL, tmp, true);
2571         tmp = createSrcRegRegion(Mod_src_undef,
2572             region->getRegAccess(),
2573             region->getBase(),
2574             region->getRegOff(),
2575             region->getSubRegOff() * 2 + 1,
2576             region->getRegion(), Type_UD);
2577         createMovInst(dcl, 0, 1, g4::SIMD1, NULL, NULL, tmp, true);
2578     }
2579     else
2580     {
2581         G4_Declare* dclAsUQ = createSendPayloadDcl(GENX_DATAPORT_IO_SZ / 2, Type_UQ);
2582         dclAsUQ->setAliasDeclare(dcl, 0);
2583         createMovInst(dclAsUQ, 0, 0, g4::SIMD1, NULL, NULL, address, true);
2584     }
2585 
2586     G4_SrcRegRegion* src = createSrcRegRegion(dcl, getRegionStride1());
2587 
2588     DATA_CACHE1_MESSAGES msgSubOpcode = DC1_A64_BLOCK_READ;
2589     unsigned rspLength = ((numOword * 16 - 1) / getGRFSize() + 1);
2590 
2591     unsigned desc = getA64BTI() |
2592         (unaligned ? A64_BLOCK_MSG_OWORD_UNALIGNED_READ : A64_BLOCK_MSG_OWORD_RW) << A64_BLOCK_MSG_SUBTYPE_OFFSET |
2593         msgSubOpcode << SEND_GT_MSG_TYPE_BIT;
2594 
2595     desc = setOwordForDesc(desc, numOword);
2596 
2597     G4_ExecSize sendExecSize {FIX_OWORD_SEND_EXEC_SIZE(numOword)};
2598     dst->setType(Type_UD);
2599 
2600     createSendInst(
2601         NULL, dst, src, 1, rspLength, sendExecSize, desc,
2602         SFID::DP_DC1, true, SendAccess::READ_ONLY, NULL, NULL, InstOpt_WriteEnable, false);
2603 
2604     return VISA_SUCCESS;
2605 }
2606 
translateVISASVMBlockWriteInst(VISA_Oword_Num size,G4_Operand * address,G4_SrcRegRegion * src)2607 int IR_Builder::translateVISASVMBlockWriteInst(
2608     VISA_Oword_Num size,
2609     G4_Operand* address,
2610     G4_SrcRegRegion* src)
2611 {
2612     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2613 
2614     unsigned numOword = Get_VISA_Oword_Num(size);
2615     unsigned srcNumGRF = (numOword * 16 + getGRFSize() - 1) / getGRFSize();
2616     G4_ExecSize sendExecSize {FIX_OWORD_SEND_EXEC_SIZE(numOword)};
2617 
2618     // FIXME: may want to apply this to FIX_OWORD_SEND_EXEC_SIZE instead
2619     if (sendExecSize < g4::SIMD8)
2620     {
2621         sendExecSize = g4::SIMD8;
2622     }
2623 
2624     G4_Declare* dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2625     if (noInt64())
2626     {
2627         G4_SrcRegRegion *region = address->asSrcRegRegion();
2628         G4_SrcRegRegion *tmp;
2629         tmp = createSrcRegRegion(Mod_src_undef,
2630             region->getRegAccess(),
2631             region->getBase(),
2632             region->getRegOff(),
2633             region->getSubRegOff() * 2,
2634             region->getRegion(), Type_UD);
2635         createMovInst(dcl, 0, 0, g4::SIMD1, NULL, NULL, tmp, true);
2636         tmp = createSrcRegRegion(Mod_src_undef,
2637             region->getRegAccess(),
2638             region->getBase(),
2639             region->getRegOff(),
2640             region->getSubRegOff() * 2 + 1,
2641             region->getRegion(), Type_UD);
2642         createMovInst(dcl, 0, 1, g4::SIMD1, NULL, NULL, tmp, true);
2643     } else {
2644         G4_Declare* dclAsUQ = createSendPayloadDcl(GENX_DATAPORT_IO_SZ / 2, Type_UQ);
2645         dclAsUQ->setAliasDeclare(dcl, 0);
2646         createMovInst(dclAsUQ, 0, 0, g4::SIMD1, NULL, NULL, address, true);
2647     }
2648 
2649     bool useSplitSend = useSends();
2650     PayloadSource sources[2];
2651     unsigned len = 0;
2652 
2653     sources[len].opnd = createSrcRegRegion(dcl, getRegionStride1());
2654     sources[len].execSize = g4::SIMD8;
2655     sources[len].instOpt = InstOpt_WriteEnable;
2656     ++len;
2657 
2658     if (src->getElemSize() < TypeSize(Type_UD))
2659     {
2660         // use D for size computation. Src is guaranteed to be GRF-aligend per vISA spec
2661         src->setType(Type_UD);
2662     }
2663     sources[len].opnd = src;
2664 
2665     G4_ExecSize movExecSize {0};
2666 
2667     auto scale = getGRFSize() / src->getElemSize();
2668     switch (src->getElemSize())
2669     {
2670     case 4:
2671         sources[len].execSize = G4_ExecSize(scale * srcNumGRF);
2672         movExecSize = G4_ExecSize(scale);
2673         break;
2674     case 8:
2675         sources[len].execSize = G4_ExecSize(scale * srcNumGRF);
2676         movExecSize = G4_ExecSize(scale);
2677         break;
2678     }
2679 
2680     sources[len].instOpt = InstOpt_WriteEnable;
2681     ++len;
2682 
2683     G4_SrcRegRegion *msgs[2] = {0, 0};
2684     unsigned sizes[2] = {0, 0};
2685     preparePayload(msgs, sizes, movExecSize, useSplitSend, sources, len);
2686 
2687     DATA_CACHE1_MESSAGES msgSubOpcode = DC1_A64_BLOCK_WRITE;
2688 
2689     unsigned desc = getA64BTI() |
2690         A64_BLOCK_MSG_OWORD_RW << A64_BLOCK_MSG_SUBTYPE_OFFSET |
2691         msgSubOpcode << SEND_GT_MSG_TYPE_BIT;
2692 
2693     desc = setOwordForDesc(desc, numOword);
2694 
2695     G4_DstRegRegion* sendDst = createNullDst(Type_UD);
2696 
2697     if (msgs[1] == 0)
2698     {
2699         createSendInst(NULL, sendDst,
2700             msgs[0], sizes[0],
2701             0, sendExecSize,
2702             desc, SFID::DP_DC1,
2703             true,
2704             SendAccess::WRITE_ONLY,
2705             NULL, NULL,
2706             InstOpt_WriteEnable, false);
2707     }
2708     else
2709     {
2710         createSplitSendInst(NULL, sendDst,
2711             msgs[0], sizes[0],
2712             msgs[1], sizes[1],
2713             0, sendExecSize,
2714             desc, SFID::DP_DC1,
2715             true,
2716             SendAccess::WRITE_ONLY,
2717             NULL, NULL,
2718             InstOpt_WriteEnable, false);
2719     }
2720 
2721     return VISA_SUCCESS;
2722 }
2723 
translateVISASVMScatterReadInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Type blockSize,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * addresses,G4_DstRegRegion * dst)2724 int IR_Builder::translateVISASVMScatterReadInst(
2725     VISA_Exec_Size execSize,
2726     VISA_EMask_Ctrl eMask,
2727     G4_Predicate* pred,
2728     VISA_SVM_Block_Type blockSize,
2729     VISA_SVM_Block_Num numBlocks,
2730     G4_SrcRegRegion* addresses,
2731     G4_DstRegRegion* dst)
2732 {
2733     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2734 
2735     ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2736         execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2737         execSize == EXEC_SIZE_16,
2738         "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2739 
2740     VISA_Exec_Size instExecSize = execSize;
2741     execSize = roundUpExecSize(execSize);
2742 
2743     bool is8ByteMsg = blockSize == SVM_BLOCK_TYPE_BYTE && numBlocks == SVM_BLOCK_NUM_8;
2744     assert((!is8ByteMsg || has8ByteA64Gather()) && "A64 8-byte scatter not supported on this platform");
2745 
2746     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2747     G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2748     G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
2749 
2750     uint32_t messageLength = (8 * exSize) / getGRFSize();
2751     uint32_t numDWperLane = 0;
2752 
2753     // ToDo: remove this as it should be done in HWConformity
2754     if (instExSize < 8 && WaDisableSendSrcDstOverlap())
2755     {
2756         // as message length is set to 2 (HW requirements),
2757         // we have to even align both src/dst to satisfy the WA
2758         G4_Declare* srcDcl = addresses->getTopDcl()->getRootDeclare();
2759         if (srcDcl->getByteSize() <= numEltPerGRF<Type_UB>())
2760         {
2761             srcDcl->setEvenAlign();
2762         }
2763         G4_Declare* dstDcl = dst->getTopDcl()->getRootDeclare();
2764         if (dstDcl->getByteSize() <= numEltPerGRF<Type_UB>())
2765         {
2766             dstDcl->setEvenAlign();
2767         }
2768     }
2769 
2770     switch (blockSize)
2771     {
2772     case SVM_BLOCK_TYPE_BYTE:
2773         numDWperLane = (numBlocks == SVM_BLOCK_NUM_8) ? 2 : 1;
2774         break;
2775     case SVM_BLOCK_TYPE_DWORD:
2776         numDWperLane = Get_Common_ISA_SVM_Block_Num(numBlocks);
2777         break;
2778     case SVM_BLOCK_TYPE_QWORD:
2779         numDWperLane = Get_Common_ISA_SVM_Block_Num(numBlocks) * 2;
2780         break;
2781     default:
2782         MUST_BE_TRUE(false, "Illegal SVM block type");
2783     }
2784     uint32_t responseLength = (numDWperLane * 4 * exSize) / getGRFSize();
2785 
2786     unsigned desc = 0;
2787     desc |= getA64BTI();
2788     desc |= blockSize << 8;
2789     desc |= numBlocks << 10;
2790     desc |= (exSize == 8 ? 0 : 1) << 12;
2791     desc |= DC1_A64_SCATTERED_READ << 14;
2792 
2793     createSendInst(pred, dst, addresses, messageLength, responseLength, instExSize, desc,
2794         SFID::DP_DC1, false, SendAccess::READ_ONLY, NULL, NULL, instOpt, false);
2795 
2796     return VISA_SUCCESS;
2797 }
2798 
translateVISASVMScatterWriteInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Type blockSize,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * addresses,G4_SrcRegRegion * src)2799 int IR_Builder::translateVISASVMScatterWriteInst(
2800     VISA_Exec_Size execSize,
2801     VISA_EMask_Ctrl eMask,
2802     G4_Predicate* pred,
2803     VISA_SVM_Block_Type blockSize,
2804     VISA_SVM_Block_Num numBlocks,
2805     G4_SrcRegRegion* addresses,
2806     G4_SrcRegRegion* src)
2807 {
2808     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2809 
2810     ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2811         execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2812         execSize == EXEC_SIZE_16,
2813         "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2814 
2815     bool is8ByteMsg = blockSize == SVM_BLOCK_TYPE_BYTE && numBlocks == SVM_BLOCK_NUM_8;
2816     assert((!is8ByteMsg || has8ByteA64Gather()) && "A64 8-byte scatter not supported on this platform");
2817     VISA_Exec_Size instExecSize = execSize;
2818     execSize = roundUpExecSize(execSize);
2819 
2820     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2821     G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2822     G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
2823 
2824     bool useSplitSend = useSends();
2825 
2826     PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
2827     unsigned len = 0;
2828 
2829     sources[len].opnd = addresses;
2830     sources[len].execSize = exSize;
2831     sources[len].instOpt = instOpt;
2832     ++len;
2833 
2834     unsigned numElems = 1;
2835     // NOTE that BYTE scatter always has numElems set to 1 as
2836     // - when the number of data elements is 1, 2, or 4, the writeback payload
2837     //   is always 1 MDP_DW_SIMD8/_SIMD16.
2838     // - when the number of data elements is 8, the write payload is always 1
2839     //   MDP_QW_SIMD8/_SIMD16.
2840     // This ALSO implies the RAW operand should be in type of UQ when the
2841     // number of data elements is 8.
2842     if (blockSize != SVM_BLOCK_TYPE_BYTE)
2843         numElems = Get_Common_ISA_SVM_Block_Num(numBlocks);
2844 
2845     sources[len].opnd = src;
2846     sources[len].execSize = G4_ExecSize(exSize * numElems);
2847     sources[len].instOpt = instOpt;
2848     ++len;
2849 
2850     G4_SrcRegRegion *msgs[2] = {0, 0};
2851     unsigned sizes[2] = {0, 0};
2852 
2853     // adjust src type
2854     // PreparePayload takes src type to calculate src1 size. The src type have to be DW
2855     // for byte scatter read
2856     G4_Type srcType = src->getType();
2857     if ((blockSize == SVM_BLOCK_TYPE_BYTE) &&
2858         (numBlocks == SVM_BLOCK_NUM_1 || numBlocks == SVM_BLOCK_NUM_2) &&
2859         (TypeSize(srcType) != 4))
2860         src->setType(Type_UD);
2861 
2862     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2863 
2864     // set the type back in case we changed it for preparePayload
2865     src->setType(srcType);
2866 
2867     unsigned desc = 0;
2868     desc |= getA64BTI();
2869     desc |= blockSize << 8;
2870     desc |= numBlocks << 10;
2871     desc |= (exSize == 8 ? 0 : 1) << 12;
2872     desc |= DC1_A64_SCATTERED_WRITE << 14;
2873 
2874     G4_DstRegRegion* dst = createNullDst(Type_UD);
2875     if (msgs[1] == 0) {
2876         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2877         createSendInst(pred, dst,
2878             msgs[0], sizes[0],
2879             0, instExSize,
2880             desc, SFID::DP_DC1,
2881             false,
2882             SendAccess::WRITE_ONLY,
2883             NULL, NULL,
2884             instOpt, false);
2885     }
2886     else {
2887         createSplitSendInst(pred, dst,
2888             msgs[0], sizes[0],
2889             msgs[1], sizes[1],
2890             0, instExSize,
2891             desc, SFID::DP_DC1,
2892             false,
2893             SendAccess::WRITE_ONLY,
2894             NULL, NULL,
2895             instOpt, false);
2896     }
2897 
2898     return VISA_SUCCESS;
2899 }
2900 
2901 
2902 
2903 
2904 // is16Bit indicates if this is a 16bit atomic op. The input source (if
2905 // any) and the writeback (if any) have the same datalayout as dword messages.
2906 // Only the lower 16 bits of each dword is used.
2907 //
FillSVMAtomicMsgDesc(bool is16Bit,bool isFloatOp,uint32_t & msgDesc)2908 static void FillSVMAtomicMsgDesc(bool is16Bit, bool isFloatOp, uint32_t &msgDesc)
2909 {
2910     if (is16Bit)
2911     {
2912         if (isFloatOp)
2913         {
2914             msgDesc |= DC1_A64_UNTYPED_HALF_FLOAT_ATOMIC << 14;
2915         }
2916         else
2917         {
2918             msgDesc |= DC1_A64_UNTYPED_HALF_INTEGER_ATOMIC << 14;
2919         }
2920     }
2921     else
2922     {
2923         if (isFloatOp)
2924         {
2925             msgDesc |= DC1_A64_UNTYPED_FLOAT_ATOMIC << 14;
2926         }
2927         else
2928         {
2929             msgDesc |= DC1_A64_ATOMIC << 14;
2930         }
2931     }
2932 }
2933 
2934 
translateVISASVMAtomicInst(VISAAtomicOps atomicOp,unsigned short bitwidth,VISA_Exec_Size execSize,VISA_EMask_Ctrl emask,G4_Predicate * pred,G4_SrcRegRegion * addresses,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_DstRegRegion * dst)2935 int IR_Builder::translateVISASVMAtomicInst(
2936     VISAAtomicOps atomicOp,
2937     unsigned short bitwidth,
2938     VISA_Exec_Size execSize,
2939     VISA_EMask_Ctrl emask,
2940     G4_Predicate* pred,
2941     G4_SrcRegRegion* addresses,
2942     G4_SrcRegRegion* src0,
2943     G4_SrcRegRegion* src1,
2944     G4_DstRegRegion* dst)
2945 {
2946     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2947 
2948     MUST_BE_TRUE(bitwidth == 16 || bitwidth == 32 || bitwidth == 64,
2949         "bitwidth must be 16/32/64");
2950 
2951     ASSERT_USER(getPlatform() >= XeHP_SDV || ((atomicOp != ATOMIC_FADD) && (atomicOp != ATOMIC_FSUB)),
2952         "FADD/FSUB atomic operations are only supported on this devices");
2953 
2954     VISA_Exec_Size instExecSize = execSize;
2955     execSize = roundUpExecSize(execSize);
2956 
2957     unsigned op = Get_Atomic_Op(atomicOp);
2958 
2959     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2960     G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2961     G4_InstOpts instOpt = Get_Gen4_Emask(emask, instExSize);
2962 
2963     if (atomicOp == ATOMIC_CMPXCHG)
2964     {
2965         // we have to swap src0 and src1 since vISA has them in different order from HW
2966         G4_SrcRegRegion* tmp = src0;
2967         src0 = src1;
2968         src1 = tmp;
2969     }
2970 
2971     bool useSplitSend = useSends();
2972 
2973     PayloadSource sources[3]; // addresses, src0, and src1
2974     unsigned len = 0;
2975 
2976     sources[len].opnd = addresses;
2977     sources[len].execSize = exSize;
2978     sources[len].instOpt = instOpt;
2979     ++len;
2980 
2981     if (src0 != NULL && !src0->isNullReg())
2982     {
2983         sources[len].opnd = src0;
2984         sources[len].execSize = exSize;
2985         sources[len].instOpt = instOpt;
2986         ++len;
2987     }
2988 
2989     if (src1 != NULL && !src1->isNullReg())
2990     {
2991         sources[len].opnd = src1;
2992         sources[len].execSize = exSize;
2993         sources[len].instOpt = instOpt;
2994         ++len;
2995     }
2996 
2997     G4_SrcRegRegion *msgs[2] = {0, 0};
2998     unsigned sizes[2] = {0, 0};
2999     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
3000     unsigned dstLength = dst->isNullReg() ? 0 : ((bitwidth == 16 || bitwidth == 32) ? 1 : 2);
3001     unsigned msgDesc = 0;
3002     msgDesc |= getA64BTI();
3003     msgDesc |= op << 8;
3004 #define A64_ATOMIC_RETURN_DATA_CONTROL_BIT 13
3005     msgDesc |= (dstLength ? 1 : 0) << A64_ATOMIC_RETURN_DATA_CONTROL_BIT;
3006     msgDesc |= ((bitwidth == 16 || bitwidth == 32) ? 0 : 1) << 12;
3007 
3008     // Fill remaining bits.
3009     FillSVMAtomicMsgDesc(bitwidth == 16, IsFloatAtomicOps(atomicOp), msgDesc);
3010 
3011     if (msgs[1] == 0) {
3012         createSendInst(pred, dst,
3013             msgs[0], sizes[0], dstLength,
3014             instExSize,
3015             msgDesc, SFID::DP_DC1,
3016             false,
3017             SendAccess::READ_WRITE,
3018             NULL, NULL,
3019             instOpt, false);
3020     }
3021     else {
3022         createSplitSendInst(pred, dst,
3023             msgs[0], sizes[0],
3024             msgs[1], sizes[1],
3025             dstLength,
3026             instExSize,
3027             msgDesc, SFID::DP_DC1,
3028             false,
3029             SendAccess::READ_WRITE,
3030             NULL, NULL,
3031             instOpt, false);
3032     }
3033 
3034     return VISA_SUCCESS;
3035 }
3036 
getSVMOffset(G4_Operand * globalOffset,G4_SrcRegRegion * offsets,uint16_t exSize,G4_Predicate * pred,uint32_t mask)3037 G4_SrcRegRegion* IR_Builder::getSVMOffset(
3038     G4_Operand* globalOffset, G4_SrcRegRegion* offsets, uint16_t exSize,
3039     G4_Predicate* pred, uint32_t mask)
3040 {
3041     G4_Declare* dcl = createSendPayloadDcl(exSize, offsets->getType());
3042     G4_DstRegRegion* tmp = createDstRegRegion(dcl, 1);
3043     createInst(pred, G4_add, 0, g4::NOSAT, g4::SIMD8, tmp, offsets, globalOffset, mask, true);
3044     if (exSize == 16)
3045     {
3046         // do second half of the 64-bit add
3047         int offset = (8 * sizeof(uint64_t)) / getGRFSize();
3048         auto dst = createDst(dcl->getRegVar(), offset, 0, 1, offsets->getType());
3049         auto src = createSrc(offsets->getBase(),
3050             offsets->getRegOff() + offset, offsets->getSubRegOff(), getRegionStride1(), offsets->getType());
3051         createInst(duplicateOperand(pred), G4_add, 0, g4::NOSAT, g4::SIMD8, dst, src,
3052             duplicateOperand(globalOffset), getSplitHiEMask(16, mask), true);
3053     }
3054     return createSrcRegRegion(dcl, getRegionStride1());
3055 }
3056 
translateSVMGather4Inst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)3057 int IR_Builder::translateSVMGather4Inst(
3058     VISA_Exec_Size          execSize,
3059     VISA_EMask_Ctrl         eMask,
3060     ChannelMask             chMask,
3061     G4_Predicate            *pred,
3062     G4_Operand              *globalOffset,
3063     G4_SrcRegRegion         *offsets,
3064     G4_DstRegRegion         *dst)
3065 {
3066     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
3067 
3068     ASSERT_USER(execSize == EXEC_SIZE_8 || execSize == EXEC_SIZE_16,
3069         "Only support SIMD8 or SIMD16!");
3070 
3071     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
3072     G4_InstOpts instOpt = Get_Gen4_Emask(eMask, exSize);
3073 
3074     bool useSplitSend = useSends();
3075 
3076     // In case non-zero global offset is specified, we need to recalculate
3077     // offsets.
3078     if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
3079     {
3080         offsets = getSVMOffset(globalOffset, offsets, exSize, pred, instOpt);
3081     }
3082 
3083     PayloadSource sources[1]; // Maximal 1 sources, offsets
3084     unsigned len = 0;
3085 
3086     sources[len].opnd = offsets;
3087     sources[len].execSize = exSize;
3088     sources[len].instOpt = instOpt;
3089     ++len;
3090 
3091     G4_SrcRegRegion *msgs[2] = {0, 0};
3092     unsigned sizes[2] = {0, 0};
3093     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
3094 
3095     SFID sfid = SFID::DP_DC1;
3096 
3097     unsigned FC = 0;
3098     // Leave sidebind scaled offset 0 as it is not used now.
3099     FC |= DC1_A64_UNTYPED_SURFACE_READ << 14;
3100     FC |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
3101     FC |= chMask.getHWEncoding() << 8;
3102     FC |= getA64BTI();
3103 
3104     unsigned resLen = (exSize / GENX_DATAPORT_IO_SZ) *
3105         chMask.getNumEnabledChannels();
3106     if (msgs[1] == 0) {
3107         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
3108         createSendInst(pred, dst,
3109             msgs[0], sizes[0],
3110             resLen,
3111             exSize,
3112             FC, sfid,
3113             false,
3114             SendAccess::READ_ONLY,
3115             NULL, NULL,
3116             instOpt, false);
3117     }
3118     else {
3119         createSplitSendInst(pred, dst,
3120             msgs[0], sizes[0], msgs[1], sizes[1],
3121             resLen,
3122             exSize,
3123             FC, sfid,
3124             false,
3125             SendAccess::READ_ONLY,
3126             NULL, NULL,
3127             instOpt, false);
3128     }
3129 
3130     return VISA_SUCCESS;
3131 }
3132 
translateSVMScatter4Inst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)3133 int IR_Builder::translateSVMScatter4Inst(
3134     VISA_Exec_Size         execSize,
3135     VISA_EMask_Ctrl        eMask,
3136     ChannelMask            chMask,
3137     G4_Predicate           *pred,
3138     G4_Operand             *globalOffset,
3139     G4_SrcRegRegion        *offsets,
3140     G4_SrcRegRegion        *src)
3141 {
3142     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
3143 
3144     ASSERT_USER(execSize == EXEC_SIZE_8 || execSize == EXEC_SIZE_16,
3145         "Only support SIMD8 or SIMD16!");
3146 
3147     G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
3148     G4_InstOpts instOpt = Get_Gen4_Emask(eMask, exSize);
3149     bool useSplitSend = useSends();
3150 
3151     // In case non-zero global offset is specified, we need to recalculate
3152     // offsets.
3153     if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
3154     {
3155         offsets = getSVMOffset(globalOffset, offsets, exSize, pred, instOpt);
3156     }
3157 
3158     PayloadSource sources[2]; // Maximal 2 sources, offsets + src
3159     unsigned len = 0;
3160 
3161     sources[len].opnd = offsets;
3162     sources[len].execSize = exSize;
3163     sources[len].instOpt = instOpt;
3164     ++len;
3165     sources[len].opnd = src;
3166     sources[len].execSize = G4_ExecSize(exSize * chMask.getNumEnabledChannels());
3167     sources[len].instOpt = instOpt;
3168     ++len;
3169 
3170     G4_SrcRegRegion *msgs[2] = {0, 0};
3171     unsigned sizes[2] = {0, 0};
3172     preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
3173 
3174     SFID sfid = SFID::DP_DC1;
3175 
3176     unsigned FC = 0;
3177     // Leave sidebind scaled offset 0 as it is not used now.
3178     FC |= DC1_A64_UNTYPED_SURFACE_WRITE << 14;
3179     FC |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
3180     FC |= chMask.getHWEncoding() << 8;
3181     FC |= getA64BTI();
3182 
3183     G4_DstRegRegion *dst = createNullDst(Type_UD);
3184     if (msgs[1] == 0) {
3185         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
3186         createSendInst(pred, dst,
3187             msgs[0], sizes[0],
3188             0,
3189             exSize,
3190             FC, sfid,
3191             false,
3192             SendAccess::WRITE_ONLY,
3193             NULL, NULL,
3194             instOpt, false);
3195     }
3196     else {
3197         createSplitSendInst(pred, dst,
3198             msgs[0], sizes[0], msgs[1], sizes[1],
3199             0,
3200             exSize,
3201             FC, sfid,
3202             false,
3203             SendAccess::WRITE_ONLY,
3204             NULL, NULL,
3205             instOpt, false);
3206     }
3207 
3208     return VISA_SUCCESS;
3209 }
3210 
translateVISASVMGather4ScaledInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)3211 int IR_Builder::translateVISASVMGather4ScaledInst(
3212     VISA_Exec_Size            execSize,
3213     VISA_EMask_Ctrl           eMask,
3214     ChannelMask               chMask,
3215     G4_Predicate              *pred,
3216     G4_Operand                *globalOffset,
3217     G4_SrcRegRegion           *offsets,
3218     G4_DstRegRegion           *dst)
3219 {
3220     return translateSVMGather4Inst(execSize, eMask, chMask, pred,
3221         globalOffset, offsets, dst);
3222 }
3223 
translateVISASVMScatter4ScaledInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)3224 int IR_Builder::translateVISASVMScatter4ScaledInst(
3225     VISA_Exec_Size           execSize,
3226     VISA_EMask_Ctrl          eMask,
3227     ChannelMask              chMask,
3228     G4_Predicate             *pred,
3229     G4_Operand               *globalOffset,
3230     G4_SrcRegRegion          *offsets,
3231     G4_SrcRegRegion          *src)
3232 {
3233     return translateSVMScatter4Inst(execSize, eMask, chMask, pred,
3234         globalOffset, offsets, src);
3235 }
3236 
3237