1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2020-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "BuildIR.h"
10 #include "../Timer.h"
11
12 using namespace vISA;
13
14
15 #define FIX_OWORD_SEND_EXEC_SIZE(BLOCK_SIZE)(((BLOCK_SIZE) > 2)? 16: (BLOCK_SIZE*4))
16
17
buildDescForScatter(uint32_t msgType,VISA_SVM_Block_Num numBlocks,MDC_SM2 simdMode)18 static uint32_t buildDescForScatter(
19 uint32_t msgType, VISA_SVM_Block_Num numBlocks, MDC_SM2 simdMode)
20 {
21 uint32_t MD = (msgType & 0x1F) << 14;
22 MD |= numBlocks << 10;
23 MD |= 1 << 9;
24 MD |= simdMode << 8;
25 return MD;
26 }
27
28
isMessageHeaderOptional(G4_Operand * surface,G4_Operand * Offset) const29 bool IR_Builder::isMessageHeaderOptional(
30 G4_Operand *surface, G4_Operand *Offset) const
31 {
32 // Message header is require for T255 stateless surface on pre-SKL devices
33 // as a workaround for HW issue.
34 if (needsA32MsgHeader() && isStatelessSurface(surface))
35 {
36 return false;
37 }
38
39 // Message Header is optional when offset is 0.
40 // When GlobalOffset is 0, message header is optional.
41 // "If the header is not present, behavior is as if the message was sent
42 // with all fields in the header set to zero."
43 return Offset->isImm() && Offset->asImm()->isZero();
44 }
45
translateVISAQWGatherInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * surface,G4_SrcRegRegion * addresses,G4_DstRegRegion * dst)46 int IR_Builder::translateVISAQWGatherInst(
47 VISA_Exec_Size execSize,
48 VISA_EMask_Ctrl eMask,
49 G4_Predicate* pred,
50 VISA_SVM_Block_Num numBlocks,
51 G4_SrcRegRegion* surface,
52 G4_SrcRegRegion* addresses,
53 G4_DstRegRegion* dst)
54 {
55 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
56
57 VISA_Exec_Size instExecSize = execSize;
58 execSize = roundUpExecSize(execSize);
59
60 unsigned exSize = Get_VISA_Exec_Size(execSize);
61 G4_ExecSize instExSize = G4_ExecSize(Get_VISA_Exec_Size(instExecSize));
62 unsigned int instOpt = Get_Gen4_Emask(eMask, instExSize);
63 uint32_t messageLength = (exSize / 8);
64 uint32_t responseLength = Get_Common_ISA_SVM_Block_Num(numBlocks) * 2 * (exSize / 8);
65
66 uint32_t desc = buildDescForScatter(DC_QWORD_SCATTERED_READ, numBlocks,
67 (execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16));
68
69 createSendInst(
70 pred, dst, addresses, messageLength, responseLength, instExSize, desc,
71 SFID::DP_DC0, false, SendAccess::READ_ONLY, surface, nullptr, instOpt, false);
72
73 return VISA_SUCCESS;
74 }
75
translateVISAQWScatterInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * surface,G4_SrcRegRegion * addresses,G4_SrcRegRegion * src)76 int IR_Builder::translateVISAQWScatterInst(
77 VISA_Exec_Size execSize,
78 VISA_EMask_Ctrl eMask,
79 G4_Predicate* pred,
80 VISA_SVM_Block_Num numBlocks,
81 G4_SrcRegRegion* surface,
82 G4_SrcRegRegion* addresses,
83 G4_SrcRegRegion* src)
84 {
85 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
86
87 VISA_Exec_Size instExecSize = execSize;
88 execSize = roundUpExecSize(execSize);
89
90 G4_ExecSize exSize = toExecSize(execSize);
91 G4_ExecSize instExSize = toExecSize(instExecSize);
92 unsigned int instOpt = Get_Gen4_Emask(eMask, instExSize);
93 bool useSplitSend = useSends();
94
95 PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
96 unsigned len = 0;
97
98 sources[len].opnd = addresses;
99 sources[len].execSize = exSize;
100 sources[len].instOpt = instOpt;
101 ++len;
102
103 unsigned numElems = Get_Common_ISA_SVM_Block_Num(numBlocks);
104
105 sources[len].opnd = src;
106 sources[len].execSize = G4_ExecSize(exSize * numElems);
107 sources[len].instOpt = instOpt;
108 ++len;
109
110 G4_SrcRegRegion *msgs[2] {0, 0};
111 unsigned sizes[2] {0, 0};
112 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
113
114 uint32_t desc = buildDescForScatter(DC_QWORD_SCATTERED_WRITE, numBlocks,
115 execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16);
116
117 G4_DstRegRegion* dst = createNullDst(Type_UD);
118 if (msgs[1] == 0)
119 {
120 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
121 createSendInst(
122 pred, dst,
123 msgs[0], sizes[0],
124 0, instExSize,
125 desc, SFID::DP_DC0,
126 false,
127 SendAccess::WRITE_ONLY,
128 surface, nullptr,
129 instOpt, false);
130 }
131 else
132 {
133 createSplitSendInst(
134 pred, dst,
135 msgs[0], sizes[0],
136 msgs[1], sizes[1],
137 0, instExSize,
138 desc, SFID::DP_DC0,
139 false,
140 SendAccess::WRITE_ONLY,
141 surface, nullptr,
142 instOpt, false);
143 }
144
145 return VISA_SUCCESS;
146 }
147
148 // if surface is PRED_SURF_255, lower it to PRED_SURF_253 so that it's non IA-coherent
149 // the surface is not changed otherwise
lowerSurface255To253(G4_Operand * surface,IR_Builder & builder)150 static G4_Operand* lowerSurface255To253(G4_Operand* surface, IR_Builder& builder)
151 {
152 // disable due to OCL SVM atomics regression
153 #if 0
154 if (surface && surface->isImm() && surface->asImm()->getImm() == PREDEF_SURF_255)
155 {
156 return builder.createImm(PREDEF_SURF_253, Type_UW);
157 }
158 else
159 #endif
160 {
161 return surface;
162 }
163 }
164
BuildStatelessSurfaceMessageHeader(IR_Builder * IRB,G4_Declare * Header)165 static void BuildStatelessSurfaceMessageHeader(IR_Builder *IRB, G4_Declare *Header)
166 {
167 // No need to mask fft id when scratch surface is bindless as
168 // A32 accesses are guaranteed to not be scratch accesses.
169 if (IRB->hasScratchSurface())
170 {
171 // Clear header
172 // Rx (8) = 0
173 auto DstOpnd = IRB->createDst(Header->getRegVar(), 0, 0, 1, Type_UD);
174 auto SrcImm0 = IRB->createImm(0, Type_UD);
175 IRB->createMov(g4::SIMD8, DstOpnd, SrcImm0, InstOpt_WriteEnable, true);
176 return;
177 }
178 // For A32, clearing off scratch space offset or Buffer Base Address is
179 // always required once header is present.
180 G4_Type ElemTy = Header->getElemType();
181
182 // R0.5<31:10> is defined as Scratch Space Offset.
183 // R0.5<8:0> is defined as FF Thread ID (FFTID) in SKL+ devices.
184 // R0.5<7:0> is defined as FF Thread ID (FFTID) in pre-SKL devices.
185 // We increase the bit range to <9:0> to copy reserved bits as well.
186 const unsigned FFTID_Mask = 0x3ff;
187
188 // Rx.5[31:0] = 0 | R0.5[9:0]
189 G4_DstRegRegion *DstOpnd = IRB->createDst(Header->getRegVar(), 0, 5, 1, ElemTy);
190 // R0.5
191 G4_SrcRegRegion *SrcOpnd = IRB->createSrc(
192 IRB->getBuiltinR0()->getRegVar(), 0, 5,
193 IRB->getRegionScalar(), ElemTy);
194 // Mask
195 G4_Imm *Mask = IRB->createImm(FFTID_Mask, Type_UD);
196 IRB->createBinOp(G4_and, g4::SIMD1, DstOpnd, SrcOpnd, Mask,
197 InstOpt_WriteEnable, true);
198 }
199
200
201 // TODO: remove
202 #define SET_DATAPORT_MESSAGE_TYPE(dest, value)\
203 dest |= value << 14;
204
setOwordForDesc(uint32_t desc,int numOword,bool isSLM) const205 uint32_t IR_Builder::setOwordForDesc(uint32_t desc, int numOword, bool isSLM) const
206 {
207 static const uint32_t MESSAGE_SPECIFIC_CONTROL = 8;
208 switch (numOword)
209 {
210 case 1:
211 return desc;
212 case 2:
213 return desc | (0x2 << MESSAGE_SPECIFIC_CONTROL);
214 case 4:
215 return desc | (0x3 << MESSAGE_SPECIFIC_CONTROL);
216 case 8:
217 return desc | (0x4 << MESSAGE_SPECIFIC_CONTROL);
218 case 16:
219 assert(isSLM && has16OWordSLMBlockRW() && "16OWord block r/w not supported");
220 return desc | (0x5 << MESSAGE_SPECIFIC_CONTROL);
221 default:
222 /// TODO(move to verifier): default: ASSERT_USER(false, "OWord block size must be 1/2/4/8.");
223 return desc;
224 }
225 }
226
227
228 /*
229 * Translates OWord Block read CISA inst.
230 *
231 * For GT, assume size is 8 then the code should look like
232 *
233 * .declare VX Base=m ElementSize=4 Type=ud Total=8
234 * .declare VY Base=r ElementSize=4 Type=ud Total=8
235 *
236 * mov (8) VX(0,0)<1>, r0:ud
237 * mov (1) VX(0,2)<1>, P
238 * send (8) VY(0,0)<1>, VX(0,0), 0x5, 0x02180200
239 * mov (8) v(0,0)<1>, VY(0,0)
240 *
241 * P: M0.2 in the message header (Global offset)
242 *
243 * 0x5 == 0 (Not the EOT)
244 *
245 * 0x02180200 == Bit 31-29: 000 (Reserved)
246 * Bit 28-25: 0001 (Msg. leng. = 1)
247 * Bit 24-20: 00001 (Response msg. leng. = 1)
248 * Bit 19: 1 (Header present)
249 * Bit 18: 0 (Ignored)
250 * Bit 17: 0 (Send write commit message; ignored for read message
251 * Bit 16-13: 0000 (Msg. type = OWord block read - for Render Cache)
252 * Bit 12-8: 00010 (Block size = 2 OWords) - can only be 1/2/4/8 for sampler/render cache
253 * Bit 7-0: 00000000 + I (Binding table index)
254 *
255 */
translateVISAOwordLoadInst(ISA_Opcode opcode,bool modified,G4_Operand * surface,VISA_Oword_Num size,G4_Operand * offOpnd,G4_DstRegRegion * dstOpnd)256 int IR_Builder::translateVISAOwordLoadInst(
257 ISA_Opcode opcode,
258 bool modified,
259 G4_Operand* surface,
260 VISA_Oword_Num size,
261 G4_Operand* offOpnd,
262 G4_DstRegRegion* dstOpnd)
263 {
264 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
265
266 surface = lowerSurface255To253(surface, *this);
267
268 unsigned num_oword = Get_VISA_Oword_Num(size);
269 bool unaligned = (opcode == ISA_OWORD_LD_UNALIGNED);
270
271 // create dcl for VX
272 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
273
274 if (isStatelessSurface(surface))
275 {
276 // Build stateless surface message header.
277 BuildStatelessSurfaceMessageHeader(this, dcl);
278 }
279
280 /* mov (1) VX(0,2)<1>, P */
281 if (unaligned && (kernel.major_version == 3 && kernel.minor_version <= 1))
282 {
283 // for vISA3.1 and earlier
284 // the offset for unaligned OW load is in unit of DW, tranlate it into BYTE.
285 if (offOpnd->isImm())
286 {
287 // imm type must be UD as the result of shift could overflow word type
288 G4_Imm *new_src_opnd1 = createImm(
289 offOpnd->asImm()->getInt() << 2, Type_UD);
290 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, new_src_opnd1, true);
291 }
292 else
293 {
294 G4_DstRegRegion* dstOpnd = createDst(dcl->getRegVar(), 0, 2, 1, dcl->getElemType());
295 createBinOp(G4_shl, g4::SIMD1, dstOpnd, offOpnd,
296 createImm(2, Type_UW), InstOpt_WriteEnable, true);
297 }
298 }
299 else
300 {
301 dcl->setCapableOfReuse();
302 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, offOpnd, true);
303 }
304 // send's operands preparation
305 G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
306 G4_DstRegRegion* d = checkSendDst(dstOpnd->asDstRegRegion());
307
308 uint32_t temp = 0;
309
310 if (unaligned)
311 {
312 SET_DATAPORT_MESSAGE_TYPE(temp, DC_ALIGNED_OWORD_BLOCK_READ)
313 }
314
315 // Set bit 12-8 for the message descriptor
316 temp = setOwordForDesc(temp, num_oword, IsSLMSurface(surface));
317
318 // !!!WHY???
319 if (num_oword > 2)
320 {
321 // redefine the type and offset of post dst.
322 if ((d->getType() != Type_W) &&
323 (d->getType() != Type_UW)) {
324 short new_SubRegOff = dstOpnd->asDstRegRegion()->getSubRegOff();
325 if (dstOpnd->getRegAccess() == Direct) {
326 new_SubRegOff = (dstOpnd->asDstRegRegion()->getSubRegOff() * dstOpnd->getTypeSize()) / TypeSize(Type_W);
327 }
328 G4_DstRegRegion new_dst(
329 dstOpnd->getRegAccess(),
330 dstOpnd->asDstRegRegion()->getBase(),
331 dstOpnd->asDstRegRegion()->getRegOff(),
332 new_SubRegOff,
333 1,
334 Type_W);
335 d = createDstRegRegion(new_dst);
336 }
337 }
338
339 SFID tf_id = SFID::DP_DC0;
340
341 G4_ExecSize send_exec_size = G4_ExecSize(FIX_OWORD_SEND_EXEC_SIZE(num_oword));
342 bool forceSplitSend = shouldForceSplitSend(surface);
343
344 if (!forceSplitSend)
345 {
346 createSendInst(
347 NULL, d,
348 payload,
349 1,
350 (num_oword * 16 + getGRFSize() - 1) / getGRFSize(),
351 send_exec_size,
352 temp,
353 tf_id,
354 true,
355 SendAccess::READ_ONLY,
356 surface,
357 NULL,
358 InstOpt_WriteEnable,
359 false);
360 }
361 else {
362 G4_SrcRegRegion *m0 = createSrcRegRegion(dcl, getRegionStride1());
363 createSplitSendInst(
364 NULL, d, m0, 1,
365 createNullSrc(Type_UD), 0,
366 (num_oword * 16 + getGRFSize() - 1) / getGRFSize(),
367 send_exec_size,
368 temp,
369 tf_id,
370 true,
371 SendAccess::READ_ONLY,
372 surface,
373 nullptr,
374 InstOpt_WriteEnable,
375 false);
376 }
377
378 return VISA_SUCCESS;
379 }
380
381 /*
382 * Translates OWord Block write intrinsic.
383 *
384 * write(I, P, vector<int, S> v)
385 *
386 * For GT, assume S = 8 then the code should look like
387 *
388 * .declare VX Base=m ElementSize=4 Type=ud Total=16
389 * .declare VY Base=m ElementSize=4 Type=ud Total=8 ALIAS(VX,8)
390 *
391 * mov (8) VX(0,0)<1>, r0:ud
392 * mov (8) VY(0,0)<1>, v // mov (8) VX(1,0)<1>, v
393 * mov (1) VX(0,2)<2>, P
394 * send (8) null<1>, VX(0,0), 0x5, 0x04090200
395 *
396 * P: M0.2 in the message header (Global offset)
397 *
398 * 0x5 == 0 (Not the EOT)
399 * 0101 (Target Function ID: DP Render Cache)
400 *
401 * 0x04090200 == Bit 31-29: 000 (Reserved)
402 * Bit 28-25: 0010 (Msg. leng. = 2)
403 * Bit 24-20: 00000 (Response msg. leng. = 0)
404 * Bit 19: 1 (Header present)
405 * Bit 18: 0 (Ignored)
406 * Bit 17: 0 (Send write commit message
407 * Bit 16-13: 1000 (Msg. type = OWord block read - for Render Cache)
408 * Bit 12-8: 00010 (Block size = 2 OWords) - can only be 1/2/4/8 for sampler/render cache
409 * Bit 7-0: 00000000 + I (Binding table index)
410 *
411 */
translateVISAOwordStoreInst(G4_Operand * surface,VISA_Oword_Num size,G4_Operand * offOpnd,G4_SrcRegRegion * srcOpnd)412 int IR_Builder::translateVISAOwordStoreInst(
413 G4_Operand* surface,
414 VISA_Oword_Num size,
415 G4_Operand* offOpnd,
416 G4_SrcRegRegion* srcOpnd)
417 {
418 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
419
420 surface = lowerSurface255To253(surface, *this);
421
422 unsigned num_oword = Get_VISA_Oword_Num(size);
423 unsigned obj_size = num_oword * 16; // size of obj in bytes
424
425 unsigned funcCtrl = DC_OWORD_BLOCK_WRITE << 14;
426
427 uint32_t payloadGRFSize = (num_oword * 16 + getGRFSize() - 1) / getGRFSize();
428
429 // Set bit 12-8 for the message descriptor
430 funcCtrl = setOwordForDesc(funcCtrl, num_oword, IsSLMSurface(surface));
431 bool forceSplitSend = shouldForceSplitSend(surface);
432 if (forceSplitSend || useSends())
433 {
434 G4_Declare *headerDcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
435
436 if (isStatelessSurface(surface))
437 {
438 // Build stateless surface message header.
439 BuildStatelessSurfaceMessageHeader(this, headerDcl);
440 }
441
442 /* mov (1) VX(0,2)<1>, P */
443 createMovInst(headerDcl, 0, 2, g4::SIMD1, nullptr, nullptr, offOpnd, true);
444
445 unsigned msgDesc = funcCtrl;
446 unsigned extMsgLength = payloadGRFSize;
447 uint16_t extFuncCtrl = 0;
448
449 // message length = 1, response length = 0, header present = 1
450 msgDesc += (1 << getSendMsgLengthBitOffset()) + (1 << getSendHeaderPresentBitOffset());
451
452 G4_SendDescRaw* desc = createSendMsgDesc(msgDesc, 0, 1, SFID::DP_DC0,
453 extMsgLength, extFuncCtrl, SendAccess::WRITE_ONLY, surface);
454
455 G4_ExecSize sendSize = G4_ExecSize(FIX_OWORD_SEND_EXEC_SIZE(num_oword));
456
457 G4_SrcRegRegion* src0 = createSrcRegRegion(headerDcl, getRegionStride1());
458 G4_DstRegRegion* dst = createNullDst(sendSize > 8 ? Type_UW: Type_UD);
459
460 createSplitSendInst(
461 nullptr, dst, src0, srcOpnd, sendSize, desc, InstOpt_WriteEnable, false);
462 }
463 else
464 {
465 uint32_t temp = obj_size/TypeSize(Type_UD) + GENX_DATAPORT_IO_SZ;
466
467 G4_Declare *dcl = createSendPayloadDcl(temp, Type_UD);
468
469 /* mov (c*r) VX(1,0)<1>, V */
470 temp = obj_size/TypeSize(Type_UD);
471
472 createMovSendSrcInst(dcl, 1, 0, temp, srcOpnd, InstOpt_WriteEnable);
473
474 if (isStatelessSurface(surface)) {
475 // Build stateless surface message header.
476 BuildStatelessSurfaceMessageHeader(this, dcl);
477 } else {
478 // Copy R0 header.
479 createMovR0Inst(dcl, 0, 0, true);
480 }
481
482 /* mov (1) VX(0,2)<1>, P */
483 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, offOpnd, true);
484
485 // send's operands preparation
486 /* Size of whole operand in UINT elements */
487 G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
488
489 unsigned send_size = FIX_OWORD_SEND_EXEC_SIZE(num_oword);
490 G4_DstRegRegion *post_dst_opnd = createNullDst(send_size > 8 ? Type_UW: Type_UD);
491
492 createSendInst(
493 NULL,
494 post_dst_opnd,
495 payload,
496 payloadGRFSize + 1,
497 0,
498 G4_ExecSize(send_size),
499 funcCtrl,
500 SFID::DP_DC0,
501 true,
502 SendAccess::WRITE_ONLY,
503 surface,
504 NULL,
505 InstOpt_WriteEnable,
506 false);
507 }
508
509 return VISA_SUCCESS;
510 }
511
512 static const uint8_t mapExecSizeToNumElts[6] = {1, 2, 4, 8, 16, 32};
513
514 /*
515 * Translates scattered read intrinsic.
516 *
517 * For GT, assume N = 8 then the code should look like
518 *
519 * .declare VX Base=m ElementSize=4 Type=ud Total=16
520 * .declare VY Base=r ElementSize=4 Type=ud Total=8
521 *
522 * mov (8) VX(0,0)<1>, r0:ud
523 * mov (1) VX(0,2)<1>, P
524 * mov (8) VX(1,0)<1>, E
525 * send (8) VY(0,0)<1>, VX(0,0), 0x5, 0x0418C200
526 *
527 * P: M0.2 in the message header (Global offset)
528 * E: M1 in the message payload (Element offsets)
529 * 0x5 == 0 (Not the EOT)
530 * 0101 (Target Function ID: DP Render Cache)
531 *
532 * 0x0418C200 == Bit 31-29: 000 (Reserved)
533 * Bit 28-25: 0010 (Msg. leng. = 2)
534 * Bit 24-20: 00001 (Response msg. leng. = 1)
535 * Bit 19: 1 (Header present)
536 * Bit 18: 0 (Ignored)
537 * Bit 17: 0 (Send write commit message; ignored for read message
538 * Bit 16-13: 0110 (Msg. type = DWord Scattered read - for Render Cache)
539 * Bit 12-10: 010 Specifies the data size for each slot. 0: 1 byte; 1: 2 bytes; 2: 4 bytes; 3: Reserved
540 * Bit 9-8: 00 (Block size = 8 DWords)
541 * Bit 7-0: 00000000 + I (Binding table index)
542 *
543 */
translateVISAGatherInst(VISA_EMask_Ctrl emask,bool modified,GATHER_SCATTER_ELEMENT_SIZE eltSize,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_DstRegRegion * dstOpnd)544 int IR_Builder::translateVISAGatherInst(
545 VISA_EMask_Ctrl emask,
546 bool modified,
547 GATHER_SCATTER_ELEMENT_SIZE eltSize,
548 VISA_Exec_Size executionSize,
549 G4_Operand* surface,
550 G4_Operand* gOffOpnd,
551 G4_SrcRegRegion* eltOffOpnd,
552 G4_DstRegRegion* dstOpnd)
553 {
554 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
555
556 surface = lowerSurface255To253(surface, *this);
557
558 // Before GEN10, we translate DWORD GATHER on SLM to untyped GATHER4 on
559 // SLM with only R channel enabled. The later is considered more
560 // efficient without recalculating offsets in BYTE.
561 if (eltSize == GATHER_SCATTER_DWORD && IsSLMSurface(surface)) {
562 return translateVISAGather4Inst(emask, modified,
563 ChannelMask::createFromAPI(CHANNEL_MASK_R),
564 executionSize, surface, gOffOpnd,
565 eltOffOpnd, dstOpnd);
566 }
567
568 G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
569 unsigned int instOpt = Get_Gen4_Emask(emask, exsize);
570 bool headerLess = isMessageHeaderOptional(surface, gOffOpnd);
571 // Element size in gather/scatter message. Initially, we assume it's the
572 // same as the request.
573 GATHER_SCATTER_ELEMENT_SIZE msgEltSize = eltSize;
574
575 // SLM access
576 // HEADLESS BYTE WORD DWORD
577 // BDW Opt YES NO NO
578 // SKL Req YES NO NO
579 // CNL Req YES NO YES
580
581 G4_Predicate* pred = NULL; // for SIMD1 gather
582 uint8_t numElt = mapExecSizeToNumElts[executionSize];
583 // we need to treat simd1 as simd8 in several places during code gen
584 uint8_t effectiveNumElt = (numElt == 1 ? 8 : numElt);
585
586 if (!headerLess && noSLMMsgHeader() && IsSLMSurface(surface))
587 {
588 // From SKL, SLM messages forbid message header. Recalculate offset by
589 // adding global offset and force headerLess.
590 G4_Declare *dcl = createSendPayloadDcl(numElt, eltOffOpnd->getType());
591 dcl->setSubRegAlign(GRFALIGN);
592 G4_DstRegRegion *newEltOffOpnd = createDstRegRegion(dcl, 1);
593 createBinOp(G4_add, G4_ExecSize(numElt), newEltOffOpnd, eltOffOpnd, gOffOpnd, instOpt, true);
594 eltOffOpnd = createSrcRegRegion(dcl, numElt == 1 ? getRegionScalar() : getRegionStride1());
595 headerLess = true;
596 }
597
598 bool useSplitSend = useSends();
599 // When header is not required, split-send is not needed as there's only
600 // one part in the message. When header is present, we will split the
601 // message as (header, offset).
602 if (headerLess)
603 useSplitSend = false;
604
605 G4_Declare *header = 0;
606 G4_Declare *offset = createSendPayloadDcl(numElt, Type_UD);
607 offset->setSubRegAlign(GRFALIGN);
608
609 if (useSplitSend)
610 {
611 ASSERT_USER(!headerLess, "SplitSend should not be used when header is not required!");
612 // Without header, it's unnecessary to split the message.
613 header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
614 }
615 else if (!headerLess)
616 {
617 header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + effectiveNumElt, Type_UD);
618 offset->setAliasDeclare(header, numEltPerGRF<Type_UB>());
619 }
620
621 G4_SrcRegRegion* msgSrcOpnd = NULL;
622
623 if (headerLess)
624 {
625 ASSERT_USER(!header, "'header' should not be allocated when header is not required!");
626
627 if (eltSize == GATHER_SCATTER_WORD ||
628 (eltSize != GATHER_SCATTER_BYTE && IsSLMSurface(surface)))
629 {
630 // Use byte gather for WORD gather as well as SLM surfaces (only supports byte gather)
631 // need a shift to make the offset to be byte offset
632 // shl (8) tmp<1>:ud elt_off<8;8,1>:ud 0x2:uw
633 // Don't do this for Dword because we use the dword scatter message instead
634 G4_DstRegRegion* tmpDstOpnd = createDstRegRegion(offset, 1);
635 createBinOp(G4_shl, G4_ExecSize(numElt), tmpDstOpnd, eltOffOpnd,
636 createImm(unsigned(eltSize), Type_UD), instOpt, true);
637 msgSrcOpnd = createSrcRegRegion(offset, getRegionStride1());
638 msgEltSize = GATHER_SCATTER_BYTE;
639 }
640 else
641 {
642 msgSrcOpnd = eltOffOpnd;
643 }
644 }
645 else
646 {
647 if (isStatelessSurface(surface)) {
648 // Build stateless surface message header.
649 BuildStatelessSurfaceMessageHeader(this, header);
650 } else {
651 // Copy R0 header.
652 createMovR0Inst(header, 0, 0, true);
653 }
654
655 G4_DstRegRegion* dst1_opnd = createDst(offset->getRegVar(), 0, 0, 1, offset->getElemType());
656
657 if (eltSize == GATHER_SCATTER_WORD || IsSLMSurface(surface))
658 {
659 // For non-SLM surface, WORD gather/scatter has no hardware
660 // support and must be translated into BYTE gather/scatter.
661 //
662 // SLM surface supports only BYTE gather/scatter
663 // support and also needs translating into BYTE gather/scatter.
664 //
665 /* mov (1) VX(0,2)<1>, P */
666 if (gOffOpnd->isImm())
667 {
668 G4_Imm *new_src_opnd1 = createImm(
669 gOffOpnd->asImm()->getInt() * (eltSize == GATHER_SCATTER_WORD ? 2 : 4),
670 gOffOpnd->getType());
671 createMovInst(header, 0, 2, g4::SIMD1, NULL, NULL, new_src_opnd1, true);
672 }
673 else
674 {
675 G4_DstRegRegion* dst2_opnd = createDst(header->getRegVar(), 0, 2, 1, header->getElemType());
676
677 createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd,
678 createImm((unsigned)eltSize, Type_UD), InstOpt_WriteEnable, true);
679 }
680 createBinOp(G4_shl, G4_ExecSize(numElt), dst1_opnd, eltOffOpnd,
681 createImm((unsigned)eltSize, Type_UD), instOpt, true);
682 msgEltSize = GATHER_SCATTER_BYTE;
683 }
684 else
685 {
686 /* mov (1) VX(0,2)<1>, P */
687 createMovInst(header, 0, 2, g4::SIMD1, NULL, NULL, gOffOpnd, true);
688 /* mov (numElt) VX(1,0)<1>, E */
689 createMov(G4_ExecSize(numElt), dst1_opnd,
690 eltOffOpnd, instOpt, true);
691 }
692
693 // Create a <8;8,1> src region for the send payload
694 msgSrcOpnd = createSrcRegRegion(header, getRegionStride1());
695 }
696
697 G4_DstRegRegion* d = dstOpnd->asDstRegRegion();
698
699 SFID tf_id = SFID::DP_DC0;
700 unsigned temp = 0;
701 // Set bit 9-8 for the message descriptor
702 if (msgEltSize == GATHER_SCATTER_DWORD)
703 {
704 if (effectiveNumElt == 8)
705 {
706 temp += 2 << 8;
707 }
708 else {
709 temp += 3 << 8;
710 }
711 temp += DC_DWORD_SCATTERED_READ << 14; // '0011' for DWORD scattered read
712 }
713 else
714 {
715 if (effectiveNumElt == 16)
716 {
717 temp += 1 << 8;
718 }
719 temp += (unsigned char)eltSize << 10;
720 temp += DC_BYTE_SCATTERED_READ << 14;
721 }
722
723 if (useSplitSend)
724 {
725 ASSERT_USER(!headerLess, "SplitSend should only be used when header is required!");
726
727 G4_SrcRegRegion *m0 = createSrcRegRegion(header, getRegionStride1());
728 G4_SrcRegRegion *m1 = createSrcRegRegion(offset, getRegionStride1());
729 createSplitSendInst(pred, d,
730 m0, 1,
731 m1, effectiveNumElt / GENX_DATAPORT_IO_SZ,
732 effectiveNumElt / GENX_DATAPORT_IO_SZ,
733 G4_ExecSize(numElt),
734 temp,
735 tf_id, true,
736 SendAccess::READ_ONLY,
737 surface, NULL, instOpt, false);
738 }
739 else
740 {
741 createSendInst(
742 pred,
743 d,
744 msgSrcOpnd,
745 headerLess ? effectiveNumElt/GENX_DATAPORT_IO_SZ : effectiveNumElt/GENX_DATAPORT_IO_SZ + 1,
746 effectiveNumElt/GENX_DATAPORT_IO_SZ,
747 G4_ExecSize(numElt),
748 temp,
749 tf_id,
750 !headerLess,
751 SendAccess::READ_ONLY,
752 surface,
753 nullptr,
754 instOpt,
755 false);
756 }
757
758 return VISA_SUCCESS;
759 }
760
761
762
763
764 /*
765 * Translates scattered write intrinsic.
766 *
767 * For GT, assume N = 8 then the code should look like
768 *
769 * .declare VX Base=m ElementSize=4 Type=ud Total=24
770 *
771 * mov (8) VX(0,0)<1>, r0:ud
772 * mov (1) VX(0,2)<1>, P
773 * mov (8) VX(1,0)<1>, E
774 * mov (8) VX(2,0)<1>, V
775 * send (8) null<1>, VX(0,0), 0x5, 0x06096200
776 *
777 * P: M0.2 in the message header (Global offset)
778 * E: M1 in the message payload (Element offsets)
779 * v: M2 in the message payload (written data)
780 *
781 * 0x5 == 0 (Not the EOT)
782 * 0101 (Target Function ID: DP Render Cache)
783 *
784 * 0x06096200 == Bit 31-29: 000 (Reserved)
785 * Bit 28-25: 0011 (Msg. leng. = 3)
786 * Bit 24-20: 00000 (Response msg. leng. = 0)
787 * Bit 19: 1 (Header present)
788 * Bit 18: 0 (Ignored)
789 * Bit 17: 0 (Send write commit message)
790 * Bit 16-13: 1011 (Msg. type = DWord Scattered write - for Render Cache)
791 * Bit 12-8: 00010 (Block size = 8 DWords)
792 * Bit 7-0: 00000000 + I (Binding table index)
793 *
794 */
translateVISAScatterInst(VISA_EMask_Ctrl emask,GATHER_SCATTER_ELEMENT_SIZE eltSize,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_SrcRegRegion * srcOpnd)795 int IR_Builder::translateVISAScatterInst(
796 VISA_EMask_Ctrl emask,
797 GATHER_SCATTER_ELEMENT_SIZE eltSize,
798 VISA_Exec_Size executionSize,
799 G4_Operand* surface,
800 G4_Operand* gOffOpnd,
801 G4_SrcRegRegion* eltOffOpnd,
802 G4_SrcRegRegion* srcOpnd)
803 {
804 // Before GEN10, we translate DWORD SCATTER on SLM to untyped GATHER4 on
805 // SLM with only R channel enabled. The later is considered more
806 // efficient without recalculating offsets in BYTE.
807 if (eltSize == GATHER_SCATTER_DWORD && IsSLMSurface(surface)) {
808 return translateVISAScatter4Inst(emask,
809 ChannelMask::createFromAPI(CHANNEL_MASK_R),
810 executionSize, surface, gOffOpnd,
811 eltOffOpnd, srcOpnd);
812 }
813
814 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
815 surface = lowerSurface255To253(surface, *this);
816
817 G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
818 G4_InstOpts instOpt = Get_Gen4_Emask(emask, exsize);
819 G4_Predicate *pred = NULL;
820 // Element size in gather/scatter message. Initially, we assume it's the same as the request.
821 GATHER_SCATTER_ELEMENT_SIZE msgEltSize = eltSize;
822
823 uint8_t numElt = mapExecSizeToNumElts[executionSize];
824 // we need to treat simd1 as simd8 in several places during code gen
825 uint8_t effectiveNumElt = (numElt == 1 ? 8 : numElt);
826
827 bool headerLess = isMessageHeaderOptional(surface, gOffOpnd);
828 G4_SrcRegRegion* msgSrcOpnd = NULL;
829
830 // SLM access
831 // HEADLESS BYTE WORD DWORD
832 // BDW Opt YES NO NO
833 // SKL Req YES NO NO
834 // CNL Req YES NO YES
835
836 if (!headerLess && noSLMMsgHeader() && IsSLMSurface(surface)) {
837 // From SKL, SLM messages forbid message header. Recalculate offset by
838 // adding global offset and force headerLess.
839 G4_Declare *dcl = createSendPayloadDcl(numElt, eltOffOpnd->getType());
840 G4_DstRegRegion *newEltOffOpnd = createDstRegRegion(dcl, 1);
841 createBinOp(G4_add, G4_ExecSize(numElt), newEltOffOpnd, eltOffOpnd, gOffOpnd, instOpt, true);
842 eltOffOpnd = createSrcRegRegion(dcl, numElt == 1 ? getRegionScalar() : getRegionStride1());
843 headerLess = true;
844 }
845
846 if (headerLess)
847 {
848 // header size = 2 * #elt
849 G4_Declare *dcl = createSendPayloadDcl(effectiveNumElt * 2, Type_UD);
850 G4_DstRegRegion* tmpDstOpnd = createDstRegRegion(dcl, 1);
851 if (eltSize == GATHER_SCATTER_WORD ||
852 (eltSize != GATHER_SCATTER_BYTE && IsSLMSurface(surface)))
853 {
854 // For non-SLM surface,
855 // need a shift to make the offset to be byte offset
856 // shl (esize) tmp.0<1>:ud elt_off<8;8,1>:ud 0x2:uw
857 // Don't do this for Dword because we use the dword scatter message instead
858 //
859 // SLM surface has only BYTE scattered
860 // read/write support. Always use BYTE scater.
861 createBinOp(G4_shl, G4_ExecSize(numElt), tmpDstOpnd, eltOffOpnd,
862 createImm(unsigned(eltSize), Type_UD), instOpt, true);
863 msgEltSize = GATHER_SCATTER_BYTE;
864 }
865 else
866 {
867 createMov(G4_ExecSize(numElt), tmpDstOpnd, eltOffOpnd, instOpt, true);
868 }
869
870 createMovSendSrcInst(dcl, effectiveNumElt/8, 0, numElt, srcOpnd, instOpt);
871 msgSrcOpnd = createSrcRegRegion(dcl, getRegionStride1());
872 }
873 else
874 {
875 // mov (8) VX(0,0)<1>, r0:ud
876 // add dcl for VX
877 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + effectiveNumElt * 2, Type_UD);
878
879 if (isStatelessSurface(surface)) {
880 // Build stateless surface message header.
881 BuildStatelessSurfaceMessageHeader(this, dcl);
882 } else {
883 // Copy R0 header.
884 createMovR0Inst(dcl, 0, 0, true);
885 }
886
887 auto dst1_opnd = createDst(dcl->getRegVar(), 1, 0, 1, dcl->getElemType());
888
889 if (eltSize == GATHER_SCATTER_WORD || IsSLMSurface(surface))
890 {
891 // For non-SLM surface, WORD gather/scatter has no hardware
892 // supportr and must be translated into BYTE gather/scatter.
893 //
894 // For SLM surface, gen9 devices has only BYTE gather/scatter
895 // support and also needs translating into BYTE gather/scatter.
896 //
897 /* mov (1) VX(0,2)<1>, P */
898 if (gOffOpnd->isImm())
899 {
900 G4_Imm *new_src_opnd1 = createImm(
901 gOffOpnd->asImm()->getInt() * (eltSize == GATHER_SCATTER_WORD ? 2 : 4),
902 gOffOpnd->getType());
903 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, new_src_opnd1, true);
904 }
905 else
906 {
907 G4_DstRegRegion* dst2_opnd = createDst(dcl->getRegVar(), 0, 2, 1, dcl->getElemType());
908 createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd,
909 createImm((unsigned)eltSize, Type_UD), InstOpt_WriteEnable, true);
910 }
911 createBinOp(G4_shl, G4_ExecSize(numElt), dst1_opnd, eltOffOpnd,
912 createImm((unsigned)eltSize, Type_UD), instOpt, true);
913 msgEltSize = GATHER_SCATTER_BYTE;
914 }
915 else
916 {
917 /* mov (1) VX(0,2)<1>, P */
918 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, gOffOpnd, true);
919 /* mov (numElt) VX(1,0)<1>, E */
920 createMov(G4_ExecSize(numElt), dst1_opnd,
921 eltOffOpnd, instOpt, true);
922 }
923
924 /* mov (numElt) VX(numElt/8+1,0)<1>, V */
925 createMovSendSrcInst(dcl, (effectiveNumElt/8+1), 0, numElt, srcOpnd, instOpt);
926
927 // send's operands preparation
928 // create a currDst for VX
929 msgSrcOpnd = createSrcRegRegion(dcl, getRegionStride1());
930 }
931
932 unsigned temp = 0;
933
934 // Set bit 9-8 for the message descriptor
935 if (msgEltSize == GATHER_SCATTER_DWORD)
936 {
937 if (effectiveNumElt == 8)
938 {
939 temp += 2 << 8;
940 }
941 else {
942 temp += 3 << 8;
943 }
944 temp += DC_DWORD_SCATTERED_WRITE << 14;
945 }
946 else
947 {
948 if (effectiveNumElt == 16)
949 {
950 temp += 1 << 8;
951 }
952 temp += (unsigned char)eltSize << 10;
953 temp += DC_BYTE_SCATTERED_WRITE << 14;
954 }
955
956 G4_DstRegRegion *post_dst_opnd = createNullDst(effectiveNumElt > 8 ? Type_UW : Type_UD);
957
958 createSendInst(
959 pred,
960 post_dst_opnd,
961 msgSrcOpnd,
962 headerLess ? effectiveNumElt/GENX_DATAPORT_IO_SZ * 2 :
963 effectiveNumElt/GENX_DATAPORT_IO_SZ * 2 + 1,
964 0,
965 G4_ExecSize(numElt),
966 temp,
967 SFID::DP_DC0,
968 !headerLess,
969 SendAccess::WRITE_ONLY,
970 surface,
971 NULL,
972 instOpt,
973 false);
974
975 return VISA_SUCCESS;
976 }
977
978
BuildUntypedStatelessSurfaceMessageHeader(IR_Builder * IRB,G4_Declare * Header)979 static void BuildUntypedStatelessSurfaceMessageHeader(IR_Builder *IRB, G4_Declare *Header)
980 {
981 // Set PSM (Pixel Sample Mask) in MH1_A32_PSM
982 G4_Type ElemTy = Header->getElemType();
983
984 // R0.7<31:0> is defined as MHC_PSM where the lower 16 bits specify the
985 // pixel sample mask.
986 const unsigned PSM_Mask = 0xffff;
987
988 // Rx.7[31:0] = 0xFFFF
989 G4_DstRegRegion *DstOpnd = IRB->createDst(Header->getRegVar(), 0, 7, 1, ElemTy);
990 // Mask
991 G4_Imm *Mask = IRB->createImm(PSM_Mask, Type_UD);
992 IRB->createMov(g4::SIMD1, DstOpnd, Mask, InstOpt_WriteEnable, true);
993
994 BuildStatelessSurfaceMessageHeader(IRB, Header);
995 }
996
997
998 /*
999 * Translates untyped surface read.
1000 *
1001 * For GT, assume N = 8 then the code should look like
1002 *
1003 * .declare VX Base=m ElementSize=4 Type=ud Total=16
1004 * .declare VY Base=r ElementSize=4 Type=ud Total=8
1005 *
1006 * mov (8) VX(0,0)<1>, r0:ud
1007 * mov (8) VX(1,0)<1>, P+E
1008 * send (8) VY(0,0)<1>, VX(0,0), 0x5, 0x0418C200
1009 *
1010 * E: M1 in the message payload (Element offsets in BYTEs)
1011 * 1010 (Target Function ID: Data Cache)
1012 *
1013 * 0x0418C200 == Bit 31-29: 000 (Reserved)
1014 * Bit 28-25: 0010 (Msg. leng. = 2)
1015 * Bit 24-20: 00001 (Response msg. leng. = 1)
1016 * Bit 19: 1 (Header present)
1017 * Bit 18: 0 (Ignored)
1018 * Bit 17-14: 1101 (Msg. type = untyped write - for data Cache)
1019 * Bit 13-12: 0010 (SIMD mode = 8)
1020 * Bit 11-8: 0000 (masked channels)
1021 * Bit 7-0: 00000000 + I (Binding table index)
1022 *
1023 */
translateVISAGather4Inst(VISA_EMask_Ctrl emask,bool modified,ChannelMask chMask,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_DstRegRegion * dstOpnd)1024 int IR_Builder::translateVISAGather4Inst(
1025 VISA_EMask_Ctrl emask,
1026 bool modified,
1027 ChannelMask chMask,
1028 VISA_Exec_Size executionSize,
1029 G4_Operand* surface,
1030 G4_Operand* gOffOpnd,
1031 G4_SrcRegRegion* eltOffOpnd,
1032 G4_DstRegRegion* dstOpnd)
1033 {
1034 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1035
1036 surface = lowerSurface255To253(surface, *this);
1037
1038 G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
1039 G4_InstOpts instOpt = Get_Gen4_Emask(emask, exsize);
1040 unsigned int num_channel = chMask.getNumEnabledChannels();
1041
1042 uint8_t numElt = mapExecSizeToNumElts[executionSize];
1043 uint8_t hdrSize = 0;
1044
1045 bool useSplitSend = useSends();
1046
1047 G4_Declare *header = 0;
1048 G4_Declare *offset = createSendPayloadDcl(numElt, Type_UD);
1049
1050 if (surface && isStatelessSurface(surface) && needsA32MsgHeader())
1051 {
1052 // Header is required to work around a HW issue on pre-SKL devices.
1053 hdrSize = GENX_DATAPORT_IO_SZ;
1054 if (useSplitSend) {
1055 header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1056 } else {
1057 header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + numElt, Type_UD);
1058 offset->setAliasDeclare(header, numEltPerGRF<Type_UB>());
1059 }
1060 } else {
1061 // When the surface is not stateless one, header is not used and therefore
1062 // split-send is not used.
1063 useSplitSend = false;
1064 }
1065
1066 if (header) {
1067 // With 'header' allocated, we need prepare the header for the
1068 // (stateless) surface.
1069 ASSERT_USER(isStatelessSurface(surface), "With 'header' allocated, stateless surface is expected!");
1070 // Build stateless surface message header.
1071 BuildUntypedStatelessSurfaceMessageHeader(this, header);
1072 }
1073
1074 // convert to byte address
1075 // shl (esize) offset<1>:ud elt_off<8;8,1>:ud 2:uw
1076 G4_DstRegRegion* dst1_opnd = createDst(offset->getRegVar(), 0, 0, 1, offset->getElemType());
1077
1078 G4_Declare *tmp_dcl = createTempVar(numElt, Type_UD, GRFALIGN);
1079 G4_DstRegRegion* dst3_opnd = createDst(tmp_dcl->getRegVar(), 0, 0, 1, tmp_dcl->getElemType());
1080
1081 createBinOp(G4_shl, G4_ExecSize(numElt), dst3_opnd, eltOffOpnd, createImm(2, Type_UW), instOpt, true);
1082
1083 G4_SrcRegRegion* src2_opnd = createSrc(tmp_dcl->getRegVar(), 0, 0,
1084 getRegionStride1(), tmp_dcl->getElemType());
1085
1086 // As untyped surface message use MH_IGNORE based header, if global offset
1087 // is non-zero, we need recalculate element offsets.
1088 if (gOffOpnd->isImm())
1089 {
1090 if (gOffOpnd->asImm()->getInt() != 0)
1091 {
1092 gOffOpnd = createImm(
1093 gOffOpnd->asImm()->getInt() * 4,
1094 gOffOpnd->getType());
1095 createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, gOffOpnd, instOpt, true);
1096 }
1097 else
1098 {
1099 createMov(G4_ExecSize(numElt), dst1_opnd, src2_opnd, instOpt, true);
1100 }
1101 }
1102 else
1103 {
1104 G4_Declare *tmp_dcl1 = createTempVar(1, gOffOpnd->getType(), Any);
1105 G4_DstRegRegion* dst2_opnd = createDst(tmp_dcl1->getRegVar(), 0, 0, 1, tmp_dcl1->getElemType());
1106
1107 createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd, createImm(2, Type_UW), InstOpt_WriteEnable, true);
1108
1109 G4_SrcRegRegion* src1Opnd = createSrc(tmp_dcl1->getRegVar(), 0, 0,
1110 getRegionScalar(), tmp_dcl1->getElemType());
1111
1112 createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, src1Opnd, instOpt, true);
1113 }
1114
1115 // send's operands preparation
1116
1117 G4_DstRegRegion* d = checkSendDst(dstOpnd->asDstRegRegion());
1118
1119 unsigned temp = 0;
1120
1121 // Set bit 13-12 for the message descriptor
1122 if (numElt == 8)
1123 {
1124 temp += 2 << 12;
1125 }
1126 else
1127 {
1128 temp += 1 << 12;
1129 }
1130
1131 SFID tf_id = SFID::DP_DC1;
1132 temp += DC1_UNTYPED_SURFACE_READ << 14;
1133
1134 // bits 11-8: channel mask
1135 // HW defines 0 to mean the channel is on, so we have to flip it
1136 temp += chMask.getHWEncoding() << 8;
1137
1138 if (surface == NULL)
1139 {
1140 temp |= 0xFE;
1141 }
1142
1143 if (useSplitSend) {
1144 ASSERT_USER(header, "'header' should be allocated when split-send is to be used.");
1145
1146 G4_SrcRegRegion *m0 = createSrcRegRegion(header, getRegionStride1());
1147 G4_SrcRegRegion *m1 = createSrcRegRegion(offset, getRegionStride1());
1148 createSplitSendInst(NULL, d,
1149 m0, 1, m1, numElt / GENX_DATAPORT_IO_SZ,
1150 (numElt / GENX_DATAPORT_IO_SZ)* num_channel,
1151 G4_ExecSize(numElt), temp, tf_id, hdrSize != 0,
1152 SendAccess::READ_ONLY,
1153 surface, NULL, instOpt, false);
1154 }
1155 else
1156 {
1157 G4_SrcRegRegion* payload = createSrcRegRegion(header ? header : offset, getRegionStride1());
1158 createSendInst(
1159 NULL,
1160 d,
1161 payload,
1162 (hdrSize + numElt)/GENX_DATAPORT_IO_SZ,
1163 (numElt/GENX_DATAPORT_IO_SZ) * num_channel,
1164 G4_ExecSize(numElt),
1165 temp,
1166 tf_id,
1167 hdrSize != 0,
1168 SendAccess::READ_ONLY,
1169 surface,
1170 NULL,
1171 instOpt,
1172 false);
1173 }
1174
1175 return VISA_SUCCESS;
1176 }
1177
1178
1179 /*
1180 * Translates untyped surface write intrinsic.
1181 *
1182 * For GT, assume N = 8 then the code should look like
1183 *
1184 * .declare VX Base=m ElementSize=4 Type=ud Total=24
1185 *
1186 * mov (8) VX(0,0)<1>, r0:ud
1187 * mov (8) VX(1,0)<1>, E + P
1188 * mov (8) VX(2,0)<1>, V
1189 * send (8) null<1>, VX(0,0), 0x5, 0x06096200
1190 *
1191 * E: M1 in the message payload (Element offsets)
1192 * v: M2 in the message payload (written data)
1193 *
1194 * 1010 (Target Function ID: DP Data Cache)
1195 *
1196 * 0x06096200 == Bit 31-29: 000 (Reserved)
1197 * Bit 28-25: 0011 (Msg. leng. = 3)
1198 * Bit 24-20: 00000 (Response msg. leng. = 0)
1199 * Bit 19: 1 (Header present)
1200 * Bit 18: 0 (Ignored)
1201 * Bit 17-14: 1101 (Msg. type = untyped write - for data Cache)
1202 * Bit 13-12: 0010 (SIMD mode = 8)
1203 * Bit 11-8: 0000 (masked channels)
1204 * Bit 7-0: 00000000 + I (Binding table index)
1205 *
1206 */
translateVISAScatter4Inst(VISA_EMask_Ctrl emask,ChannelMask chMask,VISA_Exec_Size executionSize,G4_Operand * surface,G4_Operand * gOffOpnd,G4_SrcRegRegion * eltOffOpnd,G4_SrcRegRegion * srcOpnd)1207 int IR_Builder::translateVISAScatter4Inst(
1208 VISA_EMask_Ctrl emask,
1209 ChannelMask chMask,
1210 VISA_Exec_Size executionSize,
1211 G4_Operand* surface,
1212 G4_Operand* gOffOpnd,
1213 G4_SrcRegRegion* eltOffOpnd,
1214 G4_SrcRegRegion* srcOpnd)
1215 {
1216 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1217
1218 surface = lowerSurface255To253(surface, *this);
1219
1220 G4_ExecSize exsize = G4_ExecSize(Get_VISA_Exec_Size(executionSize));
1221 G4_InstOpts instOpt = Get_Gen4_Emask(emask, exsize);
1222
1223 unsigned int num_channel = chMask.getNumEnabledChannels();
1224
1225 uint8_t numElt = mapExecSizeToNumElts[executionSize];
1226 uint8_t hdrSize = 0;
1227
1228 unsigned int data_size = numElt * num_channel;
1229 G4_Declare *src_dcl = srcOpnd->asSrcRegRegion()->getBase()->asRegVar()->getDeclare();
1230
1231 int payload_size = numElt + data_size;
1232
1233 bool useSplitSend = useSends();
1234
1235 G4_Declare *header = 0;
1236 G4_Declare *offset = 0;
1237 G4_Declare *data = createSendPayloadDcl(data_size, Type_UD);
1238
1239 if (surface && isStatelessSurface(surface) && needsA32MsgHeader())
1240 {
1241 // Header is required to work around a HW issue on pre-SKL devices.
1242 hdrSize = GENX_DATAPORT_IO_SZ;
1243 offset = createSendPayloadDcl(numElt, Type_UD);
1244 if (useSplitSend) {
1245 // When header is required, we split the message as
1246 // (header, offset + data) if split-send is supported.
1247 header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1248 offset = createSendPayloadDcl(payload_size, Type_UD);
1249 data->setAliasDeclare(offset, (numElt/8) * numEltPerGRF<Type_UB>());
1250 } else {
1251 header = createSendPayloadDcl(GENX_DATAPORT_IO_SZ + payload_size, Type_UD);
1252 offset->setAliasDeclare(header, numEltPerGRF<Type_UB>());
1253 data->setAliasDeclare(header, numEltPerGRF<Type_UB>() * ((numElt/8) + 1));
1254 }
1255 } else {
1256 if (useSplitSend) {
1257 // When header is not required, we split the message as (offset, data)
1258 // if split-send is supported.
1259 offset = createSendPayloadDcl(numElt, Type_UD);
1260 } else {
1261 offset = createSendPayloadDcl(payload_size, Type_UD);
1262 data->setAliasDeclare(offset, (numElt/8) * numEltPerGRF<Type_UB>());
1263 }
1264 }
1265
1266 if (header) {
1267 // With 'header' allocated, we need prepare the header for the
1268 // (stateless) surface.
1269 ASSERT_USER(isStatelessSurface(surface),
1270 "With 'header' allocated, stateless surface is expected!");
1271 // Build stateless surface message header.
1272 BuildUntypedStatelessSurfaceMessageHeader(this, header);
1273 }
1274
1275 if (!header && useSplitSend)
1276 {
1277 data = src_dcl;
1278 } else
1279 {
1280 // Copy data from src operand.
1281 for (unsigned i = 0; i != num_channel; ++i)
1282 {
1283 G4_SrcRegRegion* s2_opnd =
1284 createSrc(
1285 src_dcl->getRegVar(), (i * numElt) / 8, 0, getRegionStride1(), src_dcl->getElemType());
1286 createMovSendSrcInst(data, (i * numElt) / 8, 0, numElt, s2_opnd, instOpt);
1287 }
1288 }
1289
1290 // mov VX(0,0)<1>, r0
1291 // createMovR0Inst(header, 0, 0, true);
1292
1293 G4_DstRegRegion* dst1_opnd = createDst(offset->getRegVar(), 0, 0, 1, offset->getElemType());
1294
1295 G4_Declare *tmp_dcl = createTempVar(numElt, Type_UD, GRFALIGN);
1296 G4_DstRegRegion* dst3_opnd = createDst(tmp_dcl->getRegVar(), 0, 0, 1, tmp_dcl->getElemType());
1297
1298 createBinOp(G4_shl, G4_ExecSize(numElt), dst3_opnd, eltOffOpnd, createImm(2, Type_UW), instOpt, true);
1299
1300 G4_SrcRegRegion* src2_opnd =
1301 createSrc(
1302 tmp_dcl->getRegVar(), 0, 0, getRegionStride1(), tmp_dcl->getElemType());
1303
1304 if (gOffOpnd->isImm())
1305 {
1306 if (gOffOpnd->asImm()->getInt() != 0)
1307 {
1308 gOffOpnd = createImm(
1309 gOffOpnd->asImm()->getInt() * 4,
1310 gOffOpnd->getType());
1311 createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, gOffOpnd, instOpt, true);
1312 }
1313 else
1314 {
1315 createMov(G4_ExecSize(numElt), dst1_opnd, src2_opnd, instOpt, true);
1316 }
1317 }
1318 else
1319 {
1320 G4_Declare *tmp_dcl1 = createTempVar(1, gOffOpnd->getType(), Any);
1321 G4_DstRegRegion* dst2_opnd = createDst(tmp_dcl1->getRegVar(), 0, 0, 1, tmp_dcl1->getElemType());
1322
1323 createBinOp(G4_shl, g4::SIMD1, dst2_opnd, gOffOpnd, createImm(2, Type_UW), InstOpt_WriteEnable, true);
1324
1325 G4_SrcRegRegion* src1Opnd = createSrc(tmp_dcl1->getRegVar(), 0, 0,
1326 getRegionScalar(), tmp_dcl1->getElemType());
1327
1328 createBinOp(G4_add, G4_ExecSize(numElt), dst1_opnd, src2_opnd, src1Opnd, instOpt, true);
1329 }
1330
1331 // send's operands preparation
1332 unsigned temp = 0;
1333
1334 // Set bit 13-12 for the message descriptor
1335 if (numElt == 8) {
1336 temp += 2 << 12;
1337 } else {
1338 temp += 1 << 12;
1339 }
1340
1341 SFID tf_id = SFID::DP_DC1;
1342 temp += DC1_UNTYPED_SURFACE_WRITE << 14;
1343 // bits 11-8: channel mask
1344 temp += chMask.getHWEncoding() << 8;
1345
1346 // Set bit 9-8 for the message descriptor
1347
1348 if (surface == NULL)
1349 {
1350 temp |= 0xFF - 1;
1351 }
1352
1353 G4_DstRegRegion *post_dst_opnd = createNullDst(numElt > 8 ? Type_UW : Type_UD);
1354
1355 if (useSplitSend) {
1356 G4_SrcRegRegion *m0 = 0; unsigned m0Len = 0;
1357 G4_SrcRegRegion *m1 = 0; unsigned m1Len = 0;
1358 if (header) {
1359 m0 = createSrcRegRegion(header, getRegionStride1());
1360 m0Len = 1;
1361 m1 = createSrcRegRegion(offset, getRegionStride1());
1362 m1Len = payload_size / GENX_DATAPORT_IO_SZ;
1363 } else {
1364 m0 = createSrcRegRegion(offset, getRegionStride1());
1365 m0Len = numElt / GENX_DATAPORT_IO_SZ;
1366 m1 = createSrcRegRegion(data, getRegionStride1());
1367 m1Len = data_size / GENX_DATAPORT_IO_SZ;
1368 }
1369 createSplitSendInst(NULL, post_dst_opnd,
1370 m0, m0Len, m1, m1Len, 0,
1371 G4_ExecSize(numElt),
1372 temp, tf_id, hdrSize != 0,
1373 SendAccess::WRITE_ONLY,
1374 surface, NULL,
1375 instOpt, false);
1376 }
1377 else
1378 {
1379 G4_SrcRegRegion* payload = createSrcRegRegion(header ? header : offset, getRegionStride1());
1380 createSendInst(
1381 NULL,
1382 post_dst_opnd,
1383 payload,
1384 (numElt * (num_channel + 1) + hdrSize)/GENX_DATAPORT_IO_SZ,
1385 0,
1386 G4_ExecSize(numElt),
1387 temp,
1388 tf_id,
1389 hdrSize != 0,
1390 SendAccess::WRITE_ONLY,
1391 surface,
1392 NULL,
1393 instOpt,
1394 false);
1395 }
1396
1397 return VISA_SUCCESS;
1398 }
1399
IsFloatAtomicOps(VISAAtomicOps op)1400 static bool IsFloatAtomicOps(VISAAtomicOps op)
1401 {
1402 return op == ATOMIC_FMAX || op == ATOMIC_FMIN || op == ATOMIC_FCMPWR ||
1403 op == ATOMIC_FADD || op == ATOMIC_FSUB;
1404 }
1405
BuildMH1_A32_PSM(IR_Builder * IRB,G4_Declare * header)1406 static void BuildMH1_A32_PSM(IR_Builder *IRB, G4_Declare *header) {
1407 // Clear header. Ignore PSM so far.
1408 G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
1409 0, 0, 1, Type_UD);
1410 IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1411 // Set PSM to all 1s.
1412 G4_DstRegRegion *h0_7 =
1413 IRB->createDst(header->getRegVar(), 0, 7, 1, Type_UD);
1414 G4_Imm *Mask = IRB->createImm(0xFFFF, Type_UD);
1415 IRB->createMov(g4::SIMD1, h0_7, Mask, InstOpt_WriteEnable, true);
1416 }
1417
1418
BuildMH1_BTS_PSM(IR_Builder * IRB,G4_Declare * header)1419 static void BuildMH1_BTS_PSM(IR_Builder *IRB, G4_Declare *header) {
1420 // Clear header
1421 G4_DstRegRegion* h = IRB->createDst(header->getRegVar(),
1422 0, 0, 1, Type_UD);
1423 IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1424 // Set PSM to 0xFFFF so far.
1425 G4_Operand* maskImm = IRB->createImm(0xFFFF, Type_UD);
1426 G4_DstRegRegion* pitchDst = IRB->createDst(
1427 header->getRegVar(),
1428 0, 7, 1, Type_UD);
1429 IRB->createMov(g4::SIMD1, pitchDst, maskImm, InstOpt_WriteEnable, true);
1430 }
1431
1432
1433 // This version takes byte offsets and predicates
translateVISADwordAtomicInst(VISAAtomicOps atomicOp,bool is16Bit,G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Operand * surface,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_DstRegRegion * dst)1434 int IR_Builder::translateVISADwordAtomicInst(
1435 VISAAtomicOps atomicOp,
1436 bool is16Bit,
1437 G4_Predicate *pred,
1438 VISA_Exec_Size execSize,
1439 VISA_EMask_Ctrl eMask,
1440 G4_Operand* surface,
1441 G4_SrcRegRegion* offsets,
1442 G4_SrcRegRegion* src0,
1443 G4_SrcRegRegion* src1,
1444 G4_DstRegRegion* dst)
1445 {
1446 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1447
1448 ASSERT_USER(!IsFloatAtomicOps(atomicOp) || hasFloatAtomics(),
1449 "Float atomic operations are only supported on SKL+ devices");
1450
1451 ASSERT_USER(getPlatform() >= XeHP_SDV || ((atomicOp != ATOMIC_FADD) && (atomicOp != ATOMIC_FSUB)),
1452 "FADD/FSUB atomic operations are only supported on this devices");
1453
1454 surface = lowerSurface255To253(surface, *this);
1455
1456 VISA_Exec_Size instExecSize = execSize;
1457 execSize = roundUpExecSize(execSize);
1458
1459 // always 8 or 16
1460 G4_ExecSize exSize = toExecSize(execSize);
1461 // can be 1 for scalar atomics
1462 G4_ExecSize instExSize = toExecSize(instExecSize);
1463 G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
1464 unsigned subOpc = Get_Atomic_Op(atomicOp);
1465
1466 bool useSplitSend = useSends();
1467 bool hasRet = !dst->isNullReg();
1468
1469 if (atomicOp == ATOMIC_CMPXCHG)
1470 {
1471 std::swap(src0, src1);
1472 }
1473
1474 PayloadSource sources[4]; // optional header + offsets + [src0] + [src1]
1475 unsigned len = 0;
1476
1477 bool useHeader = needsA32MsgHeader() && surface && isStatelessSurface(surface);
1478 if (useHeader) {
1479 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1480
1481 BuildMH1_A32_PSM(this, dcl);
1482
1483 G4_SrcRegRegion *header
1484 = createSrcRegRegion(dcl, getRegionStride1());
1485 sources[len].opnd = header;
1486 sources[len].execSize = g4::SIMD8;
1487 sources[len].instOpt = InstOpt_WriteEnable;
1488 ++len;
1489 }
1490
1491 sources[len].opnd = offsets;
1492 sources[len].execSize = exSize;
1493 sources[len].instOpt = instOpt;
1494 ++len;
1495
1496 if (src0 && !src0->isNullReg()) {
1497 sources[len].opnd = src0;
1498 sources[len].execSize = exSize;
1499 sources[len].instOpt = instOpt;
1500 ++len;
1501 }
1502
1503 if (src1 && !src1->isNullReg()) {
1504 sources[len].opnd = src1;
1505 sources[len].execSize = exSize;
1506 sources[len].instOpt = instOpt;
1507 ++len;
1508 }
1509
1510 G4_SrcRegRegion *msgs[2] = {0, 0};
1511 unsigned sizes[2] = {0, 0};
1512 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1513
1514 SFID sfid = SFID::DP_DC1;
1515 unsigned MD = 0;
1516 bool IsFloatOp = IsFloatAtomicOps(atomicOp);
1517
1518 // Bit 12 specifies the SIMD mode.
1519 MD |= (execSize == EXEC_SIZE_8 ? MDC_SM2R_SIMD8 : MDC_SM2R_SIMD16) << 12;
1520 if (is16Bit)
1521 {
1522 MD |= (IsFloatOp ? static_cast<unsigned>(DC1_UNTYPED_HALF_FLOAT_ATOMIC)
1523 : static_cast<unsigned>(DC1_UNTYPED_HALF_INTEGER_ATOMIC))
1524 << 14;
1525 }
1526 else
1527 {
1528 MD |= (IsFloatOp ? static_cast<unsigned>(DC1_UNTYPED_FLOAT_ATOMIC)
1529 : static_cast<unsigned>(DC1_UNTYPED_ATOMIC))
1530 << 14;
1531 }
1532 MD |= (hasRet ? 1 : 0) << 13;
1533 MD |= subOpc << 8;
1534
1535 unsigned resLen = hasRet ? (exSize / GENX_DATAPORT_IO_SZ) : 0;
1536 bool forceSplitSend = shouldForceSplitSend(surface);
1537 if (msgs[1] == 0 && !forceSplitSend) {
1538 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1539 createSendInst(pred, dst,
1540 msgs[0], sizes[0],
1541 resLen,
1542 instExSize,
1543 MD, sfid,
1544 useHeader,
1545 SendAccess::READ_WRITE,
1546 surface, NULL,
1547 instOpt, false);
1548 } else {
1549 createSplitSendInst(pred, dst,
1550 msgs[0], sizes[0], msgs[1], sizes[1],
1551 resLen,
1552 instExSize,
1553 MD, sfid,
1554 useHeader,
1555 SendAccess::READ_WRITE,
1556 surface, NULL,
1557 instOpt, false);
1558 }
1559
1560 return VISA_SUCCESS;
1561 }
1562
1563
1564 // build the address payload for typed messages (read/write/atomic)
1565 // sources stores the address payload, and its length len is also updated
buildTypedSurfaceAddressPayload(G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_ExecSize exSize,G4_InstOpts instOpt,PayloadSource sources[],uint32_t & len)1566 void IR_Builder::buildTypedSurfaceAddressPayload(
1567 G4_SrcRegRegion* uOffsetOpnd,
1568 G4_SrcRegRegion* vOffsetOpnd,
1569 G4_SrcRegRegion* rOffsetOpnd,
1570 G4_SrcRegRegion* lodOpnd,
1571 G4_ExecSize exSize,
1572 G4_InstOpts instOpt,
1573 PayloadSource sources[],
1574 uint32_t& len)
1575 {
1576 // Valid address payload pattern are listed below:
1577 // (* means the parameter is ignored by HW but must be included in payload)
1578 // U
1579 // U, V
1580 // U, V, R
1581 // U, *, *, LOD
1582 // U, V, *, LOD
1583 // U, V, R, LOD
1584
1585 // Append U
1586 sources[len].opnd = uOffsetOpnd;
1587 sources[len].execSize = exSize;
1588 sources[len].instOpt = instOpt;
1589 ++len;
1590
1591 // Append V if any.
1592 if (!vOffsetOpnd->isNullReg()) {
1593 sources[len].opnd = vOffsetOpnd;
1594 sources[len].execSize = exSize;
1595 sources[len].instOpt = instOpt;
1596 ++len;
1597 }
1598 else if (!lodOpnd->isNullReg()) {
1599 G4_SrcRegRegion *nullVOffset = createNullSrc(Type_UD);
1600 sources[len].opnd = nullVOffset;
1601 sources[len].execSize = exSize;
1602 sources[len].instOpt = instOpt;
1603 ++len;
1604 }
1605
1606 // Append R if any.
1607 if (!rOffsetOpnd->isNullReg()) {
1608 ASSERT_USER(!vOffsetOpnd->isNullReg(),
1609 "r offset must be NULL if v offset is NULL");
1610 sources[len].opnd = rOffsetOpnd;
1611 sources[len].execSize = exSize;
1612 sources[len].instOpt = instOpt;
1613 ++len;
1614 }
1615 else if (!lodOpnd->isNullReg()) {
1616 G4_SrcRegRegion *nullROffset = createNullSrc(Type_UD);
1617 sources[len].opnd = nullROffset;
1618 sources[len].execSize = exSize;
1619 sources[len].instOpt = instOpt;
1620 ++len;
1621 }
1622
1623 // Append LOD if any.
1624 if (!lodOpnd->isNullReg()) {
1625 sources[len].opnd = lodOpnd;
1626 sources[len].execSize = exSize;
1627 sources[len].instOpt = instOpt;
1628 ++len;
1629 }
1630 }
1631
1632
1633 // u must not be V0. v and r are allowed to be V0, in which case they will be
1634 // skipped in payload.
translateVISAGather4TypedInst(G4_Predicate * pred,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,VISA_Exec_Size executionSize,G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_DstRegRegion * dstOpnd)1635 int IR_Builder::translateVISAGather4TypedInst(
1636 G4_Predicate *pred,
1637 VISA_EMask_Ctrl emask,
1638 ChannelMask chMask,
1639 G4_Operand *surface,
1640 VISA_Exec_Size executionSize,
1641 G4_SrcRegRegion *uOffsetOpnd,
1642 G4_SrcRegRegion *vOffsetOpnd,
1643 G4_SrcRegRegion *rOffsetOpnd,
1644 G4_SrcRegRegion *lodOpnd,
1645 G4_DstRegRegion *dstOpnd)
1646 {
1647 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1648
1649 G4_ExecSize exSize = executionSize == EXEC_SIZE_16 ? g4::SIMD16 : g4::SIMD8;
1650 assert((exSize == 8 || hasSIMD16TypedRW()) && "only simd8 is supported");
1651 G4_InstOpts instOpt = Get_Gen4_Emask(emask, exSize);
1652 int numEnabledChannels = chMask.getNumEnabledChannels();
1653
1654 bool useSplitSend = useSends();
1655
1656 bool hasHeader = getPlatform() == GENX_BDW;
1657
1658 PayloadSource sources[5]; // (maybe header) + maximal 4 addresses
1659 unsigned len = 0;
1660
1661 if (hasHeader)
1662 {
1663 // Build header
1664 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1665 BuildMH1_BTS_PSM(this, dcl);
1666
1667 // Append header
1668 G4_SrcRegRegion *header
1669 = createSrcRegRegion(dcl, getRegionStride1());
1670 sources[len].opnd = header;
1671 sources[len].execSize = g4::SIMD8;
1672 sources[len].instOpt = InstOpt_WriteEnable;
1673 ++len;
1674 }
1675
1676 buildTypedSurfaceAddressPayload(uOffsetOpnd, vOffsetOpnd, rOffsetOpnd, lodOpnd, exSize, instOpt, sources, len);
1677 G4_SrcRegRegion *msgs[2] = {0, 0};
1678 unsigned sizes[2] = {0, 0};
1679 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1680
1681 //bit 8-11: RGBA channel enable
1682 unsigned msgDesc = chMask.getHWEncoding() << 8;
1683 SFID sfId;
1684
1685 // DC1
1686 // bit14-17: 0101 (read), 1101 (write)
1687 msgDesc |= DC1_TYPED_SURFACE_READ << 14;
1688 // bit12-13: 01 (use low 8 slot)
1689 msgDesc |= MDC_SG3_SG8L << 12;
1690 sfId = SFID::DP_DC1;
1691
1692 bool forceSplitSend = shouldForceSplitSend(surface);
1693 if (msgs[1] == 0 && !forceSplitSend) {
1694 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1695 createSendInst(pred, dstOpnd,
1696 msgs[0], sizes[0],
1697 numEnabledChannels,
1698 exSize,
1699 msgDesc, sfId,
1700 hasHeader,
1701 SendAccess::READ_ONLY,
1702 surface, nullptr,
1703 instOpt, false);
1704 } else {
1705 createSplitSendInst(pred, dstOpnd,
1706 msgs[0], sizes[0], msgs[1], sizes[1],
1707 numEnabledChannels,
1708 exSize,
1709 msgDesc, sfId,
1710 hasHeader,
1711 SendAccess::READ_ONLY,
1712 surface, nullptr,
1713 instOpt, false);
1714 }
1715
1716 return VISA_SUCCESS;
1717 }
1718
1719 // u must not be V0. v and r are allowed to be V0, in which case they will be
1720 // skipped in payload.
translateVISAScatter4TypedInst(G4_Predicate * pred,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,VISA_Exec_Size executionSize,G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_SrcRegRegion * srcOpnd)1721 int IR_Builder::translateVISAScatter4TypedInst(
1722 G4_Predicate *pred,
1723 VISA_EMask_Ctrl emask,
1724 ChannelMask chMask,
1725 G4_Operand *surface,
1726 VISA_Exec_Size executionSize,
1727 G4_SrcRegRegion *uOffsetOpnd,
1728 G4_SrcRegRegion *vOffsetOpnd,
1729 G4_SrcRegRegion *rOffsetOpnd,
1730 G4_SrcRegRegion *lodOpnd,
1731 G4_SrcRegRegion *srcOpnd)
1732 {
1733 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1734
1735 G4_ExecSize exSize = executionSize == EXEC_SIZE_16 ? g4::SIMD16 : g4::SIMD8;
1736 assert((exSize == g4::SIMD8 || hasSIMD16TypedRW()) && "only simd8 is supported");
1737 G4_InstOpts instOpt = Get_Gen4_Emask(emask, exSize);
1738 int numEnabledChannels = chMask.getNumEnabledChannels();
1739
1740 bool useSplitSend = useSends();
1741
1742 bool hasHeader = getPlatform() == GENX_BDW;
1743
1744 PayloadSource sources[6]; // (maybe header) + maximal 4 addresses + source
1745 unsigned len = 0;
1746
1747 if (hasHeader)
1748 {
1749 // Build header
1750 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1751 BuildMH1_BTS_PSM(this, dcl);
1752
1753 // Append header
1754 G4_SrcRegRegion *header
1755 = createSrcRegRegion(dcl, getRegionStride1());
1756 sources[len].opnd = header;
1757 sources[len].execSize = g4::SIMD8;
1758 sources[len].instOpt = InstOpt_WriteEnable;
1759 ++len;
1760 }
1761
1762 buildTypedSurfaceAddressPayload(uOffsetOpnd, vOffsetOpnd, rOffsetOpnd, lodOpnd, exSize, instOpt, sources, len);
1763
1764 // Append source
1765 sources[len].opnd = srcOpnd;
1766 sources[len].execSize = G4_ExecSize(exSize * numEnabledChannels);
1767 sources[len].instOpt = instOpt;
1768 ++len;
1769
1770 G4_SrcRegRegion *msgs[2] = {0, 0};
1771 unsigned sizes[2] = {0, 0};
1772 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1773
1774 //bit 8-11: RGBA channel enable
1775 unsigned msgDesc = 0;
1776 SFID sfId;
1777
1778 // DC1
1779 // bit14-17: 0101 (read), 1101 (write)
1780 msgDesc |= DC1_TYPED_SURFACE_WRITE << 14;
1781 // bit12-13: 01 (use low 8 slot)
1782 msgDesc |= MDC_SG3_SG8L << 12;
1783 sfId = SFID::DP_DC1;
1784
1785 msgDesc |= chMask.getHWEncoding() << 8;
1786
1787 G4_DstRegRegion* dstOpnd = createNullDst(Type_UD);
1788
1789 bool forceSplitSend = shouldForceSplitSend(surface);
1790 if (msgs[1] == 0 && !forceSplitSend) {
1791 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1792 createSendInst(
1793 pred, dstOpnd,
1794 msgs[0], sizes[0],
1795 0,
1796 exSize,
1797 msgDesc, sfId,
1798 hasHeader,
1799 SendAccess::WRITE_ONLY,
1800 surface, NULL,
1801 instOpt, false);
1802 } else
1803 {
1804 createSplitSendInst(
1805 pred, dstOpnd,
1806 msgs[0], sizes[0], msgs[1], sizes[1],
1807 0,
1808 exSize,
1809 msgDesc, sfId,
1810 hasHeader,
1811 SendAccess::WRITE_ONLY,
1812 surface, NULL,
1813 instOpt, false);
1814 }
1815
1816 return VISA_SUCCESS;
1817 }
1818
translateVISATypedAtomicInst(VISAAtomicOps atomicOp,bool is16Bit,G4_Predicate * pred,VISA_EMask_Ctrl emask,VISA_Exec_Size execSize,G4_Operand * surface,G4_SrcRegRegion * uOffsetOpnd,G4_SrcRegRegion * vOffsetOpnd,G4_SrcRegRegion * rOffsetOpnd,G4_SrcRegRegion * lodOpnd,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_DstRegRegion * dst)1819 int IR_Builder::translateVISATypedAtomicInst(
1820 VISAAtomicOps atomicOp,
1821 bool is16Bit,
1822 G4_Predicate *pred,
1823 VISA_EMask_Ctrl emask,
1824 VISA_Exec_Size execSize,
1825 G4_Operand *surface,
1826 G4_SrcRegRegion *uOffsetOpnd,
1827 G4_SrcRegRegion *vOffsetOpnd,
1828 G4_SrcRegRegion *rOffsetOpnd,
1829 G4_SrcRegRegion *lodOpnd,
1830 G4_SrcRegRegion *src0,
1831 G4_SrcRegRegion *src1,
1832 G4_DstRegRegion *dst)
1833 {
1834 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1835
1836 VISA_Exec_Size instExecSize = execSize;
1837 assert(execSize <= (getNativeExecSize() == g4::SIMD8 ? EXEC_SIZE_8 : EXEC_SIZE_16) &&
1838 "send exec size must not exceed the platform's native execution size");
1839
1840 unsigned op = Get_Atomic_Op(atomicOp);
1841
1842 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
1843 G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
1844 G4_InstOpts instOpt = Get_Gen4_Emask(emask, instExSize);
1845
1846 if (atomicOp == ATOMIC_CMPXCHG)
1847 {
1848 // we have to swap src0 and src1 since vISA has them in different order from HW
1849 G4_SrcRegRegion* tmp = src0;
1850 src0 = src1;
1851 src1 = tmp;
1852 }
1853
1854 bool useSplitSend = useSends();
1855
1856 PayloadSource sources[6]; // u, v, r, lod, src0, src1
1857 unsigned len = 0;
1858
1859 buildTypedSurfaceAddressPayload(uOffsetOpnd, vOffsetOpnd, rOffsetOpnd, lodOpnd, exSize, instOpt, sources, len);
1860
1861 if (src0 != nullptr && !src0->isNullReg())
1862 {
1863 sources[len].opnd = src0;
1864 sources[len].execSize = exSize;
1865 sources[len].instOpt = instOpt;
1866 ++len;
1867 }
1868
1869 if (src1 != nullptr && !src1->isNullReg())
1870 {
1871 sources[len].opnd = src1;
1872 sources[len].execSize = exSize;
1873 sources[len].instOpt = instOpt;
1874 ++len;
1875 }
1876
1877 G4_SrcRegRegion *msgs[2] = {0, 0};
1878 unsigned sizes[2] = {0, 0};
1879 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
1880
1881 unsigned dstLength = dst->isNullReg() ? 0 : 1;
1882
1883 unsigned msgDesc = 0;
1884 // BTI is filled later
1885 msgDesc |= op << 8;
1886 msgDesc |= (dstLength != 0 ? 1 : 0) << 13;
1887
1888 if (is16Bit)
1889 {
1890 msgDesc |= DC1_TYPED_HALF_INTEGER_ATOMIC << 14;
1891 }
1892 else
1893 {
1894 msgDesc |= DC1_TYPED_ATOMIC << 14;
1895 }
1896
1897 bool forceSplitSend = shouldForceSplitSend(surface);
1898 if (msgs[1] == 0 && !forceSplitSend)
1899 {
1900 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1901 createSendInst(pred, dst,
1902 msgs[0], sizes[0], dstLength, exSize,
1903 msgDesc, SFID::DP_DC1,
1904 false,
1905 SendAccess::READ_WRITE,
1906 surface, nullptr,
1907 instOpt, false);
1908 }
1909 else
1910 {
1911 createSplitSendInst(pred, dst,
1912 msgs[0], sizes[0], msgs[1], sizes[1],
1913 dstLength, exSize,
1914 msgDesc, SFID::DP_DC1,
1915 false,
1916 SendAccess::READ_WRITE,
1917 surface, nullptr,
1918 instOpt, false);
1919 }
1920
1921 return VISA_SUCCESS;
1922 }
1923
BuildMH2_A32_PSM(IR_Builder * IRB,G4_Declare * header,uint16_t scale,G4_Operand * globalOffset)1924 static void BuildMH2_A32_PSM(
1925 IR_Builder *IRB, G4_Declare *header, uint16_t scale, G4_Operand *globalOffset)
1926 {
1927 // Clear header
1928 G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
1929 0, 0, 1, Type_UD);
1930 IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1931 // Copy global offset if necessary.
1932 if (!(globalOffset->isImm() && globalOffset->asImm()->isZero())) {
1933 G4_DstRegRegion* gOffDst = IRB->createDst(
1934 header->getRegVar(),
1935 0, 5, 1, Type_UD);
1936 IRB->createMov(g4::SIMD1, gOffDst, globalOffset, InstOpt_WriteEnable, true);
1937 }
1938 // Copy scale pitch if necessary.
1939 if (scale != 0) {
1940 G4_Operand* scaleImm = IRB->createImm(scale, Type_UD);
1941 G4_DstRegRegion* pitchDst = IRB->createDst(
1942 header->getRegVar(),
1943 0, 0, 1, Type_UD);
1944 IRB->createMov(g4::SIMD1, pitchDst, scaleImm, InstOpt_WriteEnable, true);
1945 }
1946 // Copy PSM which is set to 0xFFFF so far.
1947 G4_Operand* maskImm = IRB->createImm(0xFFFF, Type_UD);
1948 G4_DstRegRegion* pitchDst = IRB->createDst(
1949 header->getRegVar(),
1950 0, 7, 1, Type_UD);
1951 IRB->createMov(g4::SIMD1, pitchDst, maskImm, InstOpt_WriteEnable, true);
1952 }
1953
1954 // apply the sideband offset (can be either imm or variable) to the message descriptor
applySideBandOffset(G4_Operand * sideBand,const G4_SendDescRaw * sendMsgDesc)1955 void IR_Builder::applySideBandOffset(
1956 G4_Operand* sideBand, const G4_SendDescRaw* sendMsgDesc)
1957 {
1958 #define SIDEBAND_OFFSET_IN_EXDESC 12
1959
1960 if (sideBand->isImm())
1961 {
1962 // mov (1) a0.0 sideband << 0xC
1963 uint32_t sidebandInDesc = (uint32_t)(sideBand->asImm()->getImm() << SIDEBAND_OFFSET_IN_EXDESC);
1964 G4_DstRegRegion* dst = createDstRegRegion(builtinA0, 1);
1965 createMov(g4::SIMD1, dst, createImm(sidebandInDesc, Type_UD), InstOpt_WriteEnable, true);
1966 }
1967 else
1968 {
1969 MUST_BE_TRUE(sideBand->isSrcRegRegion(), "sideband offset should be a srcRegRegion");
1970 // shl (1) a0.0 sideband 0xC
1971 G4_DstRegRegion* dst = createDstRegRegion(builtinA0, 1);
1972 createBinOp(G4_shl, g4::SIMD1, dst, sideBand,
1973 createImm(SIDEBAND_OFFSET_IN_EXDESC, Type_UW), InstOpt_WriteEnable, true);
1974 }
1975
1976 // add (1) a0.0 a0.0 MD
1977 G4_DstRegRegion* a0Dst = createDstRegRegion(builtinA0, 1);
1978 G4_SrcRegRegion* a0Src = createSrcRegRegion(builtinA0, getRegionScalar());
1979 createBinOp(G4_add, g4::SIMD1, a0Dst, a0Src,
1980 createImm(sendMsgDesc->getExtendedDesc(), Type_UD), InstOpt_WriteEnable, true);
1981 }
1982
BuildMH2_A32(IR_Builder * IRB,G4_Declare * header,uint16_t scale,G4_Operand * globalOffset)1983 static void BuildMH2_A32(IR_Builder *IRB, G4_Declare *header,
1984 uint16_t scale, G4_Operand *globalOffset) {
1985 // Clear header
1986 G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
1987 0, 0, 1, Type_UD);
1988 IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
1989 // Copy global offset if necessary.
1990 if (!(globalOffset->isImm() && globalOffset->asImm()->isZero())) {
1991 G4_DstRegRegion* gOffDst = IRB->createDst(
1992 header->getRegVar(),
1993 0, 5, 1, Type_UD);
1994 IRB->createMov(g4::SIMD1, gOffDst, globalOffset, InstOpt_WriteEnable, true);
1995 }
1996 // Copy scale pitch if necessary.
1997 if (scale != 0) {
1998 G4_Operand* scaleImm = IRB->createImm(scale, Type_UD);
1999 G4_DstRegRegion* pitchDst = IRB->createDst(
2000 header->getRegVar(),
2001 0, 0, 1, Type_UD);
2002 IRB->createMov(g4::SIMD1, pitchDst, scaleImm, InstOpt_WriteEnable, true);
2003 }
2004 }
2005
translateVISAGather4ScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2006 int IR_Builder::translateVISAGather4ScaledInst(
2007 G4_Predicate *pred,
2008 VISA_Exec_Size execSize,
2009 VISA_EMask_Ctrl eMask,
2010 ChannelMask chMask,
2011 G4_Operand *surface,
2012 G4_Operand *globalOffset,
2013 G4_SrcRegRegion *offsets,
2014 G4_DstRegRegion *dst)
2015 {
2016 surface = lowerSurface255To253(surface, *this);
2017 return translateGather4Inst(pred, execSize, eMask, chMask, surface,
2018 globalOffset, offsets, dst);
2019 }
2020
translateVISAScatter4ScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2021 int IR_Builder::translateVISAScatter4ScaledInst(
2022 G4_Predicate *pred,
2023 VISA_Exec_Size execSize,
2024 VISA_EMask_Ctrl eMask,
2025 ChannelMask chMask,
2026 G4_Operand *surface,
2027 G4_Operand *globalOffset,
2028 G4_SrcRegRegion *offsets,
2029 G4_SrcRegRegion *src)
2030 {
2031 surface = lowerSurface255To253(surface, *this);
2032 return translateScatter4Inst(pred, execSize, eMask, chMask, surface,
2033 globalOffset, offsets, src);
2034 }
2035
translateGather4Inst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2036 int IR_Builder::translateGather4Inst(
2037 G4_Predicate *pred,
2038 VISA_Exec_Size execSize,
2039 VISA_EMask_Ctrl eMask,
2040 ChannelMask chMask,
2041 G4_Operand *surface,
2042 G4_Operand *globalOffset,
2043 G4_SrcRegRegion *offsets,
2044 G4_DstRegRegion *dst)
2045 {
2046 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2047
2048 ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2049 execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2050 execSize == EXEC_SIZE_16,
2051 "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2052
2053 VISA_Exec_Size instExecSize = execSize;
2054 execSize = roundUpExecSize(execSize);
2055
2056 G4_ExecSize exSize = toExecSize(execSize);
2057 G4_ExecSize instExSize = toExecSize(instExecSize);
2058 unsigned instOpt = Get_Gen4_Emask(eMask, exSize);
2059
2060 bool useSplitSend = useSends();
2061 bool useHeader = needsA32MsgHeader() && surface && isStatelessSurface(surface);
2062
2063 // In case non-zero global offset is specified, we need to recalculate
2064 // offsets.
2065 if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0) {
2066 G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2067 G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2068 createInst(pred, G4_add, 0, g4::NOSAT, instExSize, tmp, offsets, globalOffset, instOpt, true);
2069 offsets = createSrcRegRegion(dcl, getRegionStride1());
2070 }
2071
2072 PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
2073 unsigned len = 0;
2074
2075 if (useHeader) {
2076 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2077
2078 BuildMH1_A32_PSM(this, dcl);
2079
2080 G4_SrcRegRegion *header
2081 = createSrcRegRegion(dcl, getRegionStride1());
2082 sources[len].opnd = header;
2083 sources[len].execSize = g4::SIMD8;
2084 sources[len].instOpt = InstOpt_WriteEnable;
2085 ++len;
2086 }
2087
2088 sources[len].opnd = offsets;
2089 sources[len].execSize = exSize;
2090 sources[len].instOpt = instOpt;
2091 ++len;
2092
2093 G4_SrcRegRegion *msgs[2] = {0, 0};
2094 unsigned sizes[2] = {0, 0};
2095 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2096
2097 SFID sfid = SFID::DP_DC1;
2098
2099 unsigned MD = 0;
2100 // Leave sidebind scale offset 0 as it is not used now.
2101 MD |= DC1_UNTYPED_SURFACE_READ << 14;
2102 MD |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
2103 MD |= chMask.getHWEncoding() << 8;
2104
2105 unsigned resLen = (exSize / GENX_DATAPORT_IO_SZ) *
2106 chMask.getNumEnabledChannels();
2107
2108 bool forceSplitSend = shouldForceSplitSend(surface);
2109 if (msgs[1] == 0 && !forceSplitSend) {
2110 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2111 createSendInst(pred, dst,
2112 msgs[0], sizes[0],
2113 resLen,
2114 instExSize,
2115 MD, sfid,
2116 useHeader,
2117 SendAccess::READ_ONLY,
2118 surface, NULL,
2119 instOpt, false);
2120 } else {
2121 createSplitSendInst(pred, dst,
2122 msgs[0], sizes[0], msgs[1], sizes[1],
2123 resLen,
2124 instExSize,
2125 MD, sfid,
2126 useHeader,
2127 SendAccess::READ_ONLY,
2128 surface, NULL,
2129 instOpt, false);
2130 }
2131
2132 return VISA_SUCCESS;
2133 }
2134
2135
translateScatter4Inst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2136 int IR_Builder::translateScatter4Inst(
2137 G4_Predicate *pred,
2138 VISA_Exec_Size execSize,
2139 VISA_EMask_Ctrl eMask,
2140 ChannelMask chMask,
2141 G4_Operand *surface,
2142 G4_Operand *globalOffset,
2143 G4_SrcRegRegion *offsets,
2144 G4_SrcRegRegion *src)
2145 {
2146 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2147
2148 ASSERT_USER(
2149 execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2150 execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2151 execSize == EXEC_SIZE_16,
2152 "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2153
2154 VISA_Exec_Size instExecSize = execSize;
2155 execSize = roundUpExecSize(execSize);
2156
2157 G4_ExecSize exSize = toExecSize(execSize);
2158 G4_ExecSize instExSize = toExecSize(instExecSize);
2159 unsigned instOpt = Get_Gen4_Emask(eMask, exSize);
2160
2161 bool useSplitSend = useSends();
2162 bool useHeader = needsA32MsgHeader() && surface && isStatelessSurface(surface);
2163
2164 // In case non-zero global offset is specified, we need to recalculate
2165 // offsets.
2166 if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0) {
2167 G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2168 G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2169 createInst(pred, G4_add, 0, g4::NOSAT, instExSize, tmp, offsets, globalOffset, instOpt, true);
2170 offsets = createSrcRegRegion(dcl, getRegionStride1());
2171 }
2172
2173 PayloadSource sources[3]; // Maximal 3 sources, optional header + offsets + src
2174 unsigned len = 0;
2175
2176 if (useHeader) {
2177 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2178
2179 // TODO: Get PSM supported on demand.
2180 BuildMH1_A32_PSM(this, dcl);
2181
2182 G4_SrcRegRegion *header
2183 = createSrcRegRegion(dcl, getRegionStride1());
2184 sources[len].opnd = header;
2185 sources[len].execSize = g4::SIMD8;
2186 sources[len].instOpt = InstOpt_WriteEnable;
2187 ++len;
2188 }
2189
2190 sources[len].opnd = offsets;
2191 sources[len].execSize = exSize;
2192 sources[len].instOpt = instOpt;
2193 ++len;
2194 sources[len].opnd = src;
2195 sources[len].execSize = G4_ExecSize(exSize * chMask.getNumEnabledChannels());
2196 sources[len].instOpt = instOpt;
2197 ++len;
2198
2199 G4_SrcRegRegion *msgs[2] = {0, 0};
2200 unsigned sizes[2] = {0, 0};
2201 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2202
2203 SFID sfid = SFID::DP_DC1;
2204
2205 unsigned MD = 0;
2206 // Leave sidebind scale offset 0 as it is not used now.
2207 MD |= DC1_UNTYPED_SURFACE_WRITE << 14;
2208 MD |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
2209 MD |= chMask.getHWEncoding() << 8;
2210
2211 G4_DstRegRegion *dst = createNullDst(Type_UD);
2212 bool forceSplitSend = shouldForceSplitSend(surface);
2213 if (msgs[1] == 0 && !forceSplitSend) {
2214 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2215 createSendInst(pred, dst,
2216 msgs[0], sizes[0],
2217 0,
2218 instExSize,
2219 MD, sfid,
2220 useHeader,
2221 SendAccess::WRITE_ONLY,
2222 surface, NULL,
2223 instOpt, false);
2224 }
2225 else {
2226 createSplitSendInst(pred, dst,
2227 msgs[0], sizes[0], msgs[1], sizes[1],
2228 0,
2229 instExSize,
2230 MD, sfid,
2231 useHeader,
2232 SendAccess::WRITE_ONLY,
2233 surface, NULL,
2234 instOpt, false);
2235 }
2236
2237 return VISA_SUCCESS;
2238 }
2239
2240 /// GetNumBatch() - return the number of batches required to copy the raw
2241 /// operand to message payload
GetNumBatch(VISA_SVM_Block_Type blockSize,VISA_SVM_Block_Num numBlocks)2242 static unsigned GetNumBatch(
2243 VISA_SVM_Block_Type blockSize, VISA_SVM_Block_Num numBlocks)
2244 {
2245 switch (blockSize) {
2246 case SVM_BLOCK_TYPE_BYTE:
2247 switch (numBlocks) {
2248 case SVM_BLOCK_NUM_1:
2249 case SVM_BLOCK_NUM_2:
2250 case SVM_BLOCK_NUM_4:
2251 return 1;
2252 case SVM_BLOCK_NUM_8:
2253 return 2;
2254 }
2255 break;
2256 case SVM_BLOCK_TYPE_DWORD:
2257 return Get_Common_ISA_SVM_Block_Num(numBlocks);
2258 case SVM_BLOCK_TYPE_QWORD:
2259 return Get_Common_ISA_SVM_Block_Num(numBlocks);
2260 }
2261 ASSERT_USER(false, "Unhandled sizes/numbers of block/element!");
2262 return 0;
2263 }
2264
translateVISAGatherScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2265 int IR_Builder::translateVISAGatherScaledInst(
2266 G4_Predicate *pred,
2267 VISA_Exec_Size execSize,
2268 VISA_EMask_Ctrl eMask,
2269 VISA_SVM_Block_Num numBlocks,
2270 G4_Operand *surface,
2271 G4_Operand *globalOffset,
2272 G4_SrcRegRegion *offsets,
2273 G4_DstRegRegion *dst)
2274 {
2275 surface = lowerSurface255To253(surface, *this);
2276
2277 return translateByteGatherInst(pred, execSize, eMask, numBlocks,
2278 surface, globalOffset, offsets, dst);
2279 }
2280
translateVISAScatterScaledInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2281 int IR_Builder::translateVISAScatterScaledInst(
2282 G4_Predicate *pred,
2283 VISA_Exec_Size execSize,
2284 VISA_EMask_Ctrl eMask,
2285 VISA_SVM_Block_Num numBlocks,
2286 G4_Operand *surface,
2287 G4_Operand *globalOffset,
2288 G4_SrcRegRegion *offsets,
2289 G4_SrcRegRegion *src)
2290 {
2291
2292 surface = lowerSurface255To253(surface, *this);
2293 return translateByteScatterInst(pred, execSize, eMask, numBlocks,
2294 surface, globalOffset, offsets, src);
2295 }
2296
BuildMH_A32_GO(IR_Builder * IRB,G4_Declare * header,G4_Operand * globalOffset=0)2297 static void BuildMH_A32_GO(
2298 IR_Builder *IRB, G4_Declare *header, G4_Operand *globalOffset = 0)
2299 {
2300 // Clear header
2301 G4_DstRegRegion *h = IRB->createDst(header->getRegVar(),
2302 0, 0, 1, Type_UD);
2303 IRB->createMov(g4::SIMD8, h, IRB->createImm(0, Type_UD), InstOpt_WriteEnable, true);
2304 // Copy global offset if necessary.
2305 if (globalOffset &&
2306 !(globalOffset->isImm() &&
2307 globalOffset->asImm()->isZero())) {
2308 G4_DstRegRegion* gOffDst = IRB->createDst(
2309 header->getRegVar(),
2310 0, 2, 1, Type_UD);
2311 IRB->createMov(g4::SIMD1, gOffDst, globalOffset, InstOpt_WriteEnable, true);
2312 }
2313 }
2314
translateByteGatherInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)2315 int IR_Builder::translateByteGatherInst(
2316 G4_Predicate *pred,
2317 VISA_Exec_Size execSize,
2318 VISA_EMask_Ctrl eMask,
2319 VISA_SVM_Block_Num numBlocks,
2320 G4_Operand *surface,
2321 G4_Operand *globalOffset,
2322 G4_SrcRegRegion *offsets,
2323 G4_DstRegRegion *dst)
2324 {
2325 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2326
2327 ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2328 execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2329 execSize == EXEC_SIZE_16,
2330 "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2331 ASSERT_USER(numBlocks == SVM_BLOCK_NUM_1 ||
2332 numBlocks == SVM_BLOCK_NUM_2 ||
2333 numBlocks == SVM_BLOCK_NUM_4,
2334 "Byte gather ONLY supports 1, 2, and 4 elements per slot!");
2335
2336 VISA_Exec_Size instExecSize = execSize;
2337 execSize = roundUpExecSize(execSize);
2338
2339 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2340 G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2341 G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
2342 unsigned numBatch = GetNumBatch(SVM_BLOCK_TYPE_BYTE, numBlocks);
2343
2344 bool isSLM = IsSLMSurface(surface);
2345 // SLM forbids header. Header is optional in A32 when both scale and global
2346 // offset are 0s.
2347 bool useHeader = !isSLM && needsA32MsgHeader();
2348 bool useSplitSend = useSends();
2349
2350 // In case non-zero global offset is specified, we need to recalculate
2351 // offsets.
2352 //
2353 // NOTE: Even though pre-SKL devices require header, eliminating global
2354 // offset by adjusting offsets will simplify the header generation.
2355 if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
2356 {
2357 G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2358 G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2359 createBinOp(G4_add, instExSize, tmp, offsets, globalOffset, instOpt, true);
2360 offsets = createSrcRegRegion(dcl, getRegionStride1());
2361 }
2362
2363 PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
2364 unsigned len = 0;
2365
2366 if (useHeader) {
2367 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2368
2369 // TODO: Get BTS supported on demand.
2370 BuildMH_A32_GO(this, dcl);
2371
2372 G4_SrcRegRegion *header
2373 = createSrcRegRegion(dcl, getRegionStride1());
2374 sources[len].opnd = header;
2375 sources[len].execSize = g4::SIMD8;
2376 sources[len].instOpt = InstOpt_WriteEnable;
2377 ++len;
2378 }
2379
2380 sources[len].opnd = offsets;
2381 sources[len].execSize = exSize;
2382 sources[len].instOpt = instOpt;
2383 ++len;
2384
2385 G4_SrcRegRegion *msgs[2] = {0, 0};
2386 unsigned sizes[2] = {0, 0};
2387 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2388
2389 SFID sfid = SFID::DP_DC0;
2390
2391 unsigned MD = 0;
2392 MD |= DC_BYTE_SCATTERED_READ << 14;
2393 MD |= numBlocks << 10;
2394 MD |= (execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16) << 8;
2395
2396 unsigned resLen = (exSize / GENX_DATAPORT_IO_SZ) * numBatch;
2397 bool forceSplitSend = shouldForceSplitSend(surface);
2398 if (msgs[1] == 0 && !forceSplitSend) {
2399 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2400 createSendInst(pred, dst,
2401 msgs[0], sizes[0],
2402 resLen,
2403 instExSize,
2404 MD, sfid,
2405 useHeader,
2406 SendAccess::READ_ONLY,
2407 surface, NULL,
2408 instOpt, false);
2409 }
2410 else {
2411 createSplitSendInst(pred, dst,
2412 msgs[0], sizes[0], msgs[1], sizes[1],
2413 resLen,
2414 instExSize,
2415 MD, sfid,
2416 useHeader,
2417 SendAccess::READ_ONLY,
2418 surface, NULL,
2419 instOpt, false);
2420 }
2421
2422 return VISA_SUCCESS;
2423 }
2424
translateByteScatterInst(G4_Predicate * pred,VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,VISA_SVM_Block_Num numBlocks,G4_Operand * surface,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)2425 int IR_Builder::translateByteScatterInst(
2426 G4_Predicate *pred,
2427 VISA_Exec_Size execSize,
2428 VISA_EMask_Ctrl eMask,
2429 VISA_SVM_Block_Num numBlocks,
2430 G4_Operand *surface,
2431 G4_Operand *globalOffset,
2432 G4_SrcRegRegion *offsets,
2433 G4_SrcRegRegion *src)
2434 {
2435 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2436
2437 ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2438 execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2439 execSize == EXEC_SIZE_16,
2440 "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2441 ASSERT_USER(numBlocks == SVM_BLOCK_NUM_1 ||
2442 numBlocks == SVM_BLOCK_NUM_2 ||
2443 numBlocks == SVM_BLOCK_NUM_4,
2444 "Byte scatter ONLY supports 1, 2, and 4 elements per slot!");
2445
2446 VISA_Exec_Size instExecSize = execSize;
2447 execSize = roundUpExecSize(execSize);
2448
2449 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2450 G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2451 G4_InstOpts instOpt = Get_Gen4_Emask(eMask, exSize);
2452 unsigned numBatch = GetNumBatch(SVM_BLOCK_TYPE_BYTE, numBlocks);
2453
2454 bool isSLM = IsSLMSurface(surface);
2455 // SLM forbids header. Header is optional in A32 when both scale and global
2456 // offset are 0s.
2457 bool useHeader = !isSLM && needsA32MsgHeader();
2458 bool useSplitSend = useSends();
2459
2460 // In case non-zero global offset is specified, we need to recalculate
2461 // offsets.
2462 //
2463 // NOTE: Even though pre-SKL devices require header, eliminating global
2464 // offset by adjusting offsets will simplify the header generation.
2465 if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
2466 {
2467 G4_Declare *dcl = createSendPayloadDcl(exSize, offsets->getType());
2468 G4_DstRegRegion *tmp = createDstRegRegion(dcl, 1);
2469 createBinOp(G4_add, instExSize, tmp, offsets, globalOffset, instOpt, true);
2470 offsets = createSrcRegRegion(dcl, getRegionStride1());
2471 }
2472
2473 PayloadSource sources[3]; // Maximal 2 sources, optional header + offsets + src
2474 unsigned len = 0;
2475
2476 if (useHeader) {
2477 G4_Declare *dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2478
2479 // TODO: Get BTS supported on demand.
2480 BuildMH_A32_GO(this, dcl);
2481
2482 G4_SrcRegRegion *header
2483 = createSrcRegRegion(dcl, getRegionStride1());
2484 sources[len].opnd = header;
2485 sources[len].execSize = g4::SIMD8;
2486 sources[len].instOpt = InstOpt_WriteEnable;
2487 ++len;
2488 }
2489
2490 sources[len].opnd = offsets;
2491 sources[len].execSize = exSize;
2492 sources[len].instOpt = instOpt;
2493 ++len;
2494 sources[len].opnd = src;
2495 sources[len].execSize = G4_ExecSize(exSize * numBatch);
2496 sources[len].instOpt = instOpt;
2497 ++len;
2498
2499 G4_SrcRegRegion *msgs[2] = {0, 0};
2500 unsigned sizes[2] = {0, 0};
2501 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2502
2503 SFID sfid = SFID::DP_DC0;
2504
2505 unsigned MD = 0;
2506 // Leave sidebind scale offset 0 as it is not used now.
2507 MD |= DC_BYTE_SCATTERED_WRITE << 14;
2508 MD |= numBlocks << 10;
2509 MD |= (execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16) << 8;
2510
2511 G4_DstRegRegion *dst = createNullDst(Type_UD);
2512 bool forceSplitSend = shouldForceSplitSend(surface);
2513 if (msgs[1] == 0 && !forceSplitSend) {
2514 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2515 createSendInst(pred, dst,
2516 msgs[0], sizes[0],
2517 0,
2518 instExSize,
2519 MD, sfid,
2520 useHeader,
2521 SendAccess::WRITE_ONLY,
2522 surface, NULL,
2523 instOpt, false);
2524 } else {
2525 createSplitSendInst(pred, dst,
2526 msgs[0], sizes[0], msgs[1], sizes[1],
2527 0,
2528 instExSize,
2529 MD, sfid,
2530 useHeader,
2531 SendAccess::WRITE_ONLY,
2532 surface, NULL,
2533 instOpt, false);
2534 }
2535
2536 return VISA_SUCCESS;
2537 }
2538
2539
2540 ///
2541 /// Bits 31-29: Reserved
2542 /// Bits 28-25: Message Length: Total 256bit registers expected to be sent.
2543 /// Bits 24-20: Response Length: Total 256bit registers expected in response.
2544 /// Bit 19: Does this Message Descriptor have a header? 1 Yes, 0 No.
2545 /// Bits 18-14: Message Type: 10100: A64 Block Read, 10101: A64 Block Write
2546 /// Bit 13: Ignore
2547 /// Bits 12-11: Message sub-type (00 for OWord Block Read/Write, 01 for Unaligned OWord Block Read/Write)
2548 /// Bits 10-8: Block Size, 000 for 1 OWord, 001 for 2 OWords, 010 for 4 OWords, 100 for 8 OWords.
2549 /// Bits 7-0: Binding Table Index: Set to 0xFF for stateless memory space used bu A64 SVM Data Port.
translateVISASVMBlockReadInst(VISA_Oword_Num size,bool unaligned,G4_Operand * address,G4_DstRegRegion * dst)2550 int IR_Builder::translateVISASVMBlockReadInst(
2551 VISA_Oword_Num size,
2552 bool unaligned,
2553 G4_Operand* address,
2554 G4_DstRegRegion* dst)
2555 {
2556 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2557
2558 unsigned numOword = Get_VISA_Oword_Num(size);
2559 G4_Declare* dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2560 if (noInt64())
2561 {
2562 G4_SrcRegRegion *region = address->asSrcRegRegion();
2563 G4_SrcRegRegion *tmp;
2564 tmp = createSrcRegRegion(Mod_src_undef,
2565 region->getRegAccess(),
2566 region->getBase(),
2567 region->getRegOff(),
2568 region->getSubRegOff() * 2,
2569 region->getRegion(), Type_UD);
2570 createMovInst(dcl, 0, 0, g4::SIMD1, NULL, NULL, tmp, true);
2571 tmp = createSrcRegRegion(Mod_src_undef,
2572 region->getRegAccess(),
2573 region->getBase(),
2574 region->getRegOff(),
2575 region->getSubRegOff() * 2 + 1,
2576 region->getRegion(), Type_UD);
2577 createMovInst(dcl, 0, 1, g4::SIMD1, NULL, NULL, tmp, true);
2578 }
2579 else
2580 {
2581 G4_Declare* dclAsUQ = createSendPayloadDcl(GENX_DATAPORT_IO_SZ / 2, Type_UQ);
2582 dclAsUQ->setAliasDeclare(dcl, 0);
2583 createMovInst(dclAsUQ, 0, 0, g4::SIMD1, NULL, NULL, address, true);
2584 }
2585
2586 G4_SrcRegRegion* src = createSrcRegRegion(dcl, getRegionStride1());
2587
2588 DATA_CACHE1_MESSAGES msgSubOpcode = DC1_A64_BLOCK_READ;
2589 unsigned rspLength = ((numOword * 16 - 1) / getGRFSize() + 1);
2590
2591 unsigned desc = getA64BTI() |
2592 (unaligned ? A64_BLOCK_MSG_OWORD_UNALIGNED_READ : A64_BLOCK_MSG_OWORD_RW) << A64_BLOCK_MSG_SUBTYPE_OFFSET |
2593 msgSubOpcode << SEND_GT_MSG_TYPE_BIT;
2594
2595 desc = setOwordForDesc(desc, numOword);
2596
2597 G4_ExecSize sendExecSize {FIX_OWORD_SEND_EXEC_SIZE(numOword)};
2598 dst->setType(Type_UD);
2599
2600 createSendInst(
2601 NULL, dst, src, 1, rspLength, sendExecSize, desc,
2602 SFID::DP_DC1, true, SendAccess::READ_ONLY, NULL, NULL, InstOpt_WriteEnable, false);
2603
2604 return VISA_SUCCESS;
2605 }
2606
translateVISASVMBlockWriteInst(VISA_Oword_Num size,G4_Operand * address,G4_SrcRegRegion * src)2607 int IR_Builder::translateVISASVMBlockWriteInst(
2608 VISA_Oword_Num size,
2609 G4_Operand* address,
2610 G4_SrcRegRegion* src)
2611 {
2612 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2613
2614 unsigned numOword = Get_VISA_Oword_Num(size);
2615 unsigned srcNumGRF = (numOword * 16 + getGRFSize() - 1) / getGRFSize();
2616 G4_ExecSize sendExecSize {FIX_OWORD_SEND_EXEC_SIZE(numOword)};
2617
2618 // FIXME: may want to apply this to FIX_OWORD_SEND_EXEC_SIZE instead
2619 if (sendExecSize < g4::SIMD8)
2620 {
2621 sendExecSize = g4::SIMD8;
2622 }
2623
2624 G4_Declare* dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
2625 if (noInt64())
2626 {
2627 G4_SrcRegRegion *region = address->asSrcRegRegion();
2628 G4_SrcRegRegion *tmp;
2629 tmp = createSrcRegRegion(Mod_src_undef,
2630 region->getRegAccess(),
2631 region->getBase(),
2632 region->getRegOff(),
2633 region->getSubRegOff() * 2,
2634 region->getRegion(), Type_UD);
2635 createMovInst(dcl, 0, 0, g4::SIMD1, NULL, NULL, tmp, true);
2636 tmp = createSrcRegRegion(Mod_src_undef,
2637 region->getRegAccess(),
2638 region->getBase(),
2639 region->getRegOff(),
2640 region->getSubRegOff() * 2 + 1,
2641 region->getRegion(), Type_UD);
2642 createMovInst(dcl, 0, 1, g4::SIMD1, NULL, NULL, tmp, true);
2643 } else {
2644 G4_Declare* dclAsUQ = createSendPayloadDcl(GENX_DATAPORT_IO_SZ / 2, Type_UQ);
2645 dclAsUQ->setAliasDeclare(dcl, 0);
2646 createMovInst(dclAsUQ, 0, 0, g4::SIMD1, NULL, NULL, address, true);
2647 }
2648
2649 bool useSplitSend = useSends();
2650 PayloadSource sources[2];
2651 unsigned len = 0;
2652
2653 sources[len].opnd = createSrcRegRegion(dcl, getRegionStride1());
2654 sources[len].execSize = g4::SIMD8;
2655 sources[len].instOpt = InstOpt_WriteEnable;
2656 ++len;
2657
2658 if (src->getElemSize() < TypeSize(Type_UD))
2659 {
2660 // use D for size computation. Src is guaranteed to be GRF-aligend per vISA spec
2661 src->setType(Type_UD);
2662 }
2663 sources[len].opnd = src;
2664
2665 G4_ExecSize movExecSize {0};
2666
2667 auto scale = getGRFSize() / src->getElemSize();
2668 switch (src->getElemSize())
2669 {
2670 case 4:
2671 sources[len].execSize = G4_ExecSize(scale * srcNumGRF);
2672 movExecSize = G4_ExecSize(scale);
2673 break;
2674 case 8:
2675 sources[len].execSize = G4_ExecSize(scale * srcNumGRF);
2676 movExecSize = G4_ExecSize(scale);
2677 break;
2678 }
2679
2680 sources[len].instOpt = InstOpt_WriteEnable;
2681 ++len;
2682
2683 G4_SrcRegRegion *msgs[2] = {0, 0};
2684 unsigned sizes[2] = {0, 0};
2685 preparePayload(msgs, sizes, movExecSize, useSplitSend, sources, len);
2686
2687 DATA_CACHE1_MESSAGES msgSubOpcode = DC1_A64_BLOCK_WRITE;
2688
2689 unsigned desc = getA64BTI() |
2690 A64_BLOCK_MSG_OWORD_RW << A64_BLOCK_MSG_SUBTYPE_OFFSET |
2691 msgSubOpcode << SEND_GT_MSG_TYPE_BIT;
2692
2693 desc = setOwordForDesc(desc, numOword);
2694
2695 G4_DstRegRegion* sendDst = createNullDst(Type_UD);
2696
2697 if (msgs[1] == 0)
2698 {
2699 createSendInst(NULL, sendDst,
2700 msgs[0], sizes[0],
2701 0, sendExecSize,
2702 desc, SFID::DP_DC1,
2703 true,
2704 SendAccess::WRITE_ONLY,
2705 NULL, NULL,
2706 InstOpt_WriteEnable, false);
2707 }
2708 else
2709 {
2710 createSplitSendInst(NULL, sendDst,
2711 msgs[0], sizes[0],
2712 msgs[1], sizes[1],
2713 0, sendExecSize,
2714 desc, SFID::DP_DC1,
2715 true,
2716 SendAccess::WRITE_ONLY,
2717 NULL, NULL,
2718 InstOpt_WriteEnable, false);
2719 }
2720
2721 return VISA_SUCCESS;
2722 }
2723
translateVISASVMScatterReadInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Type blockSize,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * addresses,G4_DstRegRegion * dst)2724 int IR_Builder::translateVISASVMScatterReadInst(
2725 VISA_Exec_Size execSize,
2726 VISA_EMask_Ctrl eMask,
2727 G4_Predicate* pred,
2728 VISA_SVM_Block_Type blockSize,
2729 VISA_SVM_Block_Num numBlocks,
2730 G4_SrcRegRegion* addresses,
2731 G4_DstRegRegion* dst)
2732 {
2733 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2734
2735 ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2736 execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2737 execSize == EXEC_SIZE_16,
2738 "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2739
2740 VISA_Exec_Size instExecSize = execSize;
2741 execSize = roundUpExecSize(execSize);
2742
2743 bool is8ByteMsg = blockSize == SVM_BLOCK_TYPE_BYTE && numBlocks == SVM_BLOCK_NUM_8;
2744 assert((!is8ByteMsg || has8ByteA64Gather()) && "A64 8-byte scatter not supported on this platform");
2745
2746 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2747 G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2748 G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
2749
2750 uint32_t messageLength = (8 * exSize) / getGRFSize();
2751 uint32_t numDWperLane = 0;
2752
2753 // ToDo: remove this as it should be done in HWConformity
2754 if (instExSize < 8 && WaDisableSendSrcDstOverlap())
2755 {
2756 // as message length is set to 2 (HW requirements),
2757 // we have to even align both src/dst to satisfy the WA
2758 G4_Declare* srcDcl = addresses->getTopDcl()->getRootDeclare();
2759 if (srcDcl->getByteSize() <= numEltPerGRF<Type_UB>())
2760 {
2761 srcDcl->setEvenAlign();
2762 }
2763 G4_Declare* dstDcl = dst->getTopDcl()->getRootDeclare();
2764 if (dstDcl->getByteSize() <= numEltPerGRF<Type_UB>())
2765 {
2766 dstDcl->setEvenAlign();
2767 }
2768 }
2769
2770 switch (blockSize)
2771 {
2772 case SVM_BLOCK_TYPE_BYTE:
2773 numDWperLane = (numBlocks == SVM_BLOCK_NUM_8) ? 2 : 1;
2774 break;
2775 case SVM_BLOCK_TYPE_DWORD:
2776 numDWperLane = Get_Common_ISA_SVM_Block_Num(numBlocks);
2777 break;
2778 case SVM_BLOCK_TYPE_QWORD:
2779 numDWperLane = Get_Common_ISA_SVM_Block_Num(numBlocks) * 2;
2780 break;
2781 default:
2782 MUST_BE_TRUE(false, "Illegal SVM block type");
2783 }
2784 uint32_t responseLength = (numDWperLane * 4 * exSize) / getGRFSize();
2785
2786 unsigned desc = 0;
2787 desc |= getA64BTI();
2788 desc |= blockSize << 8;
2789 desc |= numBlocks << 10;
2790 desc |= (exSize == 8 ? 0 : 1) << 12;
2791 desc |= DC1_A64_SCATTERED_READ << 14;
2792
2793 createSendInst(pred, dst, addresses, messageLength, responseLength, instExSize, desc,
2794 SFID::DP_DC1, false, SendAccess::READ_ONLY, NULL, NULL, instOpt, false);
2795
2796 return VISA_SUCCESS;
2797 }
2798
translateVISASVMScatterWriteInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,G4_Predicate * pred,VISA_SVM_Block_Type blockSize,VISA_SVM_Block_Num numBlocks,G4_SrcRegRegion * addresses,G4_SrcRegRegion * src)2799 int IR_Builder::translateVISASVMScatterWriteInst(
2800 VISA_Exec_Size execSize,
2801 VISA_EMask_Ctrl eMask,
2802 G4_Predicate* pred,
2803 VISA_SVM_Block_Type blockSize,
2804 VISA_SVM_Block_Num numBlocks,
2805 G4_SrcRegRegion* addresses,
2806 G4_SrcRegRegion* src)
2807 {
2808 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2809
2810 ASSERT_USER(execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 ||
2811 execSize == EXEC_SIZE_4 || execSize == EXEC_SIZE_8 ||
2812 execSize == EXEC_SIZE_16,
2813 "Only support SIMD1, SIMD2, SIMD4, SIMD8 or SIMD16!");
2814
2815 bool is8ByteMsg = blockSize == SVM_BLOCK_TYPE_BYTE && numBlocks == SVM_BLOCK_NUM_8;
2816 assert((!is8ByteMsg || has8ByteA64Gather()) && "A64 8-byte scatter not supported on this platform");
2817 VISA_Exec_Size instExecSize = execSize;
2818 execSize = roundUpExecSize(execSize);
2819
2820 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2821 G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2822 G4_InstOpts instOpt = Get_Gen4_Emask(eMask, instExSize);
2823
2824 bool useSplitSend = useSends();
2825
2826 PayloadSource sources[2]; // Maximal 2 sources, optional header + offsets
2827 unsigned len = 0;
2828
2829 sources[len].opnd = addresses;
2830 sources[len].execSize = exSize;
2831 sources[len].instOpt = instOpt;
2832 ++len;
2833
2834 unsigned numElems = 1;
2835 // NOTE that BYTE scatter always has numElems set to 1 as
2836 // - when the number of data elements is 1, 2, or 4, the writeback payload
2837 // is always 1 MDP_DW_SIMD8/_SIMD16.
2838 // - when the number of data elements is 8, the write payload is always 1
2839 // MDP_QW_SIMD8/_SIMD16.
2840 // This ALSO implies the RAW operand should be in type of UQ when the
2841 // number of data elements is 8.
2842 if (blockSize != SVM_BLOCK_TYPE_BYTE)
2843 numElems = Get_Common_ISA_SVM_Block_Num(numBlocks);
2844
2845 sources[len].opnd = src;
2846 sources[len].execSize = G4_ExecSize(exSize * numElems);
2847 sources[len].instOpt = instOpt;
2848 ++len;
2849
2850 G4_SrcRegRegion *msgs[2] = {0, 0};
2851 unsigned sizes[2] = {0, 0};
2852
2853 // adjust src type
2854 // PreparePayload takes src type to calculate src1 size. The src type have to be DW
2855 // for byte scatter read
2856 G4_Type srcType = src->getType();
2857 if ((blockSize == SVM_BLOCK_TYPE_BYTE) &&
2858 (numBlocks == SVM_BLOCK_NUM_1 || numBlocks == SVM_BLOCK_NUM_2) &&
2859 (TypeSize(srcType) != 4))
2860 src->setType(Type_UD);
2861
2862 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
2863
2864 // set the type back in case we changed it for preparePayload
2865 src->setType(srcType);
2866
2867 unsigned desc = 0;
2868 desc |= getA64BTI();
2869 desc |= blockSize << 8;
2870 desc |= numBlocks << 10;
2871 desc |= (exSize == 8 ? 0 : 1) << 12;
2872 desc |= DC1_A64_SCATTERED_WRITE << 14;
2873
2874 G4_DstRegRegion* dst = createNullDst(Type_UD);
2875 if (msgs[1] == 0) {
2876 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
2877 createSendInst(pred, dst,
2878 msgs[0], sizes[0],
2879 0, instExSize,
2880 desc, SFID::DP_DC1,
2881 false,
2882 SendAccess::WRITE_ONLY,
2883 NULL, NULL,
2884 instOpt, false);
2885 }
2886 else {
2887 createSplitSendInst(pred, dst,
2888 msgs[0], sizes[0],
2889 msgs[1], sizes[1],
2890 0, instExSize,
2891 desc, SFID::DP_DC1,
2892 false,
2893 SendAccess::WRITE_ONLY,
2894 NULL, NULL,
2895 instOpt, false);
2896 }
2897
2898 return VISA_SUCCESS;
2899 }
2900
2901
2902
2903
2904 // is16Bit indicates if this is a 16bit atomic op. The input source (if
2905 // any) and the writeback (if any) have the same datalayout as dword messages.
2906 // Only the lower 16 bits of each dword is used.
2907 //
FillSVMAtomicMsgDesc(bool is16Bit,bool isFloatOp,uint32_t & msgDesc)2908 static void FillSVMAtomicMsgDesc(bool is16Bit, bool isFloatOp, uint32_t &msgDesc)
2909 {
2910 if (is16Bit)
2911 {
2912 if (isFloatOp)
2913 {
2914 msgDesc |= DC1_A64_UNTYPED_HALF_FLOAT_ATOMIC << 14;
2915 }
2916 else
2917 {
2918 msgDesc |= DC1_A64_UNTYPED_HALF_INTEGER_ATOMIC << 14;
2919 }
2920 }
2921 else
2922 {
2923 if (isFloatOp)
2924 {
2925 msgDesc |= DC1_A64_UNTYPED_FLOAT_ATOMIC << 14;
2926 }
2927 else
2928 {
2929 msgDesc |= DC1_A64_ATOMIC << 14;
2930 }
2931 }
2932 }
2933
2934
translateVISASVMAtomicInst(VISAAtomicOps atomicOp,unsigned short bitwidth,VISA_Exec_Size execSize,VISA_EMask_Ctrl emask,G4_Predicate * pred,G4_SrcRegRegion * addresses,G4_SrcRegRegion * src0,G4_SrcRegRegion * src1,G4_DstRegRegion * dst)2935 int IR_Builder::translateVISASVMAtomicInst(
2936 VISAAtomicOps atomicOp,
2937 unsigned short bitwidth,
2938 VISA_Exec_Size execSize,
2939 VISA_EMask_Ctrl emask,
2940 G4_Predicate* pred,
2941 G4_SrcRegRegion* addresses,
2942 G4_SrcRegRegion* src0,
2943 G4_SrcRegRegion* src1,
2944 G4_DstRegRegion* dst)
2945 {
2946 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2947
2948 MUST_BE_TRUE(bitwidth == 16 || bitwidth == 32 || bitwidth == 64,
2949 "bitwidth must be 16/32/64");
2950
2951 ASSERT_USER(getPlatform() >= XeHP_SDV || ((atomicOp != ATOMIC_FADD) && (atomicOp != ATOMIC_FSUB)),
2952 "FADD/FSUB atomic operations are only supported on this devices");
2953
2954 VISA_Exec_Size instExecSize = execSize;
2955 execSize = roundUpExecSize(execSize);
2956
2957 unsigned op = Get_Atomic_Op(atomicOp);
2958
2959 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
2960 G4_ExecSize instExSize {Get_VISA_Exec_Size(instExecSize)};
2961 G4_InstOpts instOpt = Get_Gen4_Emask(emask, instExSize);
2962
2963 if (atomicOp == ATOMIC_CMPXCHG)
2964 {
2965 // we have to swap src0 and src1 since vISA has them in different order from HW
2966 G4_SrcRegRegion* tmp = src0;
2967 src0 = src1;
2968 src1 = tmp;
2969 }
2970
2971 bool useSplitSend = useSends();
2972
2973 PayloadSource sources[3]; // addresses, src0, and src1
2974 unsigned len = 0;
2975
2976 sources[len].opnd = addresses;
2977 sources[len].execSize = exSize;
2978 sources[len].instOpt = instOpt;
2979 ++len;
2980
2981 if (src0 != NULL && !src0->isNullReg())
2982 {
2983 sources[len].opnd = src0;
2984 sources[len].execSize = exSize;
2985 sources[len].instOpt = instOpt;
2986 ++len;
2987 }
2988
2989 if (src1 != NULL && !src1->isNullReg())
2990 {
2991 sources[len].opnd = src1;
2992 sources[len].execSize = exSize;
2993 sources[len].instOpt = instOpt;
2994 ++len;
2995 }
2996
2997 G4_SrcRegRegion *msgs[2] = {0, 0};
2998 unsigned sizes[2] = {0, 0};
2999 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
3000 unsigned dstLength = dst->isNullReg() ? 0 : ((bitwidth == 16 || bitwidth == 32) ? 1 : 2);
3001 unsigned msgDesc = 0;
3002 msgDesc |= getA64BTI();
3003 msgDesc |= op << 8;
3004 #define A64_ATOMIC_RETURN_DATA_CONTROL_BIT 13
3005 msgDesc |= (dstLength ? 1 : 0) << A64_ATOMIC_RETURN_DATA_CONTROL_BIT;
3006 msgDesc |= ((bitwidth == 16 || bitwidth == 32) ? 0 : 1) << 12;
3007
3008 // Fill remaining bits.
3009 FillSVMAtomicMsgDesc(bitwidth == 16, IsFloatAtomicOps(atomicOp), msgDesc);
3010
3011 if (msgs[1] == 0) {
3012 createSendInst(pred, dst,
3013 msgs[0], sizes[0], dstLength,
3014 instExSize,
3015 msgDesc, SFID::DP_DC1,
3016 false,
3017 SendAccess::READ_WRITE,
3018 NULL, NULL,
3019 instOpt, false);
3020 }
3021 else {
3022 createSplitSendInst(pred, dst,
3023 msgs[0], sizes[0],
3024 msgs[1], sizes[1],
3025 dstLength,
3026 instExSize,
3027 msgDesc, SFID::DP_DC1,
3028 false,
3029 SendAccess::READ_WRITE,
3030 NULL, NULL,
3031 instOpt, false);
3032 }
3033
3034 return VISA_SUCCESS;
3035 }
3036
getSVMOffset(G4_Operand * globalOffset,G4_SrcRegRegion * offsets,uint16_t exSize,G4_Predicate * pred,uint32_t mask)3037 G4_SrcRegRegion* IR_Builder::getSVMOffset(
3038 G4_Operand* globalOffset, G4_SrcRegRegion* offsets, uint16_t exSize,
3039 G4_Predicate* pred, uint32_t mask)
3040 {
3041 G4_Declare* dcl = createSendPayloadDcl(exSize, offsets->getType());
3042 G4_DstRegRegion* tmp = createDstRegRegion(dcl, 1);
3043 createInst(pred, G4_add, 0, g4::NOSAT, g4::SIMD8, tmp, offsets, globalOffset, mask, true);
3044 if (exSize == 16)
3045 {
3046 // do second half of the 64-bit add
3047 int offset = (8 * sizeof(uint64_t)) / getGRFSize();
3048 auto dst = createDst(dcl->getRegVar(), offset, 0, 1, offsets->getType());
3049 auto src = createSrc(offsets->getBase(),
3050 offsets->getRegOff() + offset, offsets->getSubRegOff(), getRegionStride1(), offsets->getType());
3051 createInst(duplicateOperand(pred), G4_add, 0, g4::NOSAT, g4::SIMD8, dst, src,
3052 duplicateOperand(globalOffset), getSplitHiEMask(16, mask), true);
3053 }
3054 return createSrcRegRegion(dcl, getRegionStride1());
3055 }
3056
translateSVMGather4Inst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)3057 int IR_Builder::translateSVMGather4Inst(
3058 VISA_Exec_Size execSize,
3059 VISA_EMask_Ctrl eMask,
3060 ChannelMask chMask,
3061 G4_Predicate *pred,
3062 G4_Operand *globalOffset,
3063 G4_SrcRegRegion *offsets,
3064 G4_DstRegRegion *dst)
3065 {
3066 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
3067
3068 ASSERT_USER(execSize == EXEC_SIZE_8 || execSize == EXEC_SIZE_16,
3069 "Only support SIMD8 or SIMD16!");
3070
3071 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
3072 G4_InstOpts instOpt = Get_Gen4_Emask(eMask, exSize);
3073
3074 bool useSplitSend = useSends();
3075
3076 // In case non-zero global offset is specified, we need to recalculate
3077 // offsets.
3078 if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
3079 {
3080 offsets = getSVMOffset(globalOffset, offsets, exSize, pred, instOpt);
3081 }
3082
3083 PayloadSource sources[1]; // Maximal 1 sources, offsets
3084 unsigned len = 0;
3085
3086 sources[len].opnd = offsets;
3087 sources[len].execSize = exSize;
3088 sources[len].instOpt = instOpt;
3089 ++len;
3090
3091 G4_SrcRegRegion *msgs[2] = {0, 0};
3092 unsigned sizes[2] = {0, 0};
3093 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
3094
3095 SFID sfid = SFID::DP_DC1;
3096
3097 unsigned FC = 0;
3098 // Leave sidebind scaled offset 0 as it is not used now.
3099 FC |= DC1_A64_UNTYPED_SURFACE_READ << 14;
3100 FC |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
3101 FC |= chMask.getHWEncoding() << 8;
3102 FC |= getA64BTI();
3103
3104 unsigned resLen = (exSize / GENX_DATAPORT_IO_SZ) *
3105 chMask.getNumEnabledChannels();
3106 if (msgs[1] == 0) {
3107 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
3108 createSendInst(pred, dst,
3109 msgs[0], sizes[0],
3110 resLen,
3111 exSize,
3112 FC, sfid,
3113 false,
3114 SendAccess::READ_ONLY,
3115 NULL, NULL,
3116 instOpt, false);
3117 }
3118 else {
3119 createSplitSendInst(pred, dst,
3120 msgs[0], sizes[0], msgs[1], sizes[1],
3121 resLen,
3122 exSize,
3123 FC, sfid,
3124 false,
3125 SendAccess::READ_ONLY,
3126 NULL, NULL,
3127 instOpt, false);
3128 }
3129
3130 return VISA_SUCCESS;
3131 }
3132
translateSVMScatter4Inst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)3133 int IR_Builder::translateSVMScatter4Inst(
3134 VISA_Exec_Size execSize,
3135 VISA_EMask_Ctrl eMask,
3136 ChannelMask chMask,
3137 G4_Predicate *pred,
3138 G4_Operand *globalOffset,
3139 G4_SrcRegRegion *offsets,
3140 G4_SrcRegRegion *src)
3141 {
3142 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
3143
3144 ASSERT_USER(execSize == EXEC_SIZE_8 || execSize == EXEC_SIZE_16,
3145 "Only support SIMD8 or SIMD16!");
3146
3147 G4_ExecSize exSize {Get_VISA_Exec_Size(execSize)};
3148 G4_InstOpts instOpt = Get_Gen4_Emask(eMask, exSize);
3149 bool useSplitSend = useSends();
3150
3151 // In case non-zero global offset is specified, we need to recalculate
3152 // offsets.
3153 if (!globalOffset->isImm() || globalOffset->asImm()->getImm() != 0)
3154 {
3155 offsets = getSVMOffset(globalOffset, offsets, exSize, pred, instOpt);
3156 }
3157
3158 PayloadSource sources[2]; // Maximal 2 sources, offsets + src
3159 unsigned len = 0;
3160
3161 sources[len].opnd = offsets;
3162 sources[len].execSize = exSize;
3163 sources[len].instOpt = instOpt;
3164 ++len;
3165 sources[len].opnd = src;
3166 sources[len].execSize = G4_ExecSize(exSize * chMask.getNumEnabledChannels());
3167 sources[len].instOpt = instOpt;
3168 ++len;
3169
3170 G4_SrcRegRegion *msgs[2] = {0, 0};
3171 unsigned sizes[2] = {0, 0};
3172 preparePayload(msgs, sizes, exSize, useSplitSend, sources, len);
3173
3174 SFID sfid = SFID::DP_DC1;
3175
3176 unsigned FC = 0;
3177 // Leave sidebind scaled offset 0 as it is not used now.
3178 FC |= DC1_A64_UNTYPED_SURFACE_WRITE << 14;
3179 FC |= (execSize == EXEC_SIZE_8 ? MDC_SM3_SIMD8 : MDC_SM3_SIMD16) << 12;
3180 FC |= chMask.getHWEncoding() << 8;
3181 FC |= getA64BTI();
3182
3183 G4_DstRegRegion *dst = createNullDst(Type_UD);
3184 if (msgs[1] == 0) {
3185 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
3186 createSendInst(pred, dst,
3187 msgs[0], sizes[0],
3188 0,
3189 exSize,
3190 FC, sfid,
3191 false,
3192 SendAccess::WRITE_ONLY,
3193 NULL, NULL,
3194 instOpt, false);
3195 }
3196 else {
3197 createSplitSendInst(pred, dst,
3198 msgs[0], sizes[0], msgs[1], sizes[1],
3199 0,
3200 exSize,
3201 FC, sfid,
3202 false,
3203 SendAccess::WRITE_ONLY,
3204 NULL, NULL,
3205 instOpt, false);
3206 }
3207
3208 return VISA_SUCCESS;
3209 }
3210
translateVISASVMGather4ScaledInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_DstRegRegion * dst)3211 int IR_Builder::translateVISASVMGather4ScaledInst(
3212 VISA_Exec_Size execSize,
3213 VISA_EMask_Ctrl eMask,
3214 ChannelMask chMask,
3215 G4_Predicate *pred,
3216 G4_Operand *globalOffset,
3217 G4_SrcRegRegion *offsets,
3218 G4_DstRegRegion *dst)
3219 {
3220 return translateSVMGather4Inst(execSize, eMask, chMask, pred,
3221 globalOffset, offsets, dst);
3222 }
3223
translateVISASVMScatter4ScaledInst(VISA_Exec_Size execSize,VISA_EMask_Ctrl eMask,ChannelMask chMask,G4_Predicate * pred,G4_Operand * globalOffset,G4_SrcRegRegion * offsets,G4_SrcRegRegion * src)3224 int IR_Builder::translateVISASVMScatter4ScaledInst(
3225 VISA_Exec_Size execSize,
3226 VISA_EMask_Ctrl eMask,
3227 ChannelMask chMask,
3228 G4_Predicate *pred,
3229 G4_Operand *globalOffset,
3230 G4_SrcRegRegion *offsets,
3231 G4_SrcRegRegion *src)
3232 {
3233 return translateSVMScatter4Inst(execSize, eMask, chMask, pred,
3234 globalOffset, offsets, src);
3235 }
3236
3237