1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2020-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "BuildIR.h"
10 #include "../Timer.h"
11
12 using namespace vISA;
13
14 static const unsigned MESSAGE_PRECISION_SUBTYPE_OFFSET = 30;
15 static const unsigned SIMD_MODE_2_OFFSET = 29;
16
createSamplerMsgDesc(VISASampler3DSubOpCode samplerOp,bool isNativeSIMDSize,bool isFP16Return,bool isFP16Input)17 static uint32_t createSamplerMsgDesc(
18 VISASampler3DSubOpCode samplerOp,
19 bool isNativeSIMDSize,
20 bool isFP16Return,
21 bool isFP16Input)
22 {
23 // Now create message descriptor
24 // 7:0 - BTI
25 // 11:8 - Sampler Index
26 // 16:12 - Message Type
27 // 18:17 - SIMD Mode[0:1]
28 // 19 - Header Present
29 // 24:20 - Response Length
30 // 28:25 - Message Length
31 // 29 - SIMD Mode[2]
32 // 30 - Return Format
33 // 31 - CPS Message LOD Compensation Enable
34 // We only set message type, SIMD mode, and return format here. The other fields
35 // are set in createSendInst as they are common with other send messages
36 uint32_t fc = 0;
37
38 fc |= ((uint32_t)samplerOp & 0x1f) << 12;
39
40 if (isNativeSIMDSize)
41 {
42 fc |= (1 << 17);
43 }
44 else
45 {
46 fc |= (2 << 17);
47 }
48
49 if (isFP16Return)
50 {
51 // 16-bit return type. Note that this doesn't change the return length
52 fc |= (1 << MESSAGE_PRECISION_SUBTYPE_OFFSET);
53 }
54
55 if (isFP16Input)
56 {
57 fc |= (1 << SIMD_MODE_2_OFFSET);
58 }
59
60 return fc;
61 }
62
63
translateVISASampleInfoInst(VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,G4_DstRegRegion * dst)64 int IR_Builder::translateVISASampleInfoInst(
65 VISA_Exec_Size executionSize,
66 VISA_EMask_Ctrl emask,
67 ChannelMask chMask,
68 G4_Operand* surface,
69 G4_DstRegRegion* dst)
70 {
71 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
72
73 G4_ExecSize execSize {Get_VISA_Exec_Size(executionSize)};
74 G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
75 VISAChannelMask channels = chMask.getAPI();
76 bool useFakeHeader = (getPlatform() < GENX_SKL) ? false :
77 (channels == CHANNEL_MASK_R);
78 bool preEmption = forceSamplerHeader();
79 bool forceSplitSend = shouldForceSplitSend(surface);
80 bool useHeader = true;
81 // SAMPLEINFO has 0 parameters so its only header
82
83 unsigned int numRows = 1;
84
85 G4_Declare *msg = NULL;
86 G4_SrcRegRegion *m0 = NULL;
87
88 if (!useFakeHeader || forceSplitSend || preEmption)
89 {
90 msg = getSamplerHeader(false /*isBindlessSampler*/, false /*samperIndexGE16*/);
91
92 unsigned int secondDword = chMask.getHWEncoding() << 12;
93
94 G4_Imm* immOpndSecondDword = createImm(secondDword, Type_UD);
95
96 // mov (1) msg(0,2) immOpndSecondDword
97 auto payloadDstRgn = createDst(msg->getRegVar(), 0, 2, 1, Type_UD);
98
99 G4_INST* movInst = createMov(g4::SIMD1, payloadDstRgn, immOpndSecondDword, InstOpt_NoOpt, true);
100 movInst->setOptionOn(InstOpt_WriteEnable);
101
102 m0 = createSrcRegRegion(msg, getRegionStride1());
103 }
104 else
105 {
106 useHeader = false;
107 msg = createTempVar(getNativeExecSize(), Type_UD, GRFALIGN);
108 G4_DstRegRegion *dst = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
109 G4_Imm* src0Imm = createImm(0, Type_UD);
110 (void) createMov(getNativeExecSize(), dst, src0Imm, InstOpt_WriteEnable, true);
111 m0 = createSrc(msg->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
112 }
113 // Now create message descriptor
114 // 7:0 - BTI
115 // 11:8 - Sampler Index
116 // 16:12 - Message Type
117 // 18:17 - SIMD Mode
118 // 19 - Header Present
119 // 24:20 - Response Length
120 // 28:25 - Message Length
121 // 29 - SIMD Mode
122 // 30 - Return Format
123 // 31 - CPS Message LOD Compensation Enable
124 unsigned int fc = 0;
125
126 fc |= ((unsigned int) VISA_3D_SAMPLEINFO & 0x1f) << 12;
127
128 if (execSize == getNativeExecSize())
129 {
130 fc |= (1 << 17);
131 }
132 else
133 {
134 fc |= (2 << 17);
135 }
136
137 uint32_t retSize = (execSize == getNativeExecSize() ? chMask.getNumEnabledChannels() : chMask.getNumEnabledChannels() * 2);
138
139 if (forceSplitSend)
140 {
141 createSplitSendInst(NULL, dst, m0, numRows,
142 createNullSrc(Type_UD), 0, retSize,
143 execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
144 }
145 else
146 {
147 createSendInst(NULL, dst, m0, numRows, retSize,
148 execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
149 }
150
151 return VISA_SUCCESS;
152 }
153
translateVISAResInfoInst(VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,G4_SrcRegRegion * lod,G4_DstRegRegion * dst)154 int IR_Builder::translateVISAResInfoInst(
155 VISA_Exec_Size executionSize,
156 VISA_EMask_Ctrl emask,
157 ChannelMask chMask,
158 G4_Operand* surface,
159 G4_SrcRegRegion* lod,
160 G4_DstRegRegion* dst)
161 {
162 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
163
164 G4_ExecSize execSize {Get_VISA_Exec_Size(executionSize)};
165 G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
166 //For SKL if channels are continuous don't need header
167
168 VISAChannelMask channels = chMask.getAPI();
169 bool preEmption = forceSamplerHeader();
170 bool useHeader = preEmption || (getPlatform() < GENX_SKL) ? channels != CHANNEL_MASK_RGBA :
171 (channels != CHANNEL_MASK_R && channels != CHANNEL_MASK_RG && channels != CHANNEL_MASK_RGB && channels != CHANNEL_MASK_RGBA);
172
173 // Setup number of rows = (header + lod) by default
174 unsigned int numRows = (execSize == getNativeExecSize() ? 1 : 2);
175 if (useHeader)
176 {
177 numRows++;
178 }
179 unsigned int regOff = 0;
180 uint32_t returnLength = (execSize == getNativeExecSize() ? chMask.getNumEnabledChannels() : chMask.getNumEnabledChannels() * 2);
181
182 bool useSplitSend = useSends();
183
184 G4_Declare *msg = NULL;
185 G4_Declare *payloadUD = NULL;
186 if (useSplitSend)
187 {
188 if (useHeader)
189 {
190 --numRows;
191 }
192 unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
193 msg = getSamplerHeader(false /*isBindlessSampler*/, false /*samperIndexGE16*/);
194 payloadUD = createSendPayloadDcl(numElts, Type_UD);
195 }
196 else
197 {
198 unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
199 msg = createSendPayloadDcl(numElts, Type_UD);
200 payloadUD = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_UD);
201 payloadUD->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
202
203 if (useHeader)
204 {
205 // Both SAMPLEINFO and RESINFO use header
206 createMovR0Inst(msg, 0, 0, true);
207 }
208 }
209
210 if (useHeader)
211 {
212 unsigned int secondDword = 0;
213 secondDword |= (chMask.getHWEncoding() << 12);
214
215 G4_Imm* immOpndSecondDword = createImm(secondDword, Type_UD);
216
217 // mov (1) msg(0,2) immOpndSecondDword
218 auto payloadDstRgn = createDst(msg->getRegVar(), 0, 2, 1, Type_UD);
219
220 G4_INST* movInst = createMov(g4::SIMD1, payloadDstRgn, immOpndSecondDword, InstOpt_NoOpt, true);
221 movInst->setOptionOn(InstOpt_WriteEnable);
222 }
223
224 // Copy over lod vector operand to payload's 1st row
225 Copy_SrcRegRegion_To_Payload(payloadUD, regOff, lod, execSize, instOpt | InstOpt_BreakPoint);
226
227 // Now create message descriptor
228 // 7:0 - BTI
229 // 11:8 - Sampler Index
230 // 16:12 - Message Type
231 // 18:17 - SIMD Mode
232 // 19 - Header Present
233 // 24:20 - Response Length
234 // 28:25 - Message Length
235 // 29 - SIMD Mode
236 // 30 - Return Format
237 // 31 - CPS Message LOD Compensation Enable
238 unsigned int fc = 0;
239
240 fc |= ((unsigned int) VISA_3D_RESINFO & 0x1f) << 12;
241
242 if (execSize == getNativeExecSize())
243 {
244 fc |= (1 << 17);
245 }
246 else
247 {
248 fc |= (2 << 17);
249 }
250
251 if (useSplitSend)
252 {
253 G4_SrcRegRegion *m0 = nullptr;
254 G4_SrcRegRegion *m1 = nullptr;
255 unsigned int src0Size = 0;
256 unsigned int src1Size = 0;
257
258 if (useHeader)
259 {
260 m0 = createSrcRegRegion(msg, getRegionStride1());
261 m1 = createSrcRegRegion(payloadUD, getRegionStride1());
262 src0Size = 1;
263 src1Size = numRows;
264 }
265 else
266 {
267 m0 = createSrcRegRegion(payloadUD, getRegionStride1());
268 m1 = createNullSrc(Type_UD);
269 src0Size = numRows;
270 src1Size = 0;
271 }
272 createSplitSendInst(NULL, dst, m0, src0Size, m1, src1Size, returnLength,
273 execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
274 }
275 else
276 {
277 G4_SrcRegRegion *m = createSrcRegRegion(msg, getRegionStride1());
278 createSendInst(NULL, dst, m, numRows, returnLength,
279 execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
280 }
281
282 return VISA_SUCCESS;
283 }
284
285
286
287 // generate a URB_SIMD8* message
288 // urbHandle -- 1 GRF holding 8 URB handles. This is the header of the message
289 // perSlotOffset -- 1 GRF holding 8 DWord offsets. If present, it must be immediately after the header
290 // channelMask -- 1 GRF holding 8 8-bit masks. In vISA spec they have constant values and must be
291 // identical. If present, occurs after the per slot message phase if the per slot
292 // message phase exists else it occurs after the header.
293
translateVISAURBWrite3DInst(G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,uint8_t numOut,uint16_t globalOffset,G4_SrcRegRegion * channelMask,G4_SrcRegRegion * urbHandle,G4_SrcRegRegion * perSlotOffset,G4_SrcRegRegion * vertexData)294 int IR_Builder::translateVISAURBWrite3DInst(
295 G4_Predicate* pred,
296 VISA_Exec_Size executionSize,
297 VISA_EMask_Ctrl emask,
298 uint8_t numOut,
299 uint16_t globalOffset,
300 G4_SrcRegRegion* channelMask,
301 G4_SrcRegRegion* urbHandle,
302 G4_SrcRegRegion* perSlotOffset,
303 G4_SrcRegRegion* vertexData)
304 {
305 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
306
307 G4_ExecSize execSize {Get_VISA_Exec_Size(executionSize)};
308 G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
309
310 if (numOut == 0)
311 {
312 MUST_BE_TRUE(vertexData->isNullReg(), "vertex payload must be null ARF when numOut is 0");
313 }
314
315 // header + channelMask + numOut
316 unsigned int numRows = 2 + numOut;
317 const bool useHeader = true;
318 bool usePerSlotIndex = false;
319 bool useChannelMask = true;
320
321 if (!perSlotOffset->isNullReg())
322 {
323 usePerSlotIndex = true;
324 numRows++;
325 }
326
327 if (channelMask->isNullReg())
328 {
329 useChannelMask = false;
330 numRows--;
331 }
332
333 bool useSplitSend = useSends();
334 // So far, we don't have a obvious cut except for header. As the result,
335 // split-send is disabled once there's no header in the message.
336 if (!useHeader)
337 useSplitSend = false;
338
339 if (numOut == 0)
340 {
341 // no split send if payload is null
342 useSplitSend = false;
343 }
344
345 // msg is the header for split send, or the entire payload for regular send
346 G4_Declare *msg = NULL;
347 G4_Declare* payloadF = NULL;
348 G4_Declare* payloadD = NULL;
349 G4_Declare* payloadUD = NULL;
350 if (useSplitSend)
351 {
352 ASSERT_USER(useHeader, "So far, split-send is only used when header is present!");
353 --numRows;
354 if (numRows > 0)
355 {
356 unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
357 // we can use the urb handle directly since URB write will not modify its header
358 //msg = createSendPayloadDcl(GENX_SAMPLER_IO_SZ, Type_UD);
359 payloadUD = createSendPayloadDcl(numElts, Type_UD);
360 payloadF = createSendPayloadDcl(numElts, Type_F);
361 payloadD = createSendPayloadDcl(numElts, Type_D);
362 payloadF->setAliasDeclare(payloadUD, 0);
363 payloadD->setAliasDeclare(payloadUD, 0);
364 }
365 }
366 else
367 {
368 unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
369 msg = createSendPayloadDcl(numElts, Type_UD);
370 if (numRows > 1)
371 {
372 payloadUD = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_UD);
373 payloadF = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_F);
374 payloadD = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_D);
375 payloadUD->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
376 payloadF->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
377 payloadD->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
378 }
379 }
380
381 unsigned int regOff = 0;
382 // Setup header
383 if (useHeader && msg != NULL)
384 {
385 unsigned ignoredOff = 0;
386 Copy_SrcRegRegion_To_Payload(msg, ignoredOff, urbHandle, g4::SIMD8, instOpt);
387 }
388
389 if (usePerSlotIndex)
390 {
391 Copy_SrcRegRegion_To_Payload(payloadUD, regOff, perSlotOffset, g4::SIMD8, instOpt);
392 }
393
394 if (useChannelMask)
395 {
396
397 // shl (8) M2.0<1>:ud cmask<8;8,1>:ud 0x10:uw
398 auto payloadUDRegRgnRow2 = createDst(payloadUD->getRegVar(), regOff++, 0, 1, Type_UD);
399
400 createBinOp(G4_shl, g4::SIMD8, payloadUDRegRgnRow2, channelMask, createImm(16, Type_UW),
401 instOpt, true);
402 }
403
404 G4_Declare* vertexDataDcl = numOut == 0 ? NULL : vertexData->getBase()->asRegVar()->getDeclare();
405
406 bool needsDataMove = (!useSplitSend || usePerSlotIndex || useChannelMask);
407 if (needsDataMove)
408 {
409 // we have to insert moves to make payload contiguous
410 unsigned int startSrcRow = vertexData->getRegOff();
411
412 for (int i = 0; i < numOut; i++)
413 {
414 G4_DstRegRegion payloadTypedRegRowi(Direct, payloadF->getRegVar(), regOff++, 0, 1, Type_F);
415 G4_DstRegRegion* payloadTypedRegRowRgni = createDstRegRegion(payloadTypedRegRowi);
416
417 G4_SrcRegRegion* vertexSrcRegRgnRowi = createSrc(vertexDataDcl->getRegVar(), startSrcRow++, 0, getRegionStride1(), Type_F);
418
419 createMov(g4::SIMD8, payloadTypedRegRowRgni, vertexSrcRegRgnRowi, instOpt, true);
420 }
421 }
422 else
423 {
424 payloadUD = vertexDataDcl;
425 }
426
427 // Msg descriptor
428 unsigned int fc = 0;
429
430 fc |= 0x7;
431
432 fc |= (globalOffset << 4);
433
434 if (useChannelMask)
435 {
436 fc |= (0x1 << 15);
437 }
438
439 if (usePerSlotIndex)
440 {
441 fc |= (0x1 << 17);
442 }
443
444 if (useSplitSend)
445 {
446 G4_SrcRegRegion *m0 = urbHandle;
447 G4_SrcRegRegion *m1 = nullptr;
448
449 if (needsDataMove)
450 {
451 m1 = createSrcRegRegion(payloadUD, getRegionStride1());
452 }
453 else
454 {
455 ASSERT_USER(payloadUD == vertexDataDcl,
456 "If there is no need for data move then payloadUD == vertexDataDcl must hold!");
457
458 m1 = createSrc(
459 payloadUD->getRegVar(),
460 vertexData->getRegOff(),
461 vertexData->getSubRegOff(),
462 getRegionStride1(),
463 payloadUD->getElemType());
464 }
465
466 createSplitSendInst(pred, createNullDst(Type_UD), m0, 1, m1, numRows, 0,
467 execSize, fc, SFID::URB, useHeader, SendAccess::WRITE_ONLY, NULL, NULL, instOpt, false);
468 } else {
469 G4_SrcRegRegion *m = createSrcRegRegion(msg, getRegionStride1());
470 createSendInst(pred, createNullDst(Type_UD), m, numRows, 0,
471 execSize, fc, SFID::URB, useHeader, SendAccess::WRITE_ONLY, nullptr, nullptr, instOpt, false);
472 }
473 return VISA_SUCCESS;
474 }
475
476 /*****************************************************************************\
477 ENUM: EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL
478 \*****************************************************************************/
479 enum EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL
480 {
481 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD16_SINGLE_SOURCE = 0,
482 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD16_SINGLE_SOURCE_REPLICATED = 1,
483 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_LOW = 2,
484 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_HIGH = 3,
485 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_SINGLE_SOURCE_LOW = 4,
486 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_IMAGE_WRITE = 5
487 };
488
translateVISARTWrite3DInst(G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,G4_Operand * surface,G4_SrcRegRegion * r1HeaderOpnd,G4_Operand * rtIndex,vISA_RT_CONTROLS cntrls,G4_SrcRegRegion * sampleIndexOpnd,G4_Operand * cpsCounter,unsigned int numParms,G4_SrcRegRegion ** msgOpnds)489 int IR_Builder::translateVISARTWrite3DInst(
490 G4_Predicate* pred,
491 VISA_Exec_Size executionSize,
492 VISA_EMask_Ctrl emask,
493 G4_Operand *surface,
494 G4_SrcRegRegion *r1HeaderOpnd,
495 G4_Operand *rtIndex,
496 vISA_RT_CONTROLS cntrls,
497 G4_SrcRegRegion *sampleIndexOpnd,
498 G4_Operand *cpsCounter,
499 unsigned int numParms,
500 G4_SrcRegRegion ** msgOpnds)
501 {
502 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
503
504 G4_ExecSize execSize = toExecSize(executionSize);
505 G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
506 bool useHeader = false;
507
508 uint8_t varOffset = 0;
509 G4_SrcRegRegion * s0a = NULL;
510 //oMask
511 G4_SrcRegRegion * oM = NULL;
512 if (cntrls.s0aPresent)
513 {
514 s0a = msgOpnds[varOffset];
515 ++varOffset;
516 }
517 if (cntrls.oMPresent)
518 {
519 oM = msgOpnds[varOffset];
520 ++varOffset;
521 }
522
523 G4_SrcRegRegion * R = msgOpnds[varOffset++];
524 G4_SrcRegRegion * G = msgOpnds[varOffset++];
525 G4_SrcRegRegion * B = msgOpnds[varOffset++];
526 G4_SrcRegRegion * A = msgOpnds[varOffset++];
527 //depth
528 G4_SrcRegRegion * Z = NULL;
529
530 if (cntrls.zPresent)
531 Z = msgOpnds[varOffset++];
532
533 //stencil
534 G4_SrcRegRegion * S = NULL;
535 if (cntrls.isStencil)
536 {
537 S = msgOpnds[varOffset++];
538 }
539
540 if (varOffset != numParms)
541 {
542 assert(0);
543 return VISA_FAILURE;
544 }
545
546 bool FP16Data = R->getType() == Type_HF;
547 if (FP16Data)
548 {
549 MUST_BE_TRUE((G->isNullReg() || G->getType() == Type_HF) &&
550 (B->isNullReg() || B->getType() == Type_HF) &&
551 (A->isNullReg() || A->getType() == Type_HF),
552 "R,G,B,A for RT write must have the same type");
553 }
554
555 auto mult = (execSize == getNativeExecSize() ? 1 : 2);
556 mult = (FP16Data)? 1 : mult;
557
558 //RGBA sr0Alpha take up one GRF in SIMD8 and SIMD16 modes.
559 //in SIMD8 upper DWORDs are reserved
560 unsigned int numRows = numParms * mult;
561
562 //Depth is always Float
563 //For SIMD16 it is 2 grfs
564 //For SIMD8 it is 1 grf
565 if (FP16Data && cntrls.zPresent && executionSize == EXEC_SIZE_16)
566 {
567 ++numRows;
568 }
569
570 if (cntrls.oMPresent && mult == 2)
571 {
572 // oM is always 1 row irrespective of execSize
573 numRows--;
574 }
575
576 //although for now HW only supports stencil in SIMD8 mode
577 if (cntrls.isStencil && mult == 2)
578 {
579 // stencil is always 1 row irrespective of execSize
580 numRows--;
581 }
582
583 // header is always 64 byte
584 const int numDWInHeader = 16;
585 const int headerBytes = numDWInHeader * sizeof(int);
586 const int numHeaderGRF = numDWInHeader / getNativeExecSize();
587
588 /*
589 All other values should be set by default.
590 Most of the time when renderTargetIndex != 0, src0Alpha is present also
591 */
592 bool isRTIdxNonzero = cntrls.RTIndexPresent &&
593 (rtIndex->isSrcRegRegion() || (rtIndex->isImm() && rtIndex->asImm()->getImm() != 0));
594 bool isRTIdxDynamic = cntrls.RTIndexPresent && rtIndex->isSrcRegRegion();
595 bool needsHeaderForMRT = isRTIdxDynamic || cntrls.s0aPresent || (!hasHeaderlessMRTWrite() && isRTIdxNonzero);
596 if (needsHeaderForMRT || cntrls.isSampleIndex)
597 {
598 useHeader = true;
599 numRows += numHeaderGRF;
600 }
601
602 bool useSplitSend = useSends();
603 // So far, we don't have a obvious cut except for header. As the result,
604 // split-send is disabled once there's no header in the message.
605
606 G4_SrcRegRegion* srcToUse = NULL;
607 G4_Declare *msg = NULL;
608 G4_Declare *msgF = NULL;
609 G4_Declare *payloadUD = NULL;
610 G4_Declare *payloadUW = NULL;
611 G4_Declare *payloadFOrHF = NULL;
612 G4_Declare *payloadF = NULL;
613
614 if (useSplitSend)
615 {
616 if (useHeader)
617 {
618 //subtracting Header
619 numRows -= numHeaderGRF;
620 //creating header
621 msg = createSendPayloadDcl(numDWInHeader, Type_UD);
622 msgF = createSendPayloadDcl(numDWInHeader, Type_F);
623 msgF->setAliasDeclare(msg, 0);
624 }
625 //creating payload
626 unsigned int numElts = numRows * numEltPerGRF<Type_UB>() / TypeSize(Type_F);
627 payloadUD = createSendPayloadDcl(numElts, Type_UD);
628 payloadFOrHF = createSendPayloadDcl(numElts, FP16Data ? Type_HF : Type_F);
629 payloadUW = createSendPayloadDcl(numElts, Type_UW);
630 payloadF = createSendPayloadDcl(numElts, Type_F);
631
632 payloadFOrHF->setAliasDeclare(payloadUD, 0);
633 payloadUW->setAliasDeclare(payloadUD, 0);
634 payloadF->setAliasDeclare(payloadUD, 0);
635 }
636 else
637 {
638 unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
639 //creating enough space for header + payload
640 msg = createSendPayloadDcl(numElts, Type_UD);
641 msgF = createSendPayloadDcl(GENX_SAMPLER_IO_SZ * 2, Type_F);
642 msgF->setAliasDeclare(msg, 0);
643
644 //creating payload declarations.
645 payloadUD = createSendPayloadDcl(numElts - (useHeader ? numDWInHeader : 0), Type_UD);
646 payloadFOrHF = createSendPayloadDcl(numElts - (useHeader ? numDWInHeader : 0), FP16Data ? Type_HF : Type_F);
647 payloadUW = createSendPayloadDcl(numElts - (useHeader ? numDWInHeader : 0), Type_UW);
648 payloadF = createSendPayloadDcl(numElts, Type_F);
649
650 //setting them to alias a top level decl with offset past the header
651 payloadUD->setAliasDeclare(msg, useHeader ? headerBytes : 0);
652 payloadFOrHF->setAliasDeclare(msg, useHeader ? headerBytes : 0);
653 payloadUW->setAliasDeclare(msg, useHeader ? headerBytes : 0);
654 payloadF->setAliasDeclare(payloadUD, 0);
655 }
656
657 if (useHeader)
658 {
659 ASSERT_USER(r1HeaderOpnd, "Second GRF for header that was passed in is NULL.");
660 G4_DstRegRegion* payloadRegRgn = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
661
662 G4_Declare* r0 = getBuiltinR0();
663 G4_SrcRegRegion* r0RegRgn = createSrc(r0->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
664
665 //moves data from r0 to header portion of the message
666 G4_INST* movInst = createMov(g4::SIMD8, payloadRegRgn, r0RegRgn, InstOpt_NoOpt, true);
667 movInst->setOptionOn(InstOpt_WriteEnable);
668
669 payloadRegRgn = createDst(msg->getRegVar(), 1, 0, 1, Type_UD);
670 r1HeaderOpnd->setType(Type_UD);
671 movInst = createMov(g4::SIMD8, payloadRegRgn, r1HeaderOpnd, InstOpt_NoOpt, true);
672 movInst->setOptionOn(InstOpt_WriteEnable);
673
674 #define SAMPLE_INDEX_OFFSET 6
675 if (cntrls.isSampleIndex)
676 {
677 G4_Declare* tmpDcl = createTempVar(2, Type_UD, Any);
678 G4_DstRegRegion* tmpDst = createDst(tmpDcl->getRegVar(), 0, 0, 1, Type_UD);
679
680 createBinOp(G4_shl, g4::SIMD1, tmpDst, sampleIndexOpnd, createImm(SAMPLE_INDEX_OFFSET, Type_UD), InstOpt_WriteEnable, true);
681
682 G4_DstRegRegion* payloadUDRegRgn = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
683 G4_SrcRegRegion* tmpSrc = createSrc(tmpDcl->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
684 G4_SrcRegRegion* payloadSrc = createSrc(msg->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
685 createBinOp(G4_or, g4::SIMD1, payloadUDRegRgn, payloadSrc, tmpSrc, InstOpt_WriteEnable, true);
686 }
687
688 if (isRTIdxNonzero)
689 {
690 G4_DstRegRegion* dstRTIRgn = createDst(msg->getRegVar(), 0, 2, 1, Type_UD);
691
692 G4_INST* rtiMovInst = createMov(g4::SIMD1, dstRTIRgn, rtIndex, InstOpt_NoOpt, true);
693 rtiMovInst->setOptionOn(InstOpt_WriteEnable);
694 }
695
696 //if header is used, then predication value will need to be stored
697 //in the header
698 if (useHeader && (pred || cntrls.isHeaderMaskfromCe0))
699 {
700 //moving pixelMask in to payload
701 G4_DstRegRegion* dstPixelMaskRgn = createDst(
702 msg->getRegVar(), 1, 14, 1, Type_UW);
703
704 // setPixelMaskRgn when WA ce0 is needed
705 auto setPixelMaskRgn = [=](G4_InstOption Option) -> void
706 {
707 G4_Declare* flagDecl = createTempFlag(2, "WAce0");
708 G4_RegVar* flagVar = flagDecl->getRegVar();
709 G4_DstRegRegion* flag = createDst(
710 flagVar, 0,
711 Option == InstOpt_M16 ? 1 : 0,
712 1, Type_UW);
713
714 // (1) (W) mov (1|M0) WAce0.[0|1]:uw, 0
715 // M0 : WAce0.0; M16 : WAce0.1
716 // (2) cmp (16|[M0|M16]) (eq)WAce0.0 r0:uw r0:uw
717 // (3) (W) mov(1|M0) dstPixelMaskRgn:uw WAce0.[0|1]:uw
718 // M0 : WAce0.0; M16 : WAce0.1
719 createMov(g4::SIMD1, flag, createImm(0, Type_UW), InstOpt_WriteEnable, true);
720
721 G4_SrcRegRegion* r0_0 = createSrc(
722 getRealR0()->getRegVar(), 0, 0,
723 getRegionStride1(), Type_UW);
724 G4_SrcRegRegion* r0_1 = createSrc(
725 getRealR0()->getRegVar(), 0, 0,
726 getRegionStride1(), Type_UW);
727 G4_DstRegRegion* nullDst = createNullDst(Type_UW);
728 G4_CondMod* flagCM = createCondMod(Mod_e, flagVar, 0);
729 createInst(NULL, G4_cmp, flagCM, g4::NOSAT, g4::SIMD16, nullDst,
730 r0_0, r0_1, Option, true);
731
732 G4_SrcRegRegion* flagSrc = createSrc(
733 flagVar, 0,
734 Option == InstOpt_M16 ? 1 : 0,
735 getRegionScalar(), Type_UW);
736
737 // move to dstPixelMaskRgn
738 createMov(g4::SIMD1, dstPixelMaskRgn, flagSrc, InstOpt_WriteEnable, true);
739 };
740
741 G4_SrcRegRegion* pixelMask = NULL;
742 if (emask == vISA_EMASK_M5_NM || emask == vISA_EMASK_M5)
743 {
744 if (pred)
745 {
746 //this is a Second half of a SIMD32 RT write. We need to get second half of flag register.
747 //mov whole register in to GRF, move second word of it in to payload.
748
749 G4_SrcRegRegion* pixelMaskTmp = createSrc(
750 pred->getBase()->asRegVar(), 0, 0,
751 getRegionScalar(), Type_UD);
752 G4_Declare* tmpDcl = createTempVar(1, Type_UD, Any);
753 G4_DstRegRegion* tmpDst = createDst(tmpDcl->getRegVar(), 0, 0, 1, Type_UD);
754 createMov(g4::SIMD1, tmpDst, pixelMaskTmp, InstOpt_WriteEnable, true);
755
756 pixelMask = createSrc(
757 tmpDcl->getRegVar(), 0, 1, getRegionScalar(), Type_UW);
758
759 // move from temp register to header
760 createMov(g4::SIMD1, dstPixelMaskRgn, pixelMask, InstOpt_WriteEnable, true);
761 }
762 else
763 {
764 if (VISA_WA_CHECK(getPWaTable(), Wa_1406950495))
765 {
766 setPixelMaskRgn(InstOpt_M16);
767 }
768 else
769 {
770 G4_SrcRegRegion* ce0 = createSrc(
771 phyregpool.getMask0Reg(), 0, 0,
772 getRegionScalar(), Type_UD);
773
774 // shr .14<1>:uw ce0:ud 16:uw
775 createBinOp(G4_shr, g4::SIMD1, dstPixelMaskRgn,
776 ce0, createImm(16, Type_UW), InstOpt_WriteEnable, true);
777 }
778 }
779 }
780 else
781 {
782 if (pred)
783 {
784 pixelMask = createSrc(
785 pred->getBase()->asRegVar(), 0, 0,
786 getRegionScalar(), Type_UW);
787
788 //clearing lower 15 bits
789 createMov(g4::SIMD1, dstPixelMaskRgn, pixelMask, InstOpt_WriteEnable, true);
790 }
791 else
792 {
793 if (VISA_WA_CHECK(getPWaTable(), Wa_1406950495))
794 {
795 setPixelMaskRgn(InstOpt_M0);
796 }
797 else
798 {
799 G4_SrcRegRegion* ce0 = createSrc(
800 phyregpool.getMask0Reg(), 0, 0,
801 getRegionScalar(), Type_UD);
802
803 // mov .14<1>:uw ce0:ud. clearing lower 15 bits
804 createMov(g4::SIMD1, dstPixelMaskRgn, ce0, InstOpt_WriteEnable, true);
805 }
806 }
807 }
808
809 pred = NULL;
810
811 }
812 unsigned int orImmVal = 0;
813
814 //setting first DWORD of MHC_RT_C0 - Render Target Message Header Control
815
816 if (cntrls.isStencil)
817 {
818 orImmVal = (0x1 << 14);
819 }
820
821 if (cntrls.zPresent)
822 {
823 orImmVal = (0x1 << 13);
824 }
825
826 if (cntrls.oMPresent)
827 {
828 orImmVal |= (0x1 << 12);
829 }
830
831 if (cntrls.s0aPresent)
832 {
833 orImmVal |= (0x1 << 11);
834 }
835
836 if (orImmVal != 0)
837 {
838 G4_SrcRegRegion* immSrcRegRgn = createSrc(msg->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
839
840 G4_DstRegRegion* immDstRegRgn = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
841
842 G4_INST* immOrInst = createBinOp(G4_or, g4::SIMD1, immDstRegRgn, immSrcRegRgn, createImm(orImmVal, Type_UD), InstOpt_WriteEnable, true);
843 immOrInst->setOptionOn(InstOpt_WriteEnable);
844 }
845 }
846
847 // Check whether coalescing is possible
848 #define UNINITIALIZED_DWORD 0xffffffff
849 unsigned int offset = UNINITIALIZED_DWORD;
850 // If the header is not present or split-send is available, we will try to
851 // coalesc payload by checking whether the source is already prepared in a
852 // continuous region. If so, we could reuse the source region directly
853 // instead of copying it again.
854 bool canCoalesce = !useHeader || useSplitSend;
855 G4_SrcRegRegion* prevRawOpnd = NULL;
856
857 if (R->isNullReg() ||
858 G->isNullReg() ||
859 B->isNullReg() ||
860 A->isNullReg())
861 canCoalesce = false;
862
863 if (canCoalesce && cntrls.s0aPresent)
864 {
865 prevRawOpnd = s0a;
866 offset = getByteOffsetSrcRegion(s0a);
867 }
868
869 if (canCoalesce && cntrls.oMPresent)
870 {
871 //by default it will check based on first opnd type, but that can be HF, F, we need second operand type
872 //according to spec oM is UW
873 canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, oM, execSize, oM->getType());
874 prevRawOpnd = oM;
875 if (offset == UNINITIALIZED_DWORD)
876 {
877 offset = getByteOffsetSrcRegion(oM);
878 }
879 }
880
881 if (canCoalesce)
882 {
883 if (execSize == 16 && cntrls.oMPresent)
884 {
885 // oM is 1 GRF for SIMD16 since it is UW type
886 canCoalesce = checkIfRegionsAreConsecutive(oM, R, execSize, Type_UW);
887 prevRawOpnd = R;
888 }
889 else
890 {
891 canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, R, execSize);
892 prevRawOpnd = R;
893 }
894
895 if (offset == UNINITIALIZED_DWORD)
896 {
897 offset = getByteOffsetSrcRegion(prevRawOpnd);
898 }
899
900 if (canCoalesce)
901 {
902 auto tempExecSize = execSize;
903 if (FP16Data && execSize == 8)
904 tempExecSize = g4::SIMD16;
905 canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, G, tempExecSize) &&
906 checkIfRegionsAreConsecutive(G, B, tempExecSize) &&
907 checkIfRegionsAreConsecutive(B, A, tempExecSize);
908 prevRawOpnd = A;
909 if (offset == UNINITIALIZED_DWORD)
910 {
911 offset = getByteOffsetSrcRegion(A);
912 if (FP16Data && execSize == g4::SIMD8)
913 offset += 8;
914 }
915 }
916 }
917
918 if (canCoalesce && cntrls.zPresent)
919 {
920 canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, Z, execSize);
921 prevRawOpnd = Z;
922 }
923
924 if (canCoalesce && cntrls.isStencil)
925 {
926 canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, S, execSize);
927 prevRawOpnd = S;
928 }
929
930 if (canCoalesce == false)
931 {
932 // Copy parms to payload
933 unsigned regOff = 0;
934
935 if (cntrls.s0aPresent)
936 {
937
938 Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, s0a, execSize, instOpt);
939 }
940
941 if (cntrls.oMPresent)
942 {
943 Copy_SrcRegRegion_To_Payload(payloadUW, regOff, oM, execSize, instOpt);
944 //Copy_SrcRegRegion_To_Payload increments regOff by 1 if byteSize ==2
945 //works for oM since in SIMD16 it occupies one GRF
946 }
947
948
949 // When RT write is HF s0a,R, G, B, A are allowed to be HF.
950 // In SIMD8 upper DWORDS are reserved.
951 // In SIMD16 uppder DOWRDS contain second grf worth of values if type was F.
952 //
953 // Output can be only Depth, so V0 is passed in if RGBA don't need to be outputted
954 auto offIncrement = 2;
955 if (execSize == 8 || FP16Data)
956 offIncrement = 1;
957
958 if (!R->isNullReg())
959 Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, R, execSize, instOpt);
960 else
961 regOff+= offIncrement;
962
963 if (!G->isNullReg())
964 Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, G, execSize, instOpt);
965 else
966 regOff+= offIncrement;
967
968 if (!B->isNullReg())
969 Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, B, execSize, instOpt);
970 else
971 regOff+= offIncrement;
972
973 if (!A->isNullReg())
974 Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, A, execSize, instOpt);
975 else
976 regOff += offIncrement;
977
978 if (cntrls.zPresent)
979 {
980 Copy_SrcRegRegion_To_Payload(payloadF, regOff, Z, execSize, instOpt);
981 }
982
983 if (cntrls.isStencil)
984 {
985 Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, S, execSize, InstOpt_WriteEnable);
986 }
987
988 srcToUse = createSrcRegRegion(payloadUD, getRegionStride1());
989 }
990 else
991 {
992 // Coalesce and directly use original raw operand
993 G4_Declare *dcl = R->getBase()->asRegVar()->getDeclare();
994 srcToUse = createSrc(dcl->getRegVar(), offset / 32, 0, getRegionStride1(), R->getType());
995 }
996
997 // Now create message message descriptor
998 // 7:0 - BTI
999 // 10:8 - Render Target Message Subtype
1000 // 11 - Slot Group Select
1001 // 12 - Last Render Target Select
1002 // 13 - Reserved (DevBDW)
1003 // 13 - Per-Sample PS Outputs Enable (DevSKL+)
1004 // 17:14 - Message Type
1005 // 18 - Reserved
1006 // 19 - Header Present
1007 // 24:20 - Response Length
1008 // 28:25 - Message Length
1009 // 29 - Reserved
1010 // 30 - Message Precision Subtype (DevBDW+)
1011 // 31 - Reserved (MBZ)
1012 unsigned int fc = 0;
1013
1014 //making explicit
1015 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL messageType =
1016 (executionSize == EXEC_SIZE_8)
1017 ? EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_SINGLE_SOURCE_LOW
1018 : EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD16_SINGLE_SOURCE;
1019
1020 #define RENDER_TARGET_MESSAGE_SUBTYPE_OFFSET 8
1021 fc |= (messageType << RENDER_TARGET_MESSAGE_SUBTYPE_OFFSET);
1022
1023 #define SLOT_GROUP_SELECT_OFFSET 11
1024 //for SIMD32 for second RT Write setting this bit
1025 if (emask == vISA_EMASK_M5_NM || emask == vISA_EMASK_M5)
1026 fc |= (0x1 << SLOT_GROUP_SELECT_OFFSET);
1027
1028 if (cntrls.isLastWrite)
1029 {
1030 #define LAST_RENDER_TARGET_SELECT_OFFSET 12
1031 fc |= (0x1 << LAST_RENDER_TARGET_SELECT_OFFSET);
1032 }
1033
1034 if (cntrls.isPerSample)
1035 {
1036 #define PER_SAMPLE_PS_ENABLE_OFFSET 13
1037 fc += (0x1 << PER_SAMPLE_PS_ENABLE_OFFSET);
1038 }
1039
1040 if (FP16Data)
1041 {
1042 fc |= 0x1 << MESSAGE_PRECISION_SUBTYPE_OFFSET;
1043 }
1044
1045 #define MESSAGE_TYPE 14
1046 fc |= (0xc << MESSAGE_TYPE);
1047
1048 #define COARSE_PIXEL_OUTPUT_ENABLE 18
1049 if (cntrls.isCoarseMode)
1050 fc |= 0x1 << COARSE_PIXEL_OUTPUT_ENABLE;
1051 #define CPS_COUNTER_EXT_MSG_DESC_OFFSET 16
1052
1053 uint16_t extFuncCtrl = 0;
1054 if (cntrls.isNullRT && getPlatform() >= GENX_TGLLP)
1055 {
1056 // extFuncCtrl is the 16:31 bits of extDesc. NullRT is the bit 20 of extDesc.
1057 // That says NullRT is the bit 4 of extFuncCtrl.
1058 #define NULL_RENDER_TARGET 4
1059 extFuncCtrl |= 0x1 << NULL_RENDER_TARGET;
1060 }
1061
1062 if (useSplitSend || cpsCounter)
1063 {
1064 G4_SendDescRaw *msgDesc = NULL;
1065 G4_SrcRegRegion *m0 = NULL;
1066 bool indirectExDesc = false;
1067 if (useHeader)
1068 {
1069 m0 = createSrcRegRegion(msg, getRegionStride1());
1070 msgDesc = createSendMsgDesc(fc, 0, numHeaderGRF, SFID::DP_WRITE, numRows,
1071 extFuncCtrl, SendAccess::WRITE_ONLY, surface);
1072 msgDesc->setHeaderPresent(useHeader);
1073 }
1074 else
1075 {
1076 if (!isRTIdxNonzero && !cntrls.s0aPresent)
1077 {
1078 // direct imm is a-ok for ext desc
1079 msgDesc = createSendMsgDesc(fc, 0, numRows, SFID::DP_WRITE, 0,
1080 extFuncCtrl, SendAccess::WRITE_ONLY, surface);
1081 }
1082 else
1083 {
1084 assert(rtIndex->isImm() && "RTIndex must be imm at this point");
1085 uint8_t RTIndex = (uint8_t)rtIndex->asImm()->getImm() & 0x7;
1086 uint32_t desc = G4_SendDescRaw::createDesc(fc, false, numRows, 0);
1087 uint32_t extDesc = G4_SendDescRaw::createMRTExtDesc(cntrls.s0aPresent, RTIndex,
1088 false, 0, extFuncCtrl);
1089 msgDesc = createGeneralMsgDesc(desc, extDesc, SendAccess::WRITE_ONLY, surface);
1090
1091 if (!canEncodeFullExtDesc())
1092 {
1093 // we must use a0 for extended msg desc in this case as there aren't enough bits to encode
1094 // the full ext desc
1095 // mov (1) a0.2:ud extDesc
1096 G4_DstRegRegion* dst = createDstRegRegion(getBuiltinA0Dot2(), 1);
1097 createMov(g4::SIMD1, dst, createImm(extDesc, Type_UD), InstOpt_WriteEnable, true);
1098 indirectExDesc = true;
1099 }
1100 }
1101 }
1102
1103 /*
1104 If we need to set cps counter then ext_message descriptor
1105 needs to be a register.
1106 */
1107 if (cpsCounter)
1108 {
1109 ASSERT_USER(hasCPS(), "CPS counter is not supported");
1110 unsigned msgDescValue = msgDesc->getExtendedDesc();
1111
1112 //shifting CPS counter by appropriate number of bits and storing in ext_descriptor operand
1113 G4_DstRegRegion *dstMove2 = createDstRegRegion(getBuiltinA0Dot2(), 1);
1114 G4_Imm *immedOpnd = createImm(msgDescValue, Type_UD);
1115
1116 ///setting lower bits
1117 createBinOp(G4_or, g4::SIMD1, dstMove2, cpsCounter, immedOpnd, InstOpt_WriteEnable, true);
1118 indirectExDesc = true;
1119 }
1120
1121 if (!useHeader)
1122 {
1123 m0 = srcToUse;
1124 srcToUse = createNullSrc(Type_UD);
1125 }
1126
1127 createSplitSendToRenderTarget(
1128 pred,
1129 createNullDst(Type_UD),
1130 m0,
1131 srcToUse,
1132 indirectExDesc ? createSrcRegRegion(getBuiltinA0Dot2(), getRegionScalar()) : nullptr,
1133 execSize,
1134 msgDesc,
1135 instOpt);
1136 }
1137 else
1138 {
1139 G4_SrcRegRegion *m = srcToUse;
1140 if (useHeader)
1141 m = createSrcRegRegion(msg, getRegionStride1());
1142 createSendInst(pred, createNullDst(Type_UD), m, numRows, 0,
1143 execSize, fc, SFID::DP_WRITE, useHeader, SendAccess::WRITE_ONLY, surface, NULL, instOpt, true);
1144 }
1145 return VISA_SUCCESS;
1146
1147 }
1148
1149
1150 // Bit 15 of aoffimmi is set in messages with sampler index >= 16.
IsSamplerIndexGE16(G4_Operand * aoffimmi)1151 static bool IsSamplerIndexGE16(G4_Operand* aoffimmi)
1152 {
1153 bool ret = false;
1154 if (aoffimmi && aoffimmi->isImm())
1155 {
1156 const uint16_t aoffimmiVal = (uint16_t)aoffimmi->asImm()->getInt();
1157 ret = (aoffimmiVal & 0x8000) != 0;
1158 }
1159 return ret;
1160 }
1161
1162
1163 // return the contents of M0.2 for sampler messages. It must be an immediate value
createSampleHeader0Dot2(VISASampler3DSubOpCode op,bool pixelNullMask,uint16_t aoffimmi,ChannelMask channels,IR_Builder * builder)1164 static uint32_t createSampleHeader0Dot2(VISASampler3DSubOpCode op,
1165 bool pixelNullMask,
1166 uint16_t aoffimmi,
1167 ChannelMask channels,
1168 IR_Builder* builder)
1169 {
1170 uint32_t secondDword = aoffimmi & 0xfff;
1171 switch (op)
1172 {
1173 case VISA_3D_GATHER4:
1174 //gather4 source channel select
1175 secondDword |= (channels.getSingleChannel() << 16);
1176 break;
1177 case VISA_3D_GATHER4_PO:
1178 if (builder->hasGather4PO())
1179 {
1180 secondDword |= (channels.getSingleChannel() << 16);
1181 }
1182 break;
1183 case VISA_3D_GATHER4_PO_C:
1184 break;
1185 case VISA_3D_GATHER4_C:
1186 // do nothing as channle must be Red (0)
1187 break;
1188 default:
1189 // RGBA write channel mask
1190 secondDword |= (channels.getHWEncoding() << 12);
1191 break;
1192 }
1193
1194 // M0.2:23, Pixel Null Mask Enable.
1195 // Only valid for SKL+, and ignored otherwise.
1196 if (builder->hasPixelNullMask() && pixelNullMask)
1197 {
1198 secondDword |= 1 << 23;
1199 }
1200
1201 return secondDword;
1202 }
1203
1204 //
1205 // Coarse Pixel Shading(CPS) LOD compensation enable.
1206 //
1207 // - must be disabled if the response length of the message is zero;
1208 // - must be disabled if the messages is from a 32-pixel dispatch thread;
1209 // - must be disabled unless SIMD Mode is SIMD8* or SIMD16*;
1210 // - only available for sample, sample_b, sample_bc, sample_c, and LOD.
1211 //
checkCPSEnable(VISASampler3DSubOpCode op,unsigned reponseLength,unsigned execSize)1212 static void checkCPSEnable(VISASampler3DSubOpCode op,
1213 unsigned reponseLength,
1214 unsigned execSize)
1215 {
1216
1217 ASSERT_USER(reponseLength > 0,
1218 "CPS LOD Compensation Enable must be disabled if the "
1219 "response length is zero");
1220
1221 ASSERT_USER(execSize == 8 || execSize == 16,
1222 "CPS LOD Compensation Enable only valid for SIMD8* or SIMD16*");
1223
1224 ASSERT_USER(op == VISA_3D_SAMPLE ||
1225 op == VISA_3D_SAMPLE_B ||
1226 op == VISA_3D_SAMPLE_C ||
1227 op == VISA_3D_SAMPLE_B_C ||
1228 op == VISA_3D_LOD,
1229 "CPD LOD Compensation Enable only available for "
1230 "sample, sample_b, sample_bc, sample_c and LOD");
1231 }
1232
createSampleHeader(IR_Builder * builder,G4_Declare * header,VISASampler3DSubOpCode actualop,bool pixelNullMask,G4_Operand * aoffimmi,ChannelMask srcChannel,G4_Operand * sampler)1233 static G4_Operand* createSampleHeader(
1234 IR_Builder* builder, G4_Declare* header, VISASampler3DSubOpCode actualop,
1235 bool pixelNullMask, G4_Operand* aoffimmi, ChannelMask srcChannel,
1236 G4_Operand* sampler)
1237 {
1238 G4_Operand* retSampler = sampler;
1239 uint16_t aoffimmiVal = aoffimmi->isImm() ? (uint16_t)aoffimmi->asImm()->getInt() : 0;
1240
1241 unsigned int secondDword = createSampleHeader0Dot2(actualop, pixelNullMask, aoffimmiVal, srcChannel, builder);
1242
1243 G4_Imm* immOpndSecondDword = builder->createImm(secondDword, Type_UD);
1244 G4_DstRegRegion* payloadDstRgn = builder->createDst(header->getRegVar(), 0, 2, 1, Type_UD);
1245 if (aoffimmi->isImm())
1246 {
1247 // mov (1) payload(0,2) immOpndSecondDword
1248 builder->createMov(g4::SIMD1, payloadDstRgn, immOpndSecondDword, InstOpt_WriteEnable, true);
1249 }
1250 else
1251 {
1252 // or (1) payload(0,2) aoffimmi<0;1,0>:uw immOpndSeconDword
1253 builder->createBinOp(G4_or, g4::SIMD1, payloadDstRgn,
1254 aoffimmi, immOpndSecondDword, InstOpt_WriteEnable, true);
1255 }
1256
1257 if (sampler != nullptr)
1258 {
1259 builder->doSamplerHeaderMove(header, sampler);
1260
1261 // Use bit 15 of aoffimmi to tell VISA the sample index could be greater
1262 // than 15. In this case, we need to use msg header, and setup M0.3
1263 // to point to next 16 sampler state.
1264 if (IsSamplerIndexGE16(aoffimmi))
1265 {
1266 retSampler = builder->emitSampleIndexGE16(sampler, header);
1267 }
1268 }
1269
1270 return retSampler;
1271 }
1272
1273
needsNoMaskCoordinates(VISASampler3DSubOpCode opcode)1274 static bool needsNoMaskCoordinates(VISASampler3DSubOpCode opcode)
1275 {
1276 return opcode == VISA_3D_SAMPLE || opcode == VISA_3D_SAMPLE_B || opcode == VISA_3D_SAMPLE_C ||
1277 opcode == VISA_3D_SAMPLE_B_C || opcode == VISA_3D_LOD || opcode == VISA_3D_SAMPLE_KILLPIX;
1278 }
1279
getUPosition(VISASampler3DSubOpCode opcode)1280 static uint8_t getUPosition(VISASampler3DSubOpCode opcode)
1281 {
1282 uint8_t position = 0;
1283 switch (opcode)
1284 {
1285 case VISA_3D_SAMPLE:
1286 case VISA_3D_LOD:
1287 case VISA_3D_SAMPLE_D:
1288 case VISA_3D_SAMPLE_LZ:
1289 case VISA_3D_SAMPLE_KILLPIX:
1290 position = 0;
1291 break;
1292 case VISA_3D_SAMPLE_B:
1293 case VISA_3D_SAMPLE_L:
1294 case VISA_3D_SAMPLE_C:
1295 case VISA_3D_SAMPLE_D_C:
1296 case VISA_3D_SAMPLE_C_LZ:
1297 position = 1;
1298 break;
1299 case VISA_3D_SAMPLE_B_C:
1300 case VISA_3D_SAMPLE_L_C:
1301 position = 2;
1302 break;
1303 default:
1304 MUST_BE_TRUE(false, "unexpected sampler operation");
1305 return 0;
1306 }
1307 return position;
1308 }
1309
setUniformSampler(G4_InstSend * sendInst,bool uniformSampler)1310 static void setUniformSampler(G4_InstSend* sendInst, bool uniformSampler)
1311 {
1312 if (!uniformSampler)
1313 {
1314 sendInst->setSerialize();
1315 }
1316 }
1317
1318 /*
1319 Need to split sample_d and sample_dc in to two simd8 sends since HW doesn't support it.
1320 Also need to split any sample instruciton that has more then 5 parameters. Since there is a limit on msg length.
1321 */
1322 static unsigned TmpSmplDstID = 0;
1323
1324 // TODO: use IR_Builder::getNameString....
getNameString(Mem_Manager & mem,size_t size,const char * format,...)1325 const char* getNameString(
1326 Mem_Manager& mem, size_t size, const char* format, ...)
1327 {
1328 #ifdef _DEBUG
1329 char* name = (char*) mem.alloc(size);
1330 va_list args;
1331 va_start(args, format);
1332 std::vsnprintf(name, size, format, args);
1333 va_end(args);
1334 return name;
1335 #else
1336 const char* name = "";
1337 return const_cast<char*>(name);
1338 #endif
1339 }
1340
1341 // split simd32/16 sampler messages into simd16/8 messages due to HW limitation.
splitSampleInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,bool cpsEnable,G4_Predicate * pred,ChannelMask srcChannel,int numChannels,G4_Operand * aoffimmi,G4_Operand * sampler,G4_Operand * surface,G4_DstRegRegion * dst,VISA_EMask_Ctrl emask,bool useHeader,unsigned numRows,unsigned int numParms,G4_SrcRegRegion ** params,bool uniformSampler)1342 int IR_Builder::splitSampleInst(
1343 VISASampler3DSubOpCode actualop,
1344 bool pixelNullMask,
1345 bool cpsEnable,
1346 G4_Predicate* pred,
1347 ChannelMask srcChannel,
1348 int numChannels,
1349 G4_Operand *aoffimmi,
1350 G4_Operand *sampler,
1351 G4_Operand *surface,
1352 G4_DstRegRegion* dst,
1353 VISA_EMask_Ctrl emask,
1354 bool useHeader,
1355 unsigned numRows, // msg length for each simd8
1356 unsigned int numParms,
1357 G4_SrcRegRegion ** params,
1358 bool uniformSampler)
1359 {
1360 int status = VISA_SUCCESS;
1361 G4_SrcRegRegion *secondHalf[12];
1362
1363 bool isHalfReturn = dst->getTypeSize() == 2;
1364 const bool halfInput = params[0]->getTypeSize() == 2;
1365
1366 // Now, depending on message type emit out parms to payload
1367 unsigned regOff = (useHeader ? 1 : 0);
1368 G4_SrcRegRegion* temp = nullptr;
1369 G4_ExecSize execSize = getNativeExecSize();
1370 uint16_t numElts = numRows * numEltPerGRF<Type_F>();
1371 G4_Declare* payloadF = createSendPayloadDcl(numElts, Type_F);
1372 G4_Declare* payloadUD = createTempVar(numElts, Type_UD, GRFALIGN);
1373 payloadUD->setAliasDeclare(payloadF, 0);
1374 G4_SrcRegRegion* srcToUse = createSrc(payloadUD->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
1375
1376 // even though we only use lower half of the GRF, we have to allocate full GRF
1377 G4_Declare* payloadHF = createTempVar(numElts * 2, Type_HF, Any);
1378 payloadHF->setAliasDeclare(payloadF, 0);
1379
1380 /********* Creating temp destination, since results are interleaved **************/
1381 G4_DstRegRegion *dst1 = createNullDst(dst->getType());
1382 G4_Declare * originalDstDcl = nullptr;
1383 G4_Declare* tempDstDcl = nullptr;
1384 bool pixelNullMaskEnable = false;
1385 unsigned tmpDstRows = 0;
1386 if (!dst->isNullReg())
1387 {
1388 originalDstDcl = dst->getBase()->asRegVar()->getDeclare();
1389 tmpDstRows = numChannels;
1390
1391 // If Pixel Null Mask is enabled, then one extra GRF is needed for the
1392 // write back message.
1393 pixelNullMaskEnable = hasPixelNullMask() && pixelNullMask;
1394 if (pixelNullMaskEnable) {
1395 ASSERT_USER(useHeader, "pixel null mask requires a header");
1396 ++tmpDstRows;
1397 }
1398
1399 const char *name = getNameString(mem, 20, "%s%d", "TmpSmplDst_", TmpSmplDstID++);
1400
1401 tempDstDcl = createDeclareNoLookup(name,
1402 originalDstDcl->getRegFile(),
1403 originalDstDcl->getNumElems(),
1404 (uint16_t)tmpDstRows,
1405 originalDstDcl->getElemType());
1406
1407 dst1 = createDstRegRegion(dst->getRegAccess(),
1408 tempDstDcl->getRegVar(),
1409 0,
1410 0,
1411 1,
1412 dst->getType());
1413 }
1414 /********* End creating temp destination ***********************/
1415
1416 G4_Declare* header = nullptr;
1417
1418 if (useHeader)
1419 {
1420 const bool samplerIndexGE16 = IsSamplerIndexGE16(aoffimmi);
1421 bool bindlessSampler = sampler ? isBindlessSampler(sampler) : false;
1422 header = getSamplerHeader(bindlessSampler, samplerIndexGE16);
1423 sampler = createSampleHeader(this, header, actualop, pixelNullMask, aoffimmi, srcChannel,
1424 sampler);
1425 createMovInst(payloadUD, 0, 0, g4::SIMD8, nullptr, nullptr,
1426 createSrcRegRegion(header, getRegionStride1()), true);
1427 }
1428
1429 G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
1430 for (unsigned paramCounter = 0; paramCounter < numParms; ++paramCounter)
1431 {
1432 temp = params[paramCounter];
1433 uint32_t MovInstOpt = InstOpt_WriteEnable;
1434 if (temp->getTypeSize() == 2)
1435 {
1436 // we should generate
1437 // mov (8) dst<1>:hf src.0<8;8,1>:hf
1438 G4_DstRegRegion* dstHF = createDst(
1439 payloadHF->getRegVar(), regOff++, 0, 1, temp->getType());
1440 temp->setRegion(getRegionStride1());
1441 createMov(g4::SIMD8, dstHF, temp, MovInstOpt, true);
1442 }
1443 else
1444 {
1445 Copy_SrcRegRegion_To_Payload(payloadF, regOff, temp, execSize, MovInstOpt);
1446 }
1447 }
1448
1449 uint32_t responseLength = getSamplerResponseLength(numChannels, isHalfReturn, execSize,
1450 pixelNullMaskEnable, dst->isNullReg());
1451
1452 uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), isHalfReturn, halfInput);
1453 uint32_t desc = G4_SendDescRaw::createDesc(fc, useHeader, numRows, responseLength);
1454
1455 if (cpsEnable)
1456 {
1457 checkCPSEnable(actualop, responseLength, 8);
1458 }
1459 G4_SendDescRaw *msgDesc = createSampleMsgDesc(desc, cpsEnable, 0, surface, sampler);
1460
1461 G4_InstSend* sendInst = nullptr;
1462 bool forceSplitSend = shouldForceSplitSend(surface);
1463
1464 if (forceSplitSend)
1465 {
1466 sendInst = createSplitSendInst(
1467 pred, dst1, srcToUse, createNullSrc(Type_UD), execSize, msgDesc, instOpt, false);
1468 }
1469 else
1470 {
1471 sendInst = createSendInst(
1472 pred, dst1, srcToUse, execSize, msgDesc, instOpt, false);
1473 }
1474 setUniformSampler(sendInst, uniformSampler);
1475
1476 // SKL+
1477 // For SIMD8
1478 //
1479 // W4.7:1 Reserved (not written): This W4 is only delivered when Pixel Null
1480 // Mask Enable is enabled.
1481 //
1482 // W4.0 32:8 Reserved: always written as 0xffffff
1483 // 7:0 Pixel Null Mask: This field has the bit for all pixels set
1484 // to 1 except those pixels in which a null page was source for
1485 // at least one texel.
1486 //
1487 // Need to combine the results from the above two writewback messages.
1488 // Denote by U0[W4:0] the last row of the first writeback message, and
1489 // by U1[W4:0] the last row of the second writeback message. Then the last
1490 // row of the whole writeback message is to take the bitwise OR of
1491 // U0[W4:0] and U1[W4:0].
1492 G4_Declare *tempDstUD = 0;
1493 G4_Declare *tempDst2UD = 0;
1494 G4_Declare *origDstUD = 0;
1495
1496 // temp dst for the second send
1497 G4_DstRegRegion *dst2 = createNullDst(dst->getType());
1498 G4_Declare* tempDstDcl2 = nullptr;
1499 if (!dst->isNullReg())
1500 {
1501 const char *name = getNameString(mem, 20, "%s%d", "TmpSmplDst2_", TmpSmplDstID++);
1502
1503 tempDstDcl2 = createDeclareNoLookup(name,
1504 originalDstDcl->getRegFile(),
1505 originalDstDcl->getNumElems(),
1506 (uint16_t)tmpDstRows,
1507 originalDstDcl->getElemType());
1508
1509 if (pixelNullMaskEnable)
1510 {
1511 unsigned int numElts = tempDstDcl->getNumElems() * tempDstDcl->getNumRows();
1512 tempDstUD = createTempVar(numElts, Type_UD, GRFALIGN);
1513 tempDstUD->setAliasDeclare(tempDstDcl, 0);
1514
1515 numElts = tempDstDcl2->getNumElems() * tempDstDcl2->getNumRows();
1516 tempDst2UD = createTempVar(numElts, Type_UD, GRFALIGN);
1517 tempDst2UD->setAliasDeclare(tempDstDcl2, 0);
1518
1519 numElts = originalDstDcl->getNumElems() * originalDstDcl->getNumRows();
1520 origDstUD = createTempVar(numElts, Type_UD, GRFALIGN);
1521 origDstUD->setAliasDeclare(originalDstDcl, 0);
1522 }
1523
1524 dst2 = createDstRegRegion(dst->getRegAccess(),
1525 tempDstDcl2->getRegVar(),
1526 0,
1527 0,
1528 1,
1529 dst->getType());
1530 }
1531 // update emask
1532 emask = Get_Next_EMask(emask, execSize);
1533 G4_InstOpts instOpt2 = Get_Gen4_Emask(emask, execSize);
1534
1535 auto dupPredicate = [this](G4_Predicate* pred)
1536 {
1537 G4_Predicate* pred2 = nullptr;
1538 if (pred)
1539 {
1540 pred2 = createPredicate(
1541 pred->getState(),
1542 pred->getBase(),
1543 0);
1544 }
1545
1546 return pred2;
1547 };
1548
1549 {
1550 /**************** SECOND HALF OF THE SEND *********************/
1551 // re-create payload declare so the two sends may be issued independently
1552 G4_Declare* payloadF = createSendPayloadDcl(numElts, Type_F);
1553 G4_Declare* payloadUD = createTempVar(numElts, Type_UD, GRFALIGN);
1554 payloadUD->setAliasDeclare(payloadF, 0);
1555
1556 // even though we only use lower half of the GRF, we have to allocate full GRF
1557 G4_Declare* payloadHF = createTempVar(numElts * 2, Type_HF, Any);
1558 payloadHF->setAliasDeclare(payloadF, 0);
1559
1560 G4_SrcRegRegion *srcToUse2 = createSrc(payloadUD->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
1561
1562 if (useHeader)
1563 {
1564 createMovInst(payloadUD, 0, 0, g4::SIMD8, nullptr, nullptr,
1565 createSrcRegRegion(header, getRegionStride1()), true);
1566 }
1567
1568 for (unsigned int i = 0; i < numParms; i++)
1569 {
1570 if (params[i]->isNullReg())
1571 {
1572 secondHalf[i] = params[i];
1573 }
1574 else if (params[i]->getTypeSize() == 2)
1575 {
1576 // V1(0,8)<8;8,1>
1577 secondHalf[i] = createSrcWithNewSubRegOff(params[i], execSize);
1578 }
1579 else
1580 {
1581 // V1(1,0)<8;8,1>
1582 secondHalf[i] = createSrcWithNewRegOff(params[i], params[i]->getRegOff() + 1);
1583 }
1584 }
1585
1586 regOff = (useHeader ? 1 : 0);
1587 for (unsigned paramCounter = 0; paramCounter < numParms; ++paramCounter)
1588 {
1589 temp = secondHalf[paramCounter];
1590 uint32_t MovInstOpt = InstOpt_WriteEnable;
1591
1592 if (temp->getTypeSize() == 2)
1593 {
1594 // we should generate
1595 // mov (8) dst<1>:hf src.8<8;8,1>:hf
1596 G4_DstRegRegion* dstHF = createDst(
1597 payloadHF->getRegVar(), regOff++, 0, 1, temp->getType());
1598 createMov(execSize, dstHF, temp, MovInstOpt, true);
1599 }
1600 else
1601 {
1602 Copy_SrcRegRegion_To_Payload(payloadF, regOff, temp, execSize, MovInstOpt);
1603 }
1604 }
1605
1606 G4_Operand *surface2 = duplicateOperand(surface);
1607
1608 // sampler may be null for 3d load (specifically ld2dms_w)
1609 G4_Operand* sampler2 = sampler == nullptr ? nullptr : duplicateOperand(sampler);
1610
1611 G4_Predicate* pred2 = dupPredicate(pred);
1612
1613 G4_SendDescRaw *msgDesc2 = createSampleMsgDesc(desc, cpsEnable, 0, surface2, sampler2);
1614 msgDesc2->setHeaderPresent(useHeader);
1615
1616 if (forceSplitSend)
1617 {
1618 sendInst = createSplitSendInst(
1619 pred2, dst2, srcToUse2, createNullSrc(Type_UD), execSize, msgDesc2, instOpt2, false);
1620 }
1621 else
1622 {
1623 sendInst = createSendInst(
1624 pred2, dst2, srcToUse2, execSize, msgDesc2, instOpt2, false);
1625 }
1626 setUniformSampler(sendInst, uniformSampler);
1627 }
1628
1629 {
1630
1631 /**************** MOVING FROM TEMP TO DST, 1st half *********************/
1632 regOff = 0;
1633 for (unsigned i = 0; i < tmpDstRows; i++, regOff += 1)
1634 {
1635 // If Pixel Null Mask is enabled, then only copy the last double word.
1636 if (pixelNullMaskEnable && i == tmpDstRows - 1)
1637 {
1638 G4_DstRegRegion *origDstPtr = createDst(origDstUD->getRegVar(), short(regOff), 0, 1, Type_UD);
1639 G4_SrcRegRegion *src0Ptr = createSrc(tempDstUD->getRegVar(),
1640 short(i), 0, getRegionScalar(), Type_UD);
1641
1642 G4_Predicate* pred2 = dupPredicate(pred);
1643
1644 // Copy the write mask message W4.0 into the dst. (No mask?)
1645 createInst(pred2, G4_mov, NULL, g4::NOSAT, g4::SIMD1,
1646 origDstPtr, src0Ptr, NULL, NULL,
1647 InstOpt_WriteEnable, true);
1648 // Skip the remaining part of the loop.
1649 break;
1650 }
1651
1652 G4_SrcRegRegion *tmpSrcPnt = createSrc(tempDstDcl->getRegVar(), (short)i, 0, getRegionStride1(), tempDstDcl->getElemType());
1653
1654 uint32_t MovInstOpt = instOpt;
1655 if (isHalfReturn)
1656 {
1657 // mov (8) dst(0,0)<1>:hf tmp(0,0)<8;8,1>:hf {Q1}
1658 G4_DstRegRegion* dst = createDst(
1659 originalDstDcl->getRegVar(), (short)regOff, 0, 1, originalDstDcl->getElemType());
1660 createMov(execSize, dst, tmpSrcPnt, MovInstOpt, true);
1661 }
1662 else
1663 {
1664 Copy_SrcRegRegion_To_Payload(originalDstDcl, regOff, tmpSrcPnt, execSize, MovInstOpt);
1665 }
1666 }
1667 }
1668
1669 {
1670 /**************** MOVING FROM TEMP TO DST, 2nd half *********************/
1671 regOff = isHalfReturn ? 0 : 1;
1672 for (unsigned i = 0; i < tmpDstRows; i++, regOff += 1)
1673 {
1674 // If Pixel Null Mask is enabled, copy the second half to the originai dst
1675 if (pixelNullMaskEnable && i == tmpDstRows - 1) {
1676 G4_Type secondHalfType = execSize == g4::SIMD8 ? Type_UB : Type_UW;
1677 G4_DstRegRegion* origDstPtr = createDst(origDstUD->getRegVar(), regOff - 1, 1, 1, secondHalfType);
1678 G4_SrcRegRegion* src0Ptr = createSrc(tempDst2UD->getRegVar(),
1679 short(i), 0, getRegionScalar(), secondHalfType);
1680
1681 G4_Predicate* pred2 = dupPredicate(pred);
1682 // write to dst.0[8:15]
1683 createInst(pred2, G4_mov, NULL, g4::NOSAT, g4::SIMD1,
1684 origDstPtr, src0Ptr, NULL, InstOpt_WriteEnable, true);
1685
1686 // Skip the remaining part of the loop.
1687 break;
1688 }
1689
1690 G4_SrcRegRegion *tmpSrcPnt = createSrc(tempDstDcl2->getRegVar(), (short)i, 0, getRegionStride1(), tempDstDcl->getElemType());
1691
1692 uint32_t MovInstOpt = instOpt2;
1693 if (isHalfReturn)
1694 {
1695 // mov (8) dst(0,8)<1>:hf tmp(0,0)<8;8,1>:hf {Q2}
1696 G4_DstRegRegion* dst = createDst(
1697 originalDstDcl->getRegVar(), (short)regOff, execSize, 1, originalDstDcl->getElemType());
1698 createMov(execSize, dst, tmpSrcPnt, MovInstOpt, true);
1699 }
1700 else
1701 {
1702 Copy_SrcRegRegion_To_Payload(originalDstDcl, regOff, tmpSrcPnt, execSize, MovInstOpt);
1703 }
1704 }
1705 }
1706 return status;
1707 }
1708
doSamplerHeaderMove(G4_Declare * headerDcl,G4_Operand * sampler)1709 void IR_Builder::doSamplerHeaderMove(G4_Declare* headerDcl, G4_Operand* sampler)
1710 {
1711 if (isBindlessSampler(sampler))
1712 {
1713 // sampler index in msg desc will be 0, manipulate the sampler offset instead
1714 // mov (1) M0.3<1>:ud sampler<0;1,0>:ud the driver will send the handle with bit 0 already set
1715 G4_DstRegRegion* dst = createDst(headerDcl->getRegVar(), 0, 3, 1, Type_UD);
1716 createMov(g4::SIMD1, dst, sampler, InstOpt_WriteEnable, true);
1717 }
1718 }
1719
1720 //
1721 // generate the r0 move for the sampler message header, and return the dcl
1722 // for CNL+, also set SSP to dynamic if message is not bindless
1723 //
getSamplerHeader(bool isBindlessSampler,bool samplerIndexGE16)1724 G4_Declare* IR_Builder::getSamplerHeader(bool isBindlessSampler, bool samplerIndexGE16)
1725 {
1726 G4_Declare* dcl = nullptr;
1727
1728 G4_InstOpts dbgOpt = m_options->getOption(vISA_markSamplerMoves) ? InstOpt_BreakPoint : InstOpt_NoOpt;
1729 if (m_options->getOption(vISA_cacheSamplerHeader) && !isBindlessSampler)
1730 {
1731 dcl = builtinSamplerHeader;
1732 if (!builtinSamplerHeaderInitialized)
1733 {
1734 builtinSamplerHeaderInitialized = true;
1735 if (hasBindlessSampler())
1736 {
1737 // make sure we set bit 0 of M0.3:ud to be 0
1738 // and (1) M0.6<1>:uw M0.6<1>:uw 0xFFFE
1739 G4_DstRegRegion* dst = createDst(dcl->getRegVar(), 0, 6, 1, Type_UW);
1740 G4_SrcRegRegion* src0 = createSrc(dcl->getRegVar(), 0, 6, getRegionScalar(), Type_UW);
1741 G4_INST* SSPMove = createBinOp(G4_and, g4::SIMD1, dst, src0, createImm(0xFFFE, Type_UW), InstOpt_WriteEnable, false);
1742 instList.push_front(SSPMove);
1743 }
1744 G4_INST* r0Move = createMov(g4::SIMD8,
1745 createDstRegRegion(dcl, 1),
1746 createSrcRegRegion(builtinR0, getRegionStride1()),
1747 InstOpt_WriteEnable | dbgOpt, false);
1748 instList.push_front(r0Move);
1749 }
1750 if (samplerIndexGE16)
1751 {
1752 // When sampler index is greater or equal 16 then the
1753 // createSamplerHeader() message overwrites the sampler states
1754 // pointer in the header -> cannot use the cached value in this
1755 // case.
1756 dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1757 dcl->setCapableOfReuse();
1758 G4_SrcRegRegion* src = createSrc(builtinSamplerHeader->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
1759 createMovInst(dcl, 0, 0, g4::SIMD8, NULL, NULL, src, false, dbgOpt);
1760 }
1761 }
1762 else
1763 {
1764 dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1765 dcl->setCapableOfReuse();
1766 createMovR0Inst(dcl, 0, 0, true, dbgOpt);
1767 if (hasBindlessSampler() && !isBindlessSampler)
1768 {
1769 // make sure we set bit 0 of M0.3:ud to be 0
1770 // and (1) M0.6<1>:uw M0.6<1>:uw 0xFFFE
1771 G4_DstRegRegion* dst = createDst(dcl->getRegVar(), 0, 6, 1, Type_UW);
1772 G4_SrcRegRegion* src0 = createSrc(dcl->getRegVar(), 0, 6, getRegionScalar(), Type_UW);
1773 createBinOp(G4_and, g4::SIMD1, dst, src0, createImm(0xFFFE, Type_UW), InstOpt_WriteEnable, true);
1774 }
1775 }
1776
1777 return dcl;
1778 }
1779
1780 // get the number of GRFs occupied by a sampler message's operand
getNumGRF(bool isFP16,int execSize)1781 static uint32_t getNumGRF(bool isFP16, int execSize)
1782 {
1783 int numBytes = (isFP16 ? 2 : 4) * execSize;
1784 return (numBytes + getGRFSize() - 1) / getGRFSize();
1785 }
1786
getSamplerResponseLength(int numChannels,bool isFP16,int execSize,bool pixelNullMask,bool nullDst)1787 uint32_t IR_Builder::getSamplerResponseLength(
1788 int numChannels, bool isFP16, int execSize, bool pixelNullMask, bool nullDst)
1789 {
1790 if (nullDst)
1791 {
1792 hasNullReturnSampler = true;
1793 return 0;
1794 }
1795 uint32_t responseLength = numChannels * getNumGRF(isFP16, execSize);
1796
1797 if (pixelNullMask)
1798 {
1799 ++responseLength;
1800 }
1801 return responseLength;
1802 }
1803
needSamplerHeader(IR_Builder * builder,bool pixelNullMask,bool nonZeroAoffImmi,bool needHeaderForChannels,bool bindlessSampler,bool simd16HFReturn)1804 static bool needSamplerHeader(
1805 IR_Builder* builder, bool pixelNullMask, bool nonZeroAoffImmi,
1806 bool needHeaderForChannels, bool bindlessSampler,
1807 bool simd16HFReturn)
1808 {
1809 return builder->forceSamplerHeader() ||
1810 (pixelNullMask && builder->hasPixelNullMask()) ||
1811 nonZeroAoffImmi || needHeaderForChannels || bindlessSampler ||
1812 (simd16HFReturn && VISA_WA_CHECK(builder->getPWaTable(), WaHeaderRequiredOnSimd16Sample16bit));
1813 }
1814
1815 // This function assumes there are no gaps in parameter array. e.g. NULL pointers
1816 // If there is a gap it must be RawOperand with value 0.
translateVISASampler3DInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,bool cpsEnable,bool uniformSampler,G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * aoffimmi,G4_Operand * sampler,G4_Operand * surface,G4_DstRegRegion * dst,unsigned int numParms,G4_SrcRegRegion ** params)1817 int IR_Builder::translateVISASampler3DInst(
1818 VISASampler3DSubOpCode actualop,
1819 bool pixelNullMask,
1820 bool cpsEnable,
1821 bool uniformSampler,
1822 G4_Predicate* pred,
1823 VISA_Exec_Size executionSize,
1824 VISA_EMask_Ctrl emask,
1825 ChannelMask chMask,
1826 G4_Operand *aoffimmi,
1827 G4_Operand *sampler,
1828 G4_Operand *surface,
1829 G4_DstRegRegion* dst,
1830 unsigned int numParms,
1831 G4_SrcRegRegion ** params)
1832 {
1833 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1834
1835 G4_ExecSize execSize = toExecSize(executionSize);
1836 G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
1837
1838 // First setup message header and message payload
1839
1840 // Message header and payload size is numParms GRFs
1841
1842 const bool FP16Return = dst->getTypeSize() == 2;
1843 const bool FP16Input = params[0]->getType() == Type_HF;
1844
1845 bool useHeader = false;
1846
1847 unsigned int numRows = numParms * getNumGRF(FP16Input, execSize);
1848
1849 VISAChannelMask channels = chMask.getAPI();
1850 // For SKL+ channel mask R, RG, RGB, and RGBA may be derived from response length
1851 bool needHeaderForChannels = (getPlatform() < GENX_SKL) ? channels != CHANNEL_MASK_RGBA :
1852 (channels != CHANNEL_MASK_R && channels != CHANNEL_MASK_RG && channels != CHANNEL_MASK_RGB && channels != CHANNEL_MASK_RGBA);
1853
1854 bool nonZeroAoffImmi = !(aoffimmi->isImm() && aoffimmi->asImm()->getInt() == 0);
1855 bool simd16HFReturn = FP16Return && execSize == 16;
1856 if (needSamplerHeader(this, pixelNullMask, nonZeroAoffImmi, needHeaderForChannels,
1857 isBindlessSampler(sampler),
1858 simd16HFReturn))
1859 {
1860 useHeader = true;
1861 ++numRows;
1862 }
1863
1864 int numChannels = chMask.getNumEnabledChannels();
1865
1866 if (execSize > getNativeExecSize() &&
1867 (numRows > 11 || actualop == VISA_3D_SAMPLE_D || actualop == VISA_3D_SAMPLE_D_C || actualop == VISA_3D_SAMPLE_KILLPIX))
1868 {
1869 // decrementing since we will produce SIMD8 code.
1870 // don't do this for SIMD16H since its message length is the same as SIMD8H
1871 if (!FP16Input)
1872 {
1873 numRows -= numParms;
1874 }
1875
1876 return splitSampleInst(actualop, pixelNullMask, cpsEnable, pred, chMask,
1877 numChannels, aoffimmi, sampler, surface,
1878 dst, emask, useHeader, numRows, numParms, params, uniformSampler);
1879 }
1880
1881 bool useSplitSend = useSends();
1882
1883 G4_SrcRegRegion *header = 0;
1884 G4_Operand* samplerIdx = sampler;
1885
1886 if (useHeader)
1887 {
1888 const bool samplerIndexGE16 = IsSamplerIndexGE16(aoffimmi);
1889 G4_Declare *dcl = getSamplerHeader(isBindlessSampler(sampler), samplerIndexGE16);
1890 samplerIdx = createSampleHeader(this, dcl, actualop, pixelNullMask, aoffimmi, chMask,
1891 sampler);
1892 header = createSrcRegRegion(dcl, getRegionStride1());
1893 }
1894
1895 G4_InstOpts dbgOpt = m_options->getOption(vISA_markSamplerMoves) ? InstOpt_BreakPoint : InstOpt_NoOpt;
1896 // Collect payload sources.
1897 unsigned len = numParms + (header ? 1 : 0);
1898 std::vector<PayloadSource> sources(len);
1899 unsigned i = 0;
1900 // Collect header if present.
1901 if (header) {
1902 sources[i].opnd = header;
1903 sources[i].execSize = g4::SIMD8;
1904 sources[i].instOpt = InstOpt_WriteEnable | dbgOpt;
1905 ++i;
1906 }
1907 // Collect all parameters.
1908 bool needNoMask = needsNoMaskCoordinates(actualop);
1909 unsigned uPos = needNoMask ? getUPosition(actualop) : ~0u;
1910 for (unsigned j = 0; j != numParms; ++j) {
1911 sources[i].opnd = params[j];
1912 sources[i].execSize = execSize;
1913 sources[i].instOpt = (needNoMask && (uPos <= j && j < (uPos + 3))) ?
1914 InstOpt_WriteEnable | dbgOpt : instOpt | dbgOpt;
1915 ++i;
1916 }
1917 ASSERT_USER(i == len, "There's mismatching during payload source collecting!");
1918
1919 G4_SrcRegRegion *msgs[2] = {0, 0};
1920 unsigned sizes[2] = {0, 0};
1921 preparePayload(msgs, sizes, execSize, useSplitSend, sources.data(), len);
1922
1923 uint32_t responseLength = getSamplerResponseLength(numChannels, FP16Return, execSize,
1924 hasPixelNullMask() && pixelNullMask, dst->isNullReg());
1925
1926 // Check if CPS LOD Compensation Enable is valid.
1927 if (cpsEnable)
1928 {
1929 checkCPSEnable(actualop, responseLength, execSize);
1930 }
1931
1932 uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), FP16Return, FP16Input);
1933 uint32_t desc = G4_SendDescRaw::createDesc(fc, useHeader, sizes[0], responseLength);
1934
1935 G4_InstSend* sendInst = nullptr;
1936 bool forceSplitSend = shouldForceSplitSend(surface);
1937 if (msgs[1] == 0 && !forceSplitSend)
1938 {
1939 ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1940 G4_SendDescRaw *msgDesc = createSampleMsgDesc(desc, cpsEnable, 0, surface, samplerIdx);
1941
1942 sendInst = createSendInst(pred, dst, msgs[0], execSize,
1943 msgDesc, instOpt, false);
1944 }
1945 else
1946 {
1947 G4_SendDescRaw *msgDesc = createSampleMsgDesc(desc, cpsEnable, sizes[1], surface, samplerIdx);
1948 sendInst = createSplitSendInst(pred, dst, msgs[0], msgs[1],
1949 execSize, msgDesc, instOpt, false);
1950 }
1951 setUniformSampler(sendInst, uniformSampler);
1952 return VISA_SUCCESS;
1953 }
1954
translateVISALoad3DInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,G4_Predicate * pred_opnd,VISA_Exec_Size executionSize,VISA_EMask_Ctrl em,ChannelMask channelMask,G4_Operand * aoffimmi,G4_Operand * surface,G4_DstRegRegion * dst,uint8_t numParms,G4_SrcRegRegion ** opndArray)1955 int IR_Builder::translateVISALoad3DInst(
1956 VISASampler3DSubOpCode actualop,
1957 bool pixelNullMask,
1958 G4_Predicate *pred_opnd,
1959 VISA_Exec_Size executionSize,
1960 VISA_EMask_Ctrl em,
1961 ChannelMask channelMask,
1962 G4_Operand* aoffimmi,
1963 G4_Operand* surface,
1964 G4_DstRegRegion* dst,
1965 uint8_t numParms,
1966 G4_SrcRegRegion ** opndArray)
1967 {
1968 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1969
1970 bool useHeader = false;
1971
1972 G4_ExecSize execSize = toExecSize(executionSize);
1973 G4_InstOpts instOpt = Get_Gen4_Emask(em, execSize);
1974
1975 const bool halfReturn = dst->getTypeSize() == 2;
1976 const bool halfInput = opndArray[0]->getTypeSize() == 2;
1977
1978 unsigned int numRows = numParms * getNumGRF(halfInput, execSize);
1979
1980 VISAChannelMask channels = channelMask.getAPI();
1981 // For SKL+ channel mask R, RG, RGB, and RGBA may be derived from response length
1982 bool needHeaderForChannels = (getPlatform() < GENX_SKL) ? channels != CHANNEL_MASK_RGBA :
1983 (channels != CHANNEL_MASK_R && channels != CHANNEL_MASK_RG && channels != CHANNEL_MASK_RGB && channels != CHANNEL_MASK_RGBA);
1984
1985 bool nonZeroAoffImmi = !(aoffimmi->isImm() && aoffimmi->asImm()->getInt() == 0);
1986 bool simd16HFReturn = halfReturn && execSize == 16;
1987 if (needSamplerHeader(this, pixelNullMask, nonZeroAoffImmi, needHeaderForChannels, false,
1988 simd16HFReturn))
1989 {
1990 useHeader = true;
1991 ++numRows;
1992 }
1993
1994 int numChannels = channelMask.getNumEnabledChannels();
1995 if (execSize > getNativeExecSize() && numRows > 11)
1996 {
1997 // decrementing since we will produce SIMD8 code.
1998 // don't do this for SIMD16H since its message length is the same as SIMD8H
1999 if (!halfInput)
2000 {
2001 numRows -= numParms;
2002 }
2003 return splitSampleInst(actualop, pixelNullMask, /*cpsEnable*/false,
2004 pred_opnd, channelMask, numChannels, aoffimmi, NULL, surface,
2005 dst, em, useHeader, numRows, numParms, opndArray);
2006 }
2007
2008 bool useSplitSend = useSends();
2009
2010 G4_SrcRegRegion *header = nullptr;
2011 if (useHeader)
2012 {
2013 G4_Declare* dcl = getSamplerHeader(false /*isBindlessSampler*/, false /*samperIndexGE16*/);
2014 {
2015 (void)createSampleHeader(this, dcl, actualop, pixelNullMask, aoffimmi, channelMask,
2016 nullptr);
2017 }
2018 header = createSrcRegRegion(dcl, getRegionStride1());
2019 }
2020
2021 // Collect payload sources.
2022 unsigned len = numParms + (header ? 1 : 0);
2023 std::vector<PayloadSource> sources(len);
2024 unsigned i = 0;
2025 // Collect header if present.
2026 if (header) {
2027 sources[i].opnd = header;
2028 sources[i].execSize = g4::SIMD8;
2029 sources[i].instOpt = InstOpt_WriteEnable;
2030 ++i;
2031 }
2032 // Collect all parameters.
2033 bool needNoMask = needsNoMaskCoordinates(actualop);
2034 unsigned uPos = needNoMask ? getUPosition(actualop) : ~0u;
2035 for (unsigned j = 0; j != numParms; ++j) {
2036 sources[i].opnd = opndArray[j];
2037 sources[i].execSize = execSize;
2038 sources[i].instOpt = (needNoMask && (uPos <= j && j < (uPos + 3))) ?
2039 InstOpt_WriteEnable : instOpt;
2040 ++i;
2041 }
2042 ASSERT_USER(i == len, "There's mismatching during payload source collecting!");
2043
2044 G4_SrcRegRegion *msgs[2] = {0, 0};
2045 unsigned sizes[2] = {0, 0};
2046 preparePayload(msgs, sizes, execSize, useSplitSend, sources.data(), len);
2047
2048 uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), halfReturn, halfInput);
2049
2050 uint32_t responseLength = getSamplerResponseLength(numChannels, halfReturn, execSize,
2051 hasPixelNullMask() && pixelNullMask, dst->isNullReg());
2052
2053 bool forceSplitSend = shouldForceSplitSend(surface);
2054 if (msgs[1] == 0 && !forceSplitSend)
2055 {
2056 createSendInst(pred_opnd, dst,
2057 msgs[0], sizes[0],
2058 responseLength,
2059 execSize, fc, SFID::SAMPLER,
2060 useHeader,
2061 SendAccess::READ_ONLY,
2062 surface, NULL,
2063 instOpt, false);
2064 }
2065 else
2066 {
2067 createSplitSendInst(pred_opnd, dst,
2068 msgs[0], sizes[0], msgs[1], sizes[1],
2069 responseLength,
2070 execSize, fc, SFID::SAMPLER,
2071 useHeader,
2072 SendAccess::READ_ONLY,
2073 surface, NULL,
2074 instOpt, false);
2075 }
2076
2077 return VISA_SUCCESS;
2078 }
2079
translateVISAGather3dInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl em,ChannelMask channelMask,G4_Operand * aoffimmi,G4_Operand * sampler,G4_Operand * surface,G4_DstRegRegion * dst,unsigned int numOpnds,G4_SrcRegRegion ** opndArray)2080 int IR_Builder::translateVISAGather3dInst(
2081 VISASampler3DSubOpCode actualop,
2082 bool pixelNullMask,
2083 G4_Predicate* pred,
2084 VISA_Exec_Size executionSize,
2085 VISA_EMask_Ctrl em,
2086 ChannelMask channelMask,
2087 G4_Operand* aoffimmi,
2088 G4_Operand* sampler,
2089 G4_Operand* surface,
2090 G4_DstRegRegion* dst,
2091 unsigned int numOpnds,
2092 G4_SrcRegRegion ** opndArray)
2093 {
2094 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2095
2096 bool useHeader = false;
2097
2098 G4_ExecSize execSize = toExecSize(executionSize);
2099 G4_InstOpts instOpt = Get_Gen4_Emask(em, execSize);
2100
2101 const bool FP16Return = dst->getTypeSize() == 2;
2102 const bool FP16Input = opndArray[0]->getType() == Type_HF;
2103
2104 unsigned int numRows = numOpnds * getNumGRF(FP16Input, execSize);
2105
2106 bool nonZeroAoffImmi = !(aoffimmi->isImm() && aoffimmi->asImm()->getInt() == 0);
2107 bool needHeaderForChannels = channelMask.getSingleChannel() != VISA_3D_GATHER4_CHANNEL_R;
2108 bool simd16HFReturn = FP16Return && execSize == 16;
2109
2110 if (needSamplerHeader(this, pixelNullMask, nonZeroAoffImmi, needHeaderForChannels,
2111 isBindlessSampler(sampler),
2112 simd16HFReturn))
2113 {
2114 useHeader = true;
2115 ++numRows;
2116 }
2117
2118
2119 if (execSize > getNativeExecSize() && numRows > 11)
2120 {
2121 // decrementing since we will produce SIMD8 code.
2122 // don't do this for SIMD16H since its message length is the same as SIMD8H
2123 if (!FP16Input)
2124 {
2125 numRows -= numOpnds;
2126 }
2127
2128 return splitSampleInst(actualop, pixelNullMask, /*cpsEnable*/false,
2129 pred, channelMask, 4, aoffimmi, sampler, surface,
2130 dst, em, useHeader, numRows, numOpnds, opndArray);
2131 }
2132
2133 bool useSplitSend = useSends();
2134
2135 G4_SrcRegRegion *header = nullptr;
2136 G4_Operand* samplerIdx = sampler;
2137
2138 if (useHeader)
2139 {
2140 const bool samplerIndexGE16 = IsSamplerIndexGE16(aoffimmi);
2141 G4_Declare *dcl = getSamplerHeader(isBindlessSampler(sampler), samplerIndexGE16);
2142 {
2143 samplerIdx = createSampleHeader(this, dcl, actualop, pixelNullMask, aoffimmi, channelMask,
2144 sampler);
2145 }
2146 header = createSrcRegRegion(dcl, getRegionStride1());
2147 }
2148
2149 // Collect payload sources.
2150 unsigned len = numOpnds + (header ? 1 : 0);
2151 std::vector<PayloadSource> sources(len);
2152 unsigned i = 0;
2153 // Collect header if present.
2154 if (header) {
2155 sources[i].opnd = header;
2156 sources[i].execSize = g4::SIMD8;
2157 sources[i].instOpt = InstOpt_WriteEnable;
2158 ++i;
2159 }
2160 // Collect all parameters.
2161 bool needNoMask = needsNoMaskCoordinates(actualop);
2162 unsigned uPos = needNoMask ? getUPosition(actualop) : ~0u;
2163 for (unsigned j = 0; j != numOpnds; ++j) {
2164 sources[i].opnd = opndArray[j];
2165 sources[i].execSize = execSize;
2166 sources[i].instOpt = (needNoMask && (uPos <= j && j < (uPos + 3))) ?
2167 InstOpt_WriteEnable : instOpt;
2168 ++i;
2169 }
2170 ASSERT_USER(i == len, "There's mismatching during payload source collecting!");
2171
2172 G4_SrcRegRegion *msgs[2] = {0, 0};
2173 unsigned sizes[2] = {0, 0};
2174 preparePayload(msgs, sizes, execSize, useSplitSend, sources.data(), len);
2175
2176 uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), FP16Return, FP16Input);
2177 uint32_t responseLength = getSamplerResponseLength(4, FP16Return, execSize,
2178 hasPixelNullMask() && pixelNullMask, dst->isNullReg());
2179
2180 bool forceSplitSend = shouldForceSplitSend(surface);
2181 if (msgs[1] == 0 && !forceSplitSend)
2182 {
2183 createSendInst(pred, dst, msgs[0], sizes[0],
2184 responseLength,
2185 execSize, fc, SFID::SAMPLER,
2186 useHeader,
2187 SendAccess::READ_ONLY,
2188 surface, samplerIdx,
2189 instOpt, false);
2190 }
2191 else
2192 {
2193 createSplitSendInst(pred, dst,
2194 msgs[0], sizes[0], msgs[1], sizes[1],
2195 responseLength,
2196 execSize, fc, SFID::SAMPLER,
2197 useHeader,
2198 SendAccess::READ_ONLY,
2199 surface, samplerIdx,
2200 instOpt, false);
2201 }
2202
2203 return VISA_SUCCESS;
2204 }
2205
2206
2207 /*
2208 * Translates Sampler Norm API intrinsic.
2209 *
2210 * Assuming: N = 4, channelMask=ABGR_ENABLE, surfIndex = 0x21, samplerIndex = 0x4,
2211 * then the generated code should look like the following for GT:
2212 *
2213 * .declare VX Base=m ElementSize=4 Type=ud Total=16
2214 * .declare VY Base=r ElementSize=2 Type=uw Total=128
2215 *
2216 * mov (8) VX(0,0)<1>, r0:ud
2217 * mov (1) VX(0,2)<1>, 0
2218 * mov (1) VX(1,1)<1>, deltaU
2219 * mov (1) VX(1,2)<1>, u
2220 * mov (1) VX(1,5)<1>, deltaV
2221 * mov (1) VX(1,6)<1>, v
2222 * send (16) VY(0,0)<1>, VX(0,0), 0x2, 0x048bc421
2223 * mov (128) M(0,0)<1>, VY(0,0)
2224 *
2225 * VX(0,0): message header
2226 *
2227 * VX(1,0): SIMD32 media payload
2228 *
2229 * ex_desc: 0x2 == 0010 (Target Function ID: Sampling Engine)
2230 *
2231 * desc: 0x048bc421 == Bit 31-29: 000 (Reserved)
2232 * Bit 28-25: 0010 (Message Length =)
2233 * Bit 24-20: 01000 (Response Message Length = 8)
2234 * Bit 19: 1 (Header present)
2235 * Bit 18: 0 (Reserved)
2236 * Bit 17-16: 11 (SIMD Mode = SIMD32)
2237 * Bit 15-12: 1100 (Message Type = sample_unorm media)
2238 * Bit 11-8: 0000 + samplerIndex (Sampler Index)
2239 * Bit 7-0: 00000000 + surfIndex (Binding Table Index)
2240 *
2241 */
translateVISASamplerNormInst(G4_Operand * surface,G4_Operand * sampler,ChannelMask channel,unsigned numEnabledChannels,G4_Operand * deltaUOpnd,G4_Operand * uOffOpnd,G4_Operand * deltaVOpnd,G4_Operand * vOffOpnd,G4_DstRegRegion * dst_opnd)2242 int IR_Builder::translateVISASamplerNormInst(
2243 G4_Operand* surface,
2244 G4_Operand* sampler,
2245 ChannelMask channel,
2246 unsigned numEnabledChannels,
2247 G4_Operand* deltaUOpnd,
2248 G4_Operand* uOffOpnd,
2249 G4_Operand* deltaVOpnd,
2250 G4_Operand* vOffOpnd,
2251 G4_DstRegRegion* dst_opnd)
2252 {
2253 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2254
2255 // mov (8) VX(0,0)<1>, r0:ud
2256 // add dcl for VX
2257 G4_Declare *dcl = createSendPayloadDcl(2 * GENX_SAMPLER_IO_SZ, Type_UD);
2258
2259 // mov VX(0,0)<1>, r0
2260 createMovR0Inst(dcl, 0, 0);
2261 /* mov (1) VX(0,2)<1>, 0 */
2262 unsigned cmask = channel.getHWEncoding() << 12;
2263 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, createImm(cmask, Type_UD));
2264
2265 G4_Declare *dcl1 = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_F);
2266 dcl1->setAliasDeclare(dcl, numEltPerGRF<Type_UB>());
2267
2268 // mov (1) VX(1,4)<1>, deltaU
2269 createMovInst(dcl1, 0, 4, g4::SIMD1, NULL, NULL, deltaUOpnd);
2270 // mov (1) VX(1,2)<1>, u
2271 createMovInst(dcl1, 0, 2, g4::SIMD1, NULL, NULL, uOffOpnd);
2272 // mov (1) VX(1,5)<1>, deltaV
2273 createMovInst(dcl1, 0, 5, g4::SIMD1, NULL, NULL, deltaVOpnd);
2274 // mov (1) VX(1,3)<1>, v
2275 createMovInst(dcl1, 0, 3, g4::SIMD1, NULL, NULL, vOffOpnd);
2276
2277 // send's operands preparation
2278 // create a currDst for VX
2279 G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
2280
2281 G4_DstRegRegion* d = checkSendDst(dst_opnd->asDstRegRegion());
2282
2283 // Set bit 12-17 for the message descriptor
2284 unsigned temp = 0;
2285 temp += 0xc << 12; // Bit 16-12 = 1100 for Sampler Message Type
2286 temp += 0x3 << 17; // Bit 18-17 = 11 for SIMD32 mode
2287
2288 createSendInst(
2289 NULL,
2290 d,
2291 payload,
2292 2,
2293 32*numEnabledChannels*TypeSize(Type_UW)/numEltPerGRF<Type_UB>(),
2294 g4::SIMD32,
2295 temp,
2296 SFID::SAMPLER,
2297 1,
2298 SendAccess::READ_ONLY,
2299 surface,
2300 sampler,
2301 0,
2302 false);
2303
2304 return VISA_SUCCESS;
2305 }
2306
2307
2308 /*
2309 * Translates Sampler intrinsic.
2310 *
2311 * Assuming: N = 4, channelMask=ABGR_ENABLE, surfIndex = 0x21, samplerIndex = 0x4,
2312 * then the generated code should look like the following for GT:
2313 *
2314 * .declare VX Base=m ElementSize=4 Type=f Total=72
2315 * .declare VY Base=r ElementSize=4 Type=f Total=64
2316 * .declare VZ Base=r ElementSize=2 Type=w Total=128 ALIAS(VY,0)
2317 *
2318 * mov (8) VX(0,0)<1>, r0:ud
2319 * mov (1) VX(0,2)<1>, 0
2320 * mov (16) VX(1,0)<1>, u
2321 * mov (16) VX(3,0)<1>, v
2322 * mov (16) VX(5,0)<1>, r
2323 * mov (16) VX(7,0)<1>, 0
2324 * send (16) VY(0,0)<1>, VX(0,0), 0x2, 0x128a0421
2325 * mov (64) M(0,0)<1>, VY(0,0)
2326 *
2327 * ex_desc: 0x2 == 0010 (Target Function ID: Sampling Engine)
2328 *
2329 * desc: 0x128a0421 == Bit 31-29: 000 (Reserved)
2330 * Bit 28-25: 1001 (Message Length = 9 (1+2*4 for SIMD16))
2331 * Bit 24-20: 01000 (Response Message Length = 8)
2332 * Bit 19: 1 (Header present)
2333 * Bit 18: 0 (Reserved)
2334 * Bit 17-16: 10 (SIMD Mode = SIMD16)
2335 * Bit 15-12: 0000 (Message Type = Sample)
2336 * Bit 11-8: 0000 + samplerIndex (Sampler Index)
2337 * Bit 7-0: 00000000 + surfIndex (Binding Table Index)
2338 *
2339 */
translateVISASamplerInst(unsigned simdMode,G4_Operand * surface,G4_Operand * sampler,ChannelMask channel,unsigned numEnabledChannels,G4_Operand * uOffOpnd,G4_Operand * vOffOpnd,G4_Operand * rOffOpnd,G4_DstRegRegion * dstOpnd)2340 int IR_Builder::translateVISASamplerInst(
2341 unsigned simdMode,
2342 G4_Operand* surface,
2343 G4_Operand* sampler,
2344 ChannelMask channel,
2345 unsigned numEnabledChannels,
2346 G4_Operand* uOffOpnd,
2347 G4_Operand* vOffOpnd,
2348 G4_Operand* rOffOpnd,
2349 G4_DstRegRegion* dstOpnd)
2350 {
2351 TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2352
2353 // mov (8) VX(0,0)<1>, r0:ud
2354 // add dcl for VX
2355 unsigned num_payload_elt = simdMode/2 * numEltPerGRF<Type_UB>()/TypeSize(Type_UD);
2356 G4_Declare *dcl = createSendPayloadDcl(num_payload_elt + GENX_SAMPLER_IO_SZ, Type_UD);
2357
2358 // mov VX(0,0)<1>, r0
2359 createMovR0Inst(dcl, 0, 0);
2360 unsigned cmask = channel.getHWEncoding() << 12;
2361 /* mov (1) VX(0,2)<1>, 0 */
2362 createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, createImm(cmask, Type_UD));
2363
2364 // set up the message payload
2365 // lod is always uninitialized for us as we don't support it.
2366 G4_Declare *dcl1 = createSendPayloadDcl(num_payload_elt, Type_UD);
2367 dcl1->setAliasDeclare(dcl, numEltPerGRF<Type_UB>());
2368 /* mov (sample_mode) VX(0,0)<1>, u */
2369 createMovSendSrcInst(dcl1, 0, 0, simdMode, uOffOpnd, 0);
2370 if (sampler == NULL)
2371 {
2372 // ld
2373 if (getPlatform() < GENX_SKL)
2374 {
2375 // the order of paramters is
2376 // u lod v r
2377 /* mov (sample_mode) VX(sample_mode/8, 0)<1>, lod */
2378 createMovSendSrcInst(dcl1, simdMode/8, 0, simdMode, createImm(0, Type_UD), 0);
2379 /* mov (sample_mode) VX(2*sample_mode/8, 0)<1>, v */
2380 createMovSendSrcInst(dcl1, 2*simdMode/8, 0, simdMode, vOffOpnd, 0);
2381 /* mov (sample_mode) VX(3*sampler_mode/8, 0)<1>, r */
2382 createMovSendSrcInst(dcl1, 3*simdMode/8, 0, simdMode, rOffOpnd, 0);
2383 }
2384 else
2385 {
2386 // SKL+: the order of paramters is
2387 // u v lod r
2388 /* mov (sample_mode) VX(sample_mode/8, 0)<1>, v */
2389 createMovSendSrcInst(dcl1, simdMode/8, 0, simdMode, vOffOpnd, 0);
2390 /* mov (sample_mode) VX(2*sample_mode/8, 0)<1>, lod */
2391 createMovSendSrcInst(dcl1, 2*simdMode/8, 0, simdMode, createImm(0, Type_UD), 0);
2392 /* mov (sample_mode) VX(3*sampler_mode/8, 0)<1>, r */
2393 createMovSendSrcInst(dcl1, 3*simdMode/8, 0, simdMode, rOffOpnd, 0);
2394 }
2395 }
2396 else
2397 {
2398 // sample
2399 /* mov (sample_mode) VX(1 + sample_mode/8, 0)<1>, v */
2400 createMovSendSrcInst(dcl1, simdMode/8, 0, simdMode, vOffOpnd, 0);
2401 /* mov (sample_mode) VX(3,0)<1>, r */
2402 createMovSendSrcInst(dcl1, 2*simdMode/8, 0, simdMode, rOffOpnd, 0);
2403 /* mov (sample_mode) VX(5,0)<1>, 0 */
2404 createMovSendSrcInst(dcl1, 3*simdMode/8, 0, simdMode, createImm(0, Type_UD), 0);
2405 }
2406 // send's operands preparation
2407 // create a currDst for VX
2408 G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
2409
2410 G4_DstRegRegion* d = checkSendDst(dstOpnd->asDstRegRegion());
2411
2412 // Set bit 9-8 for the message descriptor
2413 unsigned temp = 0;
2414
2415 //Bit 17-18 = 10 for SIMD mode
2416 if (simdMode == 8)
2417 {
2418 temp += 0x1 << 17;
2419 }
2420 else
2421 {
2422 temp += 0x2 << 17;
2423 }
2424
2425 if (sampler == NULL)
2426 {
2427 #define SAMPLER_MESSAGE_TYPE_OFFSET 12
2428 //LD message
2429 temp += VISASampler3DSubOpCode::VISA_3D_LD << SAMPLER_MESSAGE_TYPE_OFFSET;
2430 }
2431
2432 if (simdMode == 16) {
2433 // redefine the type and offset of post dst.
2434 if ((d->getType() != Type_W) &&
2435 (d->getType() != Type_UW)) {
2436 short new_SubRegOff = dstOpnd->asDstRegRegion()->getSubRegOff();
2437 if (dstOpnd->getRegAccess() == Direct) {
2438 new_SubRegOff = (dstOpnd->asDstRegRegion()->getSubRegOff() * dstOpnd->getTypeSize()) / TypeSize(Type_W);
2439 }
2440 G4_DstRegRegion new_dst(
2441 dstOpnd->getRegAccess(),
2442 dstOpnd->asDstRegRegion()->getBase(),
2443 dstOpnd->asDstRegRegion()->getRegOff(),
2444 new_SubRegOff,
2445 1,
2446 Type_W);
2447 d = createDstRegRegion(new_dst);
2448 }
2449 }
2450
2451 createSendInst(
2452 NULL,
2453 d,
2454 payload,
2455 1 + simdMode/2,
2456 ((simdMode == 8) ? 32 : (numEnabledChannels*16))*TypeSize(Type_F)/numEltPerGRF<Type_UB>(),
2457 G4_ExecSize(simdMode),
2458 temp,
2459 SFID::SAMPLER,
2460 1,
2461 SendAccess::READ_ONLY,
2462 surface,
2463 sampler,
2464 0,
2465 false);
2466 return VISA_SUCCESS;
2467 }
2468