1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2020-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "BuildIR.h"
10 #include "../Timer.h"
11 
12 using namespace vISA;
13 
14 static const unsigned MESSAGE_PRECISION_SUBTYPE_OFFSET  = 30;
15 static const unsigned SIMD_MODE_2_OFFSET  = 29;
16 
createSamplerMsgDesc(VISASampler3DSubOpCode samplerOp,bool isNativeSIMDSize,bool isFP16Return,bool isFP16Input)17 static uint32_t createSamplerMsgDesc(
18     VISASampler3DSubOpCode samplerOp,
19     bool isNativeSIMDSize,
20     bool isFP16Return,
21     bool isFP16Input)
22 {
23     // Now create message descriptor
24     // 7:0 - BTI
25     // 11:8 - Sampler Index
26     // 16:12 - Message Type
27     // 18:17 - SIMD Mode[0:1]
28     // 19 - Header Present
29     // 24:20 - Response Length
30     // 28:25 - Message Length
31     // 29 - SIMD Mode[2]
32     // 30 - Return Format
33     // 31 - CPS Message LOD Compensation Enable
34     // We only set message type, SIMD mode, and return format here.  The other fields
35     // are set in createSendInst as they are common with other send messages
36     uint32_t fc = 0;
37 
38     fc |= ((uint32_t)samplerOp & 0x1f) << 12;
39 
40     if (isNativeSIMDSize)
41     {
42         fc |= (1 << 17);
43     }
44     else
45     {
46         fc |= (2 << 17);
47     }
48 
49     if (isFP16Return)
50     {
51         // 16-bit return type.  Note that this doesn't change the return length
52         fc |= (1 << MESSAGE_PRECISION_SUBTYPE_OFFSET);
53     }
54 
55     if (isFP16Input)
56     {
57         fc |= (1 << SIMD_MODE_2_OFFSET);
58     }
59 
60     return fc;
61 }
62 
63 
translateVISASampleInfoInst(VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,G4_DstRegRegion * dst)64 int IR_Builder::translateVISASampleInfoInst(
65     VISA_Exec_Size executionSize,
66     VISA_EMask_Ctrl emask,
67     ChannelMask chMask,
68     G4_Operand* surface,
69     G4_DstRegRegion* dst)
70 {
71     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
72 
73     G4_ExecSize execSize {Get_VISA_Exec_Size(executionSize)};
74     G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
75     VISAChannelMask channels = chMask.getAPI();
76     bool useFakeHeader = (getPlatform() < GENX_SKL) ? false :
77         (channels == CHANNEL_MASK_R);
78     bool preEmption = forceSamplerHeader();
79     bool forceSplitSend = shouldForceSplitSend(surface);
80     bool useHeader = true;
81     // SAMPLEINFO has 0 parameters so its only header
82 
83     unsigned int numRows = 1;
84 
85     G4_Declare *msg = NULL;
86     G4_SrcRegRegion *m0 = NULL;
87 
88     if (!useFakeHeader || forceSplitSend || preEmption)
89     {
90         msg = getSamplerHeader(false /*isBindlessSampler*/, false /*samperIndexGE16*/);
91 
92         unsigned int secondDword = chMask.getHWEncoding() << 12;
93 
94         G4_Imm* immOpndSecondDword = createImm(secondDword, Type_UD);
95 
96         // mov (1) msg(0,2) immOpndSecondDword
97         auto payloadDstRgn = createDst(msg->getRegVar(), 0, 2, 1, Type_UD);
98 
99         G4_INST* movInst = createMov(g4::SIMD1, payloadDstRgn, immOpndSecondDword, InstOpt_NoOpt, true);
100         movInst->setOptionOn(InstOpt_WriteEnable);
101 
102         m0 = createSrcRegRegion(msg, getRegionStride1());
103     }
104     else
105     {
106         useHeader = false;
107         msg = createTempVar(getNativeExecSize(), Type_UD, GRFALIGN);
108         G4_DstRegRegion *dst = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
109         G4_Imm* src0Imm = createImm(0, Type_UD);
110         (void) createMov(getNativeExecSize(), dst, src0Imm, InstOpt_WriteEnable, true);
111         m0 = createSrc(msg->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
112     }
113     // Now create message descriptor
114     // 7:0 - BTI
115     // 11:8 - Sampler Index
116     // 16:12 - Message Type
117     // 18:17 - SIMD Mode
118     // 19 - Header Present
119     // 24:20 - Response Length
120     // 28:25 - Message Length
121     // 29 - SIMD Mode
122     // 30 - Return Format
123     // 31 - CPS Message LOD Compensation Enable
124     unsigned int fc = 0;
125 
126     fc |= ((unsigned int) VISA_3D_SAMPLEINFO & 0x1f) << 12;
127 
128     if (execSize == getNativeExecSize())
129     {
130         fc |= (1 << 17);
131     }
132     else
133     {
134         fc |= (2 << 17);
135     }
136 
137     uint32_t retSize = (execSize == getNativeExecSize() ? chMask.getNumEnabledChannels() : chMask.getNumEnabledChannels() * 2);
138 
139     if (forceSplitSend)
140     {
141         createSplitSendInst(NULL, dst, m0, numRows,
142             createNullSrc(Type_UD), 0, retSize,
143             execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
144     }
145     else
146     {
147         createSendInst(NULL, dst, m0, numRows, retSize,
148             execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
149     }
150 
151     return VISA_SUCCESS;
152 }
153 
translateVISAResInfoInst(VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * surface,G4_SrcRegRegion * lod,G4_DstRegRegion * dst)154 int IR_Builder::translateVISAResInfoInst(
155     VISA_Exec_Size executionSize,
156     VISA_EMask_Ctrl emask,
157     ChannelMask chMask,
158     G4_Operand* surface,
159     G4_SrcRegRegion* lod,
160     G4_DstRegRegion* dst)
161 {
162     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
163 
164     G4_ExecSize execSize {Get_VISA_Exec_Size(executionSize)};
165     G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
166     //For SKL if channels are continuous don't need header
167 
168     VISAChannelMask channels = chMask.getAPI();
169     bool preEmption = forceSamplerHeader();
170     bool useHeader = preEmption || (getPlatform() < GENX_SKL) ? channels != CHANNEL_MASK_RGBA :
171         (channels != CHANNEL_MASK_R && channels != CHANNEL_MASK_RG && channels != CHANNEL_MASK_RGB && channels != CHANNEL_MASK_RGBA);
172 
173     // Setup number of rows = (header + lod) by default
174     unsigned int numRows = (execSize == getNativeExecSize() ? 1 : 2);
175     if (useHeader)
176     {
177         numRows++;
178     }
179     unsigned int regOff = 0;
180     uint32_t returnLength = (execSize == getNativeExecSize() ? chMask.getNumEnabledChannels() : chMask.getNumEnabledChannels() * 2);
181 
182     bool useSplitSend = useSends();
183 
184     G4_Declare *msg = NULL;
185     G4_Declare *payloadUD = NULL;
186     if (useSplitSend)
187     {
188         if (useHeader)
189         {
190             --numRows;
191         }
192         unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
193         msg = getSamplerHeader(false /*isBindlessSampler*/, false /*samperIndexGE16*/);
194         payloadUD = createSendPayloadDcl(numElts, Type_UD);
195     }
196     else
197     {
198         unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
199         msg = createSendPayloadDcl(numElts, Type_UD);
200         payloadUD = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_UD);
201         payloadUD->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
202 
203         if (useHeader)
204         {
205             // Both SAMPLEINFO and RESINFO use header
206             createMovR0Inst(msg, 0, 0, true);
207         }
208     }
209 
210     if (useHeader)
211     {
212         unsigned int secondDword = 0;
213         secondDword |= (chMask.getHWEncoding() << 12);
214 
215         G4_Imm* immOpndSecondDword = createImm(secondDword, Type_UD);
216 
217         // mov (1) msg(0,2) immOpndSecondDword
218         auto payloadDstRgn = createDst(msg->getRegVar(), 0, 2, 1, Type_UD);
219 
220         G4_INST* movInst = createMov(g4::SIMD1, payloadDstRgn, immOpndSecondDword, InstOpt_NoOpt, true);
221         movInst->setOptionOn(InstOpt_WriteEnable);
222     }
223 
224     // Copy over lod vector operand to payload's 1st row
225     Copy_SrcRegRegion_To_Payload(payloadUD, regOff, lod, execSize, instOpt | InstOpt_BreakPoint);
226 
227     // Now create message descriptor
228     // 7:0 - BTI
229     // 11:8 - Sampler Index
230     // 16:12 - Message Type
231     // 18:17 - SIMD Mode
232     // 19 - Header Present
233     // 24:20 - Response Length
234     // 28:25 - Message Length
235     // 29 - SIMD Mode
236     // 30 - Return Format
237     // 31 - CPS Message LOD Compensation Enable
238     unsigned int fc = 0;
239 
240     fc |= ((unsigned int) VISA_3D_RESINFO & 0x1f) << 12;
241 
242     if (execSize == getNativeExecSize())
243     {
244         fc |= (1 << 17);
245     }
246     else
247     {
248         fc |= (2 << 17);
249     }
250 
251     if (useSplitSend)
252     {
253         G4_SrcRegRegion *m0 = nullptr;
254         G4_SrcRegRegion *m1 = nullptr;
255         unsigned int src0Size = 0;
256         unsigned int src1Size = 0;
257 
258         if (useHeader)
259         {
260             m0 = createSrcRegRegion(msg, getRegionStride1());
261             m1 = createSrcRegRegion(payloadUD, getRegionStride1());
262             src0Size = 1;
263             src1Size = numRows;
264         }
265         else
266         {
267             m0 = createSrcRegRegion(payloadUD, getRegionStride1());
268             m1 = createNullSrc(Type_UD);
269             src0Size = numRows;
270             src1Size = 0;
271         }
272         createSplitSendInst(NULL, dst, m0, src0Size, m1, src1Size, returnLength,
273             execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
274     }
275     else
276     {
277         G4_SrcRegRegion *m = createSrcRegRegion(msg, getRegionStride1());
278         createSendInst(NULL, dst, m, numRows, returnLength,
279             execSize, fc, SFID::SAMPLER, useHeader, SendAccess::READ_ONLY, surface, NULL, instOpt, false);
280     }
281 
282     return VISA_SUCCESS;
283 }
284 
285 
286 
287 // generate a URB_SIMD8* message
288 // urbHandle -- 1 GRF holding 8 URB handles.  This is the header of the message
289 // perSlotOffset -- 1 GRF holding 8 DWord offsets.  If present, it must be immediately after the header
290 // channelMask -- 1 GRF holding 8 8-bit masks.  In vISA spec they have constant values and must be
291 //                identical.  If present,  occurs after the per slot message phase if the per slot
292 //                message phase exists else it occurs after the header.
293 
translateVISAURBWrite3DInst(G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,uint8_t numOut,uint16_t globalOffset,G4_SrcRegRegion * channelMask,G4_SrcRegRegion * urbHandle,G4_SrcRegRegion * perSlotOffset,G4_SrcRegRegion * vertexData)294 int IR_Builder::translateVISAURBWrite3DInst(
295     G4_Predicate* pred,
296     VISA_Exec_Size executionSize,
297     VISA_EMask_Ctrl emask,
298     uint8_t numOut,
299     uint16_t globalOffset,
300     G4_SrcRegRegion* channelMask,
301     G4_SrcRegRegion* urbHandle,
302     G4_SrcRegRegion* perSlotOffset,
303     G4_SrcRegRegion* vertexData)
304 {
305     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
306 
307     G4_ExecSize execSize {Get_VISA_Exec_Size(executionSize)};
308     G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
309 
310     if (numOut == 0)
311     {
312         MUST_BE_TRUE(vertexData->isNullReg(), "vertex payload must be null ARF when numOut is 0");
313     }
314 
315     // header + channelMask + numOut
316     unsigned int numRows = 2 + numOut;
317     const bool useHeader = true;
318     bool usePerSlotIndex = false;
319     bool useChannelMask = true;
320 
321     if (!perSlotOffset->isNullReg())
322     {
323         usePerSlotIndex = true;
324         numRows++;
325     }
326 
327     if (channelMask->isNullReg())
328     {
329         useChannelMask = false;
330         numRows--;
331     }
332 
333     bool useSplitSend = useSends();
334     // So far, we don't have a obvious cut except for header. As the result,
335     // split-send is disabled once there's no header in the message.
336     if (!useHeader)
337         useSplitSend = false;
338 
339     if (numOut == 0)
340     {
341         // no split send if payload is null
342         useSplitSend = false;
343     }
344 
345     // msg is the header for split send, or the entire payload for regular send
346     G4_Declare *msg = NULL;
347     G4_Declare* payloadF = NULL;
348     G4_Declare* payloadD = NULL;
349     G4_Declare* payloadUD = NULL;
350     if (useSplitSend)
351     {
352         ASSERT_USER(useHeader, "So far, split-send is only used when header is present!");
353         --numRows;
354         if (numRows > 0)
355         {
356             unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
357             // we can use the urb handle directly since URB write will not modify its header
358             //msg = createSendPayloadDcl(GENX_SAMPLER_IO_SZ, Type_UD);
359             payloadUD = createSendPayloadDcl(numElts, Type_UD);
360             payloadF = createSendPayloadDcl(numElts, Type_F);
361             payloadD = createSendPayloadDcl(numElts, Type_D);
362             payloadF->setAliasDeclare(payloadUD, 0);
363             payloadD->setAliasDeclare(payloadUD, 0);
364         }
365     }
366     else
367     {
368         unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
369         msg = createSendPayloadDcl(numElts, Type_UD);
370         if (numRows > 1)
371         {
372             payloadUD = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_UD);
373             payloadF = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_F);
374             payloadD = createSendPayloadDcl(numElts - (useHeader ? GENX_SAMPLER_IO_SZ : 0), Type_D);
375             payloadUD->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
376             payloadF->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
377             payloadD->setAliasDeclare(msg, useHeader ? numEltPerGRF<Type_UB>() : 0);
378         }
379     }
380 
381     unsigned int regOff = 0;
382     // Setup header
383     if (useHeader && msg != NULL)
384     {
385         unsigned ignoredOff = 0;
386         Copy_SrcRegRegion_To_Payload(msg, ignoredOff, urbHandle, g4::SIMD8, instOpt);
387     }
388 
389     if (usePerSlotIndex)
390     {
391         Copy_SrcRegRegion_To_Payload(payloadUD, regOff, perSlotOffset, g4::SIMD8, instOpt);
392     }
393 
394     if (useChannelMask)
395     {
396 
397         // shl (8) M2.0<1>:ud cmask<8;8,1>:ud 0x10:uw
398         auto payloadUDRegRgnRow2 = createDst(payloadUD->getRegVar(), regOff++, 0, 1, Type_UD);
399 
400         createBinOp(G4_shl, g4::SIMD8, payloadUDRegRgnRow2, channelMask, createImm(16, Type_UW),
401             instOpt, true);
402     }
403 
404     G4_Declare* vertexDataDcl = numOut == 0 ? NULL : vertexData->getBase()->asRegVar()->getDeclare();
405 
406     bool needsDataMove = (!useSplitSend || usePerSlotIndex || useChannelMask);
407     if (needsDataMove)
408     {
409         // we have to insert moves to make payload contiguous
410         unsigned int startSrcRow = vertexData->getRegOff();
411 
412         for (int i = 0; i < numOut; i++)
413         {
414             G4_DstRegRegion payloadTypedRegRowi(Direct, payloadF->getRegVar(), regOff++, 0, 1, Type_F);
415             G4_DstRegRegion* payloadTypedRegRowRgni = createDstRegRegion(payloadTypedRegRowi);
416 
417             G4_SrcRegRegion* vertexSrcRegRgnRowi = createSrc(vertexDataDcl->getRegVar(), startSrcRow++, 0, getRegionStride1(), Type_F);
418 
419             createMov(g4::SIMD8, payloadTypedRegRowRgni, vertexSrcRegRgnRowi, instOpt, true);
420         }
421     }
422     else
423     {
424         payloadUD = vertexDataDcl;
425     }
426 
427     // Msg descriptor
428     unsigned int fc = 0;
429 
430     fc |= 0x7;
431 
432     fc |= (globalOffset << 4);
433 
434     if (useChannelMask)
435     {
436         fc |= (0x1 << 15);
437     }
438 
439     if (usePerSlotIndex)
440     {
441         fc |= (0x1 << 17);
442     }
443 
444     if (useSplitSend)
445     {
446         G4_SrcRegRegion *m0 = urbHandle;
447         G4_SrcRegRegion *m1 = nullptr;
448 
449         if (needsDataMove)
450         {
451             m1 = createSrcRegRegion(payloadUD, getRegionStride1());
452         }
453         else
454         {
455             ASSERT_USER(payloadUD == vertexDataDcl,
456                 "If there is no need for data move then payloadUD == vertexDataDcl must hold!");
457 
458             m1 = createSrc(
459                 payloadUD->getRegVar(),
460                 vertexData->getRegOff(),
461                 vertexData->getSubRegOff(),
462                 getRegionStride1(),
463                 payloadUD->getElemType());
464         }
465 
466         createSplitSendInst(pred, createNullDst(Type_UD), m0, 1, m1, numRows, 0,
467             execSize, fc, SFID::URB, useHeader, SendAccess::WRITE_ONLY, NULL, NULL, instOpt, false);
468     } else {
469         G4_SrcRegRegion *m = createSrcRegRegion(msg, getRegionStride1());
470         createSendInst(pred, createNullDst(Type_UD), m, numRows, 0,
471             execSize, fc, SFID::URB, useHeader, SendAccess::WRITE_ONLY, nullptr, nullptr, instOpt, false);
472     }
473     return VISA_SUCCESS;
474 }
475 
476 /*****************************************************************************\
477 ENUM: EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL
478 \*****************************************************************************/
479 enum EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL
480 {
481     EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD16_SINGLE_SOURCE = 0,
482     EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD16_SINGLE_SOURCE_REPLICATED = 1,
483     EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_LOW = 2,
484     EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_HIGH = 3,
485     EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_SINGLE_SOURCE_LOW = 4,
486     EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_IMAGE_WRITE = 5
487 };
488 
translateVISARTWrite3DInst(G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,G4_Operand * surface,G4_SrcRegRegion * r1HeaderOpnd,G4_Operand * rtIndex,vISA_RT_CONTROLS cntrls,G4_SrcRegRegion * sampleIndexOpnd,G4_Operand * cpsCounter,unsigned int numParms,G4_SrcRegRegion ** msgOpnds)489 int IR_Builder::translateVISARTWrite3DInst(
490     G4_Predicate* pred,
491     VISA_Exec_Size executionSize,
492     VISA_EMask_Ctrl emask,
493     G4_Operand *surface,
494     G4_SrcRegRegion *r1HeaderOpnd,
495     G4_Operand *rtIndex,
496     vISA_RT_CONTROLS cntrls,
497     G4_SrcRegRegion *sampleIndexOpnd,
498     G4_Operand *cpsCounter,
499     unsigned int numParms,
500     G4_SrcRegRegion ** msgOpnds)
501 {
502     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
503 
504     G4_ExecSize execSize = toExecSize(executionSize);
505     G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
506     bool useHeader = false;
507 
508     uint8_t varOffset = 0;
509     G4_SrcRegRegion * s0a = NULL;
510     //oMask
511     G4_SrcRegRegion * oM  = NULL;
512     if (cntrls.s0aPresent)
513     {
514         s0a = msgOpnds[varOffset];
515         ++varOffset;
516     }
517     if (cntrls.oMPresent)
518     {
519         oM = msgOpnds[varOffset];
520         ++varOffset;
521     }
522 
523     G4_SrcRegRegion * R = msgOpnds[varOffset++];
524     G4_SrcRegRegion * G = msgOpnds[varOffset++];
525     G4_SrcRegRegion * B = msgOpnds[varOffset++];
526     G4_SrcRegRegion * A = msgOpnds[varOffset++];
527     //depth
528     G4_SrcRegRegion * Z = NULL;
529 
530     if (cntrls.zPresent)
531         Z = msgOpnds[varOffset++];
532 
533     //stencil
534     G4_SrcRegRegion * S = NULL;
535     if (cntrls.isStencil)
536     {
537         S = msgOpnds[varOffset++];
538     }
539 
540     if (varOffset != numParms)
541     {
542         assert(0);
543         return VISA_FAILURE;
544     }
545 
546     bool FP16Data = R->getType() == Type_HF;
547     if (FP16Data)
548     {
549         MUST_BE_TRUE((G->isNullReg() || G->getType() == Type_HF) &&
550             (B->isNullReg() || B->getType() == Type_HF) &&
551             (A->isNullReg() || A->getType() == Type_HF),
552             "R,G,B,A for RT write must have the same type");
553     }
554 
555     auto mult = (execSize == getNativeExecSize() ? 1 : 2);
556     mult = (FP16Data)? 1 : mult;
557 
558     //RGBA sr0Alpha take up one GRF in SIMD8 and SIMD16 modes.
559     //in SIMD8 upper DWORDs are reserved
560     unsigned int numRows = numParms * mult;
561 
562     //Depth is always Float
563     //For SIMD16 it is 2 grfs
564     //For SIMD8  it is 1 grf
565     if (FP16Data && cntrls.zPresent && executionSize == EXEC_SIZE_16)
566     {
567         ++numRows;
568     }
569 
570     if (cntrls.oMPresent && mult == 2)
571     {
572         // oM is always 1 row irrespective of execSize
573         numRows--;
574     }
575 
576     //although for now HW only supports stencil in SIMD8 mode
577     if (cntrls.isStencil && mult == 2)
578     {
579         // stencil is always 1 row irrespective of execSize
580         numRows--;
581     }
582 
583     // header is always 64 byte
584     const int numDWInHeader = 16;
585     const int headerBytes = numDWInHeader * sizeof(int);
586     const int numHeaderGRF = numDWInHeader / getNativeExecSize();
587 
588     /*
589     All other values should be set by default.
590     Most of the time when renderTargetIndex != 0, src0Alpha is present also
591     */
592     bool isRTIdxNonzero = cntrls.RTIndexPresent &&
593         (rtIndex->isSrcRegRegion() || (rtIndex->isImm() && rtIndex->asImm()->getImm() != 0));
594     bool isRTIdxDynamic = cntrls.RTIndexPresent && rtIndex->isSrcRegRegion();
595     bool needsHeaderForMRT = isRTIdxDynamic || cntrls.s0aPresent || (!hasHeaderlessMRTWrite() && isRTIdxNonzero);
596     if (needsHeaderForMRT || cntrls.isSampleIndex)
597     {
598         useHeader = true;
599         numRows += numHeaderGRF;
600     }
601 
602     bool useSplitSend = useSends();
603     // So far, we don't have a obvious cut except for header. As the result,
604     // split-send is disabled once there's no header in the message.
605 
606     G4_SrcRegRegion* srcToUse   = NULL;
607     G4_Declare *msg             = NULL;
608     G4_Declare *msgF            = NULL;
609     G4_Declare *payloadUD       = NULL;
610     G4_Declare *payloadUW       = NULL;
611     G4_Declare *payloadFOrHF    = NULL;
612     G4_Declare *payloadF        = NULL;
613 
614     if (useSplitSend)
615     {
616         if (useHeader)
617         {
618             //subtracting Header
619             numRows -= numHeaderGRF;
620             //creating header
621             msg = createSendPayloadDcl(numDWInHeader, Type_UD);
622             msgF = createSendPayloadDcl(numDWInHeader, Type_F);
623             msgF->setAliasDeclare(msg, 0);
624         }
625         //creating payload
626         unsigned int numElts = numRows * numEltPerGRF<Type_UB>() / TypeSize(Type_F);
627         payloadUD = createSendPayloadDcl(numElts, Type_UD);
628         payloadFOrHF = createSendPayloadDcl(numElts, FP16Data ? Type_HF : Type_F);
629         payloadUW = createSendPayloadDcl(numElts, Type_UW);
630         payloadF = createSendPayloadDcl(numElts, Type_F);
631 
632         payloadFOrHF->setAliasDeclare(payloadUD, 0);
633         payloadUW->setAliasDeclare(payloadUD, 0);
634         payloadF->setAliasDeclare(payloadUD, 0);
635     }
636     else
637     {
638         unsigned int numElts = numRows * numEltPerGRF<Type_UB>()/TypeSize(Type_F);
639         //creating enough space for header + payload
640         msg = createSendPayloadDcl(numElts, Type_UD);
641         msgF = createSendPayloadDcl(GENX_SAMPLER_IO_SZ * 2, Type_F);
642         msgF->setAliasDeclare(msg, 0);
643 
644         //creating payload declarations.
645         payloadUD = createSendPayloadDcl(numElts - (useHeader ? numDWInHeader : 0), Type_UD);
646         payloadFOrHF = createSendPayloadDcl(numElts - (useHeader ? numDWInHeader : 0), FP16Data ? Type_HF : Type_F);
647         payloadUW = createSendPayloadDcl(numElts - (useHeader ? numDWInHeader : 0), Type_UW);
648         payloadF = createSendPayloadDcl(numElts, Type_F);
649 
650         //setting them to alias a top level decl with offset past the header
651         payloadUD->setAliasDeclare(msg, useHeader ? headerBytes : 0);
652         payloadFOrHF->setAliasDeclare(msg, useHeader ? headerBytes : 0);
653         payloadUW->setAliasDeclare(msg, useHeader ? headerBytes : 0);
654         payloadF->setAliasDeclare(payloadUD, 0);
655     }
656 
657     if (useHeader)
658     {
659         ASSERT_USER(r1HeaderOpnd, "Second GRF for header that was passed in is NULL.");
660         G4_DstRegRegion* payloadRegRgn = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
661 
662         G4_Declare* r0 = getBuiltinR0();
663         G4_SrcRegRegion* r0RegRgn = createSrc(r0->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
664 
665         //moves data from r0 to header portion of the message
666         G4_INST* movInst = createMov(g4::SIMD8, payloadRegRgn, r0RegRgn, InstOpt_NoOpt, true);
667         movInst->setOptionOn(InstOpt_WriteEnable);
668 
669         payloadRegRgn = createDst(msg->getRegVar(), 1, 0, 1, Type_UD);
670         r1HeaderOpnd->setType(Type_UD);
671         movInst = createMov(g4::SIMD8, payloadRegRgn, r1HeaderOpnd, InstOpt_NoOpt, true);
672         movInst->setOptionOn(InstOpt_WriteEnable);
673 
674 #define SAMPLE_INDEX_OFFSET 6
675         if (cntrls.isSampleIndex)
676         {
677             G4_Declare* tmpDcl = createTempVar(2, Type_UD, Any);
678             G4_DstRegRegion* tmpDst = createDst(tmpDcl->getRegVar(), 0, 0, 1, Type_UD);
679 
680             createBinOp(G4_shl, g4::SIMD1, tmpDst, sampleIndexOpnd, createImm(SAMPLE_INDEX_OFFSET, Type_UD), InstOpt_WriteEnable, true);
681 
682             G4_DstRegRegion* payloadUDRegRgn = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
683             G4_SrcRegRegion* tmpSrc = createSrc(tmpDcl->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
684             G4_SrcRegRegion* payloadSrc = createSrc(msg->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
685             createBinOp(G4_or, g4::SIMD1, payloadUDRegRgn, payloadSrc, tmpSrc, InstOpt_WriteEnable, true);
686         }
687 
688         if (isRTIdxNonzero)
689         {
690             G4_DstRegRegion* dstRTIRgn = createDst(msg->getRegVar(), 0, 2, 1, Type_UD);
691 
692             G4_INST* rtiMovInst = createMov(g4::SIMD1, dstRTIRgn, rtIndex, InstOpt_NoOpt, true);
693             rtiMovInst->setOptionOn(InstOpt_WriteEnable);
694         }
695 
696         //if header is used, then predication value will need to be stored
697         //in the header
698         if (useHeader && (pred || cntrls.isHeaderMaskfromCe0))
699         {
700             //moving pixelMask in to payload
701             G4_DstRegRegion* dstPixelMaskRgn = createDst(
702                 msg->getRegVar(), 1, 14, 1, Type_UW);
703 
704             // setPixelMaskRgn when WA ce0 is needed
705             auto setPixelMaskRgn = [=](G4_InstOption Option) -> void
706             {
707                 G4_Declare* flagDecl = createTempFlag(2, "WAce0");
708                 G4_RegVar* flagVar = flagDecl->getRegVar();
709                 G4_DstRegRegion* flag = createDst(
710                     flagVar, 0,
711                     Option == InstOpt_M16 ? 1 : 0,
712                     1, Type_UW);
713 
714                 // (1) (W) mov (1|M0) WAce0.[0|1]:uw, 0
715                 //         M0 : WAce0.0; M16 : WAce0.1
716                 // (2)     cmp (16|[M0|M16]) (eq)WAce0.0 r0:uw r0:uw
717                 // (3) (W) mov(1|M0) dstPixelMaskRgn:uw  WAce0.[0|1]:uw
718                 //         M0 : WAce0.0; M16 : WAce0.1
719                 createMov(g4::SIMD1, flag, createImm(0, Type_UW), InstOpt_WriteEnable, true);
720 
721                 G4_SrcRegRegion* r0_0 = createSrc(
722                     getRealR0()->getRegVar(), 0, 0,
723                     getRegionStride1(), Type_UW);
724                 G4_SrcRegRegion* r0_1 = createSrc(
725                     getRealR0()->getRegVar(), 0, 0,
726                     getRegionStride1(), Type_UW);
727                 G4_DstRegRegion* nullDst = createNullDst(Type_UW);
728                 G4_CondMod* flagCM = createCondMod(Mod_e, flagVar, 0);
729                 createInst(NULL, G4_cmp, flagCM, g4::NOSAT, g4::SIMD16, nullDst,
730                     r0_0, r0_1, Option, true);
731 
732                 G4_SrcRegRegion* flagSrc = createSrc(
733                     flagVar, 0,
734                     Option == InstOpt_M16 ? 1 : 0,
735                     getRegionScalar(), Type_UW);
736 
737                 // move to dstPixelMaskRgn
738                 createMov(g4::SIMD1, dstPixelMaskRgn, flagSrc, InstOpt_WriteEnable, true);
739             };
740 
741             G4_SrcRegRegion* pixelMask = NULL;
742             if (emask == vISA_EMASK_M5_NM || emask == vISA_EMASK_M5)
743             {
744                 if (pred)
745                 {
746                     //this is a Second half of a SIMD32 RT write. We need to get second half of flag register.
747                     //mov whole register in to GRF, move second word of it in to payload.
748 
749                     G4_SrcRegRegion* pixelMaskTmp = createSrc(
750                         pred->getBase()->asRegVar(), 0, 0,
751                         getRegionScalar(), Type_UD);
752                     G4_Declare* tmpDcl = createTempVar(1, Type_UD, Any);
753                     G4_DstRegRegion* tmpDst = createDst(tmpDcl->getRegVar(), 0, 0, 1, Type_UD);
754                     createMov(g4::SIMD1, tmpDst, pixelMaskTmp, InstOpt_WriteEnable, true);
755 
756                     pixelMask = createSrc(
757                         tmpDcl->getRegVar(), 0, 1, getRegionScalar(), Type_UW);
758 
759                     // move from temp register to header
760                     createMov(g4::SIMD1, dstPixelMaskRgn, pixelMask, InstOpt_WriteEnable, true);
761                 }
762                 else
763                 {
764                     if (VISA_WA_CHECK(getPWaTable(), Wa_1406950495))
765                     {
766                         setPixelMaskRgn(InstOpt_M16);
767                     }
768                     else
769                     {
770                         G4_SrcRegRegion* ce0 = createSrc(
771                             phyregpool.getMask0Reg(), 0, 0,
772                             getRegionScalar(), Type_UD);
773 
774                         // shr .14<1>:uw ce0:ud 16:uw
775                         createBinOp(G4_shr, g4::SIMD1, dstPixelMaskRgn,
776                             ce0, createImm(16, Type_UW), InstOpt_WriteEnable, true);
777                     }
778                 }
779             }
780             else
781             {
782                 if (pred)
783                 {
784                     pixelMask = createSrc(
785                         pred->getBase()->asRegVar(), 0, 0,
786                         getRegionScalar(), Type_UW);
787 
788                     //clearing lower 15 bits
789                     createMov(g4::SIMD1, dstPixelMaskRgn, pixelMask, InstOpt_WriteEnable, true);
790                 }
791                 else
792                 {
793                     if (VISA_WA_CHECK(getPWaTable(), Wa_1406950495))
794                     {
795                         setPixelMaskRgn(InstOpt_M0);
796                     }
797                     else
798                     {
799                         G4_SrcRegRegion* ce0 = createSrc(
800                             phyregpool.getMask0Reg(), 0, 0,
801                             getRegionScalar(), Type_UD);
802 
803                         // mov .14<1>:uw ce0:ud.  clearing lower 15 bits
804                         createMov(g4::SIMD1, dstPixelMaskRgn, ce0, InstOpt_WriteEnable, true);
805                     }
806                 }
807             }
808 
809             pred = NULL;
810 
811         }
812         unsigned int orImmVal = 0;
813 
814         //setting first DWORD of MHC_RT_C0 - Render Target Message Header Control
815 
816         if (cntrls.isStencil)
817         {
818             orImmVal = (0x1 << 14);
819         }
820 
821         if (cntrls.zPresent)
822         {
823             orImmVal = (0x1 << 13);
824         }
825 
826         if (cntrls.oMPresent)
827         {
828             orImmVal |= (0x1 << 12);
829         }
830 
831         if (cntrls.s0aPresent)
832         {
833             orImmVal |= (0x1 << 11);
834         }
835 
836         if (orImmVal != 0)
837         {
838             G4_SrcRegRegion* immSrcRegRgn = createSrc(msg->getRegVar(), 0, 0, getRegionScalar(), Type_UD);
839 
840             G4_DstRegRegion* immDstRegRgn = createDst(msg->getRegVar(), 0, 0, 1, Type_UD);
841 
842             G4_INST* immOrInst = createBinOp(G4_or, g4::SIMD1, immDstRegRgn, immSrcRegRgn, createImm(orImmVal, Type_UD), InstOpt_WriteEnable, true);
843             immOrInst->setOptionOn(InstOpt_WriteEnable);
844         }
845     }
846 
847     // Check whether coalescing is possible
848 #define UNINITIALIZED_DWORD 0xffffffff
849     unsigned int offset = UNINITIALIZED_DWORD;
850     // If the header is not present or split-send is available, we will try to
851     // coalesc payload by checking whether the source is already prepared in a
852     // continuous region. If so, we could reuse the source region directly
853     // instead of copying it again.
854     bool canCoalesce = !useHeader || useSplitSend;
855     G4_SrcRegRegion* prevRawOpnd = NULL;
856 
857     if (R->isNullReg()  ||
858         G->isNullReg()  ||
859         B->isNullReg()  ||
860         A->isNullReg())
861         canCoalesce = false;
862 
863     if (canCoalesce && cntrls.s0aPresent)
864     {
865         prevRawOpnd = s0a;
866         offset = getByteOffsetSrcRegion(s0a);
867     }
868 
869     if (canCoalesce && cntrls.oMPresent)
870     {
871         //by default it will check based on first opnd type, but that can be HF, F, we need second operand type
872         //according to spec oM is UW
873         canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, oM, execSize, oM->getType());
874         prevRawOpnd = oM;
875         if (offset == UNINITIALIZED_DWORD)
876         {
877             offset = getByteOffsetSrcRegion(oM);
878         }
879     }
880 
881     if (canCoalesce)
882     {
883         if (execSize == 16 && cntrls.oMPresent)
884         {
885             // oM is 1 GRF for SIMD16 since it is UW type
886             canCoalesce = checkIfRegionsAreConsecutive(oM, R, execSize, Type_UW);
887             prevRawOpnd = R;
888         }
889         else
890         {
891             canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, R, execSize);
892             prevRawOpnd = R;
893         }
894 
895         if (offset == UNINITIALIZED_DWORD)
896         {
897             offset = getByteOffsetSrcRegion(prevRawOpnd);
898         }
899 
900         if (canCoalesce)
901         {
902             auto tempExecSize = execSize;
903             if (FP16Data && execSize == 8)
904                 tempExecSize = g4::SIMD16;
905             canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, G, tempExecSize) &&
906                 checkIfRegionsAreConsecutive(G, B, tempExecSize) &&
907                 checkIfRegionsAreConsecutive(B, A, tempExecSize);
908             prevRawOpnd = A;
909             if (offset == UNINITIALIZED_DWORD)
910             {
911                 offset = getByteOffsetSrcRegion(A);
912                 if (FP16Data && execSize == g4::SIMD8)
913                     offset += 8;
914             }
915         }
916     }
917 
918     if (canCoalesce && cntrls.zPresent)
919     {
920         canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, Z, execSize);
921         prevRawOpnd = Z;
922     }
923 
924     if (canCoalesce && cntrls.isStencil)
925     {
926         canCoalesce = checkIfRegionsAreConsecutive(prevRawOpnd, S, execSize);
927         prevRawOpnd = S;
928     }
929 
930     if (canCoalesce == false)
931     {
932         // Copy parms to payload
933         unsigned regOff = 0;
934 
935         if (cntrls.s0aPresent)
936         {
937 
938             Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, s0a, execSize, instOpt);
939         }
940 
941         if (cntrls.oMPresent)
942         {
943             Copy_SrcRegRegion_To_Payload(payloadUW, regOff, oM, execSize, instOpt);
944             //Copy_SrcRegRegion_To_Payload increments regOff by 1 if byteSize ==2
945             //works for oM since in SIMD16 it occupies one GRF
946         }
947 
948 
949         //   When RT write is HF s0a,R, G, B, A are allowed to be HF.
950         //   In SIMD8 upper DWORDS are reserved.
951         //   In SIMD16 uppder DOWRDS contain second grf worth of values if type was F.
952         //
953         // Output can be only Depth, so V0 is passed in if RGBA don't need to be outputted
954         auto offIncrement = 2;
955         if (execSize == 8 || FP16Data)
956             offIncrement = 1;
957 
958         if (!R->isNullReg())
959             Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, R, execSize, instOpt);
960         else
961             regOff+= offIncrement;
962 
963         if (!G->isNullReg())
964             Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, G, execSize, instOpt);
965         else
966             regOff+= offIncrement;
967 
968         if (!B->isNullReg())
969             Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, B, execSize, instOpt);
970         else
971             regOff+= offIncrement;
972 
973         if (!A->isNullReg())
974             Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, A, execSize, instOpt);
975         else
976             regOff += offIncrement;
977 
978         if (cntrls.zPresent)
979         {
980             Copy_SrcRegRegion_To_Payload(payloadF, regOff, Z, execSize, instOpt);
981         }
982 
983         if (cntrls.isStencil)
984         {
985             Copy_SrcRegRegion_To_Payload(payloadFOrHF, regOff, S, execSize, InstOpt_WriteEnable);
986         }
987 
988         srcToUse = createSrcRegRegion(payloadUD, getRegionStride1());
989     }
990     else
991     {
992         // Coalesce and directly use original raw operand
993         G4_Declare *dcl = R->getBase()->asRegVar()->getDeclare();
994         srcToUse = createSrc(dcl->getRegVar(), offset / 32, 0, getRegionStride1(), R->getType());
995     }
996 
997     // Now create message message descriptor
998     // 7:0 - BTI
999     // 10:8 - Render Target Message Subtype
1000     // 11 - Slot Group Select
1001     // 12 - Last Render Target Select
1002     // 13 - Reserved (DevBDW)
1003     // 13 - Per-Sample PS Outputs Enable (DevSKL+)
1004     // 17:14 - Message Type
1005     // 18 - Reserved
1006     // 19 - Header Present
1007     // 24:20 - Response Length
1008     // 28:25 - Message Length
1009     // 29 - Reserved
1010     // 30 - Message Precision Subtype (DevBDW+)
1011     // 31 - Reserved (MBZ)
1012     unsigned int fc = 0;
1013 
1014     //making explicit
1015     EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL messageType =
1016         (executionSize == EXEC_SIZE_8)
1017         ? EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_SINGLE_SOURCE_LOW
1018         : EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD16_SINGLE_SOURCE;
1019 
1020 #define RENDER_TARGET_MESSAGE_SUBTYPE_OFFSET 8
1021     fc |= (messageType << RENDER_TARGET_MESSAGE_SUBTYPE_OFFSET);
1022 
1023 #define SLOT_GROUP_SELECT_OFFSET 11
1024     //for SIMD32 for second RT Write setting this bit
1025     if (emask == vISA_EMASK_M5_NM || emask == vISA_EMASK_M5)
1026         fc |= (0x1 << SLOT_GROUP_SELECT_OFFSET);
1027 
1028     if (cntrls.isLastWrite)
1029     {
1030 #define LAST_RENDER_TARGET_SELECT_OFFSET 12
1031         fc |= (0x1 << LAST_RENDER_TARGET_SELECT_OFFSET);
1032     }
1033 
1034     if (cntrls.isPerSample)
1035     {
1036 #define PER_SAMPLE_PS_ENABLE_OFFSET 13
1037         fc += (0x1 << PER_SAMPLE_PS_ENABLE_OFFSET);
1038     }
1039 
1040     if (FP16Data)
1041     {
1042         fc |= 0x1 << MESSAGE_PRECISION_SUBTYPE_OFFSET;
1043     }
1044 
1045 #define MESSAGE_TYPE 14
1046     fc |= (0xc << MESSAGE_TYPE);
1047 
1048 #define COARSE_PIXEL_OUTPUT_ENABLE 18
1049     if (cntrls.isCoarseMode)
1050         fc |= 0x1 << COARSE_PIXEL_OUTPUT_ENABLE;
1051 #define CPS_COUNTER_EXT_MSG_DESC_OFFSET 16
1052 
1053     uint16_t extFuncCtrl = 0;
1054     if (cntrls.isNullRT && getPlatform() >= GENX_TGLLP)
1055     {
1056         // extFuncCtrl is the 16:31 bits of extDesc. NullRT is the bit 20 of extDesc.
1057         // That says NullRT is the bit 4 of extFuncCtrl.
1058 #define NULL_RENDER_TARGET 4
1059         extFuncCtrl |= 0x1 << NULL_RENDER_TARGET;
1060     }
1061 
1062     if (useSplitSend || cpsCounter)
1063     {
1064         G4_SendDescRaw *msgDesc = NULL;
1065         G4_SrcRegRegion *m0 = NULL;
1066         bool indirectExDesc = false;
1067         if (useHeader)
1068         {
1069             m0 = createSrcRegRegion(msg, getRegionStride1());
1070             msgDesc = createSendMsgDesc(fc, 0, numHeaderGRF, SFID::DP_WRITE, numRows,
1071                 extFuncCtrl, SendAccess::WRITE_ONLY, surface);
1072             msgDesc->setHeaderPresent(useHeader);
1073         }
1074         else
1075         {
1076             if (!isRTIdxNonzero && !cntrls.s0aPresent)
1077             {
1078                 // direct imm is a-ok for ext desc
1079                 msgDesc = createSendMsgDesc(fc, 0, numRows, SFID::DP_WRITE, 0,
1080                     extFuncCtrl, SendAccess::WRITE_ONLY, surface);
1081             }
1082             else
1083             {
1084                 assert(rtIndex->isImm() && "RTIndex must be imm at this point");
1085                 uint8_t RTIndex = (uint8_t)rtIndex->asImm()->getImm() & 0x7;
1086                 uint32_t desc = G4_SendDescRaw::createDesc(fc, false, numRows, 0);
1087                 uint32_t extDesc = G4_SendDescRaw::createMRTExtDesc(cntrls.s0aPresent, RTIndex,
1088                     false, 0, extFuncCtrl);
1089                 msgDesc = createGeneralMsgDesc(desc, extDesc, SendAccess::WRITE_ONLY, surface);
1090 
1091                 if (!canEncodeFullExtDesc())
1092                 {
1093                     // we must use a0 for extended msg desc in this case as there aren't enough bits to encode
1094                     // the full ext desc
1095                     // mov (1) a0.2:ud extDesc
1096                     G4_DstRegRegion* dst = createDstRegRegion(getBuiltinA0Dot2(), 1);
1097                     createMov(g4::SIMD1, dst, createImm(extDesc, Type_UD), InstOpt_WriteEnable, true);
1098                     indirectExDesc = true;
1099                 }
1100             }
1101         }
1102 
1103         /*
1104         If we need to set cps counter then ext_message descriptor
1105         needs to be a register.
1106         */
1107         if (cpsCounter)
1108         {
1109             ASSERT_USER(hasCPS(), "CPS counter is not supported");
1110             unsigned msgDescValue = msgDesc->getExtendedDesc();
1111 
1112             //shifting CPS counter by appropriate number of bits and storing in ext_descriptor operand
1113             G4_DstRegRegion *dstMove2 = createDstRegRegion(getBuiltinA0Dot2(), 1);
1114             G4_Imm *immedOpnd = createImm(msgDescValue, Type_UD);
1115 
1116             ///setting lower bits
1117             createBinOp(G4_or, g4::SIMD1, dstMove2, cpsCounter, immedOpnd, InstOpt_WriteEnable, true);
1118             indirectExDesc = true;
1119         }
1120 
1121         if (!useHeader)
1122         {
1123             m0 = srcToUse;
1124             srcToUse = createNullSrc(Type_UD);
1125         }
1126 
1127         createSplitSendToRenderTarget(
1128             pred,
1129             createNullDst(Type_UD),
1130             m0,
1131             srcToUse,
1132             indirectExDesc ? createSrcRegRegion(getBuiltinA0Dot2(), getRegionScalar()) : nullptr,
1133             execSize,
1134             msgDesc,
1135             instOpt);
1136     }
1137     else
1138     {
1139         G4_SrcRegRegion *m = srcToUse;
1140         if (useHeader)
1141             m = createSrcRegRegion(msg, getRegionStride1());
1142         createSendInst(pred, createNullDst(Type_UD), m, numRows, 0,
1143             execSize, fc, SFID::DP_WRITE, useHeader, SendAccess::WRITE_ONLY, surface, NULL, instOpt, true);
1144     }
1145     return VISA_SUCCESS;
1146 
1147 }
1148 
1149 
1150 // Bit 15 of aoffimmi is set in messages with sampler index >= 16.
IsSamplerIndexGE16(G4_Operand * aoffimmi)1151 static bool IsSamplerIndexGE16(G4_Operand* aoffimmi)
1152 {
1153     bool ret = false;
1154     if (aoffimmi && aoffimmi->isImm())
1155     {
1156         const uint16_t aoffimmiVal = (uint16_t)aoffimmi->asImm()->getInt();
1157         ret = (aoffimmiVal & 0x8000) != 0;
1158     }
1159     return ret;
1160 }
1161 
1162 
1163 // return the contents of M0.2 for sampler messages.  It must be an immediate value
createSampleHeader0Dot2(VISASampler3DSubOpCode op,bool pixelNullMask,uint16_t aoffimmi,ChannelMask channels,IR_Builder * builder)1164 static uint32_t createSampleHeader0Dot2(VISASampler3DSubOpCode op,
1165     bool pixelNullMask,
1166     uint16_t aoffimmi,
1167     ChannelMask channels,
1168     IR_Builder* builder)
1169 {
1170     uint32_t secondDword = aoffimmi & 0xfff;
1171     switch (op)
1172     {
1173     case VISA_3D_GATHER4:
1174         //gather4 source channel select
1175         secondDword |= (channels.getSingleChannel() << 16);
1176         break;
1177     case VISA_3D_GATHER4_PO:
1178         if (builder->hasGather4PO())
1179         {
1180             secondDword |= (channels.getSingleChannel() << 16);
1181         }
1182         break;
1183     case VISA_3D_GATHER4_PO_C:
1184         break;
1185     case VISA_3D_GATHER4_C:
1186         // do nothing as channle must be Red (0)
1187         break;
1188     default:
1189         // RGBA write channel mask
1190         secondDword |= (channels.getHWEncoding() << 12);
1191         break;
1192     }
1193 
1194     // M0.2:23, Pixel Null Mask Enable.
1195     // Only valid for SKL+, and ignored otherwise.
1196     if (builder->hasPixelNullMask() && pixelNullMask)
1197     {
1198         secondDword |= 1 << 23;
1199     }
1200 
1201     return secondDword;
1202 }
1203 
1204 //
1205 // Coarse Pixel Shading(CPS) LOD compensation enable.
1206 //
1207 // - must be disabled if the response length of the message is zero;
1208 // - must be disabled if the messages is from a 32-pixel dispatch thread;
1209 // - must be disabled unless SIMD Mode is SIMD8* or SIMD16*;
1210 // - only available for sample, sample_b, sample_bc, sample_c, and LOD.
1211 //
checkCPSEnable(VISASampler3DSubOpCode op,unsigned reponseLength,unsigned execSize)1212 static void checkCPSEnable(VISASampler3DSubOpCode op,
1213     unsigned reponseLength,
1214     unsigned execSize)
1215 {
1216 
1217     ASSERT_USER(reponseLength > 0,
1218         "CPS LOD Compensation Enable must be disabled if the "
1219         "response length is zero");
1220 
1221     ASSERT_USER(execSize == 8 || execSize == 16,
1222         "CPS LOD Compensation Enable only valid for SIMD8* or SIMD16*");
1223 
1224     ASSERT_USER(op == VISA_3D_SAMPLE ||
1225         op == VISA_3D_SAMPLE_B ||
1226         op == VISA_3D_SAMPLE_C ||
1227         op == VISA_3D_SAMPLE_B_C ||
1228         op == VISA_3D_LOD,
1229         "CPD LOD Compensation Enable only available for "
1230         "sample, sample_b, sample_bc, sample_c and LOD");
1231 }
1232 
createSampleHeader(IR_Builder * builder,G4_Declare * header,VISASampler3DSubOpCode actualop,bool pixelNullMask,G4_Operand * aoffimmi,ChannelMask srcChannel,G4_Operand * sampler)1233 static G4_Operand* createSampleHeader(
1234     IR_Builder* builder, G4_Declare* header, VISASampler3DSubOpCode actualop,
1235     bool pixelNullMask, G4_Operand* aoffimmi, ChannelMask srcChannel,
1236     G4_Operand* sampler)
1237 {
1238     G4_Operand* retSampler = sampler;
1239     uint16_t aoffimmiVal = aoffimmi->isImm() ? (uint16_t)aoffimmi->asImm()->getInt() : 0;
1240 
1241     unsigned int secondDword = createSampleHeader0Dot2(actualop, pixelNullMask, aoffimmiVal, srcChannel, builder);
1242 
1243     G4_Imm* immOpndSecondDword = builder->createImm(secondDword, Type_UD);
1244     G4_DstRegRegion* payloadDstRgn = builder->createDst(header->getRegVar(), 0, 2, 1, Type_UD);
1245     if (aoffimmi->isImm())
1246     {
1247         // mov (1) payload(0,2) immOpndSecondDword
1248         builder->createMov(g4::SIMD1, payloadDstRgn, immOpndSecondDword, InstOpt_WriteEnable, true);
1249     }
1250     else
1251     {
1252         // or (1) payload(0,2) aoffimmi<0;1,0>:uw immOpndSeconDword
1253         builder->createBinOp(G4_or, g4::SIMD1, payloadDstRgn,
1254             aoffimmi, immOpndSecondDword, InstOpt_WriteEnable, true);
1255     }
1256 
1257     if (sampler != nullptr)
1258     {
1259         builder->doSamplerHeaderMove(header, sampler);
1260 
1261         // Use bit 15 of aoffimmi to tell VISA the sample index could be greater
1262         // than 15.  In this case, we need to use msg header, and setup M0.3
1263         // to point to next 16 sampler state.
1264         if (IsSamplerIndexGE16(aoffimmi))
1265         {
1266             retSampler = builder->emitSampleIndexGE16(sampler, header);
1267         }
1268     }
1269 
1270     return retSampler;
1271 }
1272 
1273 
needsNoMaskCoordinates(VISASampler3DSubOpCode opcode)1274 static bool needsNoMaskCoordinates(VISASampler3DSubOpCode opcode)
1275 {
1276     return opcode == VISA_3D_SAMPLE || opcode == VISA_3D_SAMPLE_B || opcode == VISA_3D_SAMPLE_C ||
1277         opcode == VISA_3D_SAMPLE_B_C || opcode == VISA_3D_LOD || opcode == VISA_3D_SAMPLE_KILLPIX;
1278 }
1279 
getUPosition(VISASampler3DSubOpCode opcode)1280 static uint8_t getUPosition(VISASampler3DSubOpCode opcode)
1281 {
1282     uint8_t position = 0;
1283     switch (opcode)
1284     {
1285     case VISA_3D_SAMPLE:
1286     case VISA_3D_LOD:
1287     case VISA_3D_SAMPLE_D:
1288     case VISA_3D_SAMPLE_LZ:
1289     case VISA_3D_SAMPLE_KILLPIX:
1290         position = 0;
1291         break;
1292     case VISA_3D_SAMPLE_B:
1293     case VISA_3D_SAMPLE_L:
1294     case VISA_3D_SAMPLE_C:
1295     case VISA_3D_SAMPLE_D_C:
1296     case VISA_3D_SAMPLE_C_LZ:
1297         position = 1;
1298         break;
1299     case VISA_3D_SAMPLE_B_C:
1300     case VISA_3D_SAMPLE_L_C:
1301         position = 2;
1302         break;
1303     default:
1304         MUST_BE_TRUE(false, "unexpected sampler operation");
1305         return 0;
1306     }
1307     return position;
1308 }
1309 
setUniformSampler(G4_InstSend * sendInst,bool uniformSampler)1310 static void setUniformSampler(G4_InstSend* sendInst, bool uniformSampler)
1311 {
1312     if (!uniformSampler)
1313     {
1314         sendInst->setSerialize();
1315     }
1316 }
1317 
1318 /*
1319 Need to split sample_d and sample_dc in to two simd8 sends since HW doesn't support it.
1320 Also need to split any sample instruciton that has more then 5 parameters. Since there is a limit on msg length.
1321 */
1322 static unsigned TmpSmplDstID = 0;
1323 
1324 // TODO: use IR_Builder::getNameString....
getNameString(Mem_Manager & mem,size_t size,const char * format,...)1325 const char* getNameString(
1326     Mem_Manager& mem, size_t size, const char* format, ...)
1327 {
1328 #ifdef _DEBUG
1329     char* name = (char*) mem.alloc(size);
1330     va_list args;
1331     va_start(args, format);
1332     std::vsnprintf(name, size, format, args);
1333     va_end(args);
1334     return name;
1335 #else
1336     const char* name = "";
1337     return const_cast<char*>(name);
1338 #endif
1339 }
1340 
1341 // split simd32/16 sampler messages into simd16/8 messages due to HW limitation.
splitSampleInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,bool cpsEnable,G4_Predicate * pred,ChannelMask srcChannel,int numChannels,G4_Operand * aoffimmi,G4_Operand * sampler,G4_Operand * surface,G4_DstRegRegion * dst,VISA_EMask_Ctrl emask,bool useHeader,unsigned numRows,unsigned int numParms,G4_SrcRegRegion ** params,bool uniformSampler)1342 int IR_Builder::splitSampleInst(
1343     VISASampler3DSubOpCode actualop,
1344     bool pixelNullMask,
1345     bool cpsEnable,
1346     G4_Predicate* pred,
1347     ChannelMask srcChannel,
1348     int numChannels,
1349     G4_Operand *aoffimmi,
1350     G4_Operand *sampler,
1351     G4_Operand *surface,
1352     G4_DstRegRegion* dst,
1353     VISA_EMask_Ctrl emask,
1354     bool useHeader,
1355     unsigned numRows, // msg length for each simd8
1356     unsigned int numParms,
1357     G4_SrcRegRegion ** params,
1358     bool uniformSampler)
1359 {
1360     int status = VISA_SUCCESS;
1361     G4_SrcRegRegion *secondHalf[12];
1362 
1363     bool isHalfReturn = dst->getTypeSize() == 2;
1364     const bool halfInput = params[0]->getTypeSize() == 2;
1365 
1366     // Now, depending on message type emit out parms to payload
1367     unsigned regOff = (useHeader ? 1 : 0);
1368     G4_SrcRegRegion* temp = nullptr;
1369     G4_ExecSize execSize = getNativeExecSize();
1370     uint16_t numElts = numRows * numEltPerGRF<Type_F>();
1371     G4_Declare* payloadF = createSendPayloadDcl(numElts, Type_F);
1372     G4_Declare* payloadUD = createTempVar(numElts, Type_UD, GRFALIGN);
1373     payloadUD->setAliasDeclare(payloadF, 0);
1374     G4_SrcRegRegion* srcToUse = createSrc(payloadUD->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
1375 
1376     // even though we only use lower half of the GRF, we have to allocate full GRF
1377     G4_Declare* payloadHF = createTempVar(numElts * 2, Type_HF, Any);
1378     payloadHF->setAliasDeclare(payloadF, 0);
1379 
1380     /********* Creating temp destination, since results are interleaved **************/
1381     G4_DstRegRegion *dst1 = createNullDst(dst->getType());
1382     G4_Declare * originalDstDcl = nullptr;
1383     G4_Declare* tempDstDcl = nullptr;
1384     bool pixelNullMaskEnable = false;
1385     unsigned tmpDstRows = 0;
1386     if (!dst->isNullReg())
1387     {
1388         originalDstDcl = dst->getBase()->asRegVar()->getDeclare();
1389         tmpDstRows = numChannels;
1390 
1391         // If Pixel Null Mask is enabled, then one extra GRF is needed for the
1392         // write back message.
1393         pixelNullMaskEnable = hasPixelNullMask() && pixelNullMask;
1394         if (pixelNullMaskEnable) {
1395             ASSERT_USER(useHeader, "pixel null mask requires a header");
1396             ++tmpDstRows;
1397         }
1398 
1399         const char *name = getNameString(mem, 20, "%s%d", "TmpSmplDst_", TmpSmplDstID++);
1400 
1401         tempDstDcl = createDeclareNoLookup(name,
1402             originalDstDcl->getRegFile(),
1403             originalDstDcl->getNumElems(),
1404             (uint16_t)tmpDstRows,
1405             originalDstDcl->getElemType());
1406 
1407         dst1 = createDstRegRegion(dst->getRegAccess(),
1408             tempDstDcl->getRegVar(),
1409             0,
1410             0,
1411             1,
1412             dst->getType());
1413     }
1414     /********* End creating temp destination ***********************/
1415 
1416     G4_Declare* header = nullptr;
1417 
1418     if (useHeader)
1419     {
1420         const bool samplerIndexGE16 = IsSamplerIndexGE16(aoffimmi);
1421         bool bindlessSampler = sampler ? isBindlessSampler(sampler) : false;
1422         header = getSamplerHeader(bindlessSampler, samplerIndexGE16);
1423         sampler = createSampleHeader(this, header, actualop, pixelNullMask, aoffimmi, srcChannel,
1424             sampler);
1425         createMovInst(payloadUD, 0, 0, g4::SIMD8, nullptr, nullptr,
1426             createSrcRegRegion(header, getRegionStride1()), true);
1427     }
1428 
1429     G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
1430     for (unsigned paramCounter = 0; paramCounter < numParms; ++paramCounter)
1431     {
1432         temp = params[paramCounter];
1433         uint32_t MovInstOpt = InstOpt_WriteEnable;
1434         if (temp->getTypeSize() == 2)
1435         {
1436             // we should generate
1437             // mov (8) dst<1>:hf src.0<8;8,1>:hf
1438             G4_DstRegRegion* dstHF = createDst(
1439                 payloadHF->getRegVar(), regOff++, 0, 1, temp->getType());
1440             temp->setRegion(getRegionStride1());
1441             createMov(g4::SIMD8, dstHF, temp, MovInstOpt, true);
1442         }
1443         else
1444         {
1445             Copy_SrcRegRegion_To_Payload(payloadF, regOff, temp, execSize, MovInstOpt);
1446         }
1447     }
1448 
1449     uint32_t responseLength = getSamplerResponseLength(numChannels, isHalfReturn, execSize,
1450         pixelNullMaskEnable, dst->isNullReg());
1451 
1452     uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), isHalfReturn, halfInput);
1453     uint32_t desc = G4_SendDescRaw::createDesc(fc, useHeader, numRows, responseLength);
1454 
1455     if (cpsEnable)
1456     {
1457         checkCPSEnable(actualop, responseLength, 8);
1458     }
1459     G4_SendDescRaw *msgDesc = createSampleMsgDesc(desc, cpsEnable, 0, surface, sampler);
1460 
1461     G4_InstSend* sendInst = nullptr;
1462     bool forceSplitSend = shouldForceSplitSend(surface);
1463 
1464     if (forceSplitSend)
1465     {
1466         sendInst = createSplitSendInst(
1467             pred, dst1, srcToUse, createNullSrc(Type_UD), execSize, msgDesc, instOpt, false);
1468     }
1469     else
1470     {
1471         sendInst = createSendInst(
1472             pred, dst1, srcToUse, execSize, msgDesc, instOpt, false);
1473     }
1474     setUniformSampler(sendInst, uniformSampler);
1475 
1476     // SKL+
1477     // For SIMD8
1478     //
1479     // W4.7:1 Reserved (not written): This W4 is only delivered when Pixel Null
1480     //        Mask Enable is enabled.
1481     //
1482     // W4.0  32:8 Reserved: always written as 0xffffff
1483     //        7:0 Pixel Null Mask: This field has the bit for all pixels set
1484     //            to 1 except those pixels in which a null page was source for
1485     //            at least one texel.
1486     //
1487     // Need to combine the results from the above two writewback messages.
1488     // Denote by U0[W4:0] the last row of the first writeback message, and
1489     // by U1[W4:0] the last row of the second writeback message. Then the last
1490     // row of the whole writeback message is to take the bitwise OR of
1491     // U0[W4:0] and U1[W4:0].
1492     G4_Declare *tempDstUD = 0;
1493     G4_Declare *tempDst2UD = 0;
1494     G4_Declare *origDstUD = 0;
1495 
1496     // temp dst for the second send
1497     G4_DstRegRegion *dst2 = createNullDst(dst->getType());
1498     G4_Declare* tempDstDcl2 = nullptr;
1499     if (!dst->isNullReg())
1500     {
1501         const char *name = getNameString(mem, 20, "%s%d", "TmpSmplDst2_", TmpSmplDstID++);
1502 
1503         tempDstDcl2 = createDeclareNoLookup(name,
1504             originalDstDcl->getRegFile(),
1505             originalDstDcl->getNumElems(),
1506             (uint16_t)tmpDstRows,
1507             originalDstDcl->getElemType());
1508 
1509         if (pixelNullMaskEnable)
1510         {
1511             unsigned int numElts = tempDstDcl->getNumElems() * tempDstDcl->getNumRows();
1512             tempDstUD = createTempVar(numElts, Type_UD, GRFALIGN);
1513             tempDstUD->setAliasDeclare(tempDstDcl, 0);
1514 
1515             numElts = tempDstDcl2->getNumElems() * tempDstDcl2->getNumRows();
1516             tempDst2UD = createTempVar(numElts, Type_UD, GRFALIGN);
1517             tempDst2UD->setAliasDeclare(tempDstDcl2, 0);
1518 
1519             numElts = originalDstDcl->getNumElems() * originalDstDcl->getNumRows();
1520             origDstUD = createTempVar(numElts, Type_UD, GRFALIGN);
1521             origDstUD->setAliasDeclare(originalDstDcl, 0);
1522         }
1523 
1524         dst2 = createDstRegRegion(dst->getRegAccess(),
1525             tempDstDcl2->getRegVar(),
1526             0,
1527             0,
1528             1,
1529             dst->getType());
1530     }
1531     // update emask
1532     emask = Get_Next_EMask(emask, execSize);
1533     G4_InstOpts instOpt2 = Get_Gen4_Emask(emask, execSize);
1534 
1535     auto dupPredicate = [this](G4_Predicate* pred)
1536     {
1537         G4_Predicate* pred2 = nullptr;
1538         if (pred)
1539         {
1540             pred2 = createPredicate(
1541                 pred->getState(),
1542                 pred->getBase(),
1543                 0);
1544         }
1545 
1546         return pred2;
1547     };
1548 
1549     {
1550         /**************** SECOND HALF OF THE SEND *********************/
1551         // re-create payload declare so the two sends may be issued independently
1552         G4_Declare* payloadF = createSendPayloadDcl(numElts, Type_F);
1553         G4_Declare* payloadUD = createTempVar(numElts, Type_UD, GRFALIGN);
1554         payloadUD->setAliasDeclare(payloadF, 0);
1555 
1556         // even though we only use lower half of the GRF, we have to allocate full GRF
1557         G4_Declare* payloadHF = createTempVar(numElts * 2, Type_HF, Any);
1558         payloadHF->setAliasDeclare(payloadF, 0);
1559 
1560         G4_SrcRegRegion *srcToUse2 = createSrc(payloadUD->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
1561 
1562         if (useHeader)
1563         {
1564             createMovInst(payloadUD, 0, 0, g4::SIMD8, nullptr, nullptr,
1565                 createSrcRegRegion(header, getRegionStride1()), true);
1566         }
1567 
1568         for (unsigned int i = 0; i < numParms; i++)
1569         {
1570             if (params[i]->isNullReg())
1571             {
1572                 secondHalf[i] = params[i];
1573             }
1574             else if (params[i]->getTypeSize() == 2)
1575             {
1576                 // V1(0,8)<8;8,1>
1577                 secondHalf[i] = createSrcWithNewSubRegOff(params[i], execSize);
1578             }
1579             else
1580             {
1581                 // V1(1,0)<8;8,1>
1582                 secondHalf[i] = createSrcWithNewRegOff(params[i], params[i]->getRegOff() + 1);
1583             }
1584         }
1585 
1586         regOff = (useHeader ? 1 : 0);
1587         for (unsigned paramCounter = 0; paramCounter < numParms; ++paramCounter)
1588         {
1589             temp = secondHalf[paramCounter];
1590             uint32_t MovInstOpt = InstOpt_WriteEnable;
1591 
1592             if (temp->getTypeSize() == 2)
1593             {
1594                 // we should generate
1595                 // mov (8) dst<1>:hf src.8<8;8,1>:hf
1596                 G4_DstRegRegion* dstHF = createDst(
1597                     payloadHF->getRegVar(), regOff++, 0, 1, temp->getType());
1598                 createMov(execSize, dstHF, temp, MovInstOpt, true);
1599             }
1600             else
1601             {
1602                 Copy_SrcRegRegion_To_Payload(payloadF, regOff, temp, execSize, MovInstOpt);
1603             }
1604         }
1605 
1606         G4_Operand *surface2 = duplicateOperand(surface);
1607 
1608         // sampler may be null for 3d load (specifically ld2dms_w)
1609         G4_Operand* sampler2 = sampler == nullptr ? nullptr : duplicateOperand(sampler);
1610 
1611         G4_Predicate* pred2 = dupPredicate(pred);
1612 
1613         G4_SendDescRaw *msgDesc2 = createSampleMsgDesc(desc, cpsEnable, 0, surface2, sampler2);
1614         msgDesc2->setHeaderPresent(useHeader);
1615 
1616         if (forceSplitSend)
1617         {
1618             sendInst = createSplitSendInst(
1619                 pred2, dst2, srcToUse2, createNullSrc(Type_UD), execSize, msgDesc2, instOpt2, false);
1620         }
1621         else
1622         {
1623             sendInst = createSendInst(
1624                 pred2, dst2, srcToUse2, execSize, msgDesc2, instOpt2, false);
1625         }
1626         setUniformSampler(sendInst, uniformSampler);
1627     }
1628 
1629     {
1630 
1631         /**************** MOVING FROM TEMP TO DST, 1st half *********************/
1632         regOff = 0;
1633         for (unsigned i = 0; i < tmpDstRows; i++, regOff += 1)
1634         {
1635             // If Pixel Null Mask is enabled, then only copy the last double word.
1636             if (pixelNullMaskEnable && i == tmpDstRows - 1)
1637             {
1638                 G4_DstRegRegion *origDstPtr = createDst(origDstUD->getRegVar(), short(regOff), 0, 1, Type_UD);
1639                 G4_SrcRegRegion *src0Ptr = createSrc(tempDstUD->getRegVar(),
1640                     short(i), 0, getRegionScalar(), Type_UD);
1641 
1642                 G4_Predicate* pred2 = dupPredicate(pred);
1643 
1644                 // Copy the write mask message W4.0 into the dst. (No mask?)
1645                 createInst(pred2, G4_mov, NULL, g4::NOSAT, g4::SIMD1,
1646                     origDstPtr, src0Ptr, NULL, NULL,
1647                     InstOpt_WriteEnable, true);
1648                 // Skip the remaining part of the loop.
1649                 break;
1650             }
1651 
1652             G4_SrcRegRegion *tmpSrcPnt = createSrc(tempDstDcl->getRegVar(), (short)i, 0, getRegionStride1(), tempDstDcl->getElemType());
1653 
1654             uint32_t MovInstOpt = instOpt;
1655             if (isHalfReturn)
1656             {
1657                 // mov (8) dst(0,0)<1>:hf tmp(0,0)<8;8,1>:hf {Q1}
1658                 G4_DstRegRegion* dst = createDst(
1659                     originalDstDcl->getRegVar(), (short)regOff, 0, 1, originalDstDcl->getElemType());
1660                 createMov(execSize, dst, tmpSrcPnt, MovInstOpt, true);
1661             }
1662             else
1663             {
1664                 Copy_SrcRegRegion_To_Payload(originalDstDcl, regOff, tmpSrcPnt, execSize, MovInstOpt);
1665             }
1666         }
1667     }
1668 
1669     {
1670         /**************** MOVING FROM TEMP TO DST, 2nd half *********************/
1671         regOff = isHalfReturn ? 0 : 1;
1672         for (unsigned i = 0; i < tmpDstRows; i++, regOff += 1)
1673         {
1674             // If Pixel Null Mask is enabled, copy the second half to the originai dst
1675             if (pixelNullMaskEnable && i == tmpDstRows - 1) {
1676                 G4_Type secondHalfType = execSize == g4::SIMD8 ? Type_UB : Type_UW;
1677                 G4_DstRegRegion* origDstPtr = createDst(origDstUD->getRegVar(), regOff - 1, 1, 1, secondHalfType);
1678                 G4_SrcRegRegion* src0Ptr = createSrc(tempDst2UD->getRegVar(),
1679                     short(i), 0, getRegionScalar(), secondHalfType);
1680 
1681                 G4_Predicate* pred2 = dupPredicate(pred);
1682                 // write to dst.0[8:15]
1683                 createInst(pred2, G4_mov, NULL, g4::NOSAT, g4::SIMD1,
1684                     origDstPtr, src0Ptr, NULL, InstOpt_WriteEnable, true);
1685 
1686                 // Skip the remaining part of the loop.
1687                 break;
1688             }
1689 
1690             G4_SrcRegRegion *tmpSrcPnt = createSrc(tempDstDcl2->getRegVar(), (short)i, 0, getRegionStride1(), tempDstDcl->getElemType());
1691 
1692             uint32_t MovInstOpt = instOpt2;
1693             if (isHalfReturn)
1694             {
1695                 // mov (8) dst(0,8)<1>:hf tmp(0,0)<8;8,1>:hf {Q2}
1696                 G4_DstRegRegion* dst = createDst(
1697                     originalDstDcl->getRegVar(), (short)regOff, execSize, 1, originalDstDcl->getElemType());
1698                 createMov(execSize, dst, tmpSrcPnt, MovInstOpt, true);
1699             }
1700             else
1701             {
1702                 Copy_SrcRegRegion_To_Payload(originalDstDcl, regOff, tmpSrcPnt, execSize, MovInstOpt);
1703             }
1704         }
1705     }
1706     return status;
1707 }
1708 
doSamplerHeaderMove(G4_Declare * headerDcl,G4_Operand * sampler)1709 void IR_Builder::doSamplerHeaderMove(G4_Declare* headerDcl, G4_Operand* sampler)
1710 {
1711     if (isBindlessSampler(sampler))
1712     {
1713         // sampler index in msg desc will be 0, manipulate the sampler offset instead
1714         // mov (1) M0.3<1>:ud sampler<0;1,0>:ud the driver will send the handle with bit 0 already set
1715         G4_DstRegRegion* dst = createDst(headerDcl->getRegVar(), 0, 3, 1, Type_UD);
1716         createMov(g4::SIMD1, dst, sampler, InstOpt_WriteEnable, true);
1717     }
1718 }
1719 
1720 //
1721 // generate the r0 move for the sampler message header, and return the dcl
1722 // for CNL+, also set SSP to dynamic if message is not bindless
1723 //
getSamplerHeader(bool isBindlessSampler,bool samplerIndexGE16)1724 G4_Declare* IR_Builder::getSamplerHeader(bool isBindlessSampler, bool samplerIndexGE16)
1725 {
1726     G4_Declare* dcl = nullptr;
1727 
1728     G4_InstOpts dbgOpt = m_options->getOption(vISA_markSamplerMoves) ? InstOpt_BreakPoint : InstOpt_NoOpt;
1729     if (m_options->getOption(vISA_cacheSamplerHeader) && !isBindlessSampler)
1730     {
1731         dcl = builtinSamplerHeader;
1732         if (!builtinSamplerHeaderInitialized)
1733         {
1734             builtinSamplerHeaderInitialized = true;
1735             if (hasBindlessSampler())
1736             {
1737                 // make sure we set bit 0 of M0.3:ud to be 0
1738                 // and (1) M0.6<1>:uw M0.6<1>:uw 0xFFFE
1739                 G4_DstRegRegion* dst = createDst(dcl->getRegVar(), 0, 6, 1, Type_UW);
1740                 G4_SrcRegRegion* src0 = createSrc(dcl->getRegVar(), 0, 6, getRegionScalar(), Type_UW);
1741                 G4_INST* SSPMove = createBinOp(G4_and, g4::SIMD1, dst, src0, createImm(0xFFFE, Type_UW), InstOpt_WriteEnable, false);
1742                 instList.push_front(SSPMove);
1743             }
1744             G4_INST* r0Move = createMov(g4::SIMD8,
1745                 createDstRegRegion(dcl, 1),
1746                 createSrcRegRegion(builtinR0, getRegionStride1()),
1747                 InstOpt_WriteEnable | dbgOpt, false);
1748             instList.push_front(r0Move);
1749         }
1750         if (samplerIndexGE16)
1751         {
1752             // When sampler index is greater or equal 16 then the
1753             // createSamplerHeader() message overwrites the sampler states
1754             // pointer in the header -> cannot use the cached value in this
1755             // case.
1756             dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1757             dcl->setCapableOfReuse();
1758             G4_SrcRegRegion* src = createSrc(builtinSamplerHeader->getRegVar(), 0, 0, getRegionStride1(), Type_UD);
1759             createMovInst(dcl, 0, 0, g4::SIMD8, NULL, NULL, src, false, dbgOpt);
1760         }
1761     }
1762     else
1763     {
1764         dcl = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_UD);
1765         dcl->setCapableOfReuse();
1766         createMovR0Inst(dcl, 0, 0, true, dbgOpt);
1767         if (hasBindlessSampler() && !isBindlessSampler)
1768         {
1769             // make sure we set bit 0 of M0.3:ud to be 0
1770             // and (1) M0.6<1>:uw M0.6<1>:uw 0xFFFE
1771             G4_DstRegRegion* dst = createDst(dcl->getRegVar(), 0, 6, 1, Type_UW);
1772             G4_SrcRegRegion* src0 = createSrc(dcl->getRegVar(), 0, 6, getRegionScalar(), Type_UW);
1773             createBinOp(G4_and, g4::SIMD1, dst, src0, createImm(0xFFFE, Type_UW), InstOpt_WriteEnable, true);
1774         }
1775     }
1776 
1777     return dcl;
1778 }
1779 
1780 // get the number of GRFs occupied by a sampler message's operand
getNumGRF(bool isFP16,int execSize)1781 static uint32_t getNumGRF(bool isFP16, int execSize)
1782 {
1783     int numBytes = (isFP16 ? 2 : 4) * execSize;
1784     return (numBytes + getGRFSize() - 1) / getGRFSize();
1785 }
1786 
getSamplerResponseLength(int numChannels,bool isFP16,int execSize,bool pixelNullMask,bool nullDst)1787 uint32_t IR_Builder::getSamplerResponseLength(
1788     int numChannels, bool isFP16, int execSize, bool pixelNullMask, bool nullDst)
1789 {
1790     if (nullDst)
1791     {
1792         hasNullReturnSampler = true;
1793         return 0;
1794     }
1795     uint32_t responseLength = numChannels * getNumGRF(isFP16, execSize);
1796 
1797     if (pixelNullMask)
1798     {
1799         ++responseLength;
1800     }
1801     return responseLength;
1802 }
1803 
needSamplerHeader(IR_Builder * builder,bool pixelNullMask,bool nonZeroAoffImmi,bool needHeaderForChannels,bool bindlessSampler,bool simd16HFReturn)1804 static bool needSamplerHeader(
1805     IR_Builder* builder, bool pixelNullMask, bool nonZeroAoffImmi,
1806     bool needHeaderForChannels, bool bindlessSampler,
1807     bool simd16HFReturn)
1808 {
1809     return builder->forceSamplerHeader() ||
1810         (pixelNullMask && builder->hasPixelNullMask()) ||
1811         nonZeroAoffImmi || needHeaderForChannels || bindlessSampler ||
1812         (simd16HFReturn && VISA_WA_CHECK(builder->getPWaTable(), WaHeaderRequiredOnSimd16Sample16bit));
1813 }
1814 
1815 // This function assumes there are no gaps in parameter array. e.g. NULL pointers
1816 // If there is a gap it must be RawOperand with value 0.
translateVISASampler3DInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,bool cpsEnable,bool uniformSampler,G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl emask,ChannelMask chMask,G4_Operand * aoffimmi,G4_Operand * sampler,G4_Operand * surface,G4_DstRegRegion * dst,unsigned int numParms,G4_SrcRegRegion ** params)1817 int IR_Builder::translateVISASampler3DInst(
1818     VISASampler3DSubOpCode actualop,
1819     bool pixelNullMask,
1820     bool cpsEnable,
1821     bool uniformSampler,
1822     G4_Predicate* pred,
1823     VISA_Exec_Size executionSize,
1824     VISA_EMask_Ctrl emask,
1825     ChannelMask chMask,
1826     G4_Operand *aoffimmi,
1827     G4_Operand *sampler,
1828     G4_Operand *surface,
1829     G4_DstRegRegion* dst,
1830     unsigned int numParms,
1831     G4_SrcRegRegion ** params)
1832 {
1833     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1834 
1835     G4_ExecSize execSize = toExecSize(executionSize);
1836     G4_InstOpts instOpt = Get_Gen4_Emask(emask, execSize);
1837 
1838     // First setup message header and message payload
1839 
1840     // Message header and payload size is numParms GRFs
1841 
1842     const bool FP16Return = dst->getTypeSize() == 2;
1843     const bool FP16Input = params[0]->getType() == Type_HF;
1844 
1845     bool useHeader = false;
1846 
1847     unsigned int numRows = numParms * getNumGRF(FP16Input, execSize);
1848 
1849     VISAChannelMask channels = chMask.getAPI();
1850     // For SKL+ channel mask R, RG, RGB, and RGBA may be derived from response length
1851     bool needHeaderForChannels = (getPlatform() < GENX_SKL) ? channels != CHANNEL_MASK_RGBA :
1852         (channels != CHANNEL_MASK_R && channels != CHANNEL_MASK_RG && channels != CHANNEL_MASK_RGB && channels != CHANNEL_MASK_RGBA);
1853 
1854     bool nonZeroAoffImmi = !(aoffimmi->isImm() && aoffimmi->asImm()->getInt() == 0);
1855     bool simd16HFReturn = FP16Return && execSize == 16;
1856     if (needSamplerHeader(this, pixelNullMask, nonZeroAoffImmi, needHeaderForChannels,
1857         isBindlessSampler(sampler),
1858         simd16HFReturn))
1859     {
1860         useHeader = true;
1861         ++numRows;
1862     }
1863 
1864     int numChannels = chMask.getNumEnabledChannels();
1865 
1866     if (execSize > getNativeExecSize() &&
1867         (numRows > 11 || actualop == VISA_3D_SAMPLE_D || actualop == VISA_3D_SAMPLE_D_C || actualop == VISA_3D_SAMPLE_KILLPIX))
1868     {
1869         // decrementing since we will produce SIMD8 code.
1870         // don't do this for SIMD16H since its message length is the same as SIMD8H
1871         if (!FP16Input)
1872         {
1873             numRows -= numParms;
1874         }
1875 
1876         return splitSampleInst(actualop, pixelNullMask, cpsEnable, pred, chMask,
1877             numChannels, aoffimmi, sampler, surface,
1878             dst, emask, useHeader, numRows, numParms, params, uniformSampler);
1879     }
1880 
1881     bool useSplitSend = useSends();
1882 
1883     G4_SrcRegRegion *header = 0;
1884     G4_Operand* samplerIdx = sampler;
1885 
1886         if (useHeader)
1887         {
1888             const bool samplerIndexGE16 = IsSamplerIndexGE16(aoffimmi);
1889             G4_Declare *dcl = getSamplerHeader(isBindlessSampler(sampler), samplerIndexGE16);
1890             samplerIdx = createSampleHeader(this, dcl, actualop, pixelNullMask, aoffimmi, chMask,
1891                 sampler);
1892             header = createSrcRegRegion(dcl, getRegionStride1());
1893         }
1894 
1895     G4_InstOpts dbgOpt = m_options->getOption(vISA_markSamplerMoves) ? InstOpt_BreakPoint : InstOpt_NoOpt;
1896     // Collect payload sources.
1897     unsigned len = numParms + (header ? 1 : 0);
1898     std::vector<PayloadSource> sources(len);
1899     unsigned i = 0;
1900     // Collect header if present.
1901     if (header) {
1902         sources[i].opnd = header;
1903         sources[i].execSize = g4::SIMD8;
1904         sources[i].instOpt = InstOpt_WriteEnable | dbgOpt;
1905         ++i;
1906     }
1907     // Collect all parameters.
1908     bool needNoMask = needsNoMaskCoordinates(actualop);
1909     unsigned uPos = needNoMask ? getUPosition(actualop) : ~0u;
1910     for (unsigned j = 0; j != numParms; ++j) {
1911         sources[i].opnd = params[j];
1912         sources[i].execSize = execSize;
1913         sources[i].instOpt = (needNoMask && (uPos <= j && j < (uPos + 3))) ?
1914             InstOpt_WriteEnable | dbgOpt : instOpt | dbgOpt;
1915         ++i;
1916     }
1917     ASSERT_USER(i == len, "There's mismatching during payload source collecting!");
1918 
1919     G4_SrcRegRegion *msgs[2] = {0, 0};
1920     unsigned sizes[2] = {0, 0};
1921     preparePayload(msgs, sizes, execSize, useSplitSend, sources.data(), len);
1922 
1923     uint32_t responseLength = getSamplerResponseLength(numChannels, FP16Return, execSize,
1924         hasPixelNullMask() && pixelNullMask, dst->isNullReg());
1925 
1926     // Check if CPS LOD Compensation Enable is valid.
1927     if (cpsEnable)
1928     {
1929         checkCPSEnable(actualop, responseLength, execSize);
1930     }
1931 
1932     uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), FP16Return, FP16Input);
1933     uint32_t desc = G4_SendDescRaw::createDesc(fc, useHeader, sizes[0], responseLength);
1934 
1935     G4_InstSend* sendInst = nullptr;
1936     bool forceSplitSend = shouldForceSplitSend(surface);
1937     if (msgs[1] == 0 && !forceSplitSend)
1938     {
1939         ASSERT_USER(sizes[1] == 0, "Expect the 2nd part of the payload has zero size!");
1940         G4_SendDescRaw *msgDesc = createSampleMsgDesc(desc, cpsEnable, 0, surface, samplerIdx);
1941 
1942         sendInst = createSendInst(pred, dst, msgs[0], execSize,
1943             msgDesc, instOpt, false);
1944     }
1945     else
1946     {
1947         G4_SendDescRaw *msgDesc = createSampleMsgDesc(desc, cpsEnable, sizes[1], surface, samplerIdx);
1948         sendInst = createSplitSendInst(pred, dst, msgs[0], msgs[1],
1949             execSize, msgDesc, instOpt, false);
1950     }
1951     setUniformSampler(sendInst, uniformSampler);
1952     return VISA_SUCCESS;
1953 }
1954 
translateVISALoad3DInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,G4_Predicate * pred_opnd,VISA_Exec_Size executionSize,VISA_EMask_Ctrl em,ChannelMask channelMask,G4_Operand * aoffimmi,G4_Operand * surface,G4_DstRegRegion * dst,uint8_t numParms,G4_SrcRegRegion ** opndArray)1955 int IR_Builder::translateVISALoad3DInst(
1956     VISASampler3DSubOpCode actualop,
1957     bool pixelNullMask,
1958     G4_Predicate *pred_opnd,
1959     VISA_Exec_Size executionSize,
1960     VISA_EMask_Ctrl em,
1961     ChannelMask channelMask,
1962     G4_Operand* aoffimmi,
1963     G4_Operand* surface,
1964     G4_DstRegRegion* dst,
1965     uint8_t numParms,
1966     G4_SrcRegRegion ** opndArray)
1967 {
1968     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
1969 
1970     bool useHeader = false;
1971 
1972     G4_ExecSize execSize = toExecSize(executionSize);
1973     G4_InstOpts instOpt = Get_Gen4_Emask(em, execSize);
1974 
1975     const bool halfReturn = dst->getTypeSize() == 2;
1976     const bool halfInput = opndArray[0]->getTypeSize() == 2;
1977 
1978     unsigned int numRows = numParms * getNumGRF(halfInput, execSize);
1979 
1980     VISAChannelMask channels = channelMask.getAPI();
1981     // For SKL+ channel mask R, RG, RGB, and RGBA may be derived from response length
1982     bool needHeaderForChannels = (getPlatform() < GENX_SKL) ? channels != CHANNEL_MASK_RGBA :
1983         (channels != CHANNEL_MASK_R && channels != CHANNEL_MASK_RG && channels != CHANNEL_MASK_RGB && channels != CHANNEL_MASK_RGBA);
1984 
1985     bool nonZeroAoffImmi = !(aoffimmi->isImm() && aoffimmi->asImm()->getInt() == 0);
1986     bool simd16HFReturn = halfReturn && execSize == 16;
1987     if (needSamplerHeader(this, pixelNullMask, nonZeroAoffImmi, needHeaderForChannels, false,
1988         simd16HFReturn))
1989     {
1990         useHeader = true;
1991         ++numRows;
1992     }
1993 
1994     int numChannels = channelMask.getNumEnabledChannels();
1995     if (execSize > getNativeExecSize() && numRows > 11)
1996     {
1997         // decrementing since we will produce SIMD8 code.
1998         // don't do this for SIMD16H since its message length is the same as SIMD8H
1999         if (!halfInput)
2000         {
2001             numRows -= numParms;
2002         }
2003         return splitSampleInst(actualop, pixelNullMask, /*cpsEnable*/false,
2004             pred_opnd, channelMask, numChannels, aoffimmi, NULL, surface,
2005             dst, em, useHeader, numRows, numParms, opndArray);
2006     }
2007 
2008     bool useSplitSend = useSends();
2009 
2010     G4_SrcRegRegion *header = nullptr;
2011     if (useHeader)
2012     {
2013         G4_Declare* dcl = getSamplerHeader(false /*isBindlessSampler*/, false /*samperIndexGE16*/);
2014         {
2015             (void)createSampleHeader(this, dcl, actualop, pixelNullMask, aoffimmi, channelMask,
2016                 nullptr);
2017         }
2018         header = createSrcRegRegion(dcl, getRegionStride1());
2019     }
2020 
2021     // Collect payload sources.
2022     unsigned len = numParms + (header ? 1 : 0);
2023     std::vector<PayloadSource> sources(len);
2024     unsigned i = 0;
2025     // Collect header if present.
2026     if (header) {
2027         sources[i].opnd = header;
2028         sources[i].execSize = g4::SIMD8;
2029         sources[i].instOpt = InstOpt_WriteEnable;
2030         ++i;
2031     }
2032     // Collect all parameters.
2033     bool needNoMask = needsNoMaskCoordinates(actualop);
2034     unsigned uPos = needNoMask ? getUPosition(actualop) : ~0u;
2035     for (unsigned j = 0; j != numParms; ++j) {
2036         sources[i].opnd = opndArray[j];
2037         sources[i].execSize = execSize;
2038         sources[i].instOpt = (needNoMask && (uPos <= j && j < (uPos + 3))) ?
2039             InstOpt_WriteEnable : instOpt;
2040         ++i;
2041     }
2042     ASSERT_USER(i == len, "There's mismatching during payload source collecting!");
2043 
2044     G4_SrcRegRegion *msgs[2] = {0, 0};
2045     unsigned sizes[2] = {0, 0};
2046     preparePayload(msgs, sizes, execSize, useSplitSend, sources.data(), len);
2047 
2048     uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), halfReturn, halfInput);
2049 
2050     uint32_t responseLength = getSamplerResponseLength(numChannels, halfReturn, execSize,
2051         hasPixelNullMask() && pixelNullMask, dst->isNullReg());
2052 
2053     bool forceSplitSend = shouldForceSplitSend(surface);
2054     if (msgs[1] == 0 && !forceSplitSend)
2055     {
2056         createSendInst(pred_opnd, dst,
2057             msgs[0], sizes[0],
2058             responseLength,
2059             execSize, fc, SFID::SAMPLER,
2060             useHeader,
2061             SendAccess::READ_ONLY,
2062             surface, NULL,
2063             instOpt, false);
2064     }
2065     else
2066     {
2067         createSplitSendInst(pred_opnd, dst,
2068             msgs[0], sizes[0], msgs[1], sizes[1],
2069             responseLength,
2070             execSize, fc, SFID::SAMPLER,
2071             useHeader,
2072             SendAccess::READ_ONLY,
2073             surface, NULL,
2074             instOpt, false);
2075     }
2076 
2077     return VISA_SUCCESS;
2078 }
2079 
translateVISAGather3dInst(VISASampler3DSubOpCode actualop,bool pixelNullMask,G4_Predicate * pred,VISA_Exec_Size executionSize,VISA_EMask_Ctrl em,ChannelMask channelMask,G4_Operand * aoffimmi,G4_Operand * sampler,G4_Operand * surface,G4_DstRegRegion * dst,unsigned int numOpnds,G4_SrcRegRegion ** opndArray)2080 int IR_Builder::translateVISAGather3dInst(
2081     VISASampler3DSubOpCode actualop,
2082     bool pixelNullMask,
2083     G4_Predicate* pred,
2084     VISA_Exec_Size executionSize,
2085     VISA_EMask_Ctrl em,
2086     ChannelMask channelMask,
2087     G4_Operand* aoffimmi,
2088     G4_Operand* sampler,
2089     G4_Operand* surface,
2090     G4_DstRegRegion* dst,
2091     unsigned int numOpnds,
2092     G4_SrcRegRegion ** opndArray)
2093 {
2094     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2095 
2096     bool useHeader = false;
2097 
2098     G4_ExecSize execSize = toExecSize(executionSize);
2099     G4_InstOpts instOpt = Get_Gen4_Emask(em, execSize);
2100 
2101     const bool FP16Return = dst->getTypeSize() == 2;
2102     const bool FP16Input = opndArray[0]->getType() == Type_HF;
2103 
2104     unsigned int numRows = numOpnds * getNumGRF(FP16Input, execSize);
2105 
2106     bool nonZeroAoffImmi = !(aoffimmi->isImm() && aoffimmi->asImm()->getInt() == 0);
2107     bool needHeaderForChannels = channelMask.getSingleChannel() != VISA_3D_GATHER4_CHANNEL_R;
2108     bool simd16HFReturn = FP16Return && execSize == 16;
2109 
2110     if (needSamplerHeader(this, pixelNullMask, nonZeroAoffImmi, needHeaderForChannels,
2111         isBindlessSampler(sampler),
2112         simd16HFReturn))
2113     {
2114         useHeader = true;
2115         ++numRows;
2116     }
2117 
2118 
2119     if (execSize > getNativeExecSize() && numRows > 11)
2120     {
2121         // decrementing since we will produce SIMD8 code.
2122         // don't do this for SIMD16H since its message length is the same as SIMD8H
2123         if (!FP16Input)
2124         {
2125             numRows -= numOpnds;
2126         }
2127 
2128         return splitSampleInst(actualop, pixelNullMask, /*cpsEnable*/false,
2129             pred, channelMask, 4, aoffimmi, sampler, surface,
2130             dst, em, useHeader, numRows, numOpnds, opndArray);
2131     }
2132 
2133     bool useSplitSend = useSends();
2134 
2135     G4_SrcRegRegion *header = nullptr;
2136     G4_Operand* samplerIdx = sampler;
2137 
2138     if (useHeader)
2139     {
2140         const bool samplerIndexGE16 = IsSamplerIndexGE16(aoffimmi);
2141         G4_Declare *dcl = getSamplerHeader(isBindlessSampler(sampler), samplerIndexGE16);
2142         {
2143             samplerIdx = createSampleHeader(this, dcl, actualop, pixelNullMask, aoffimmi, channelMask,
2144                 sampler);
2145         }
2146         header = createSrcRegRegion(dcl, getRegionStride1());
2147     }
2148 
2149     // Collect payload sources.
2150     unsigned len = numOpnds + (header ? 1 : 0);
2151     std::vector<PayloadSource> sources(len);
2152     unsigned i = 0;
2153     // Collect header if present.
2154     if (header) {
2155         sources[i].opnd = header;
2156         sources[i].execSize = g4::SIMD8;
2157         sources[i].instOpt = InstOpt_WriteEnable;
2158         ++i;
2159     }
2160     // Collect all parameters.
2161     bool needNoMask = needsNoMaskCoordinates(actualop);
2162     unsigned uPos = needNoMask ? getUPosition(actualop) : ~0u;
2163     for (unsigned j = 0; j != numOpnds; ++j) {
2164         sources[i].opnd = opndArray[j];
2165         sources[i].execSize = execSize;
2166         sources[i].instOpt = (needNoMask && (uPos <= j && j < (uPos + 3))) ?
2167             InstOpt_WriteEnable : instOpt;
2168         ++i;
2169     }
2170     ASSERT_USER(i == len, "There's mismatching during payload source collecting!");
2171 
2172     G4_SrcRegRegion *msgs[2] = {0, 0};
2173     unsigned sizes[2] = {0, 0};
2174     preparePayload(msgs, sizes, execSize, useSplitSend, sources.data(), len);
2175 
2176     uint32_t fc = createSamplerMsgDesc(actualop, execSize == getNativeExecSize(), FP16Return, FP16Input);
2177     uint32_t responseLength = getSamplerResponseLength(4, FP16Return, execSize,
2178         hasPixelNullMask() && pixelNullMask, dst->isNullReg());
2179 
2180     bool forceSplitSend = shouldForceSplitSend(surface);
2181     if (msgs[1] == 0 && !forceSplitSend)
2182     {
2183         createSendInst(pred, dst, msgs[0], sizes[0],
2184             responseLength,
2185             execSize, fc, SFID::SAMPLER,
2186             useHeader,
2187             SendAccess::READ_ONLY,
2188             surface, samplerIdx,
2189             instOpt, false);
2190     }
2191     else
2192     {
2193         createSplitSendInst(pred, dst,
2194             msgs[0], sizes[0], msgs[1], sizes[1],
2195             responseLength,
2196             execSize, fc, SFID::SAMPLER,
2197             useHeader,
2198             SendAccess::READ_ONLY,
2199             surface, samplerIdx,
2200             instOpt, false);
2201     }
2202 
2203     return VISA_SUCCESS;
2204 }
2205 
2206 
2207 /*
2208 * Translates Sampler Norm API intrinsic.
2209 *
2210 * Assuming: N = 4, channelMask=ABGR_ENABLE, surfIndex = 0x21, samplerIndex = 0x4,
2211 *           then the generated code should look like the following for GT:
2212 *
2213 * .declare  VX Base=m ElementSize=4 Type=ud Total=16
2214 * .declare  VY Base=r ElementSize=2 Type=uw Total=128
2215 *
2216 * mov  (8)     VX(0,0)<1>,  r0:ud
2217 * mov  (1)     VX(0,2)<1>,  0
2218 * mov  (1)     VX(1,1)<1>,  deltaU
2219 * mov  (1)     VX(1,2)<1>,  u
2220 * mov  (1)     VX(1,5)<1>,  deltaV
2221 * mov  (1)     VX(1,6)<1>,  v
2222 * send (16)    VY(0,0)<1>,  VX(0,0),    0x2,   0x048bc421
2223 * mov  (128)   M(0,0)<1>,   VY(0,0)
2224 *
2225 * VX(0,0): message header
2226 *
2227 * VX(1,0): SIMD32 media payload
2228 *
2229 * ex_desc: 0x2 == 0010 (Target Function ID: Sampling Engine)
2230 *
2231 * desc: 0x048bc421 == Bit 31-29: 000 (Reserved)
2232 *                     Bit 28-25: 0010 (Message Length =)
2233 *                     Bit 24-20: 01000 (Response Message Length = 8)
2234 *                     Bit 19:    1 (Header present)
2235 *                     Bit 18:    0 (Reserved)
2236 *                     Bit 17-16: 11 (SIMD Mode = SIMD32)
2237 *                     Bit 15-12: 1100 (Message Type = sample_unorm media)
2238 *                     Bit 11-8:  0000 + samplerIndex  (Sampler Index)
2239 *                     Bit 7-0:   00000000 + surfIndex (Binding Table Index)
2240 *
2241 */
translateVISASamplerNormInst(G4_Operand * surface,G4_Operand * sampler,ChannelMask channel,unsigned numEnabledChannels,G4_Operand * deltaUOpnd,G4_Operand * uOffOpnd,G4_Operand * deltaVOpnd,G4_Operand * vOffOpnd,G4_DstRegRegion * dst_opnd)2242 int IR_Builder::translateVISASamplerNormInst(
2243     G4_Operand* surface,
2244     G4_Operand* sampler,
2245     ChannelMask channel,
2246     unsigned numEnabledChannels,
2247     G4_Operand* deltaUOpnd,
2248     G4_Operand* uOffOpnd,
2249     G4_Operand* deltaVOpnd,
2250     G4_Operand* vOffOpnd,
2251     G4_DstRegRegion* dst_opnd)
2252 {
2253     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2254 
2255     // mov (8)      VX(0,0)<1>,  r0:ud
2256     // add dcl for VX
2257     G4_Declare *dcl = createSendPayloadDcl(2 * GENX_SAMPLER_IO_SZ, Type_UD);
2258 
2259     // mov  VX(0,0)<1>, r0
2260     createMovR0Inst(dcl, 0, 0);
2261     /* mov (1)     VX(0,2)<1>,   0  */
2262     unsigned cmask = channel.getHWEncoding() << 12;
2263     createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, createImm(cmask, Type_UD));
2264 
2265     G4_Declare *dcl1 = createSendPayloadDcl(GENX_DATAPORT_IO_SZ, Type_F);
2266     dcl1->setAliasDeclare(dcl, numEltPerGRF<Type_UB>());
2267 
2268     // mov  (1)     VX(1,4)<1>,  deltaU
2269     createMovInst(dcl1, 0, 4, g4::SIMD1, NULL, NULL, deltaUOpnd);
2270     // mov  (1)     VX(1,2)<1>,  u
2271     createMovInst(dcl1, 0, 2, g4::SIMD1, NULL, NULL, uOffOpnd);
2272     // mov  (1)     VX(1,5)<1>,  deltaV
2273     createMovInst(dcl1, 0, 5, g4::SIMD1, NULL, NULL, deltaVOpnd);
2274     // mov  (1)     VX(1,3)<1>,  v
2275     createMovInst(dcl1, 0, 3, g4::SIMD1, NULL, NULL, vOffOpnd);
2276 
2277     // send's operands preparation
2278     // create a currDst for VX
2279     G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
2280 
2281     G4_DstRegRegion* d = checkSendDst(dst_opnd->asDstRegRegion());
2282 
2283     // Set bit 12-17 for the message descriptor
2284     unsigned temp = 0;
2285     temp += 0xc << 12;   // Bit 16-12 = 1100 for Sampler Message Type
2286     temp += 0x3 << 17;   // Bit 18-17 = 11 for SIMD32 mode
2287 
2288     createSendInst(
2289         NULL,
2290         d,
2291         payload,
2292         2,
2293         32*numEnabledChannels*TypeSize(Type_UW)/numEltPerGRF<Type_UB>(),
2294         g4::SIMD32,
2295         temp,
2296         SFID::SAMPLER,
2297         1,
2298         SendAccess::READ_ONLY,
2299         surface,
2300         sampler,
2301         0,
2302         false);
2303 
2304     return VISA_SUCCESS;
2305 }
2306 
2307 
2308 /*
2309 * Translates Sampler intrinsic.
2310 *
2311 * Assuming: N = 4, channelMask=ABGR_ENABLE, surfIndex = 0x21, samplerIndex = 0x4,
2312 *           then the generated code should look like the following for GT:
2313 *
2314 * .declare  VX Base=m ElementSize=4 Type=f Total=72
2315 * .declare  VY Base=r ElementSize=4 Type=f Total=64
2316 * .declare  VZ Base=r ElementSize=2 Type=w Total=128 ALIAS(VY,0)
2317 *
2318 * mov  (8)     VX(0,0)<1>,  r0:ud
2319 * mov  (1)     VX(0,2)<1>,  0
2320 * mov  (16)    VX(1,0)<1>,  u
2321 * mov  (16)    VX(3,0)<1>,  v
2322 * mov  (16)    VX(5,0)<1>,  r
2323 * mov  (16)    VX(7,0)<1>,  0
2324 * send (16)    VY(0,0)<1>,  VX(0,0),    0x2,  0x128a0421
2325 * mov  (64)    M(0,0)<1>,   VY(0,0)
2326 *
2327 * ex_desc: 0x2 == 0010 (Target Function ID: Sampling Engine)
2328 *
2329 * desc: 0x128a0421 == Bit 31-29: 000 (Reserved)
2330 *                     Bit 28-25: 1001 (Message Length = 9 (1+2*4 for SIMD16))
2331 *                     Bit 24-20: 01000 (Response Message Length = 8)
2332 *                     Bit 19:    1 (Header present)
2333 *                     Bit 18:    0 (Reserved)
2334 *                     Bit 17-16: 10 (SIMD Mode = SIMD16)
2335 *                     Bit 15-12: 0000 (Message Type = Sample)
2336 *                     Bit 11-8:  0000 + samplerIndex  (Sampler Index)
2337 *                     Bit 7-0:   00000000 + surfIndex (Binding Table Index)
2338 *
2339 */
translateVISASamplerInst(unsigned simdMode,G4_Operand * surface,G4_Operand * sampler,ChannelMask channel,unsigned numEnabledChannels,G4_Operand * uOffOpnd,G4_Operand * vOffOpnd,G4_Operand * rOffOpnd,G4_DstRegRegion * dstOpnd)2340 int IR_Builder::translateVISASamplerInst(
2341     unsigned simdMode,
2342     G4_Operand* surface,
2343     G4_Operand* sampler,
2344     ChannelMask channel,
2345     unsigned numEnabledChannels,
2346     G4_Operand* uOffOpnd,
2347     G4_Operand* vOffOpnd,
2348     G4_Operand* rOffOpnd,
2349     G4_DstRegRegion* dstOpnd)
2350 {
2351     TIME_SCOPE(VISA_BUILDER_IR_CONSTRUCTION);
2352 
2353     // mov (8)      VX(0,0)<1>,  r0:ud
2354     // add dcl for VX
2355     unsigned num_payload_elt = simdMode/2 * numEltPerGRF<Type_UB>()/TypeSize(Type_UD);
2356     G4_Declare *dcl = createSendPayloadDcl(num_payload_elt + GENX_SAMPLER_IO_SZ, Type_UD);
2357 
2358     // mov  VX(0,0)<1>, r0
2359     createMovR0Inst(dcl, 0, 0);
2360     unsigned cmask = channel.getHWEncoding() << 12;
2361     /* mov (1)     VX(0,2)<1>,   0  */
2362     createMovInst(dcl, 0, 2, g4::SIMD1, NULL, NULL, createImm(cmask, Type_UD));
2363 
2364     // set up the message payload
2365     // lod is always uninitialized for us as we don't support it.
2366     G4_Declare *dcl1 = createSendPayloadDcl(num_payload_elt, Type_UD);
2367     dcl1->setAliasDeclare(dcl, numEltPerGRF<Type_UB>());
2368     /* mov  (sample_mode)    VX(0,0)<1>,  u */
2369     createMovSendSrcInst(dcl1, 0, 0, simdMode, uOffOpnd, 0);
2370     if (sampler == NULL)
2371     {
2372         // ld
2373         if (getPlatform() < GENX_SKL)
2374         {
2375             // the order of paramters is
2376             // u    lod        v    r
2377             /* mov  (sample_mode)    VX(sample_mode/8, 0)<1>,  lod */
2378             createMovSendSrcInst(dcl1, simdMode/8, 0, simdMode, createImm(0, Type_UD), 0);
2379             /* mov  (sample_mode)    VX(2*sample_mode/8, 0)<1>,  v */
2380             createMovSendSrcInst(dcl1, 2*simdMode/8, 0, simdMode, vOffOpnd, 0);
2381             /* mov  (sample_mode)    VX(3*sampler_mode/8, 0)<1>,  r */
2382             createMovSendSrcInst(dcl1, 3*simdMode/8, 0, simdMode, rOffOpnd, 0);
2383         }
2384         else
2385         {
2386             // SKL+: the order of paramters is
2387             // u    v   lod r
2388             /* mov  (sample_mode)    VX(sample_mode/8, 0)<1>,  v */
2389             createMovSendSrcInst(dcl1, simdMode/8, 0, simdMode, vOffOpnd, 0);
2390             /* mov  (sample_mode)    VX(2*sample_mode/8, 0)<1>,  lod */
2391             createMovSendSrcInst(dcl1, 2*simdMode/8, 0, simdMode, createImm(0, Type_UD), 0);
2392             /* mov  (sample_mode)    VX(3*sampler_mode/8, 0)<1>,  r */
2393             createMovSendSrcInst(dcl1, 3*simdMode/8, 0, simdMode, rOffOpnd, 0);
2394         }
2395     }
2396     else
2397     {
2398         // sample
2399         /* mov  (sample_mode)    VX(1 + sample_mode/8, 0)<1>,  v */
2400         createMovSendSrcInst(dcl1, simdMode/8, 0, simdMode, vOffOpnd, 0);
2401         /* mov  (sample_mode)    VX(3,0)<1>,  r */
2402         createMovSendSrcInst(dcl1, 2*simdMode/8, 0, simdMode, rOffOpnd, 0);
2403         /* mov  (sample_mode)    VX(5,0)<1>,  0 */
2404         createMovSendSrcInst(dcl1, 3*simdMode/8, 0, simdMode, createImm(0, Type_UD), 0);
2405     }
2406     // send's operands preparation
2407     // create a currDst for VX
2408     G4_SrcRegRegion* payload = createSrcRegRegion(dcl, getRegionStride1());
2409 
2410     G4_DstRegRegion* d = checkSendDst(dstOpnd->asDstRegRegion());
2411 
2412     // Set bit 9-8 for the message descriptor
2413     unsigned temp = 0;
2414 
2415     //Bit 17-18 = 10 for SIMD mode
2416     if (simdMode == 8)
2417     {
2418         temp += 0x1 << 17;
2419     }
2420     else
2421     {
2422         temp += 0x2 << 17;
2423     }
2424 
2425     if (sampler == NULL)
2426     {
2427 #define SAMPLER_MESSAGE_TYPE_OFFSET    12
2428         //LD message
2429         temp += VISASampler3DSubOpCode::VISA_3D_LD << SAMPLER_MESSAGE_TYPE_OFFSET;
2430     }
2431 
2432     if (simdMode == 16) {
2433         // redefine the type and offset of post dst.
2434         if ((d->getType() != Type_W) &&
2435             (d->getType() != Type_UW)) {
2436             short new_SubRegOff = dstOpnd->asDstRegRegion()->getSubRegOff();
2437             if (dstOpnd->getRegAccess() == Direct) {
2438                 new_SubRegOff = (dstOpnd->asDstRegRegion()->getSubRegOff() * dstOpnd->getTypeSize()) / TypeSize(Type_W);
2439             }
2440             G4_DstRegRegion new_dst(
2441                 dstOpnd->getRegAccess(),
2442                 dstOpnd->asDstRegRegion()->getBase(),
2443                 dstOpnd->asDstRegRegion()->getRegOff(),
2444                 new_SubRegOff,
2445                 1,
2446                 Type_W);
2447             d = createDstRegRegion(new_dst);
2448         }
2449     }
2450 
2451     createSendInst(
2452         NULL,
2453         d,
2454         payload,
2455         1 + simdMode/2,
2456         ((simdMode == 8) ? 32 : (numEnabledChannels*16))*TypeSize(Type_F)/numEltPerGRF<Type_UB>(),
2457         G4_ExecSize(simdMode),
2458         temp,
2459         SFID::SAMPLER,
2460         1,
2461         SendAccess::READ_ONLY,
2462         surface,
2463         sampler,
2464         0,
2465         false);
2466     return VISA_SUCCESS;
2467 }
2468