1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2020-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "BuildIR.h"
10 
11 using namespace vISA;
12 
isNoMask(VISA_EMask_Ctrl eMask)13 bool IR_Builder::isNoMask(VISA_EMask_Ctrl eMask) {
14     switch (eMask) {
15     case vISA_EMASK_M1_NM:
16     case vISA_EMASK_M2_NM:
17     case vISA_EMASK_M3_NM:
18     case vISA_EMASK_M4_NM:
19     case vISA_EMASK_M5_NM:
20     case vISA_EMASK_M6_NM:
21     case vISA_EMASK_M7_NM:
22     case vISA_EMASK_M8_NM:
23         return true;
24     default:
25         return false;
26     }
27 }
28 
toExecSize(VISA_Exec_Size execSize)29 G4_ExecSize IR_Builder::toExecSize(VISA_Exec_Size execSize)
30 {
31     switch (execSize) {
32     case EXEC_SIZE_1: return g4::SIMD1;
33     case EXEC_SIZE_2: return g4::SIMD2;
34     case EXEC_SIZE_4: return g4::SIMD4;
35     case EXEC_SIZE_8: return g4::SIMD8;
36     case EXEC_SIZE_16: return g4::SIMD16;
37     case EXEC_SIZE_32: return g4::SIMD32;
38     default:
39         MUST_BE_TRUE(false, "illegal common ISA execsize (should be 0..5).");
40         return G4_ExecSize(0);
41     }
42 }
43 
44 // vector scatter messages are either SIMD8/16, so we have to round up
45 // the exec size
roundUpExecSize(VISA_Exec_Size execSize)46 VISA_Exec_Size IR_Builder::roundUpExecSize(VISA_Exec_Size execSize)
47 {
48     // for PVC legacy messages must be SIMD16
49     if (getNativeExecSize() == g4::SIMD16)
50     {
51         return EXEC_SIZE_16;
52     }
53     if (execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 || execSize == EXEC_SIZE_4)
54     {
55         return EXEC_SIZE_8;
56     }
57     return execSize;
58 }
59 
getImmDcl(G4_Imm * val,int numElt)60 G4_Declare* IR_Builder::getImmDcl(G4_Imm* val, int numElt)
61 {
62     auto dcl = immPool.addImmVal(val, numElt);
63     if (dcl)
64     {
65         return dcl;
66     }
67     dcl = createTempVarWithNoSpill(numElt, val->getType(), Any);
68     createMov(G4_ExecSize(numElt), createDstRegRegion(dcl, 1), val,
69         InstOpt_WriteEnable, true);
70     return dcl;
71 }
72 
73 
74 
75 
76 /// CopySrcToMsgPayload() performs a single batch of copy source into message
77 /// payload. If that single batch needs copy more than 2 GRFs, it will be split
78 /// into 2 parts recursively. That implies the a single batch copy MUST have
79 /// the size of power-of-2 multiple GRFs.
CopySrcToMsgPayload(IR_Builder * IRB,G4_ExecSize execSize,uint32_t eMask,G4_Declare * msg,unsigned msgRegOff,G4_SrcRegRegion * src,unsigned srcRegOff)80 static void CopySrcToMsgPayload(
81     IR_Builder *IRB,
82     G4_ExecSize execSize, uint32_t eMask,
83     G4_Declare *msg, unsigned msgRegOff,
84     G4_SrcRegRegion *src, unsigned srcRegOff)
85 {
86     uint32_t numRegs = (src->getElemSize() * execSize) /
87         COMMON_ISA_GRF_REG_SIZE;
88     if (numRegs == 0)
89     {
90         // always copy at least one GRF
91         numRegs = 1;
92     }
93 
94     ASSERT_USER((numRegs & (numRegs - 1)) == 0,
95         "The batch size of a source message copy (i.e., native raw "
96         "operand size) MUST be power-of-2 multiple of GRFs!");
97 
98     if (numRegs > 2) {
99         // Copying of 2+ GRFs needs splitting. The splitting algorithm is
100         // designed to be as general as possible to cover all possible valid
101         // cases for message payload copying, i.e.,
102         //
103         // <32 x i32> -> 2 * <16 x i32>
104         // <16 x i64> -> 2 * < 8 x i64>
105         // <32 x i64> -> 2 * <16 x i64> -> 4 * < 8 x i64>
106         //
107         G4_ExecSize newExecSize {execSize / 2};
108         unsigned splitOff = numRegs >> 1;
109         uint32_t loEMask = IR_Builder::getSplitLoEMask(execSize, eMask);
110         uint32_t hiEMask = IR_Builder::getSplitHiEMask(execSize, eMask);
111         // Copy Lo
112         CopySrcToMsgPayload(IRB, newExecSize, loEMask,
113             msg, msgRegOff,
114             src, srcRegOff);
115         // Copy Hi
116         CopySrcToMsgPayload(IRB, newExecSize, hiEMask,
117             msg, msgRegOff + splitOff,
118             src, srcRegOff + splitOff);
119         return;
120     }
121 
122     G4_DstRegRegion *dstRegion
123         = IRB->createDst(msg->getRegVar(),
124         (short)msgRegOff, 0, 1,
125             src->getType());
126     G4_SrcRegRegion *srcRegion
127         = IRB->createSrcRegRegion(src->getModifier(),
128             src->getRegAccess(),
129             src->getBase(),
130             src->getRegOff() + srcRegOff,
131             src->getSubRegOff(),
132             src->getRegion(),
133             src->getType());
134     IRB->createMov(execSize, dstRegion, srcRegion, eMask, true);
135 }
136 
Copy_Source_To_Payload(IR_Builder * IRB,G4_ExecSize batchExSize,G4_Declare * msg,unsigned & regOff,G4_SrcRegRegion * source,G4_ExecSize execSize,uint32_t eMask)137 static void Copy_Source_To_Payload(
138     IR_Builder *IRB, G4_ExecSize batchExSize,
139     G4_Declare *msg, unsigned &regOff,
140     G4_SrcRegRegion *source, G4_ExecSize execSize,
141     uint32_t eMask)
142 {
143     ASSERT_USER(batchExSize == 1 || batchExSize == 2 || batchExSize == 4 ||
144         batchExSize == 8 || batchExSize == 16 || batchExSize == 32,
145         "Invalid execution size for message payload copy!");
146 
147     unsigned srcRegOff = 0;
148     G4_ExecSize batchSize = std::min(batchExSize, execSize);
149     uint32_t numSrcRegs = (source->getElemSize() * batchSize) /
150         COMMON_ISA_GRF_REG_SIZE;
151     if (numSrcRegs == 0)
152     {
153         // always copy at least one GRF
154         numSrcRegs = 1;
155     }
156 
157     for (unsigned i = 0; i < execSize; i += batchSize) {
158         if (!source->isNullReg()) {
159             CopySrcToMsgPayload(IRB, batchSize, eMask,
160                 msg, regOff, source, srcRegOff);
161         }
162         regOff += numSrcRegs;
163         srcRegOff += numSrcRegs;
164     }
165 }
166 
preparePayload(G4_SrcRegRegion * msgs[2],unsigned sizes[2],G4_ExecSize batchExSize,bool splitSendEnabled,PayloadSource srcs[],unsigned len)167 void IR_Builder::preparePayload(
168     G4_SrcRegRegion *msgs[2],
169     unsigned sizes[2],
170     G4_ExecSize batchExSize,
171     bool splitSendEnabled,
172     PayloadSource srcs[], unsigned len)
173 {
174     const G4_Declare *dcls[2] = {0, 0};
175     unsigned msgSizes[2] = {0, 0};
176     unsigned current = 0;
177     unsigned offset = 0;
178     unsigned splitPos = 0;
179 
180     // Loop through all source regions to check whether they forms one
181     // consecutive regions or one/two consecutive regions if splitIndex is
182     // non-zero.
183     unsigned i;
184     for (i = 0; i != len; ++i) {
185         G4_SrcRegRegion *srcReg = srcs[i].opnd;
186 
187         if (srcReg->isNullReg()) {
188             break;
189         }
190 
191         const G4_Declare *srcDcl = getDeclare(srcReg);
192         ASSERT_USER(srcDcl, "Declaration is missing!");
193 
194         unsigned regionSize = srcs[i].execSize * srcReg->getTypeSize();
195 
196         if (regionSize < COMMON_ISA_GRF_REG_SIZE) {
197             // FIXME: Need a better solution to decouple the value type from
198             // the container type to generate better COPY if required.
199             // round up to 1 GRF
200             regionSize = COMMON_ISA_GRF_REG_SIZE;
201         }
202 
203         if (srcDcl == dcls[current]) {
204             unsigned srcOff = getByteOffsetSrcRegion(srcReg);
205             // Check offset if they have the same declaration.
206             if (offset == srcOff) {
207                 // Advance offset to next expected one.
208                 offset += regionSize;
209                 msgSizes[current] += regionSize;
210                 continue;
211             }
212             // Check whether there are overlaps if split-send is enabled.
213             if (splitSendEnabled && current == 0 && srcOff < offset) {
214                 // The source overlaps with the previous sources prepared.
215                 // Force to copy all sources from the this source for the 2nd
216                 // part in the split message.
217                 ++current;
218 
219                 ASSERT_USER(i > 0, "Split position MUST NOT be at index 0!");
220                 splitPos = i;
221                 break;
222             }
223         }
224 
225         if (dcls[current] == 0) {
226             // First time checking the current region.
227             offset = getByteOffsetSrcRegion(srcReg);
228             offset += regionSize;
229             msgSizes[current] += regionSize;
230             dcls[current] = srcDcl;
231             continue;
232         }
233 
234         // Bail out if more than 1 consecutive regions are needed but
235         // split-send is not enabled.
236         if (!splitSendEnabled)
237             break;
238 
239         // Bail out if more than 2 consecutive regions will be needed.
240         if (current != 0)
241             break;
242 
243         // Check one more consecutive regions.
244         ++current;
245 
246         ASSERT_USER(i > 0, "Split position MUST NOT be at index 0!");
247 
248         // Record the 2nd consecutive region.
249         splitPos = i;
250         offset = getByteOffsetSrcRegion(srcReg);
251         offset += regionSize;
252         msgSizes[current] += regionSize;
253         dcls[current] = srcDcl;
254     }
255 
256     if (i == len) {
257         // All sources are checked and they are fit into one or two consecutive
258         // regions.
259         msgs[0] = srcs[0].opnd;
260         msgs[1] = (splitPos == 0) ? 0 : srcs[splitPos].opnd;
261         sizes[0] = msgSizes[0] / numEltPerGRF<Type_UB>();
262         sizes[1] = msgSizes[1] / numEltPerGRF<Type_UB>();
263 
264         return;
265     }
266 
267     // Count remaining message size.
268     for (; i != len; ++i) {
269         G4_SrcRegRegion *srcReg = srcs[i].opnd;
270         unsigned regionSize = srcs[i].execSize * srcReg->getTypeSize();
271         if (regionSize < COMMON_ISA_GRF_REG_SIZE) {
272             // FIXME: Need a better solution to decouple the value type from
273             // the container type to generate better COPY if required.
274             // round up to 1 GRF
275             regionSize = COMMON_ISA_GRF_REG_SIZE;
276         }
277         msgSizes[current] += regionSize;
278     }
279 
280     // Allocate a new large enough GPR to copy in the payload.
281     G4_Declare *msg =
282         createSendPayloadDcl(msgSizes[current]/TypeSize(Type_UD), Type_UD);
283 
284     // Copy sources.
285     unsigned regOff = 0;
286     for (i = splitPos; i != len; ++i)
287     {
288         Copy_Source_To_Payload(this, batchExSize, msg, regOff, srcs[i].opnd,
289             srcs[i].execSize, srcs[i].instOpt);
290     }
291 
292     i = 0;
293     if (current > 0) {
294         msgs[i] = srcs[0].opnd;
295         sizes[i] = msgSizes[0] / numEltPerGRF<Type_UB>();
296         ++i;
297     }
298     msgs[i] = createSrcRegRegion(msg, getRegionStride1());
299     sizes[i] = msgSizes[current] / numEltPerGRF<Type_UB>();
300 }
301 
coalescePayload(unsigned sourceAlignment,unsigned payloadAlignment,uint32_t payloadWidth,uint32_t srcSize,std::initializer_list<G4_SrcRegRegion * > srcs,VISA_EMask_Ctrl emask)302 G4_SrcRegRegion *IR_Builder::coalescePayload(
303     unsigned sourceAlignment,
304     unsigned payloadAlignment,
305     uint32_t payloadWidth,   // number of elements for one payload in the send.
306     uint32_t srcSize,       // number of elements provided by src
307     std::initializer_list<G4_SrcRegRegion *> srcs,
308     VISA_EMask_Ctrl emask)
309 {
310     MUST_BE_TRUE(sourceAlignment != 0 && payloadAlignment != 0,
311         "alignment mustn't be 0");
312     MUST_BE_TRUE(payloadAlignment % 4 == 0, // we could relax this with smarter code below
313         "result alignment must be multiple of 4");
314     MUST_BE_TRUE(srcs.size() > 0,"empty initializer list");
315 
316     // First check for trivial cases.  If all are null, then we can
317     // return null.  This is the case for operations like load's src1 and
318     // atomics with no argument (e.g. atomic increment).
319     //
320     // If the first src is the only non-null register and it's alignment fits
321     // then we can just return that register and call it a day.  This is the
322     // common case for things like stores or atomics with a single
323     // data parameter (e.g. atomic add).
324     bool allNull = true;
325     bool onlySrc0NonNull = true;
326     int ix = 0;
327     for (G4_SrcRegRegion *src : srcs) {
328         allNull &= src->isNullReg();
329         onlySrc0NonNull &= ix++ == 0 || src->isNullReg();
330     }
331     G4_SrcRegRegion *src0 = *srcs.begin();
332     if (allNull) {
333         return src0;
334     } else if (onlySrc0NonNull) {
335         const G4_Declare *src0Dcl = getDeclare(src0);
336         MUST_BE_TRUE(src0Dcl, "declaration missing");
337         unsigned src0Size = src0Dcl->getTotalElems()*src0Dcl->getElemSize();
338         if (src0Size % sourceAlignment == 0 &&
339             src0Size % payloadAlignment == 0)
340         {
341             return src0;
342         }
343     }
344 
345     // Otherwise, we have to do some copying
346     auto alignTo = [] (size_t a, size_t n) {
347         return (n + a - 1) - ((n + a - 1)%a);
348     };
349 
350     int numPayloadGRF =  0;
351     // precompute the necessary region size
352     for (G4_SrcRegRegion *src : srcs) {
353         if (src && !src->isNullReg())
354         {
355             // ToDo: add D16 support later
356             auto laneSize = src->getTypeSize() == 8 ? 8 : 4;
357             numPayloadGRF += std::max(1u, (payloadWidth * laneSize) / getGRFSize());
358         }
359     }
360 
361     G4_Declare *payloadDeclUD = createSendPayloadDcl(numPayloadGRF * getGRFSize() / 4, Type_UD);
362     payloadDeclUD->setEvenAlign();
363 
364     unsigned row = 0;
365     for (G4_SrcRegRegion *src : srcs) {
366         if (src && !src->isNullReg()) {
367 
368             // ToDo: add D16 support later
369             auto laneSize = src->getTypeSize() == 8 ? 8 : 4;
370             auto totalSize = srcSize * laneSize;
371 
372             // for each payload we copy <srcSize> lanes to its corresponding location in payload
373             // src must be GRF-aligned per vISA spec requirement
374             // Two moves may be necessary for 64-bit types
375             auto copyRegion =
376                 [&] (G4_Type type) {
377                 uint32_t numMoves = std::max(1u, totalSize / (2 * getGRFSize()));
378                 auto moveMask = emask;
379                 G4_ExecSize MAX_SIMD {std::min(srcSize, getNativeExecSize() * (laneSize == 8 ? 1u : 2u))};
380                 for (unsigned i = 0; i < numMoves; i++) {
381                     auto rowOffset = i * 2;
382                     unsigned int instOpt = Get_Gen4_Emask(moveMask, MAX_SIMD);
383                     G4_DstRegRegion* dstRegion =
384                         createDst(
385                             payloadDeclUD->getRegVar(),
386                             row + rowOffset, 0,
387                             1, type);
388                     G4_SrcRegRegion* srcRegion =
389                         createSrc(
390                             src->getTopDcl()->getRegVar(), src->getRegOff() + rowOffset, 0,
391                             getRegionStride1(),
392                             type);
393                     createMov(MAX_SIMD,
394                         dstRegion, srcRegion, instOpt, true);
395                     moveMask = Get_Next_EMask(moveMask, MAX_SIMD);
396                 }
397             };
398 
399             copyRegion(src->getType());
400 
401             // advance the payload offset by <payloadWidth> elements
402             row += std::max(1u, (payloadWidth * laneSize) / getGRFSize());
403         }
404     }
405 
406     return createSrcRegRegion(payloadDeclUD, getRegionStride1());
407 }
408 
409 
Copy_SrcRegRegion_To_Payload(G4_Declare * payload,unsigned int & regOff,G4_SrcRegRegion * src,G4_ExecSize execSize,uint32_t emask)410 void IR_Builder::Copy_SrcRegRegion_To_Payload(
411     G4_Declare* payload, unsigned int& regOff, G4_SrcRegRegion* src,
412     G4_ExecSize execSize, uint32_t emask)
413 {
414     auto payloadDstRgn = createDst(payload->getRegVar(), (short)regOff, 0, 1, payload->getElemType());
415 
416     G4_SrcRegRegion* srcRgn = createSrcRegRegion(*src);
417     srcRgn->setType(payload->getElemType());
418     createMov(execSize, payloadDstRgn, srcRgn, emask, true);
419     if (TypeSize(payload->getElemType()) == 2)
420     {
421         // for half float each source occupies 1 GRF regardless of execution size
422         regOff++;
423     }
424     else
425     {
426         regOff += execSize / getNativeExecSize();
427     }
428 }
429 
getByteOffsetSrcRegion(G4_SrcRegRegion * srcRegion)430 unsigned int IR_Builder::getByteOffsetSrcRegion(G4_SrcRegRegion* srcRegion)
431 {
432     unsigned int offset =
433         (srcRegion->getRegOff() * numEltPerGRF<Type_UB>()) +
434         (srcRegion->getSubRegOff() * srcRegion->getTypeSize());
435 
436     if (srcRegion->getBase() &&
437         srcRegion->getBase()->isRegVar())
438     {
439         G4_Declare* dcl = srcRegion->getBase()->asRegVar()->getDeclare();
440 
441         if (dcl != NULL)
442         {
443             while (dcl->getAliasDeclare() != NULL)
444             {
445                 offset += dcl->getAliasOffset();
446                 dcl = dcl->getAliasDeclare();
447             }
448         }
449     }
450 
451     return offset;
452 }
453 
checkIfRegionsAreConsecutive(G4_SrcRegRegion * first,G4_SrcRegRegion * second,G4_ExecSize execSize)454 bool IR_Builder::checkIfRegionsAreConsecutive(
455     G4_SrcRegRegion* first, G4_SrcRegRegion* second, G4_ExecSize execSize)
456 {
457     if (first == NULL || second == NULL)
458     {
459         return true;
460     }
461 
462     return checkIfRegionsAreConsecutive(first, second, execSize, first->getType());
463 }
464 
checkIfRegionsAreConsecutive(G4_SrcRegRegion * first,G4_SrcRegRegion * second,G4_ExecSize execSize,G4_Type type)465 bool IR_Builder::checkIfRegionsAreConsecutive(
466     G4_SrcRegRegion* first, G4_SrcRegRegion* second, G4_ExecSize execSize, G4_Type type)
467 {
468     bool isConsecutive = false;
469 
470     if (first == NULL || second == NULL)
471     {
472         isConsecutive = true;
473     }
474     else
475     {
476         G4_Declare* firstDcl = getDeclare(first);
477         G4_Declare* secondDcl = getDeclare(second);
478 
479         unsigned int firstOff = getByteOffsetSrcRegion(first);
480         unsigned int secondOff = getByteOffsetSrcRegion(second);
481 
482         if (firstDcl == secondDcl)
483         {
484             if (firstOff + execSize * TypeSize(type) == secondOff)
485             {
486                 isConsecutive = true;
487             }
488         }
489     }
490 
491     return isConsecutive;
492 }
493 
generateDebugInfoPlaceholder()494 int IR_Builder::generateDebugInfoPlaceholder()
495 {
496     debugInfoPlaceholder = curCISAOffset;
497     return VISA_SUCCESS;
498 }
499 
500 
translateVISALifetimeInst(uint8_t properties,G4_Operand * var)501 int IR_Builder::translateVISALifetimeInst(uint8_t properties, G4_Operand* var)
502 {
503     // Lifetime.start/end are two variants of this instruction
504     createImm(properties & 0x1, Type_UB);
505 
506     if ((properties & 0x1) == LIFETIME_START)
507     {
508         G4_DstRegRegion* varDstRgn = createDst(var->getBase(), 0, 0, 1, Type_UD);
509         createIntrinsicInst(
510             nullptr, Intrinsic::PseudoKill, g4::SIMD1,
511             varDstRgn, createImm((unsigned int)PseudoKillType::Src),
512             nullptr, nullptr, InstOpt_WriteEnable, true);
513     }
514     else
515     {
516         G4_SrcRegRegion* varSrcRgn = createSrc(var->getBase(), 0, 0, getRegionScalar(), Type_UD);
517         createIntrinsicInst(nullptr, Intrinsic::PseudoUse, g4::SIMD1, nullptr, varSrcRgn,
518             nullptr, nullptr, InstOpt_WriteEnable, true);
519     }
520 
521     // We dont treat lifetime.end specially for now because lifetime.start
522     // is expected to halt propagation of liveness upwards. lifetime.start
523     // would prevent loop local variables/sub-rooutine local variables
524     // from being live across entire loop/sub-routine.
525 
526     return VISA_SUCCESS;
527 }
528