1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2020-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "BuildIR.h"
10
11 using namespace vISA;
12
isNoMask(VISA_EMask_Ctrl eMask)13 bool IR_Builder::isNoMask(VISA_EMask_Ctrl eMask) {
14 switch (eMask) {
15 case vISA_EMASK_M1_NM:
16 case vISA_EMASK_M2_NM:
17 case vISA_EMASK_M3_NM:
18 case vISA_EMASK_M4_NM:
19 case vISA_EMASK_M5_NM:
20 case vISA_EMASK_M6_NM:
21 case vISA_EMASK_M7_NM:
22 case vISA_EMASK_M8_NM:
23 return true;
24 default:
25 return false;
26 }
27 }
28
toExecSize(VISA_Exec_Size execSize)29 G4_ExecSize IR_Builder::toExecSize(VISA_Exec_Size execSize)
30 {
31 switch (execSize) {
32 case EXEC_SIZE_1: return g4::SIMD1;
33 case EXEC_SIZE_2: return g4::SIMD2;
34 case EXEC_SIZE_4: return g4::SIMD4;
35 case EXEC_SIZE_8: return g4::SIMD8;
36 case EXEC_SIZE_16: return g4::SIMD16;
37 case EXEC_SIZE_32: return g4::SIMD32;
38 default:
39 MUST_BE_TRUE(false, "illegal common ISA execsize (should be 0..5).");
40 return G4_ExecSize(0);
41 }
42 }
43
44 // vector scatter messages are either SIMD8/16, so we have to round up
45 // the exec size
roundUpExecSize(VISA_Exec_Size execSize)46 VISA_Exec_Size IR_Builder::roundUpExecSize(VISA_Exec_Size execSize)
47 {
48 // for PVC legacy messages must be SIMD16
49 if (getNativeExecSize() == g4::SIMD16)
50 {
51 return EXEC_SIZE_16;
52 }
53 if (execSize == EXEC_SIZE_1 || execSize == EXEC_SIZE_2 || execSize == EXEC_SIZE_4)
54 {
55 return EXEC_SIZE_8;
56 }
57 return execSize;
58 }
59
getImmDcl(G4_Imm * val,int numElt)60 G4_Declare* IR_Builder::getImmDcl(G4_Imm* val, int numElt)
61 {
62 auto dcl = immPool.addImmVal(val, numElt);
63 if (dcl)
64 {
65 return dcl;
66 }
67 dcl = createTempVarWithNoSpill(numElt, val->getType(), Any);
68 createMov(G4_ExecSize(numElt), createDstRegRegion(dcl, 1), val,
69 InstOpt_WriteEnable, true);
70 return dcl;
71 }
72
73
74
75
76 /// CopySrcToMsgPayload() performs a single batch of copy source into message
77 /// payload. If that single batch needs copy more than 2 GRFs, it will be split
78 /// into 2 parts recursively. That implies the a single batch copy MUST have
79 /// the size of power-of-2 multiple GRFs.
CopySrcToMsgPayload(IR_Builder * IRB,G4_ExecSize execSize,uint32_t eMask,G4_Declare * msg,unsigned msgRegOff,G4_SrcRegRegion * src,unsigned srcRegOff)80 static void CopySrcToMsgPayload(
81 IR_Builder *IRB,
82 G4_ExecSize execSize, uint32_t eMask,
83 G4_Declare *msg, unsigned msgRegOff,
84 G4_SrcRegRegion *src, unsigned srcRegOff)
85 {
86 uint32_t numRegs = (src->getElemSize() * execSize) /
87 COMMON_ISA_GRF_REG_SIZE;
88 if (numRegs == 0)
89 {
90 // always copy at least one GRF
91 numRegs = 1;
92 }
93
94 ASSERT_USER((numRegs & (numRegs - 1)) == 0,
95 "The batch size of a source message copy (i.e., native raw "
96 "operand size) MUST be power-of-2 multiple of GRFs!");
97
98 if (numRegs > 2) {
99 // Copying of 2+ GRFs needs splitting. The splitting algorithm is
100 // designed to be as general as possible to cover all possible valid
101 // cases for message payload copying, i.e.,
102 //
103 // <32 x i32> -> 2 * <16 x i32>
104 // <16 x i64> -> 2 * < 8 x i64>
105 // <32 x i64> -> 2 * <16 x i64> -> 4 * < 8 x i64>
106 //
107 G4_ExecSize newExecSize {execSize / 2};
108 unsigned splitOff = numRegs >> 1;
109 uint32_t loEMask = IR_Builder::getSplitLoEMask(execSize, eMask);
110 uint32_t hiEMask = IR_Builder::getSplitHiEMask(execSize, eMask);
111 // Copy Lo
112 CopySrcToMsgPayload(IRB, newExecSize, loEMask,
113 msg, msgRegOff,
114 src, srcRegOff);
115 // Copy Hi
116 CopySrcToMsgPayload(IRB, newExecSize, hiEMask,
117 msg, msgRegOff + splitOff,
118 src, srcRegOff + splitOff);
119 return;
120 }
121
122 G4_DstRegRegion *dstRegion
123 = IRB->createDst(msg->getRegVar(),
124 (short)msgRegOff, 0, 1,
125 src->getType());
126 G4_SrcRegRegion *srcRegion
127 = IRB->createSrcRegRegion(src->getModifier(),
128 src->getRegAccess(),
129 src->getBase(),
130 src->getRegOff() + srcRegOff,
131 src->getSubRegOff(),
132 src->getRegion(),
133 src->getType());
134 IRB->createMov(execSize, dstRegion, srcRegion, eMask, true);
135 }
136
Copy_Source_To_Payload(IR_Builder * IRB,G4_ExecSize batchExSize,G4_Declare * msg,unsigned & regOff,G4_SrcRegRegion * source,G4_ExecSize execSize,uint32_t eMask)137 static void Copy_Source_To_Payload(
138 IR_Builder *IRB, G4_ExecSize batchExSize,
139 G4_Declare *msg, unsigned ®Off,
140 G4_SrcRegRegion *source, G4_ExecSize execSize,
141 uint32_t eMask)
142 {
143 ASSERT_USER(batchExSize == 1 || batchExSize == 2 || batchExSize == 4 ||
144 batchExSize == 8 || batchExSize == 16 || batchExSize == 32,
145 "Invalid execution size for message payload copy!");
146
147 unsigned srcRegOff = 0;
148 G4_ExecSize batchSize = std::min(batchExSize, execSize);
149 uint32_t numSrcRegs = (source->getElemSize() * batchSize) /
150 COMMON_ISA_GRF_REG_SIZE;
151 if (numSrcRegs == 0)
152 {
153 // always copy at least one GRF
154 numSrcRegs = 1;
155 }
156
157 for (unsigned i = 0; i < execSize; i += batchSize) {
158 if (!source->isNullReg()) {
159 CopySrcToMsgPayload(IRB, batchSize, eMask,
160 msg, regOff, source, srcRegOff);
161 }
162 regOff += numSrcRegs;
163 srcRegOff += numSrcRegs;
164 }
165 }
166
preparePayload(G4_SrcRegRegion * msgs[2],unsigned sizes[2],G4_ExecSize batchExSize,bool splitSendEnabled,PayloadSource srcs[],unsigned len)167 void IR_Builder::preparePayload(
168 G4_SrcRegRegion *msgs[2],
169 unsigned sizes[2],
170 G4_ExecSize batchExSize,
171 bool splitSendEnabled,
172 PayloadSource srcs[], unsigned len)
173 {
174 const G4_Declare *dcls[2] = {0, 0};
175 unsigned msgSizes[2] = {0, 0};
176 unsigned current = 0;
177 unsigned offset = 0;
178 unsigned splitPos = 0;
179
180 // Loop through all source regions to check whether they forms one
181 // consecutive regions or one/two consecutive regions if splitIndex is
182 // non-zero.
183 unsigned i;
184 for (i = 0; i != len; ++i) {
185 G4_SrcRegRegion *srcReg = srcs[i].opnd;
186
187 if (srcReg->isNullReg()) {
188 break;
189 }
190
191 const G4_Declare *srcDcl = getDeclare(srcReg);
192 ASSERT_USER(srcDcl, "Declaration is missing!");
193
194 unsigned regionSize = srcs[i].execSize * srcReg->getTypeSize();
195
196 if (regionSize < COMMON_ISA_GRF_REG_SIZE) {
197 // FIXME: Need a better solution to decouple the value type from
198 // the container type to generate better COPY if required.
199 // round up to 1 GRF
200 regionSize = COMMON_ISA_GRF_REG_SIZE;
201 }
202
203 if (srcDcl == dcls[current]) {
204 unsigned srcOff = getByteOffsetSrcRegion(srcReg);
205 // Check offset if they have the same declaration.
206 if (offset == srcOff) {
207 // Advance offset to next expected one.
208 offset += regionSize;
209 msgSizes[current] += regionSize;
210 continue;
211 }
212 // Check whether there are overlaps if split-send is enabled.
213 if (splitSendEnabled && current == 0 && srcOff < offset) {
214 // The source overlaps with the previous sources prepared.
215 // Force to copy all sources from the this source for the 2nd
216 // part in the split message.
217 ++current;
218
219 ASSERT_USER(i > 0, "Split position MUST NOT be at index 0!");
220 splitPos = i;
221 break;
222 }
223 }
224
225 if (dcls[current] == 0) {
226 // First time checking the current region.
227 offset = getByteOffsetSrcRegion(srcReg);
228 offset += regionSize;
229 msgSizes[current] += regionSize;
230 dcls[current] = srcDcl;
231 continue;
232 }
233
234 // Bail out if more than 1 consecutive regions are needed but
235 // split-send is not enabled.
236 if (!splitSendEnabled)
237 break;
238
239 // Bail out if more than 2 consecutive regions will be needed.
240 if (current != 0)
241 break;
242
243 // Check one more consecutive regions.
244 ++current;
245
246 ASSERT_USER(i > 0, "Split position MUST NOT be at index 0!");
247
248 // Record the 2nd consecutive region.
249 splitPos = i;
250 offset = getByteOffsetSrcRegion(srcReg);
251 offset += regionSize;
252 msgSizes[current] += regionSize;
253 dcls[current] = srcDcl;
254 }
255
256 if (i == len) {
257 // All sources are checked and they are fit into one or two consecutive
258 // regions.
259 msgs[0] = srcs[0].opnd;
260 msgs[1] = (splitPos == 0) ? 0 : srcs[splitPos].opnd;
261 sizes[0] = msgSizes[0] / numEltPerGRF<Type_UB>();
262 sizes[1] = msgSizes[1] / numEltPerGRF<Type_UB>();
263
264 return;
265 }
266
267 // Count remaining message size.
268 for (; i != len; ++i) {
269 G4_SrcRegRegion *srcReg = srcs[i].opnd;
270 unsigned regionSize = srcs[i].execSize * srcReg->getTypeSize();
271 if (regionSize < COMMON_ISA_GRF_REG_SIZE) {
272 // FIXME: Need a better solution to decouple the value type from
273 // the container type to generate better COPY if required.
274 // round up to 1 GRF
275 regionSize = COMMON_ISA_GRF_REG_SIZE;
276 }
277 msgSizes[current] += regionSize;
278 }
279
280 // Allocate a new large enough GPR to copy in the payload.
281 G4_Declare *msg =
282 createSendPayloadDcl(msgSizes[current]/TypeSize(Type_UD), Type_UD);
283
284 // Copy sources.
285 unsigned regOff = 0;
286 for (i = splitPos; i != len; ++i)
287 {
288 Copy_Source_To_Payload(this, batchExSize, msg, regOff, srcs[i].opnd,
289 srcs[i].execSize, srcs[i].instOpt);
290 }
291
292 i = 0;
293 if (current > 0) {
294 msgs[i] = srcs[0].opnd;
295 sizes[i] = msgSizes[0] / numEltPerGRF<Type_UB>();
296 ++i;
297 }
298 msgs[i] = createSrcRegRegion(msg, getRegionStride1());
299 sizes[i] = msgSizes[current] / numEltPerGRF<Type_UB>();
300 }
301
coalescePayload(unsigned sourceAlignment,unsigned payloadAlignment,uint32_t payloadWidth,uint32_t srcSize,std::initializer_list<G4_SrcRegRegion * > srcs,VISA_EMask_Ctrl emask)302 G4_SrcRegRegion *IR_Builder::coalescePayload(
303 unsigned sourceAlignment,
304 unsigned payloadAlignment,
305 uint32_t payloadWidth, // number of elements for one payload in the send.
306 uint32_t srcSize, // number of elements provided by src
307 std::initializer_list<G4_SrcRegRegion *> srcs,
308 VISA_EMask_Ctrl emask)
309 {
310 MUST_BE_TRUE(sourceAlignment != 0 && payloadAlignment != 0,
311 "alignment mustn't be 0");
312 MUST_BE_TRUE(payloadAlignment % 4 == 0, // we could relax this with smarter code below
313 "result alignment must be multiple of 4");
314 MUST_BE_TRUE(srcs.size() > 0,"empty initializer list");
315
316 // First check for trivial cases. If all are null, then we can
317 // return null. This is the case for operations like load's src1 and
318 // atomics with no argument (e.g. atomic increment).
319 //
320 // If the first src is the only non-null register and it's alignment fits
321 // then we can just return that register and call it a day. This is the
322 // common case for things like stores or atomics with a single
323 // data parameter (e.g. atomic add).
324 bool allNull = true;
325 bool onlySrc0NonNull = true;
326 int ix = 0;
327 for (G4_SrcRegRegion *src : srcs) {
328 allNull &= src->isNullReg();
329 onlySrc0NonNull &= ix++ == 0 || src->isNullReg();
330 }
331 G4_SrcRegRegion *src0 = *srcs.begin();
332 if (allNull) {
333 return src0;
334 } else if (onlySrc0NonNull) {
335 const G4_Declare *src0Dcl = getDeclare(src0);
336 MUST_BE_TRUE(src0Dcl, "declaration missing");
337 unsigned src0Size = src0Dcl->getTotalElems()*src0Dcl->getElemSize();
338 if (src0Size % sourceAlignment == 0 &&
339 src0Size % payloadAlignment == 0)
340 {
341 return src0;
342 }
343 }
344
345 // Otherwise, we have to do some copying
346 auto alignTo = [] (size_t a, size_t n) {
347 return (n + a - 1) - ((n + a - 1)%a);
348 };
349
350 int numPayloadGRF = 0;
351 // precompute the necessary region size
352 for (G4_SrcRegRegion *src : srcs) {
353 if (src && !src->isNullReg())
354 {
355 // ToDo: add D16 support later
356 auto laneSize = src->getTypeSize() == 8 ? 8 : 4;
357 numPayloadGRF += std::max(1u, (payloadWidth * laneSize) / getGRFSize());
358 }
359 }
360
361 G4_Declare *payloadDeclUD = createSendPayloadDcl(numPayloadGRF * getGRFSize() / 4, Type_UD);
362 payloadDeclUD->setEvenAlign();
363
364 unsigned row = 0;
365 for (G4_SrcRegRegion *src : srcs) {
366 if (src && !src->isNullReg()) {
367
368 // ToDo: add D16 support later
369 auto laneSize = src->getTypeSize() == 8 ? 8 : 4;
370 auto totalSize = srcSize * laneSize;
371
372 // for each payload we copy <srcSize> lanes to its corresponding location in payload
373 // src must be GRF-aligned per vISA spec requirement
374 // Two moves may be necessary for 64-bit types
375 auto copyRegion =
376 [&] (G4_Type type) {
377 uint32_t numMoves = std::max(1u, totalSize / (2 * getGRFSize()));
378 auto moveMask = emask;
379 G4_ExecSize MAX_SIMD {std::min(srcSize, getNativeExecSize() * (laneSize == 8 ? 1u : 2u))};
380 for (unsigned i = 0; i < numMoves; i++) {
381 auto rowOffset = i * 2;
382 unsigned int instOpt = Get_Gen4_Emask(moveMask, MAX_SIMD);
383 G4_DstRegRegion* dstRegion =
384 createDst(
385 payloadDeclUD->getRegVar(),
386 row + rowOffset, 0,
387 1, type);
388 G4_SrcRegRegion* srcRegion =
389 createSrc(
390 src->getTopDcl()->getRegVar(), src->getRegOff() + rowOffset, 0,
391 getRegionStride1(),
392 type);
393 createMov(MAX_SIMD,
394 dstRegion, srcRegion, instOpt, true);
395 moveMask = Get_Next_EMask(moveMask, MAX_SIMD);
396 }
397 };
398
399 copyRegion(src->getType());
400
401 // advance the payload offset by <payloadWidth> elements
402 row += std::max(1u, (payloadWidth * laneSize) / getGRFSize());
403 }
404 }
405
406 return createSrcRegRegion(payloadDeclUD, getRegionStride1());
407 }
408
409
Copy_SrcRegRegion_To_Payload(G4_Declare * payload,unsigned int & regOff,G4_SrcRegRegion * src,G4_ExecSize execSize,uint32_t emask)410 void IR_Builder::Copy_SrcRegRegion_To_Payload(
411 G4_Declare* payload, unsigned int& regOff, G4_SrcRegRegion* src,
412 G4_ExecSize execSize, uint32_t emask)
413 {
414 auto payloadDstRgn = createDst(payload->getRegVar(), (short)regOff, 0, 1, payload->getElemType());
415
416 G4_SrcRegRegion* srcRgn = createSrcRegRegion(*src);
417 srcRgn->setType(payload->getElemType());
418 createMov(execSize, payloadDstRgn, srcRgn, emask, true);
419 if (TypeSize(payload->getElemType()) == 2)
420 {
421 // for half float each source occupies 1 GRF regardless of execution size
422 regOff++;
423 }
424 else
425 {
426 regOff += execSize / getNativeExecSize();
427 }
428 }
429
getByteOffsetSrcRegion(G4_SrcRegRegion * srcRegion)430 unsigned int IR_Builder::getByteOffsetSrcRegion(G4_SrcRegRegion* srcRegion)
431 {
432 unsigned int offset =
433 (srcRegion->getRegOff() * numEltPerGRF<Type_UB>()) +
434 (srcRegion->getSubRegOff() * srcRegion->getTypeSize());
435
436 if (srcRegion->getBase() &&
437 srcRegion->getBase()->isRegVar())
438 {
439 G4_Declare* dcl = srcRegion->getBase()->asRegVar()->getDeclare();
440
441 if (dcl != NULL)
442 {
443 while (dcl->getAliasDeclare() != NULL)
444 {
445 offset += dcl->getAliasOffset();
446 dcl = dcl->getAliasDeclare();
447 }
448 }
449 }
450
451 return offset;
452 }
453
checkIfRegionsAreConsecutive(G4_SrcRegRegion * first,G4_SrcRegRegion * second,G4_ExecSize execSize)454 bool IR_Builder::checkIfRegionsAreConsecutive(
455 G4_SrcRegRegion* first, G4_SrcRegRegion* second, G4_ExecSize execSize)
456 {
457 if (first == NULL || second == NULL)
458 {
459 return true;
460 }
461
462 return checkIfRegionsAreConsecutive(first, second, execSize, first->getType());
463 }
464
checkIfRegionsAreConsecutive(G4_SrcRegRegion * first,G4_SrcRegRegion * second,G4_ExecSize execSize,G4_Type type)465 bool IR_Builder::checkIfRegionsAreConsecutive(
466 G4_SrcRegRegion* first, G4_SrcRegRegion* second, G4_ExecSize execSize, G4_Type type)
467 {
468 bool isConsecutive = false;
469
470 if (first == NULL || second == NULL)
471 {
472 isConsecutive = true;
473 }
474 else
475 {
476 G4_Declare* firstDcl = getDeclare(first);
477 G4_Declare* secondDcl = getDeclare(second);
478
479 unsigned int firstOff = getByteOffsetSrcRegion(first);
480 unsigned int secondOff = getByteOffsetSrcRegion(second);
481
482 if (firstDcl == secondDcl)
483 {
484 if (firstOff + execSize * TypeSize(type) == secondOff)
485 {
486 isConsecutive = true;
487 }
488 }
489 }
490
491 return isConsecutive;
492 }
493
generateDebugInfoPlaceholder()494 int IR_Builder::generateDebugInfoPlaceholder()
495 {
496 debugInfoPlaceholder = curCISAOffset;
497 return VISA_SUCCESS;
498 }
499
500
translateVISALifetimeInst(uint8_t properties,G4_Operand * var)501 int IR_Builder::translateVISALifetimeInst(uint8_t properties, G4_Operand* var)
502 {
503 // Lifetime.start/end are two variants of this instruction
504 createImm(properties & 0x1, Type_UB);
505
506 if ((properties & 0x1) == LIFETIME_START)
507 {
508 G4_DstRegRegion* varDstRgn = createDst(var->getBase(), 0, 0, 1, Type_UD);
509 createIntrinsicInst(
510 nullptr, Intrinsic::PseudoKill, g4::SIMD1,
511 varDstRgn, createImm((unsigned int)PseudoKillType::Src),
512 nullptr, nullptr, InstOpt_WriteEnable, true);
513 }
514 else
515 {
516 G4_SrcRegRegion* varSrcRgn = createSrc(var->getBase(), 0, 0, getRegionScalar(), Type_UD);
517 createIntrinsicInst(nullptr, Intrinsic::PseudoUse, g4::SIMD1, nullptr, varSrcRgn,
518 nullptr, nullptr, InstOpt_WriteEnable, true);
519 }
520
521 // We dont treat lifetime.end specially for now because lifetime.start
522 // is expected to halt propagation of liveness upwards. lifetime.start
523 // would prevent loop local variables/sub-rooutine local variables
524 // from being live across entire loop/sub-routine.
525
526 return VISA_SUCCESS;
527 }
528