1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "BuildIR.h"
10 #include "DebugInfo.h"
11 #include "G4_Kernel.hpp"
12 #include "G4_BB.hpp"
13 #include "VarSplit.h"
14 // #include "iga/IGALibrary/api/igaEncoderWrapper.hpp"
15 #include "iga/IGALibrary/api/kv.hpp"
16 #include "BinaryEncodingIGA.h"
17
18 #include <list>
19 #include <fstream>
20 #include <functional>
21 #include <iomanip>
22 #include <utility>
23
24 using namespace vISA;
25
markInsts()26 void gtPinData::markInsts()
27 {
28 // Take a snapshot of instructions in kernel.
29 for (auto bb : kernel.fg)
30 {
31 for (auto inst : *bb)
32 {
33 markedInsts.insert(inst);
34 }
35 }
36 }
37
removeUnmarkedInsts()38 void gtPinData::removeUnmarkedInsts()
39 {
40 if (!kernel.fg.getIsStackCallFunc() &&
41 !kernel.fg.getHasStackCalls())
42 {
43 // Marked instructions correspond to caller/callee save
44 // and FP/SP manipulation instructions.
45 return;
46 }
47
48 MUST_BE_TRUE(whichRAPass == ReRAPass,
49 "Unexpectedly removing unmarked instructions in first RA pass");
50 // Instructions not seen in "marked" snapshot will be removed by this function.
51 for (auto bb : kernel.fg)
52 {
53 for (auto it = bb->begin(), itEnd = bb->end();
54 it != itEnd;)
55 {
56 auto inst = (*it);
57
58 if (markedInsts.find(inst) == markedInsts.end())
59 {
60 it = bb->erase(it);
61 continue;
62 }
63 it++;
64 }
65 }
66 }
67
getFreeGRFInfo(unsigned & size)68 void* gtPinData::getFreeGRFInfo(unsigned& size)
69 {
70 // Here is agreed upon format for reporting free GRFs:
71 //struct freeBytes
72 //{
73 // unsigned short startByte;
74 // unsigned short numConsecutiveBytes;
75 //};
76
77 // Added magic 0xDEADD00D at start and
78 // magic 0xDEADBEEF at the end of buffer
79 // on request of gtpin team.
80 //
81 //struct freeGRFInfo
82 //{
83 // unsigned short numItems;
84 //
85 // freeBytes data[numItems];
86 //};
87 struct freeBytes
88 {
89 unsigned short startByte;
90 unsigned short numConsecutiveBytes;
91 };
92
93 struct freeGRFInfo
94 {
95 unsigned int magicStart;
96 unsigned int numItems;
97 };
98
99 // Compute free register information using vector for efficiency,
100 // then convert to POS for passing back to gtpin.
101 std::vector<std::pair<unsigned short, unsigned short>> vecFreeBytes;
102
103 for (auto byte : globalFreeRegs)
104 {
105 if (vecFreeBytes.size() > 0)
106 {
107 auto& lastFree = vecFreeBytes.back();
108 if (byte == (lastFree.first + lastFree.second))
109 {
110 lastFree.second += 1;
111 }
112 else
113 {
114 vecFreeBytes.push_back(std::make_pair(byte, 1));
115 }
116 }
117 else
118 {
119 vecFreeBytes.push_back(std::make_pair(byte, 1));
120 }
121 }
122
123 // Now convert vector to POS
124 unsigned int numItems = (unsigned int)vecFreeBytes.size();
125 freeGRFInfo* buffer = (freeGRFInfo*)malloc(numItems * sizeof(freeBytes) + sizeof(unsigned int)
126 + sizeof(unsigned int) + sizeof(unsigned int));
127 if (buffer)
128 {
129 buffer->numItems = numItems;
130 buffer->magicStart = 0xDEADD00D;
131 memcpy_s((unsigned char*)buffer + sizeof(unsigned int) + sizeof(unsigned int),
132 numItems * sizeof(freeBytes), vecFreeBytes.data(), numItems * sizeof(freeBytes));
133 unsigned int magicEnd = 0xDEADBEEF;
134 memcpy_s((unsigned char*)buffer + sizeof(unsigned int) + sizeof(unsigned int) + (numItems * sizeof(freeBytes)),
135 sizeof(magicEnd), &magicEnd, sizeof(magicEnd));
136
137 // numItems - unsigned int
138 // magicStart - unsigned int
139 // magicEnd - unsigned int
140 // data - numItems * sizeof(freeBytes)
141 size = sizeof(unsigned int) + sizeof(unsigned int) + sizeof(unsigned int) + (numItems * sizeof(freeBytes));
142 }
143
144 return buffer;
145 }
146
setGTPinInit(void * buffer)147 void gtPinData::setGTPinInit(void* buffer)
148 {
149 MUST_BE_TRUE(sizeof(gtpin::igc::igc_init_t) <= 200, "Check size of igc_init_t");
150 gtpin_init = (gtpin::igc::igc_init_t*)buffer;
151
152 if (gtpin_init->re_ra)
153 kernel.getOptions()->setOption(vISA_ReRAPostSchedule, true);
154 if (gtpin_init->grf_info)
155 kernel.getOptions()->setOption(vISA_GetFreeGRFInfo, true);
156 }
157
158 template<typename T>
writeBuffer(std::vector<unsigned char> & buffer,unsigned & bufferSize,const T * t,unsigned numBytes)159 static void writeBuffer(
160 std::vector<unsigned char>& buffer,
161 unsigned& bufferSize,
162 const T* t,
163 unsigned numBytes)
164 {
165 const unsigned char* data = (const unsigned char*)t;
166 for (unsigned i = 0; i != numBytes; i++)
167 {
168 buffer.push_back(data[i]);
169 }
170 bufferSize += numBytes;
171 }
172
getGTPinInfoBuffer(unsigned & bufferSize)173 void* gtPinData::getGTPinInfoBuffer(unsigned &bufferSize)
174 {
175 if (!gtpin_init && !gtpinInitFromL0)
176 {
177 bufferSize = 0;
178 return nullptr;
179 }
180 gtpin::igc::igc_init_t t;
181 std::vector<unsigned char> buffer;
182 unsigned numTokens = 0;
183 auto stackABI = kernel.fg.getIsStackCallFunc() || kernel.fg.getHasStackCalls();
184 bufferSize = 0;
185
186 memset(&t, 0, sizeof(t));
187
188 t.version = gtpin::igc::GTPIN_IGC_INTERFACE_VERSION;
189 t.igc_init_size = sizeof(t);
190 if (gtpinInitFromL0)
191 {
192 if (!stackABI)
193 {
194 if (kernel.getOption(vISA_GetFreeGRFInfo))
195 {
196 t.grf_info = 1;
197 numTokens++;
198 }
199
200 if (kernel.getOption(vISA_GTPinReRA))
201 {
202 t.re_ra = 1;
203 }
204 }
205
206 if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
207 t.srcline_mapping = 1;
208
209 if (kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize) > 0)
210 {
211 t.scratch_area_size = getNumBytesScratchUse();
212 numTokens++;
213 }
214 }
215 else
216 {
217 t.version = std::min(gtpin_init->version, gtpin::igc::GTPIN_IGC_INTERFACE_VERSION);
218 if (!stackABI)
219 {
220 if (gtpin_init->grf_info)
221 {
222 t.grf_info = 1;
223 numTokens++;
224 }
225
226 if (gtpin_init->re_ra)
227 {
228 t.re_ra = 1;
229 }
230 }
231
232 if (gtpin_init->srcline_mapping && kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
233 t.srcline_mapping = 1;
234
235 if (gtpin_init->scratch_area_size > 0)
236 {
237 t.scratch_area_size = gtpin_init->scratch_area_size;
238 numTokens++;
239 }
240 }
241
242 // For payload offsets
243 numTokens++;
244
245 // Report #GRFs
246 numTokens++;
247
248 writeBuffer(buffer, bufferSize, &t, sizeof(t));
249 writeBuffer(buffer, bufferSize, &numTokens, sizeof(uint32_t));
250
251 if (t.grf_info)
252 {
253 // create token
254 void* rerabuffer = nullptr;
255 unsigned rerasize = 0;
256
257 rerabuffer = getFreeGRFInfo(rerasize);
258
259 gtpin::igc::igc_token_header_t th;
260 th.token = gtpin::igc::GTPIN_IGC_TOKEN::GTPIN_IGC_TOKEN_GRF_INFO;
261 th.token_size = sizeof(gtpin::igc::igc_token_header_t) + rerasize;
262
263 // write token and data to buffer
264 writeBuffer(buffer, bufferSize, &th, sizeof(th));
265 writeBuffer(buffer, bufferSize, rerabuffer, rerasize);
266
267 free(rerabuffer);
268 }
269
270 if (t.scratch_area_size)
271 {
272 gtpin::igc::igc_token_scratch_area_info_t scratchSlotData;
273 scratchSlotData.scratch_area_size = t.scratch_area_size;
274 scratchSlotData.scratch_area_offset = nextScratchFree;
275
276 // gtpin scratch slots are beyond spill memory
277 scratchSlotData.token = gtpin::igc::GTPIN_IGC_TOKEN_SCRATCH_AREA_INFO;
278 scratchSlotData.token_size = sizeof(scratchSlotData);
279
280 writeBuffer(buffer, bufferSize, &scratchSlotData, sizeof(scratchSlotData));
281 }
282
283 {
284 // Write payload offsets
285 gtpin::igc::igc_token_kernel_start_info_t offsets;
286 offsets.token = gtpin::igc::GTPIN_IGC_TOKEN_KERNEL_START_INFO;
287 offsets.per_thread_prolog_size = kernel.getPerThreadNextOff();
288 offsets.cross_thread_prolog_size = kernel.getCrossThreadNextOff() - offsets.per_thread_prolog_size;
289 offsets.token_size = sizeof(offsets);
290 writeBuffer(buffer, bufferSize, &offsets, sizeof(offsets));
291 }
292
293 {
294 // Report num GRFs
295 gtpin::igc::igc_token_num_grf_regs_t numGRFs;
296 numGRFs.token = gtpin::igc::GTPIN_IGC_TOKEN_NUM_GRF_REGS;
297 numGRFs.token_size = sizeof(numGRFs);
298 numGRFs.num_grf_regs = kernel.getNumRegTotal();
299 writeBuffer(buffer, bufferSize, &numGRFs, sizeof(numGRFs));
300 }
301
302 void* gtpinBuffer = allocCodeBlock(bufferSize);
303
304 memcpy_s(gtpinBuffer, bufferSize, buffer.data(), bufferSize);
305
306 // Dump buffer with shader dumps
307 if (kernel.getOption(vISA_outputToFile))
308 {
309 auto asmName = kernel.getOptions()->getOptionCstr(VISA_AsmFileName);
310 if (asmName)
311 {
312 std::ofstream ofInit;
313 std::stringstream ssInit;
314 ssInit << std::string(asmName) << ".gtpin_igc_init";
315 ofInit.open(ssInit.str(), std::ofstream::binary);
316 if (gtpin_init)
317 {
318 ofInit.write((const char*)gtpin_init, sizeof(*gtpin_init));
319 }
320 ofInit.close();
321
322 std::ofstream ofInfo;
323 std::stringstream ssInfo;
324 ssInfo << std::string(asmName) << ".gtpin_igc_info";
325 ofInfo.open(ssInfo.str(), std::ofstream::binary);
326 if (gtpinBuffer)
327 {
328 ofInfo.write((const char*)gtpinBuffer, bufferSize);
329 }
330 ofInfo.close();
331 }
332 }
333
334 return gtpinBuffer;
335 }
336
getNumBytesScratchUse() const337 uint32_t gtPinData::getNumBytesScratchUse() const
338 {
339 if (gtpin_init)
340 {
341 return gtpin_init->scratch_area_size;
342 }
343 else if (isGTPinInitFromL0())
344 {
345 return kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize);
346 }
347 return 0;
348 }
349
350
G4_Kernel(INST_LIST_NODE_ALLOCATOR & alloc,Mem_Manager & m,Options * options,Attributes * anAttr,unsigned char major,unsigned char minor)351 G4_Kernel::G4_Kernel(INST_LIST_NODE_ALLOCATOR& alloc,
352 Mem_Manager& m, Options* options, Attributes* anAttr,
353 unsigned char major, unsigned char minor)
354 : m_options(options), m_kernelAttrs(anAttr), RAType(RA_Type::UNKNOWN_RA),
355 asmInstCount(0), kernelID(0), fg(alloc, this, m),
356 major_version(major), minor_version(minor)
357 {
358 ASSERT_USER(
359 major < COMMON_ISA_MAJOR_VER ||
360 (major == COMMON_ISA_MAJOR_VER && minor <= COMMON_ISA_MINOR_VER),
361 "CISA version not supported by this JIT-compiler");
362
363
364 name = NULL;
365 numThreads = 0;
366 hasAddrTaken = false;
367 kernelDbgInfo = nullptr;
368 sharedDebugInfo = false;
369 sharedGTPinInfo = false;
370 if (options->getOption(vISAOptions::vISA_ReRAPostSchedule) ||
371 options->getOption(vISAOptions::vISA_GetFreeGRFInfo) ||
372 options->getuInt32Option(vISAOptions::vISA_GTPinScratchAreaSize))
373 {
374 allocGTPinData();
375 } else {
376 gtPinInfo = nullptr;
377 }
378
379 setKernelParameters();
380 }
381
~G4_Kernel()382 G4_Kernel::~G4_Kernel()
383 {
384 if (kernelDbgInfo && !sharedDebugInfo)
385 {
386 kernelDbgInfo->~KernelDebugInfo();
387 }
388
389 if (gtPinInfo && !sharedGTPinInfo)
390 {
391 gtPinInfo->~gtPinData();
392 }
393
394 if (varSplitPass)
395 {
396 delete varSplitPass;
397 varSplitPass = nullptr;
398 }
399
400 Declares.clear();
401 }
402
setKernelDebugInfo(KernelDebugInfo * k)403 void G4_Kernel::setKernelDebugInfo(KernelDebugInfo* k)
404 {
405 assert(k);
406 if (kernelDbgInfo)
407 {
408 kernelDbgInfo->~KernelDebugInfo();
409 }
410 kernelDbgInfo = k;
411 sharedDebugInfo = true;
412 }
413
setGTPinData(gtPinData * p)414 void G4_Kernel::setGTPinData(gtPinData* p) {
415 assert(p);
416 if (gtPinInfo == nullptr)
417 {
418 gtPinInfo->~gtPinData();
419 }
420 gtPinInfo = p;
421 sharedGTPinInfo = true;
422 }
423
computeChannelSlicing()424 void G4_Kernel::computeChannelSlicing()
425 {
426 G4_ExecSize simdSize = getSimdSize();
427 channelSliced = true;
428
429 if (simdSize == g4::SIMD8 || simdSize == g4::SIMD16)
430 {
431 // SIMD8/16 kernels are not sliced
432 channelSliced = false;
433 return;
434 }
435
436 // .dcl V1 size = 128 bytes
437 // op (16|M0) V1(0,0) ..
438 // op (16|M16) V1(2,0) ..
439 // For above sequence, return 32. Instruction
440 // is broken in to 2 only due to hw restriction.
441 // Allocation of dcl is still as if it were a
442 // SIMD32 kernel.
443
444 // Store emask bits that are ever used to define a variable
445 std::unordered_map<G4_Declare*, std::bitset<32>> emaskRef;
446 for (auto bb : fg)
447 {
448 for (auto inst : *bb)
449 {
450 if (inst->isSend())
451 continue;
452
453 auto dst = inst->getDst();
454 if (!dst || !dst->getTopDcl() ||
455 dst->getHorzStride() != 1)
456 continue;
457
458 if (inst->isWriteEnableInst())
459 continue;
460
461 auto regFileKind = dst->getTopDcl()->getRegFile();
462 if (regFileKind != G4_RegFileKind::G4_GRF && regFileKind != G4_RegFileKind::G4_INPUT)
463 continue;
464
465 if (dst->getTopDcl()->getByteSize() <= dst->getTypeSize() * (unsigned)simdSize)
466 continue;
467
468 auto emaskOffStart = inst->getMaskOffset();
469
470 // Reset all bits on first encounter of dcl
471 if (emaskRef.find(dst->getTopDcl()) == emaskRef.end())
472 emaskRef[dst->getTopDcl()].reset();
473
474 // Set bits based on which EM bits are used in the def
475 for (unsigned i = emaskOffStart; i != (emaskOffStart + inst->getExecSize()); i++)
476 {
477 emaskRef[dst->getTopDcl()].set(i);
478 }
479 }
480 }
481
482 // Check whether any variable's emask usage straddles across lower and upper 16 bits
483 for (auto& emRefs : emaskRef)
484 {
485 auto& bits = emRefs.second;
486 auto num = bits.to_ulong();
487
488 // Check whether any lower 16 and upper 16 bits are set
489 if (((num & 0xffff) != 0) && ((num & 0xffff0000) != 0))
490 {
491 channelSliced = false;
492 return;
493 }
494 }
495
496 return;
497 }
498
calculateSimdSize()499 void G4_Kernel::calculateSimdSize()
500 {
501 // Iterate over all instructions in kernel to check
502 // whether default execution size of kernel is
503 // SIMD8/16. This is required for knowing alignment
504 // to use for GRF candidates.
505
506 // only do it once per kernel, as we should not introduce inst with larger simd size than in the input
507 if (simdSize.value != 0)
508 {
509 return;
510 }
511
512 // First, get simdsize from attribute (0 : not given)
513 // If not 0|8|16|32, wrong value from attribute.
514 simdSize = G4_ExecSize((unsigned)m_kernelAttrs->getInt32KernelAttr(Attributes::ATTR_SimdSize));
515 if (simdSize != g4::SIMD8 && simdSize != g4::SIMD16 && simdSize != g4::SIMD32)
516 {
517 assert(simdSize.value == 0 && "vISA: wrong value for SimdSize attribute");
518 simdSize = g4::SIMD8;
519
520 for (auto bb : fg)
521 {
522 for (auto inst : *bb)
523 {
524 // do not consider send since for certain messages we have to set its execution size
525 // to 16 even in simd8 shaders
526 if (!inst->isLabel() && !inst->isSend())
527 {
528 uint32_t size = inst->getMaskOffset() + inst->getExecSize();
529 if (size > 16)
530 {
531 simdSize = g4::SIMD32;
532 break;
533 }
534 else if (size > 8)
535 {
536 simdSize = g4::SIMD16;
537 }
538 }
539 }
540 if (simdSize == g4::SIMD32)
541 break;
542 }
543 }
544
545 if (GlobalRA::useGenericAugAlign())
546 computeChannelSlicing();
547 }
548
549 //
550 // Updates kernel's related structures based on number of threads.
551 //
updateKernelByNumThreads(int nThreads)552 void G4_Kernel::updateKernelByNumThreads(int nThreads)
553 {
554 if (numThreads == nThreads)
555 return;
556
557 numThreads = nThreads;
558
559 // Scale number of GRFs, Acc, SWSB tokens.
560 setKernelParameters();
561
562 // Update physical register pool
563 fg.builder->rebuildPhyRegPool(getNumRegTotal());
564 }
565
566 //
567 // Evaluate AddrExp/AddrExpList to Imm
568 //
evalAddrExp()569 void G4_Kernel::evalAddrExp()
570 {
571 for (std::list<G4_BB*>::iterator it = fg.begin(), itEnd = fg.end();
572 it != itEnd; ++it)
573 {
574 G4_BB* bb = (*it);
575
576 for (INST_LIST_ITER i = bb->begin(), iEnd = bb->end(); i != iEnd; i++)
577 {
578 G4_INST* inst = (*i);
579
580 //
581 // process each source operand
582 //
583 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
584 {
585 G4_Operand* opnd = inst->getSrc(j);
586
587 if (opnd == NULL) continue;
588
589 if (opnd->isAddrExp())
590 {
591 int val = opnd->asAddrExp()->eval();
592 G4_Type ty = opnd->asAddrExp()->getType();
593
594 G4_Imm* imm = fg.builder->createImm(val, ty);
595 inst->setSrc(imm, j);
596 }
597 }
598 }
599 }
600 }
601
602 // FIX: this needs to here because of the above static thread-local variable
603 extern _THREAD const char* g4_prevFilename;
604 extern _THREAD int g4_prevSrcLineNo;
605
split(const std::string & str,const char * delimiter)606 static std::vector<std::string> split(
607 const std::string & str, const char * delimiter)
608 {
609 std::vector<std::string> v;
610 std::string::size_type start = 0;
611
612 for (auto pos = str.find_first_of(delimiter, start);
613 pos != std::string::npos;
614 start = pos + 1, pos = str.find_first_of(delimiter, start))
615 {
616 if (pos != start)
617 {
618 v.emplace_back(str, start, pos - start);
619 }
620 }
621
622 if (start < str.length())
623 v.emplace_back(str, start, str.length() - start);
624 return v;
625 }
626
getIGAPlatform()627 static iga_gen_t getIGAPlatform()
628 {
629 iga_gen_t platform = IGA_GEN_INVALID;
630 switch (getGenxPlatform())
631 {
632 case GENX_BDW: platform = IGA_GEN8; break;
633 case GENX_CHV: platform = IGA_GEN8lp; break;
634 case GENX_SKL: platform = IGA_GEN9; break;
635 case GENX_BXT: platform = IGA_GEN9lp; break;
636 case GENX_ICLLP: platform = IGA_GEN11; break;
637 case GENX_TGLLP:platform = IGA_GEN12p1; break;
638 case XeHP_SDV: platform = IGA_XE_HP; break;
639 case GENX_DG2:
640 platform = IGA_XE_HPG;
641 break;
642 case GENX_PVC:
643 case GENX_PVCXT:
644 platform = IGA_XE_HPC;
645 break;
646 default:
647 break;
648 }
649
650 return platform;
651 }
652
getKernelDebugInfo()653 KernelDebugInfo* G4_Kernel::getKernelDebugInfo()
654 {
655 if (kernelDbgInfo == nullptr)
656 {
657 kernelDbgInfo = new(fg.mem)KernelDebugInfo();
658 }
659
660 return kernelDbgInfo;
661 }
662
getStackCallStartReg() const663 unsigned G4_Kernel::getStackCallStartReg() const
664 {
665 // Last 3 GRFs to be used as scratch
666 unsigned totalGRFs = getNumRegTotal();
667 unsigned startReg = totalGRFs - numReservedABIGRF();
668 return startReg;
669 }
calleeSaveStart() const670 unsigned G4_Kernel::calleeSaveStart() const
671 {
672 return getCallerSaveLastGRF() + 1;
673 }
getNumCalleeSaveRegs() const674 unsigned G4_Kernel::getNumCalleeSaveRegs() const
675 {
676 unsigned totalGRFs = getNumRegTotal();
677 return totalGRFs - calleeSaveStart() - numReservedABIGRF();
678 }
679
680 //
681 // rename non-root declares to their root decl name to make
682 // it easier to read IR dump
683 //
renameAliasDeclares()684 void G4_Kernel::renameAliasDeclares()
685 {
686 #if _DEBUG
687 for (auto dcl : Declares)
688 {
689 if (dcl->getAliasDeclare())
690 {
691 uint32_t offset = 0;
692 G4_Declare* rootDcl = dcl->getRootDeclare(offset);
693 std::string newName(rootDcl->getName());
694 if (rootDcl->getElemType() != dcl->getElemType())
695 {
696 newName += "_";
697 newName += TypeSymbol(dcl->getElemType());
698 }
699 if (offset != 0)
700 {
701 newName += "_" + std::to_string(offset);
702 }
703 dcl->setName(fg.builder->getNameString(fg.mem, 64, "%s", newName.c_str()));
704 }
705 }
706 #endif
707 }
708
709 //
710 // perform relocation for every entry in the allocation table
711 //
doRelocation(void * binary,uint32_t binarySize)712 void G4_Kernel::doRelocation(void* binary, uint32_t binarySize)
713 {
714 for (auto&& entry : relocationTable)
715 {
716 entry.doRelocation(*this, binary, binarySize);
717 }
718 }
719
getFirstNonLabelInst() const720 G4_INST* G4_Kernel::getFirstNonLabelInst() const
721 {
722 for (auto I = fg.cbegin(), E = fg.cend(); I != E; ++I)
723 {
724 auto bb = *I;
725 G4_INST* firstInst = bb->getFirstInst();
726 if (firstInst)
727 {
728 return firstInst;
729 }
730 }
731 // empty kernel
732 return nullptr;
733 }
734
getDebugSrcLine(const std::string & fileName,int srcLine)735 std::string G4_Kernel::getDebugSrcLine(const std::string& fileName, int srcLine)
736 {
737 auto iter = debugSrcLineMap.find(fileName);
738 if (iter == debugSrcLineMap.end())
739 {
740 std::ifstream ifs(fileName);
741 if (!ifs)
742 {
743 // file doesn't exist
744 debugSrcLineMap[fileName] = std::make_pair<bool, std::vector<std::string>>(false, {});
745 return "";
746 }
747 std::string line;
748 std::vector<std::string> srcLines;
749 while (std::getline(ifs, line))
750 {
751 srcLines.push_back(line);
752 }
753 debugSrcLineMap[fileName] = std::make_pair(true, std::move(srcLines));
754 }
755 iter = debugSrcLineMap.find(fileName);
756 if (iter == debugSrcLineMap.end() ||
757 !iter->second.first)
758 {
759 return "";
760 }
761 auto& lines = iter->second.second;
762 if (srcLine > (int) lines.size() || srcLine <= 0)
763 {
764 return "invalid line number";
765 }
766 return lines[srcLine - 1];
767 }
768
getVarSplitPass()769 VarSplitPass* G4_Kernel::getVarSplitPass()
770 {
771 if (varSplitPass)
772 return varSplitPass;
773
774 varSplitPass = new VarSplitPass(*this);
775
776 return varSplitPass;
777 }
778
setKernelParameters()779 void G4_Kernel::setKernelParameters()
780 {
781 unsigned overrideGRFNum = 0;
782 unsigned overrideNumThreads = 0;
783
784 TARGET_PLATFORM platform = getGenxPlatform();
785 overrideGRFNum = m_options->getuInt32Option(vISA_TotalGRFNum);
786
787 overrideNumThreads = m_options->getuInt32Option(vISA_HWThreadNumberPerEU);
788
789 //
790 // Number of threads/GRF can currently be set by:
791 // 1.- IGC flag (reg key)
792 // 2.- Compiler option entered by user for
793 // 2.1 entire module
794 // 2.2 kernel function
795 // 3.- Compiler heuristics
796 //
797 if (m_options->getuInt32Option(vISA_ForceHWThreadNumberPerEU))
798 {
799 numThreads = m_options->getuInt32Option(vISA_ForceHWThreadNumberPerEU);
800 }
801 regSharingHeuristics = m_options->getOption(vISA_RegSharingHeuristics);
802 if (overrideNumThreads || regSharingHeuristics)
803 {
804 overrideGRFNum = 0;
805 if (numThreads > 0)
806 {
807 overrideNumThreads = numThreads;
808 }
809 }
810
811 // Set the number of GRFs
812 if (overrideGRFNum > 0)
813 {
814 // User-provided number of GRFs
815 unsigned Val = m_options->getuInt32Option(vISA_GRFNumToUse);
816 if (Val > 0)
817 {
818 numRegTotal = std::min(Val, overrideGRFNum);
819 }
820 else
821 {
822 numRegTotal = overrideGRFNum;
823 }
824 callerSaveLastGRF = ((overrideGRFNum - 8) / 2) - 1;
825 }
826 else if (overrideNumThreads > 0)
827 {
828 switch (platform)
829 {
830 case XeHP_SDV:
831 case GENX_DG2:
832 switch (overrideNumThreads)
833 {
834 case 4:
835 numRegTotal = 256;
836 break;
837 default:
838 numRegTotal = 128;
839 }
840 break;
841 case GENX_PVC:
842 case GENX_PVCXT:
843 switch (overrideNumThreads)
844 {
845 case 4:
846 numRegTotal = 256;
847 break;
848 case 5:
849 numRegTotal = 192;
850 break;
851 case 6:
852 numRegTotal = 160;
853 break;
854 case 8:
855 numRegTotal = 128;
856 break;
857 case 10:
858 numRegTotal = 96;
859 break;
860 case 12:
861 numRegTotal = 64;
862 break;
863 default:
864 numRegTotal = 128;
865 }
866 break;
867 default:
868 numRegTotal = 128;
869 }
870 callerSaveLastGRF = ((numRegTotal - 8) / 2) - 1;
871 }
872 else
873 {
874 // Default value for all other platforms
875 unsigned Val = m_options->getuInt32Option(vISA_GRFNumToUse);
876 numRegTotal = Val ? Val : 128;
877 callerSaveLastGRF = ((numRegTotal - 8) / 2) - 1;
878 }
879 // For safety update TotalGRFNum, there may be some uses for this vISA option
880 m_options->setOption(vISA_TotalGRFNum, numRegTotal);
881
882 // Set the number of SWSB tokens
883 unsigned overrideNumSWSB = m_options->getuInt32Option(vISA_SWSBTokenNum);
884 if (overrideNumSWSB > 0)
885 {
886 // User-provided number of SWSB tokens
887 numSWSBTokens = overrideNumSWSB;
888 }
889 else if (overrideNumThreads > 0)
890 {
891 switch (platform)
892 {
893 case GENX_PVC:
894 case GENX_PVCXT:
895 switch (overrideNumThreads)
896 {
897 case 4:
898 numSWSBTokens = 32;
899 break;
900 case 5:
901 numSWSBTokens = 24;
902 break;
903 case 6:
904 numSWSBTokens = 20;
905 break;
906 case 8:
907 numSWSBTokens = 16;
908 break;
909 case 10:
910 numSWSBTokens = 12;
911 break;
912 case 12:
913 numSWSBTokens = 8;
914 break;
915 default:
916 numSWSBTokens = 16;
917 }
918 break;
919 default:
920 numSWSBTokens = 16;
921 }
922 }
923 else
924 {
925 // Default value based on platform
926 switch (platform)
927 {
928 case GENX_PVC:
929 case GENX_PVCXT:
930 numSWSBTokens = 16;
931 if (numRegTotal == 256)
932 {
933 numSWSBTokens *= 2;
934 }
935 break;
936 default:
937 numSWSBTokens = 16;
938 }
939 }
940
941
942 // Set the number of Acc. They are in the unit of GRFs (i.e., 1 accumulator is the same size as 1 GRF)
943 unsigned overrideNumAcc = m_options->getuInt32Option(vISA_numGeneralAcc);
944 if (overrideNumAcc > 0)
945 {
946 // User-provided number of Acc
947 numAcc = overrideNumAcc;
948 }
949 else if (overrideNumThreads > 0)
950 {
951 switch (platform)
952 {
953 case XeHP_SDV:
954 case GENX_DG2:
955 switch (overrideNumThreads)
956 {
957 case 4:
958 numAcc = 8;
959 break;
960 default:
961 numAcc = 4;
962 }
963 break;
964 case GENX_PVC:
965 case GENX_PVCXT:
966 switch (overrideNumThreads)
967 {
968 case 4:
969 numAcc = 8;
970 break;
971 case 5:
972 numAcc = 6;
973 break;
974 case 6:
975 case 8:
976 numAcc = 4;
977 break;
978 case 10:
979 case 12:
980 numAcc = 2;
981 break;
982 default:
983 numAcc = 8;
984 }
985 break;
986 default:
987 numAcc = 4;
988 }
989 }
990 else
991 {
992 // Default value based on platform
993 switch (platform)
994 {
995 case XeHP_SDV:
996 case GENX_DG2:
997 case GENX_PVC:
998 case GENX_PVCXT:
999 numAcc = 4;
1000 if (numRegTotal == 256)
1001 {
1002 numAcc *= 2;
1003 }
1004 break;
1005 default:
1006 numAcc = 2;
1007 }
1008 }
1009
1010 // Set number of threads if it was not defined before
1011 if (numThreads == 0)
1012 {
1013 if (overrideNumThreads > 0)
1014 {
1015 numThreads = overrideNumThreads;
1016 }
1017 else
1018 {
1019 switch (platform)
1020 {
1021 case XeHP_SDV:
1022 case GENX_DG2:
1023 switch (numRegTotal)
1024 {
1025 case 256:
1026 numThreads = 4;
1027 break;
1028 default:
1029 numThreads = 8;
1030 }
1031 break;
1032 case GENX_PVC:
1033 case GENX_PVCXT:
1034 switch (numRegTotal)
1035 {
1036 case 256:
1037 numThreads = 4;
1038 break;
1039 case 192:
1040 numThreads = 5;
1041 break;
1042 case 160:
1043 numThreads = 6;
1044 break;
1045 case 128:
1046 numThreads = 8;
1047 break;
1048 case 96:
1049 numThreads = 10;
1050 break;
1051 case 64:
1052 numThreads = 12;
1053 break;
1054 default:
1055 numThreads = 8;
1056 }
1057 break;
1058 default:
1059 numThreads = 7;
1060 }
1061 }
1062 }
1063
1064 if (m_options->getOption(vISA_hasDoubleAcc))
1065 {
1066 numAcc = 16;
1067 }
1068 }
1069
dump(std::ostream & os) const1070 void G4_Kernel::dump(std::ostream &os) const
1071 {
1072 fg.print(os);
1073 }
1074
dumpToFile(const std::string & suffixIn)1075 void G4_Kernel::dumpToFile(const std::string &suffixIn)
1076 {
1077 bool dumpDot = m_options->getOption(vISA_DumpDot);
1078 bool dumpG4 =
1079 m_options->getOption(vISA_DumpPasses) ||
1080 m_options->getuInt32Option(vISA_DumpPassesSubset) >= 1;
1081 if (!dumpDot && !dumpG4)
1082 return;
1083
1084 // calls to this will produce a sequence of dumps
1085 // [kernel-name].000.[suffix].{dot,g4}
1086 // [kernel-name].001.[suffix].{dot,g4}
1087 // ...
1088 // If vISA_DumpPassesSubset == 1 then we omit any files that don't change
1089 // the string representation of the kernel (i.e. skip passes that don't do anything).
1090 std::stringstream ss;
1091 ss << (name ? name : "UnknownKernel");
1092 ss << "." << std::setfill('0') << std::setw(3) << nextDumpIndex++ << "." << suffixIn;
1093 std::string baseName = sanitizePathString(ss.str());
1094
1095 if (dumpDot)
1096 dumpDotFileInternal(baseName);
1097
1098 if (dumpG4)
1099 dumpG4Internal(baseName);
1100 }
1101
emitDeviceAsm(std::ostream & os,const void * binary,uint32_t binarySize)1102 void G4_Kernel::emitDeviceAsm(
1103 std::ostream& os, const void * binary, uint32_t binarySize)
1104 {
1105 //
1106 // for GTGPU lib release, don't dump out asm
1107 //
1108 #ifdef NDEBUG
1109 #ifdef GTGPU_LIB
1110 return;
1111 #endif
1112 #endif
1113 const bool newAsm =
1114 m_options->getOption(vISA_dumpNewSyntax) && !(binary == NULL || binarySize == 0);
1115
1116 if (!m_options->getOption(vISA_StripComments)) {
1117 emitDeviceAsmHeaderComment(os);
1118 }
1119
1120 // Set this to NULL to always print filename for each kernel
1121 g4_prevFilename = nullptr;
1122 g4_prevSrcLineNo = 0;
1123
1124 if (!newAsm) {
1125 emitDeviceAsmInstructionsOldAsm(os);
1126 return;
1127 }
1128
1129 emitDeviceAsmInstructionsIga(os, binary, binarySize);
1130
1131 if (getPlatformGeneration(getGenxPlatform()) >= PlatformGen::XE) {
1132 os << "\n\n";
1133 os << "//.BankConflicts: " << fg.XeBCStats.BCNum << "\n";
1134 os << "//.BankConflicts.SameBank: " << fg.XeBCStats.sameBankConflicts << "\n";
1135 os << "//.BankConflicts.TwoSrc: " << fg.XeBCStats.twoSrcBC << "\n";
1136 int nativeSimdSize = 8;
1137 if (getGenxPlatform() >= GENX_PVC)
1138 nativeSimdSize = 16;
1139 os << "//.SIMD" << 2*nativeSimdSize << "ReadSuppressions: " << fg.XeBCStats.simd16ReadSuppression << "\n";
1140 os << "//.SIMD" << nativeSimdSize << "s: " << fg.XeBCStats.simd8 << "\n//\n";
1141 os << "//.RMWs: " << fg.numRMWs << "\n//\n";
1142 }
1143 else
1144 {
1145 os << "// Bank Conflict Statistics: \n";
1146 os << "// -- GOOD: " << fg.BCStats.NumOfGoodInsts << "\n";
1147 os << "// -- BAD: " << fg.BCStats.NumOfBadInsts << "\n";
1148 os << "// -- OK: " << fg.BCStats.NumOfOKInsts << "\n";
1149 }
1150 }
1151
emitRegInfo()1152 void G4_Kernel::emitRegInfo()
1153 {
1154 const char* asmName = nullptr;
1155 getOptions()->getOption(VISA_AsmFileName, asmName);
1156 const char* asmNameEmpty = "";
1157 if (!asmName)
1158 {
1159 asmName = asmNameEmpty;
1160 }
1161
1162 std::string dumpFileName = std::string(asmName) + ".reginfo";
1163 std::fstream ofile(dumpFileName, std::ios::out);
1164
1165 emitRegInfoKernel(ofile);
1166
1167 ofile.close();
1168 }
1169
emitRegInfoKernel(std::ostream & output)1170 void G4_Kernel::emitRegInfoKernel(std::ostream& output)
1171 {
1172 output << "//.platform " << getGenxPlatformString(fg.builder->getPlatform());
1173 output << "\n" << "//.kernel ID 0x" << std::hex << getKernelID() << "\n";
1174 output << std::dec << "\n";
1175 int instOffset = 0;
1176
1177 for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB)
1178 {
1179 for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end(); ++itInst)
1180 {
1181 G4_INST* inst = (*itInst);
1182 if (inst->isLabel())
1183 {
1184 continue;
1185 }
1186 if (inst->getLexicalId() == -1)
1187 {
1188 continue;
1189 }
1190
1191 (*itBB)->emitRegInfo(output, inst, instOffset);
1192 instOffset += inst->isCompactedInst() ? 8 : 16;
1193 }
1194 }
1195 return;
1196 }
1197
1198 //
1199 // This routine dumps out the dot file of the control flow graph along with instructions.
1200 // dot is drawing graph tool from AT&T.
1201 //
dumpDotFileInternal(const std::string & baseName)1202 void G4_Kernel::dumpDotFileInternal(const std::string &baseName)
1203 {
1204 std::fstream ofile(baseName + ".dot", std::ios::out);
1205 assert(ofile);
1206 //
1207 // write digraph KernelName {"
1208 // size = "8, 10";
1209 //
1210 const char* asmFileName = NULL;
1211 m_options->getOption(VISA_AsmFileName, asmFileName);
1212 if (asmFileName == NULL)
1213 ofile << "digraph UnknownKernel" << " {" << std::endl;
1214 else
1215 ofile << "digraph " << asmFileName << " {" << std::endl;
1216 //
1217 // keep the graph width 8, estimate a reasonable graph height
1218 //
1219 const unsigned itemPerPage = 64; // 60 instructions per Letter page
1220 unsigned totalItem = (unsigned)Declares.size();
1221 for (std::list<G4_BB*>::iterator it = fg.begin(); it != fg.end(); ++it)
1222 totalItem += ((unsigned)(*it)->size());
1223 totalItem += (unsigned)fg.size();
1224 float graphHeight = (float)totalItem / itemPerPage;
1225 graphHeight = graphHeight < 100.0f ? 100.0f : graphHeight; // minimal size: Letter
1226 ofile << "\n\t// Setup\n";
1227 ofile << "\tsize = \"80.0, " << graphHeight << "\";\n";
1228 ofile << "\tpage= \"80.5, 110\";\n";
1229 ofile << "\tpagedir=\"TL\";\n";
1230 //
1231 // dump out declare information
1232 // Declare [label="
1233 //
1234 //if (name == NULL)
1235 // ofile << "\tDeclares [shape=record, label=\"{kernel:UnknownKernel" << " | ";
1236 //else
1237 // ofile << "\tDeclares [shape=record, label=\"{kernel:" << name << " | ";
1238 //for (std::list<G4_Declare*>::iterator it = Declares.begin(); it != Declares.end(); ++it)
1239 //{
1240 // (*it)->emit(ofile, true, Options::symbolReg); // Solve the DumpDot error on representing <>
1241 //
1242 // ofile << "\\l"; // left adjusted
1243 //}
1244 //ofile << "}\"];" << std::endl;
1245 //
1246 // dump out flow graph
1247 //
1248 for (std::list<G4_BB*>::iterator it = fg.begin(); it != fg.end(); ++it)
1249 {
1250 G4_BB* bb = (*it);
1251 //
1252 // write: BB0 [shape=plaintext, label=<
1253 // <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
1254 // <TR><TD ALIGN="CENTER">BB0: TestRA_Dot</TD></TR>
1255 // <TR><TD>
1256 // <TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">
1257 // <TR><TD ALIGN="LEFT">TestRA_Dot:</TD></TR>
1258 // <TR><TD ALIGN="LEFT"><FONT color="red">add (8) Region(0,0)[1] Region(0,0)[8;8,1] PAYLOAD(0,0)[8;8,1] [NoMask]</FONT></TD></TR>
1259 // </TABLE>
1260 // </TD></TR>
1261 // </TABLE>>];
1262 // print out label if the first inst is a label inst
1263 //
1264 ofile << "\t";
1265 bb->writeBBId(ofile);
1266 ofile << " [shape=plaintext, label=<" << std::endl;
1267 ofile << "\t\t\t <TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">" << std::endl;
1268 ofile << "\t\t\t\t<TR><TD ALIGN=\"CENTER\">";
1269 bb->writeBBId(ofile);
1270 ofile << ": ";
1271
1272 if (!bb->empty() && bb->front()->isLabel())
1273 {
1274 bb->front()->getSrc(0)->emit(ofile);
1275 }
1276 ofile << "</TD></TR>" << std::endl;
1277 //emit all instructions within basic block
1278 ofile << "\t\t\t\t<TR><TD>" << std::endl;
1279
1280 if (!bb->empty())
1281 {
1282 ofile << "\t\t\t\t\t <TABLE BORDER=\"0\" CELLBORDER=\"0\" CELLSPACING=\"0\">" << std::endl;
1283 for (INST_LIST_ITER i = bb->begin(); i != bb->end(); i++)
1284 {
1285 //
1286 // detect if there is spill code first, set different color for it
1287 //
1288 std::string fontColor = "black";
1289 //
1290 // emit the instruction
1291 //
1292 ofile << "\t\t\t\t\t\t<TR><TD ALIGN=\"LEFT\"><FONT color=\"" << fontColor << "\">";
1293 std::ostringstream os;
1294 (*i)->emit(os, m_options->getOption(vISA_SymbolReg), true);
1295 std::string dotStr(os.str());
1296 //TODO: dot doesn't like '<', '>', '{', or '}' (and '&') this code below is a hack. need to replace with delimiters.
1297 //std::replace_if(dotStr.begin(), dotStr.end(), bind2nd(equal_to<char>(), '<'), '[');
1298 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '<'), '[');
1299 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '>'), ']');
1300 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '{'), '[');
1301 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '}'), ']');
1302 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '&'), '$');
1303 ofile << dotStr;
1304
1305 ofile << "</FONT></TD></TR>" << std::endl;
1306 //ofile << "\\l"; // left adjusted
1307 }
1308 ofile << "\t\t\t\t\t </TABLE>" << std::endl;
1309 }
1310
1311 ofile << "\t\t\t\t</TD></TR>" << std::endl;
1312 ofile << "\t\t\t </TABLE>>];" << std::endl;
1313 //
1314 // dump out succ edges
1315 // BB12 -> BB10
1316 //
1317 for (std::list<G4_BB*>::iterator sit = bb->Succs.begin();
1318 sit != bb->Succs.end(); ++sit)
1319 {
1320 bb->writeBBId(ofile);
1321 ofile << " -> ";
1322 (*sit)->writeBBId(ofile);
1323 ofile << std::endl;
1324 }
1325 }
1326 //
1327 // write "}" to end digraph
1328 //
1329 ofile << std::endl << " }" << std::endl;
1330 //
1331 // close dot file
1332 //
1333 ofile.close();
1334 }
1335
1336 // Dump the instructions into a .g4 file
dumpG4Internal(const std::string & file)1337 void G4_Kernel::dumpG4Internal(const std::string &file)
1338 {
1339 std::stringstream g4asm;
1340 dumpG4InternalTo(g4asm);
1341 std::string g4asms = g4asm.str();
1342 if (m_options->getuInt32Option(vISA_DumpPassesSubset) == 1 && g4asms == lastG4Asm) {
1343 return;
1344 }
1345 lastG4Asm = std::move(g4asms);
1346
1347 std::fstream ofile(file + ".g4", std::ios::out);
1348 assert(ofile);
1349 dumpG4InternalTo(ofile);
1350 }
1351
dumpG4InternalTo(std::ostream & os)1352 void G4_Kernel::dumpG4InternalTo(std::ostream &os)
1353 {
1354 const char* asmFileName = nullptr;
1355 m_options->getOption(VISA_AsmFileName, asmFileName);
1356 os << ".kernel " << name << "\n";
1357
1358 for (const G4_Declare *d : Declares) {
1359 static const int MIN_DECL = 34; // skip the built-in decls
1360 if (d->getDeclId() > MIN_DECL) {
1361 // os << d->getDeclId() << "\n";
1362 d->emit(os);
1363 }
1364 }
1365
1366 for (std::list<G4_BB*>::iterator it = fg.begin();
1367 it != fg.end(); ++it)
1368 {
1369 // Emit BB number
1370 G4_BB* bb = (*it);
1371 bb->writeBBId(os);
1372
1373 // Emit BB type
1374 if (bb->getBBType())
1375 {
1376 os << " [" << bb->getBBTypeStr() << "] ";
1377 }
1378
1379 os << "\tPreds: ";
1380 for (auto pred : bb->Preds)
1381 {
1382 pred->writeBBId(os);
1383 os << " ";
1384 }
1385 os << "\tSuccs: ";
1386 for (auto succ : bb->Succs)
1387 {
1388 succ->writeBBId(os);
1389 os << " ";
1390 }
1391 os << "\n";
1392
1393 bb->emit(os);
1394 os << "\n\n";
1395 } // bbs
1396 }
1397
emitDeviceAsmHeaderComment(std::ostream & os)1398 void G4_Kernel::emitDeviceAsmHeaderComment(std::ostream& os)
1399 {
1400 os << "//.kernel ";
1401 if (name != NULL)
1402 {
1403 // some 3D kernels do not have a name
1404 os << name;
1405 }
1406
1407 os << "\n" << "//.platform " << getGenxPlatformString(getGenxPlatform());
1408 os << "\n" << "//.thread_config " << "numGRF=" << numRegTotal << ", numAcc=" << numAcc;
1409 if (fg.builder->hasSWSB())
1410 {
1411 os << ", numSWSB=" << numSWSBTokens;
1412 }
1413 os << "\n" << "//.options_string \"" << m_options->getUserArgString().str() << "\"";
1414 os << "\n" << "//.full_options \"" << m_options->getFullArgString() << "\"";
1415 os << "\n" << "//.instCount " << asmInstCount;
1416 static const char* const RATypeString[] {
1417 RA_TYPE(STRINGIFY)
1418 };
1419 os << "\n//.RA type\t" << RATypeString[RAType];
1420
1421 if (auto jitInfo = fg.builder->getJitInfo())
1422 {
1423 if (jitInfo->numGRFUsed != 0)
1424 {
1425 os << "\n" << "//.GRF count " << jitInfo->numGRFUsed;
1426 }
1427 if (jitInfo->spillMemUsed > 0)
1428 {
1429 os << "\n" << "//.spill size " << jitInfo->spillMemUsed;
1430 }
1431 if (jitInfo->numGRFSpillFill > 0)
1432 {
1433 os << "\n" << "//.spill GRF est. ref count " << jitInfo->numGRFSpillFill;
1434 }
1435 if (jitInfo->numFlagSpillStore > 0)
1436 {
1437 os << "\n//.spill flag store " << jitInfo->numFlagSpillStore;
1438 os << "\n//.spill flag load " << jitInfo->numFlagSpillLoad;
1439 }
1440 }
1441
1442 auto privateMemSize = getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
1443 if (privateMemSize != 0)
1444 {
1445 os << "\n//.private memory size " << privateMemSize;
1446 }
1447 os << "\n\n";
1448
1449 //Step2: emit declares (as needed)
1450 //
1451 // firstly, emit RA declare as comments or code depends on Options::symbolReg
1452 // we check if the register allocation is successful here
1453 //
1454
1455 for (auto dcl : Declares)
1456 {
1457 dcl->emit(os);
1458 }
1459 os << "\n";
1460
1461 auto fmtHex = [](int i) {
1462 std::stringstream ss;
1463 ss << "0x" << std::hex << std::uppercase << i;
1464 return ss.str();
1465 };
1466
1467 const unsigned inputCount = fg.builder->getInputCount();
1468 std::vector<std::string> argNames;
1469 size_t maxNameLen = 8;
1470 for (unsigned id = 0; id < inputCount; id++)
1471 {
1472 const input_info_t* ii = fg.builder->getInputArg(id);
1473 std::stringstream ss;
1474 if (ii->dcl && ii->dcl->getName()) {
1475 ss << ii->dcl->getName();
1476 } else {
1477 ss << "__unnamed" << (id + 1);
1478 }
1479 argNames.push_back(ss.str());
1480 maxNameLen = std::max(maxNameLen, argNames.back().size());
1481 }
1482
1483 // emit input location and size
1484 os << "// .inputs\n";
1485 const size_t COLW_IDENT = maxNameLen;
1486 static const size_t COLW_TYPE = 8;
1487 static const size_t COLW_SIZE = 6;
1488 static const size_t COLW_AT = 8;
1489 static const size_t COLW_CLASS = 10;
1490
1491 std::stringstream bordss;
1492 bordss << "// ";
1493 bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_IDENT + 2) << "";
1494 bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_TYPE + 2) << "";
1495 bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_SIZE + 2) << "";
1496 bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_AT + 2) << "";
1497 bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_CLASS + 2) << "";
1498 bordss << '+' << "\n";
1499 std::string border = bordss.str();
1500
1501 os << border;
1502 os <<
1503 "//" <<
1504 " | " << std::left << std::setw(COLW_IDENT) << "id" <<
1505 " | " << std::left << std::setw(COLW_TYPE) << "type" <<
1506 " | " << std::right << std::setw(COLW_SIZE) << "bytes" <<
1507 " | " << std::left << std::setw(COLW_AT) << "at" <<
1508 " | " << std::left << std::setw(COLW_CLASS) << "class" <<
1509 " |" << "\n";
1510 os << border;
1511
1512 const unsigned grfSize = getGRFSize();
1513 for (unsigned id = 0; id < inputCount; id++)
1514 {
1515 const input_info_t* input_info = fg.builder->getInputArg(id);
1516 //
1517 os << "//";
1518 //
1519 // id
1520 os <<
1521 " | " << std::left << std::setw(COLW_IDENT) << argNames[id];
1522 //
1523 // type and length
1524 // e.g. :uq x 16
1525 const G4_Declare *dcl = input_info->dcl;
1526 std::stringstream sstype;
1527 if (dcl) {
1528 switch (dcl->getElemType()) {
1529 case Type_B: sstype << ":b"; break;
1530 case Type_W: sstype << ":w"; break;
1531 case Type_D: sstype << ":d"; break;
1532 case Type_Q: sstype << ":q"; break;
1533 case Type_V: sstype << ":v"; break;
1534 case Type_UB: sstype << ":ub"; break;
1535 case Type_UW: sstype << ":uw"; break;
1536 case Type_UD: sstype << ":ud"; break;
1537 case Type_UQ: sstype << ":uq"; break;
1538 case Type_UV: sstype << ":uv"; break;
1539 //
1540 case Type_F: sstype << ":f"; break;
1541 case Type_HF: sstype << ":hf"; break;
1542 case Type_DF: sstype << ":df"; break;
1543 case Type_NF: sstype << ":nf"; break;
1544 case Type_BF: sstype << ":bf"; break;
1545 default:
1546 sstype << fmtHex((int)dcl->getElemType()) << "?";
1547 break;
1548 }
1549 if (dcl->getTotalElems() != 1)
1550 sstype << " x " << dcl->getTotalElems();
1551 } else {
1552 sstype << "?";
1553 }
1554 os << " | " << std::left << std::setw(COLW_TYPE) << sstype.str();
1555 //
1556 // size
1557 os << " | " << std::right << std::setw(COLW_SIZE) << std::dec << input_info->size;
1558
1559 // location
1560 unsigned reg = input_info->offset / grfSize,
1561 subRegBytes = input_info->offset % grfSize;
1562 std::stringstream ssloc;
1563 ssloc << "r" << reg;
1564 if (subRegBytes != 0)
1565 ssloc << "+" << subRegBytes;
1566 os << " | " << std::left << std::setw(COLW_AT) << ssloc.str();
1567
1568 // class
1569 std::string inpcls;
1570 switch (input_info->getInputClass()) {
1571 case INPUT_GENERAL: inpcls = "general"; break;
1572 case INPUT_SAMPLER: inpcls = "sampler"; break;
1573 case INPUT_SURFACE: inpcls = "surface"; break;
1574 default: inpcls = fmtHex((int)input_info->getInputClass()); break;
1575 }
1576 os << " | " << std::left << std::setw(COLW_CLASS) << inpcls;
1577 //
1578 os << " |\n";
1579 }
1580 os << border << "\n";
1581
1582 if (getPlatformGeneration(getGenxPlatform()) < PlatformGen::XE)
1583 {
1584 fg.BCStats.clear();
1585 }
1586 else
1587 {
1588 fg.XeBCStats.clear();
1589 }
1590 fg.numRMWs = 0;
1591 }
1592
1593
parseDecodeErrors(KernelView & kView,const char * errBuf,size_t errBufSize)1594 static std::map<int, std::string> parseDecodeErrors(
1595 KernelView &kView, const char *errBuf, size_t errBufSize)
1596 {
1597 // FIXME: IGA KernelView should be refactored to just return PC's
1598 // paired with diagnostic strings for each
1599 // (automatically allocate in IGA and cleanup when KV is deleted)
1600 bool dissasemblyFailed = !kView.decodeSucceeded();
1601 std::string igaErrMsgs;
1602 std::vector<std::string> igaErrMsgsVector;
1603 std::map<int, std::string> errorToStringMap;
1604 if (dissasemblyFailed)
1605 {
1606 std::cerr << "failed to decode binary for asm output";
1607 igaErrMsgs = errBuf;
1608 igaErrMsgsVector = split(igaErrMsgs, "\n");
1609 for (auto msg : igaErrMsgsVector)
1610 {
1611 auto pos = msg.find("ERROR");
1612 if (pos != std::string::npos)
1613 {
1614 std::cerr << msg << "\n";
1615 std::vector<std::string> aString = split(msg, " ");
1616 for (auto token : aString)
1617 {
1618 if (token.find_first_of("0123456789") != std::string::npos)
1619 {
1620 int errorPC = std::atoi(token.c_str());
1621 errorToStringMap[errorPC] = msg;
1622 break;
1623 }
1624 }
1625 }
1626 }
1627 }
1628
1629 return errorToStringMap;
1630 }
1631
1632 using BlockOffsets = std::map<int32_t,std::vector<std::string>>;
1633
precomputeBlockOffsets(std::ostream & os,G4_Kernel & g4k,const KernelView & kv)1634 static BlockOffsets precomputeBlockOffsets(
1635 std::ostream& os, G4_Kernel &g4k, const KernelView &kv)
1636 {
1637 // pre-compute the PCs of each basic block
1638 int32_t currPc = 0, lastInstSize = -1;
1639 std::map<int32_t,std::vector<std::string>> blockOffsets;
1640 for (BB_LIST_ITER itBB = g4k.fg.begin(); itBB != g4k.fg.end(); ++itBB) {
1641 for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end(); ++itInst) {
1642 if ((*itInst)->isLabel()) {
1643 // G4 treats labels as special instructions
1644 const char *lbl = (*itInst)->getLabelStr();
1645 if (lbl && *lbl) {
1646 blockOffsets[currPc].emplace_back(lbl);
1647 }
1648 } else {
1649 // we are looking at the next G4 instruction,
1650 // but reached the end of the decode stream
1651 if (lastInstSize == 0) {
1652 os << "// ERROR: deducing G4 block PCs "
1653 "(IGA decoded stream ends early); falling back to IGA labels\n";
1654 blockOffsets.clear(); // fallback to IGA default labels
1655 return blockOffsets;
1656 }
1657 lastInstSize = kv.getInstSize(currPc);
1658 currPc += lastInstSize;
1659 }
1660 }
1661 }
1662 if (kv.getInstSize(currPc) != 0) {
1663 // we are looking at the next G4 instruction,
1664 // but reached the end of the decode stream
1665 os << "// ERROR: deducing G4 block PCs "
1666 "(G4_INST stream ends early); falling back to IGA labels\n";
1667 blockOffsets.clear(); // fallback to IGA default labels
1668 }
1669 return blockOffsets;
1670 }
1671
1672
1673 // needs further cleanup (confirm label prefixes are gone, newAsm == true)
emitDeviceAsmInstructionsIga(std::ostream & os,const void * binary,uint32_t binarySize)1674 void G4_Kernel::emitDeviceAsmInstructionsIga(
1675 std::ostream& os, const void * binary, uint32_t binarySize)
1676 {
1677 os << "\n";
1678
1679 const size_t ERROR_STRING_MAX_LENGTH = 16 * 1024;
1680 char* errBuf = new char[ERROR_STRING_MAX_LENGTH];
1681 assert(errBuf);
1682 if (!errBuf)
1683 return;
1684 KernelView kv(
1685 getIGAPlatform(), binary, binarySize,
1686 GetIGASWSBEncodeMode(*fg.builder),
1687 errBuf, ERROR_STRING_MAX_LENGTH);
1688 const auto errorMap =
1689 parseDecodeErrors(kv, errBuf, ERROR_STRING_MAX_LENGTH);
1690 delete [] errBuf;
1691
1692 const auto blockOffsets = precomputeBlockOffsets(os, *this, kv);
1693
1694 //
1695 // Generate a label with uniqueLabel as prefix (required by some tools).
1696 // We do so by using labeler callback. If uniqueLabels is not present, use iga's
1697 // default label. For example,
1698 // Without option -uniqueLabels:
1699 // generating default label, L1234
1700 // With option -uniqueLabels <sth>:
1701 // generating label with <sth> as prefix, <sth>_L1234
1702 //
1703 std::string labelPrefix;
1704 if (m_options->getOption(vISA_UniqueLabels))
1705 {
1706 const char* labelPrefixC = nullptr;
1707 m_options->getOption(vISA_LabelStr, labelPrefixC);
1708 labelPrefix = labelPrefixC;
1709 if (!labelPrefix.empty())
1710 labelPrefix += '_';
1711 }
1712
1713 struct LabelerState {
1714 const KernelView *kv;
1715 const BlockOffsets &blockOffsets;
1716 const std::string labelPrefix;
1717 std::string labelStorage;
1718 LabelerState(
1719 const KernelView *_kv,
1720 const BlockOffsets &offs,
1721 const std::string &lblPfx)
1722 : kv(_kv), blockOffsets(offs), labelPrefix(lblPfx)
1723 {
1724 }
1725 };
1726 LabelerState ls(&kv, blockOffsets, labelPrefix);
1727
1728 // storage for the IGA labeler
1729 auto labeler = [](int32_t pc, void *data) -> const char * {
1730 LabelerState &ls = *(LabelerState *)data;
1731 ls.labelStorage = ls.labelPrefix;
1732 auto itr = ls.blockOffsets.find(pc);
1733 if (itr == ls.blockOffsets.end()) {
1734 // let IGA choose the label name, but we still have to prefix
1735 // our user provided prefix
1736 char igaDefaultLabel[128];
1737 ls.kv->getDefaultLabelName(pc, igaDefaultLabel, sizeof(igaDefaultLabel));
1738 ls.labelStorage += igaDefaultLabel;
1739 return ls.labelStorage.c_str();
1740 }
1741 std::string g4Label = itr->second.front().c_str();
1742 ls.labelStorage += g4Label;
1743 return ls.labelStorage.c_str();
1744 };
1745
1746
1747 // initialize register suppression info
1748 int suppressRegs[5] = {};
1749 int lastRegs[3] = {};
1750 for (int i = 0; i < 3; i++)
1751 {
1752 suppressRegs[i] = -1;
1753 lastRegs[i] = -1;
1754 }
1755
1756 ////////////////////////////////////////
1757 // emit the program text (instructions) iteratively
1758 // this is a little tricky because G4 treats labels as instructions
1759 // thus we need to do a little checking to keep the two streams in sync
1760 int32_t pc = 0;
1761 std::vector<char> igaStringBuffer;
1762 igaStringBuffer.resize(512); // TODO: expand default after testing
1763 for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB) {
1764 os << "// "; (*itBB)->emitBbInfo(os); os << "\n";
1765 for (INST_LIST_ITER itInst = (*itBB)->begin();
1766 itInst != (*itBB)->end(); ++itInst)
1767 {
1768 G4_INST *i = (*itInst);
1769
1770 // walk to next non-label in this block;
1771 // return true if we find one, else fails if at end of block
1772 auto findNextNonLabel = [&](bool print) {
1773 while ((*itInst)->isLabel()) {
1774 if (print)
1775 os << "// " << (*itInst)->getLabelStr() << ":\n";
1776 itInst++;
1777 if (itInst == (*itBB)->end())
1778 break;
1779 }
1780 if (itInst == (*itBB)->end())
1781 return false;
1782 i = (*itInst);
1783 return true;
1784 };
1785
1786 bool isInstTarget = kv.isInstTarget(pc);
1787 if (isInstTarget) {
1788 auto itr = ls.blockOffsets.find(pc);
1789 if (itr == ls.blockOffsets.end()) {
1790 os << labeler(pc, &ls) << ":\n";
1791 } else {
1792 // there can be multiple labels per PC
1793 for (const std::string &lbl : itr->second) {
1794 os << ls.labelPrefix << lbl << ":\n";
1795 }
1796 }
1797 if (!findNextNonLabel(false)) {
1798 break; // at end of block
1799 }
1800 } else if (i->isLabel()) {
1801 // IGA doesn't consider this PC to be a label but G4 does
1802 //
1803 // move forward until we find the next non-label
1804 if (!findNextNonLabel(true)) {
1805 break; // at end of block
1806 }
1807 }
1808
1809 ///////////////////////////////////////////////////////////////////
1810 // we are looking at a non-label G4_INST at the next valid IGA PC
1811 // (same instruction)
1812 if (!getOptions()->getOption(vISA_disableInstDebugInfo)) {
1813 (*itBB)->emitInstructionSourceLineMapping(os, itInst);
1814 }
1815
1816 auto eitr = errorMap.find(pc);
1817 if (eitr != errorMap.end()) {
1818 os << "// " << eitr->second << "\n";
1819 os << "// text representation might not be correct";
1820 }
1821
1822 static const uint32_t IGA_FMT_OPTS =
1823 IGA_FORMATTING_OPT_PRINT_LDST
1824 | IGA_FORMATTING_OPT_PRINT_BFNEXPRS;
1825 while (true) {
1826 size_t nw = kv.getInstSyntax(
1827 pc,
1828 igaStringBuffer.data(), igaStringBuffer.size(),
1829 IGA_FMT_OPTS,
1830 labeler, &ls);
1831 if (nw == 0) {
1832 os << "<<error formatting instruction at PC " << pc << ">>\n";
1833 break;
1834 } else if (nw <= igaStringBuffer.size()) {
1835 // print it (pad it out so comments line up on most instructions)
1836 std::string line =igaStringBuffer.data();
1837 while (line.size() < 100)
1838 line += ' ';
1839 os << line;
1840 break;
1841 } else {
1842 igaStringBuffer.resize(igaStringBuffer.size() + 512);
1843 // try again
1844 }
1845 }
1846
1847 (*itBB)->emitBasicInstructionComment(os, itInst, suppressRegs, lastRegs);
1848 os << "\n";
1849
1850 pc += kv.getInstSize(pc);
1851 } // for insts in block
1852 } // for blocks
1853 } // emitDeviceAsmInstructionsIga
1854
1855
1856 // Should be removed once we can confirm no one uses it
1857 // the output comes from G4_INST::... and almost certainly won't be
1858 // parsable by IGA
emitDeviceAsmInstructionsOldAsm(std::ostream & os)1859 void G4_Kernel::emitDeviceAsmInstructionsOldAsm(std::ostream& os)
1860 {
1861 os << std::endl << ".code";
1862 for (BB_LIST_ITER it = fg.begin(); it != fg.end(); ++it)
1863 {
1864 os << "\n";
1865 (*it)->emit(os);
1866 }
1867 //Step4: emit clean-up.
1868 os << std::endl;
1869 os << ".end_code" << std::endl;
1870 os << ".end_kernel" << std::endl;
1871 os << std::endl;
1872 }
1873
getNextBB(G4_BB * bb) const1874 G4_BB* G4_Kernel::getNextBB(G4_BB* bb) const
1875 {
1876 if (!bb)
1877 return nullptr;
1878
1879 // Return the lexically following bb.
1880 G4_BB* nextBB = nullptr;
1881 for (auto it = fg.cbegin(), ie = fg.cend(); it != ie; it++)
1882 {
1883 auto curBB = (*it);
1884 if (curBB == bb)
1885 {
1886 if (it != ie)
1887 {
1888 it++;
1889 nextBB = (*it);
1890 }
1891 break;
1892 }
1893 }
1894
1895 return nextBB;
1896 }
1897
getBinOffsetOfBB(G4_BB * bb) const1898 unsigned G4_Kernel::getBinOffsetOfBB(G4_BB* bb) const {
1899 if (!bb)
1900 return 0;
1901
1902 // Given a bb, return the binary offset of first non-label of instruction.
1903 auto it = std::find_if(bb->begin(), bb->end(), [](G4_INST* inst) { return !inst->isLabel(); });
1904 assert(it != bb->end() && "expect at least one non-label inst in second BB");
1905 return (unsigned)(*it)->getGenOffset();
1906 }
1907
getPerThreadNextOff() const1908 unsigned G4_Kernel::getPerThreadNextOff() const
1909 {
1910 if (!hasPerThreadPayloadBB())
1911 return 0;
1912 G4_BB* next = getNextBB(perThreadPayloadBB);
1913 return getBinOffsetOfBB(next);
1914 }
1915
getCrossThreadNextOff() const1916 unsigned G4_Kernel::getCrossThreadNextOff() const
1917 {
1918 if (!hasCrossThreadPayloadBB())
1919 return 0;
1920 G4_BB* next = getNextBB(crossThreadPayloadBB);
1921 return getBinOffsetOfBB(next);
1922 }
1923
getComputeFFIDGPNextOff() const1924 unsigned G4_Kernel::getComputeFFIDGPNextOff() const
1925 {
1926 if (!hasComputeFFIDProlog())
1927 return 0;
1928 // return the offset of the second entry (GP1)
1929 // the first instruction in the second BB is the start of the second entry
1930 assert(fg.getNumBB() > 1 && "expect at least one prolog BB");
1931 assert(!computeFFIDGP1->empty() && !computeFFIDGP1->front()->isLabel());
1932 return getBinOffsetOfBB(computeFFIDGP1);
1933 }
1934
getComputeFFIDGP1NextOff() const1935 unsigned G4_Kernel::getComputeFFIDGP1NextOff() const
1936 {
1937 if (!hasComputeFFIDProlog())
1938 return 0;
1939 // return the offset of the BB next to GP1
1940 // the first instruction in the second BB is the start of the second entry
1941 assert(fg.getNumBB() > 1 && "expect at least one prolog BB");
1942 G4_BB* next = getNextBB(computeFFIDGP1);
1943 return getBinOffsetOfBB(next);
1944 }
1945