1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "BuildIR.h"
10 #include "DebugInfo.h"
11 #include "G4_Kernel.hpp"
12 #include "G4_BB.hpp"
13 #include "VarSplit.h"
14 // #include "iga/IGALibrary/api/igaEncoderWrapper.hpp"
15 #include "iga/IGALibrary/api/kv.hpp"
16 #include "BinaryEncodingIGA.h"
17 
18 #include <list>
19 #include <fstream>
20 #include <functional>
21 #include <iomanip>
22 #include <utility>
23 
24 using namespace vISA;
25 
markInsts()26 void gtPinData::markInsts()
27 {
28     // Take a snapshot of instructions in kernel.
29     for (auto bb : kernel.fg)
30     {
31         for (auto inst : *bb)
32         {
33             markedInsts.insert(inst);
34         }
35     }
36 }
37 
removeUnmarkedInsts()38 void gtPinData::removeUnmarkedInsts()
39 {
40     if (!kernel.fg.getIsStackCallFunc() &&
41         !kernel.fg.getHasStackCalls())
42     {
43         // Marked instructions correspond to caller/callee save
44         // and FP/SP manipulation instructions.
45         return;
46     }
47 
48     MUST_BE_TRUE(whichRAPass == ReRAPass,
49         "Unexpectedly removing unmarked instructions in first RA pass");
50     // Instructions not seen in "marked" snapshot will be removed by this function.
51     for (auto bb : kernel.fg)
52     {
53         for (auto it = bb->begin(), itEnd = bb->end();
54             it != itEnd;)
55         {
56             auto inst = (*it);
57 
58             if (markedInsts.find(inst) == markedInsts.end())
59             {
60                 it = bb->erase(it);
61                 continue;
62             }
63             it++;
64         }
65     }
66 }
67 
getFreeGRFInfo(unsigned & size)68 void* gtPinData::getFreeGRFInfo(unsigned& size)
69 {
70     // Here is agreed upon format for reporting free GRFs:
71     //struct freeBytes
72     //{
73     //    unsigned short startByte;
74     //    unsigned short numConsecutiveBytes;
75     //};
76 
77     // Added magic 0xDEADD00D at start and
78     // magic 0xDEADBEEF at the end of buffer
79     // on request of gtpin team.
80     //
81     //struct freeGRFInfo
82     //{
83     //    unsigned short numItems;
84     //
85     //    freeBytes data[numItems];
86     //};
87     struct freeBytes
88     {
89         unsigned short startByte;
90         unsigned short numConsecutiveBytes;
91     };
92 
93     struct freeGRFInfo
94     {
95         unsigned int magicStart;
96         unsigned int numItems;
97     };
98 
99     // Compute free register information using vector for efficiency,
100     // then convert to POS for passing back to gtpin.
101     std::vector<std::pair<unsigned short, unsigned short>> vecFreeBytes;
102 
103     for (auto byte : globalFreeRegs)
104     {
105         if (vecFreeBytes.size() > 0)
106         {
107             auto& lastFree = vecFreeBytes.back();
108             if (byte == (lastFree.first + lastFree.second))
109             {
110                 lastFree.second += 1;
111             }
112             else
113             {
114                 vecFreeBytes.push_back(std::make_pair(byte, 1));
115             }
116         }
117         else
118         {
119             vecFreeBytes.push_back(std::make_pair(byte, 1));
120         }
121     }
122 
123     // Now convert vector to POS
124     unsigned int numItems = (unsigned int)vecFreeBytes.size();
125     freeGRFInfo* buffer = (freeGRFInfo*)malloc(numItems * sizeof(freeBytes) + sizeof(unsigned int)
126         + sizeof(unsigned int) + sizeof(unsigned int));
127     if (buffer)
128     {
129         buffer->numItems = numItems;
130         buffer->magicStart = 0xDEADD00D;
131         memcpy_s((unsigned char*)buffer + sizeof(unsigned int) + sizeof(unsigned int),
132             numItems * sizeof(freeBytes), vecFreeBytes.data(), numItems * sizeof(freeBytes));
133         unsigned int magicEnd = 0xDEADBEEF;
134         memcpy_s((unsigned char*)buffer + sizeof(unsigned int) + sizeof(unsigned int) + (numItems * sizeof(freeBytes)),
135             sizeof(magicEnd), &magicEnd, sizeof(magicEnd));
136 
137         // numItems - unsigned int
138         // magicStart - unsigned int
139         // magicEnd - unsigned int
140         // data - numItems * sizeof(freeBytes)
141         size = sizeof(unsigned int) + sizeof(unsigned int) + sizeof(unsigned int) + (numItems * sizeof(freeBytes));
142     }
143 
144     return buffer;
145 }
146 
setGTPinInit(void * buffer)147 void gtPinData::setGTPinInit(void* buffer)
148 {
149     MUST_BE_TRUE(sizeof(gtpin::igc::igc_init_t) <= 200, "Check size of igc_init_t");
150     gtpin_init = (gtpin::igc::igc_init_t*)buffer;
151 
152     if (gtpin_init->re_ra)
153         kernel.getOptions()->setOption(vISA_ReRAPostSchedule, true);
154     if (gtpin_init->grf_info)
155         kernel.getOptions()->setOption(vISA_GetFreeGRFInfo, true);
156 }
157 
158 template<typename T>
writeBuffer(std::vector<unsigned char> & buffer,unsigned & bufferSize,const T * t,unsigned numBytes)159 static void writeBuffer(
160     std::vector<unsigned char>& buffer,
161     unsigned& bufferSize,
162     const T* t,
163     unsigned numBytes)
164 {
165     const unsigned char* data = (const unsigned char*)t;
166     for (unsigned i = 0; i != numBytes; i++)
167     {
168         buffer.push_back(data[i]);
169     }
170     bufferSize += numBytes;
171 }
172 
getGTPinInfoBuffer(unsigned & bufferSize)173 void* gtPinData::getGTPinInfoBuffer(unsigned &bufferSize)
174 {
175     if (!gtpin_init && !gtpinInitFromL0)
176     {
177         bufferSize = 0;
178         return nullptr;
179     }
180     gtpin::igc::igc_init_t t;
181     std::vector<unsigned char> buffer;
182     unsigned numTokens = 0;
183     auto stackABI = kernel.fg.getIsStackCallFunc() || kernel.fg.getHasStackCalls();
184     bufferSize = 0;
185 
186     memset(&t, 0, sizeof(t));
187 
188     t.version = gtpin::igc::GTPIN_IGC_INTERFACE_VERSION;
189     t.igc_init_size = sizeof(t);
190     if (gtpinInitFromL0)
191     {
192         if (!stackABI)
193         {
194             if (kernel.getOption(vISA_GetFreeGRFInfo))
195             {
196                 t.grf_info = 1;
197                 numTokens++;
198             }
199 
200             if (kernel.getOption(vISA_GTPinReRA))
201             {
202                 t.re_ra = 1;
203             }
204         }
205 
206         if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
207             t.srcline_mapping = 1;
208 
209         if (kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize) > 0)
210         {
211             t.scratch_area_size = getNumBytesScratchUse();
212             numTokens++;
213         }
214     }
215     else
216     {
217         t.version = std::min(gtpin_init->version, gtpin::igc::GTPIN_IGC_INTERFACE_VERSION);
218         if (!stackABI)
219         {
220             if (gtpin_init->grf_info)
221             {
222                 t.grf_info = 1;
223                 numTokens++;
224             }
225 
226             if (gtpin_init->re_ra)
227             {
228                 t.re_ra = 1;
229             }
230         }
231 
232         if (gtpin_init->srcline_mapping && kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
233             t.srcline_mapping = 1;
234 
235         if (gtpin_init->scratch_area_size > 0)
236         {
237             t.scratch_area_size = gtpin_init->scratch_area_size;
238             numTokens++;
239         }
240     }
241 
242     // For payload offsets
243     numTokens++;
244 
245     // Report #GRFs
246     numTokens++;
247 
248     writeBuffer(buffer, bufferSize, &t, sizeof(t));
249     writeBuffer(buffer, bufferSize, &numTokens, sizeof(uint32_t));
250 
251     if (t.grf_info)
252     {
253         // create token
254         void* rerabuffer = nullptr;
255         unsigned rerasize = 0;
256 
257         rerabuffer = getFreeGRFInfo(rerasize);
258 
259         gtpin::igc::igc_token_header_t th;
260         th.token = gtpin::igc::GTPIN_IGC_TOKEN::GTPIN_IGC_TOKEN_GRF_INFO;
261         th.token_size = sizeof(gtpin::igc::igc_token_header_t) + rerasize;
262 
263         // write token and data to buffer
264         writeBuffer(buffer, bufferSize, &th, sizeof(th));
265         writeBuffer(buffer, bufferSize, rerabuffer, rerasize);
266 
267         free(rerabuffer);
268     }
269 
270     if (t.scratch_area_size)
271     {
272         gtpin::igc::igc_token_scratch_area_info_t scratchSlotData;
273         scratchSlotData.scratch_area_size = t.scratch_area_size;
274         scratchSlotData.scratch_area_offset = nextScratchFree;
275 
276         // gtpin scratch slots are beyond spill memory
277         scratchSlotData.token = gtpin::igc::GTPIN_IGC_TOKEN_SCRATCH_AREA_INFO;
278         scratchSlotData.token_size = sizeof(scratchSlotData);
279 
280         writeBuffer(buffer, bufferSize, &scratchSlotData, sizeof(scratchSlotData));
281     }
282 
283     {
284         // Write payload offsets
285         gtpin::igc::igc_token_kernel_start_info_t offsets;
286         offsets.token = gtpin::igc::GTPIN_IGC_TOKEN_KERNEL_START_INFO;
287         offsets.per_thread_prolog_size = kernel.getPerThreadNextOff();
288         offsets.cross_thread_prolog_size = kernel.getCrossThreadNextOff() - offsets.per_thread_prolog_size;
289         offsets.token_size = sizeof(offsets);
290         writeBuffer(buffer, bufferSize, &offsets, sizeof(offsets));
291     }
292 
293     {
294         // Report num GRFs
295         gtpin::igc::igc_token_num_grf_regs_t numGRFs;
296         numGRFs.token = gtpin::igc::GTPIN_IGC_TOKEN_NUM_GRF_REGS;
297         numGRFs.token_size = sizeof(numGRFs);
298         numGRFs.num_grf_regs = kernel.getNumRegTotal();
299         writeBuffer(buffer, bufferSize, &numGRFs, sizeof(numGRFs));
300     }
301 
302     void* gtpinBuffer = allocCodeBlock(bufferSize);
303 
304     memcpy_s(gtpinBuffer, bufferSize, buffer.data(), bufferSize);
305 
306     // Dump buffer with shader dumps
307     if (kernel.getOption(vISA_outputToFile))
308     {
309         auto asmName = kernel.getOptions()->getOptionCstr(VISA_AsmFileName);
310         if (asmName)
311         {
312             std::ofstream ofInit;
313             std::stringstream ssInit;
314             ssInit << std::string(asmName) << ".gtpin_igc_init";
315             ofInit.open(ssInit.str(), std::ofstream::binary);
316             if (gtpin_init)
317             {
318                 ofInit.write((const char*)gtpin_init, sizeof(*gtpin_init));
319             }
320             ofInit.close();
321 
322             std::ofstream ofInfo;
323             std::stringstream ssInfo;
324             ssInfo << std::string(asmName) << ".gtpin_igc_info";
325             ofInfo.open(ssInfo.str(), std::ofstream::binary);
326             if (gtpinBuffer)
327             {
328                 ofInfo.write((const char*)gtpinBuffer, bufferSize);
329             }
330             ofInfo.close();
331         }
332     }
333 
334     return gtpinBuffer;
335 }
336 
getNumBytesScratchUse() const337 uint32_t gtPinData::getNumBytesScratchUse() const
338 {
339     if (gtpin_init)
340     {
341         return gtpin_init->scratch_area_size;
342     }
343     else if (isGTPinInitFromL0())
344     {
345         return kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize);
346     }
347     return 0;
348 }
349 
350 
G4_Kernel(INST_LIST_NODE_ALLOCATOR & alloc,Mem_Manager & m,Options * options,Attributes * anAttr,unsigned char major,unsigned char minor)351 G4_Kernel::G4_Kernel(INST_LIST_NODE_ALLOCATOR& alloc,
352     Mem_Manager& m, Options* options, Attributes* anAttr,
353     unsigned char major, unsigned char minor)
354     : m_options(options), m_kernelAttrs(anAttr), RAType(RA_Type::UNKNOWN_RA),
355     asmInstCount(0), kernelID(0), fg(alloc, this, m),
356     major_version(major), minor_version(minor)
357 {
358     ASSERT_USER(
359         major < COMMON_ISA_MAJOR_VER ||
360         (major == COMMON_ISA_MAJOR_VER && minor <= COMMON_ISA_MINOR_VER),
361         "CISA version not supported by this JIT-compiler");
362 
363 
364     name = NULL;
365     numThreads = 0;
366     hasAddrTaken = false;
367     kernelDbgInfo = nullptr;
368     sharedDebugInfo = false;
369     sharedGTPinInfo = false;
370     if (options->getOption(vISAOptions::vISA_ReRAPostSchedule) ||
371         options->getOption(vISAOptions::vISA_GetFreeGRFInfo) ||
372         options->getuInt32Option(vISAOptions::vISA_GTPinScratchAreaSize))
373     {
374         allocGTPinData();
375     } else {
376         gtPinInfo = nullptr;
377     }
378 
379     setKernelParameters();
380 }
381 
~G4_Kernel()382 G4_Kernel::~G4_Kernel()
383 {
384     if (kernelDbgInfo && !sharedDebugInfo)
385     {
386         kernelDbgInfo->~KernelDebugInfo();
387     }
388 
389     if (gtPinInfo && !sharedGTPinInfo)
390     {
391         gtPinInfo->~gtPinData();
392     }
393 
394     if (varSplitPass)
395     {
396         delete varSplitPass;
397         varSplitPass = nullptr;
398     }
399 
400     Declares.clear();
401 }
402 
setKernelDebugInfo(KernelDebugInfo * k)403 void G4_Kernel::setKernelDebugInfo(KernelDebugInfo* k)
404 {
405     assert(k);
406     if (kernelDbgInfo)
407     {
408         kernelDbgInfo->~KernelDebugInfo();
409     }
410     kernelDbgInfo = k;
411     sharedDebugInfo = true;
412 }
413 
setGTPinData(gtPinData * p)414 void G4_Kernel::setGTPinData(gtPinData* p) {
415     assert(p);
416     if (gtPinInfo == nullptr)
417     {
418         gtPinInfo->~gtPinData();
419     }
420     gtPinInfo = p;
421     sharedGTPinInfo = true;
422 }
423 
computeChannelSlicing()424 void G4_Kernel::computeChannelSlicing()
425 {
426     G4_ExecSize simdSize = getSimdSize();
427     channelSliced = true;
428 
429     if (simdSize == g4::SIMD8 || simdSize == g4::SIMD16)
430     {
431         // SIMD8/16 kernels are not sliced
432         channelSliced = false;
433         return;
434     }
435 
436     // .dcl V1 size = 128 bytes
437     // op (16|M0) V1(0,0)     ..
438     // op (16|M16) V1(2,0)    ..
439     // For above sequence, return 32. Instruction
440     // is broken in to 2 only due to hw restriction.
441     // Allocation of dcl is still as if it were a
442     // SIMD32 kernel.
443 
444     // Store emask bits that are ever used to define a variable
445     std::unordered_map<G4_Declare*, std::bitset<32>> emaskRef;
446     for (auto bb : fg)
447     {
448         for (auto inst : *bb)
449         {
450             if (inst->isSend())
451                 continue;
452 
453             auto dst = inst->getDst();
454             if (!dst || !dst->getTopDcl() ||
455                 dst->getHorzStride() != 1)
456                 continue;
457 
458             if (inst->isWriteEnableInst())
459                 continue;
460 
461             auto regFileKind = dst->getTopDcl()->getRegFile();
462             if (regFileKind != G4_RegFileKind::G4_GRF && regFileKind != G4_RegFileKind::G4_INPUT)
463                 continue;
464 
465             if (dst->getTopDcl()->getByteSize() <= dst->getTypeSize() * (unsigned)simdSize)
466                 continue;
467 
468             auto emaskOffStart = inst->getMaskOffset();
469 
470             // Reset all bits on first encounter of dcl
471             if (emaskRef.find(dst->getTopDcl()) == emaskRef.end())
472                 emaskRef[dst->getTopDcl()].reset();
473 
474             // Set bits based on which EM bits are used in the def
475             for (unsigned i = emaskOffStart; i != (emaskOffStart + inst->getExecSize()); i++)
476             {
477                 emaskRef[dst->getTopDcl()].set(i);
478             }
479         }
480     }
481 
482     // Check whether any variable's emask usage straddles across lower and upper 16 bits
483     for (auto& emRefs : emaskRef)
484     {
485         auto& bits = emRefs.second;
486         auto num = bits.to_ulong();
487 
488         // Check whether any lower 16 and upper 16 bits are set
489         if (((num & 0xffff) != 0) && ((num & 0xffff0000) != 0))
490         {
491             channelSliced = false;
492             return;
493         }
494     }
495 
496     return;
497 }
498 
calculateSimdSize()499 void G4_Kernel::calculateSimdSize()
500 {
501     // Iterate over all instructions in kernel to check
502     // whether default execution size of kernel is
503     // SIMD8/16. This is required for knowing alignment
504     // to use for GRF candidates.
505 
506     // only do it once per kernel, as we should not introduce inst with larger simd size than in the input
507     if (simdSize.value != 0)
508     {
509         return;
510     }
511 
512     // First, get simdsize from attribute (0 : not given)
513     // If not 0|8|16|32, wrong value from attribute.
514     simdSize = G4_ExecSize((unsigned)m_kernelAttrs->getInt32KernelAttr(Attributes::ATTR_SimdSize));
515     if (simdSize != g4::SIMD8 && simdSize != g4::SIMD16 && simdSize != g4::SIMD32)
516     {
517         assert(simdSize.value == 0 && "vISA: wrong value for SimdSize attribute");
518         simdSize = g4::SIMD8;
519 
520         for (auto bb : fg)
521         {
522             for (auto inst : *bb)
523             {
524                 // do not consider send since for certain messages we have to set its execution size
525                 // to 16 even in simd8 shaders
526                 if (!inst->isLabel() && !inst->isSend())
527                 {
528                     uint32_t size = inst->getMaskOffset() + inst->getExecSize();
529                     if (size > 16)
530                     {
531                         simdSize = g4::SIMD32;
532                         break;
533                     }
534                     else if (size > 8)
535                     {
536                         simdSize = g4::SIMD16;
537                     }
538                 }
539             }
540             if (simdSize == g4::SIMD32)
541                 break;
542         }
543     }
544 
545     if (GlobalRA::useGenericAugAlign())
546         computeChannelSlicing();
547 }
548 
549 //
550 // Updates kernel's related structures based on number of threads.
551 //
updateKernelByNumThreads(int nThreads)552 void G4_Kernel::updateKernelByNumThreads(int nThreads)
553 {
554     if (numThreads == nThreads)
555         return;
556 
557     numThreads = nThreads;
558 
559     // Scale number of GRFs, Acc, SWSB tokens.
560     setKernelParameters();
561 
562     // Update physical register pool
563     fg.builder->rebuildPhyRegPool(getNumRegTotal());
564 }
565 
566 //
567 // Evaluate AddrExp/AddrExpList to Imm
568 //
evalAddrExp()569 void G4_Kernel::evalAddrExp()
570 {
571     for (std::list<G4_BB*>::iterator it = fg.begin(), itEnd = fg.end();
572         it != itEnd; ++it)
573     {
574         G4_BB* bb = (*it);
575 
576         for (INST_LIST_ITER i = bb->begin(), iEnd = bb->end(); i != iEnd; i++)
577         {
578             G4_INST* inst = (*i);
579 
580             //
581             // process each source operand
582             //
583             for (unsigned j = 0; j < G4_MAX_SRCS; j++)
584             {
585                 G4_Operand* opnd = inst->getSrc(j);
586 
587                 if (opnd == NULL) continue;
588 
589                 if (opnd->isAddrExp())
590                 {
591                     int val = opnd->asAddrExp()->eval();
592                     G4_Type ty = opnd->asAddrExp()->getType();
593 
594                     G4_Imm* imm = fg.builder->createImm(val, ty);
595                     inst->setSrc(imm, j);
596                 }
597             }
598         }
599     }
600 }
601 
602 // FIX: this needs to here because of the above static thread-local variable
603 extern _THREAD const char* g4_prevFilename;
604 extern _THREAD int g4_prevSrcLineNo;
605 
split(const std::string & str,const char * delimiter)606 static std::vector<std::string> split(
607     const std::string & str, const char * delimiter)
608 {
609     std::vector<std::string> v;
610     std::string::size_type start = 0;
611 
612     for (auto pos = str.find_first_of(delimiter, start);
613         pos != std::string::npos;
614         start = pos + 1, pos = str.find_first_of(delimiter, start))
615     {
616         if (pos != start)
617         {
618             v.emplace_back(str, start, pos - start);
619         }
620     }
621 
622     if (start < str.length())
623         v.emplace_back(str, start, str.length() - start);
624     return v;
625 }
626 
getIGAPlatform()627 static iga_gen_t getIGAPlatform()
628 {
629     iga_gen_t platform = IGA_GEN_INVALID;
630     switch (getGenxPlatform())
631     {
632     case GENX_BDW: platform = IGA_GEN8; break;
633     case GENX_CHV: platform = IGA_GEN8lp; break;
634     case GENX_SKL: platform = IGA_GEN9; break;
635     case GENX_BXT: platform = IGA_GEN9lp; break;
636     case GENX_ICLLP: platform = IGA_GEN11; break;
637     case GENX_TGLLP:platform = IGA_GEN12p1; break;
638     case XeHP_SDV: platform = IGA_XE_HP; break;
639     case GENX_DG2:
640         platform = IGA_XE_HPG;
641         break;
642     case GENX_PVC:
643     case GENX_PVCXT:
644         platform = IGA_XE_HPC;
645         break;
646     default:
647         break;
648     }
649 
650     return platform;
651 }
652 
getKernelDebugInfo()653 KernelDebugInfo* G4_Kernel::getKernelDebugInfo()
654 {
655     if (kernelDbgInfo == nullptr)
656     {
657         kernelDbgInfo = new(fg.mem)KernelDebugInfo();
658     }
659 
660     return kernelDbgInfo;
661 }
662 
getStackCallStartReg() const663 unsigned G4_Kernel::getStackCallStartReg() const
664 {
665     // Last 3 GRFs to be used as scratch
666     unsigned totalGRFs = getNumRegTotal();
667     unsigned startReg = totalGRFs - numReservedABIGRF();
668     return startReg;
669 }
calleeSaveStart() const670 unsigned G4_Kernel::calleeSaveStart() const
671 {
672     return getCallerSaveLastGRF() + 1;
673 }
getNumCalleeSaveRegs() const674 unsigned G4_Kernel::getNumCalleeSaveRegs() const
675 {
676     unsigned totalGRFs = getNumRegTotal();
677     return totalGRFs - calleeSaveStart() - numReservedABIGRF();
678 }
679 
680 //
681 // rename non-root declares to their root decl name to make
682 // it easier to read IR dump
683 //
renameAliasDeclares()684 void G4_Kernel::renameAliasDeclares()
685 {
686 #if _DEBUG
687     for (auto dcl : Declares)
688     {
689         if (dcl->getAliasDeclare())
690         {
691             uint32_t offset = 0;
692             G4_Declare* rootDcl = dcl->getRootDeclare(offset);
693             std::string newName(rootDcl->getName());
694             if (rootDcl->getElemType() != dcl->getElemType())
695             {
696                 newName += "_";
697                 newName += TypeSymbol(dcl->getElemType());
698             }
699             if (offset != 0)
700             {
701                 newName += "_" + std::to_string(offset);
702             }
703             dcl->setName(fg.builder->getNameString(fg.mem, 64, "%s", newName.c_str()));
704         }
705     }
706 #endif
707 }
708 
709 //
710 // perform relocation for every entry in the allocation table
711 //
doRelocation(void * binary,uint32_t binarySize)712 void G4_Kernel::doRelocation(void* binary, uint32_t binarySize)
713 {
714     for (auto&& entry : relocationTable)
715     {
716         entry.doRelocation(*this, binary, binarySize);
717     }
718 }
719 
getFirstNonLabelInst() const720 G4_INST* G4_Kernel::getFirstNonLabelInst() const
721 {
722     for (auto I = fg.cbegin(), E = fg.cend(); I != E; ++I)
723     {
724         auto bb = *I;
725         G4_INST* firstInst = bb->getFirstInst();
726         if (firstInst)
727         {
728             return firstInst;
729         }
730     }
731     // empty kernel
732     return nullptr;
733 }
734 
getDebugSrcLine(const std::string & fileName,int srcLine)735 std::string G4_Kernel::getDebugSrcLine(const std::string& fileName, int srcLine)
736 {
737     auto iter = debugSrcLineMap.find(fileName);
738     if (iter == debugSrcLineMap.end())
739     {
740         std::ifstream ifs(fileName);
741         if (!ifs)
742         {
743             // file doesn't exist
744             debugSrcLineMap[fileName] = std::make_pair<bool, std::vector<std::string>>(false, {});
745             return "";
746         }
747         std::string line;
748         std::vector<std::string> srcLines;
749         while (std::getline(ifs, line))
750         {
751             srcLines.push_back(line);
752         }
753         debugSrcLineMap[fileName] = std::make_pair(true, std::move(srcLines));
754     }
755     iter = debugSrcLineMap.find(fileName);
756     if (iter == debugSrcLineMap.end() ||
757         !iter->second.first)
758     {
759         return "";
760     }
761     auto& lines = iter->second.second;
762     if (srcLine > (int) lines.size() || srcLine <= 0)
763     {
764         return "invalid line number";
765     }
766     return lines[srcLine - 1];
767 }
768 
getVarSplitPass()769 VarSplitPass* G4_Kernel::getVarSplitPass()
770 {
771     if (varSplitPass)
772         return varSplitPass;
773 
774     varSplitPass = new VarSplitPass(*this);
775 
776     return varSplitPass;
777 }
778 
setKernelParameters()779 void G4_Kernel::setKernelParameters()
780 {
781     unsigned overrideGRFNum = 0;
782     unsigned overrideNumThreads = 0;
783 
784     TARGET_PLATFORM platform = getGenxPlatform();
785     overrideGRFNum = m_options->getuInt32Option(vISA_TotalGRFNum);
786 
787     overrideNumThreads = m_options->getuInt32Option(vISA_HWThreadNumberPerEU);
788 
789     //
790     // Number of threads/GRF can currently be set by:
791     // 1.- IGC flag (reg key)
792     // 2.- Compiler option entered by user for
793     //      2.1 entire module
794     //      2.2 kernel function
795     // 3.- Compiler heuristics
796     //
797     if (m_options->getuInt32Option(vISA_ForceHWThreadNumberPerEU))
798     {
799         numThreads = m_options->getuInt32Option(vISA_ForceHWThreadNumberPerEU);
800     }
801     regSharingHeuristics = m_options->getOption(vISA_RegSharingHeuristics);
802     if (overrideNumThreads || regSharingHeuristics)
803     {
804         overrideGRFNum = 0;
805         if (numThreads > 0)
806         {
807             overrideNumThreads = numThreads;
808         }
809     }
810 
811     // Set the number of GRFs
812     if (overrideGRFNum > 0)
813     {
814         // User-provided number of GRFs
815         unsigned Val = m_options->getuInt32Option(vISA_GRFNumToUse);
816         if (Val > 0)
817         {
818             numRegTotal = std::min(Val, overrideGRFNum);
819         }
820         else
821         {
822             numRegTotal = overrideGRFNum;
823         }
824         callerSaveLastGRF = ((overrideGRFNum - 8) / 2) - 1;
825     }
826     else if (overrideNumThreads > 0)
827     {
828         switch (platform)
829         {
830         case XeHP_SDV:
831         case GENX_DG2:
832             switch (overrideNumThreads)
833             {
834             case 4:
835                 numRegTotal = 256;
836                 break;
837             default:
838                 numRegTotal = 128;
839             }
840             break;
841         case GENX_PVC:
842         case GENX_PVCXT:
843             switch (overrideNumThreads)
844             {
845             case 4:
846                 numRegTotal = 256;
847                 break;
848             case 5:
849                 numRegTotal = 192;
850                 break;
851             case 6:
852                 numRegTotal = 160;
853                 break;
854             case 8:
855                 numRegTotal = 128;
856                 break;
857             case 10:
858                 numRegTotal = 96;
859                 break;
860             case 12:
861                 numRegTotal = 64;
862                 break;
863             default:
864                 numRegTotal = 128;
865             }
866             break;
867         default:
868             numRegTotal = 128;
869         }
870         callerSaveLastGRF = ((numRegTotal - 8) / 2) - 1;
871     }
872     else
873     {
874         // Default value for all other platforms
875         unsigned Val = m_options->getuInt32Option(vISA_GRFNumToUse);
876         numRegTotal = Val ? Val : 128;
877         callerSaveLastGRF = ((numRegTotal - 8) / 2) - 1;
878     }
879     // For safety update TotalGRFNum, there may be some uses for this vISA option
880     m_options->setOption(vISA_TotalGRFNum, numRegTotal);
881 
882     // Set the number of SWSB tokens
883     unsigned overrideNumSWSB = m_options->getuInt32Option(vISA_SWSBTokenNum);
884     if (overrideNumSWSB > 0)
885     {
886         // User-provided number of SWSB tokens
887         numSWSBTokens = overrideNumSWSB;
888     }
889     else if (overrideNumThreads > 0)
890     {
891         switch (platform)
892         {
893         case GENX_PVC:
894         case GENX_PVCXT:
895             switch (overrideNumThreads)
896             {
897             case 4:
898                 numSWSBTokens = 32;
899                 break;
900             case 5:
901                 numSWSBTokens = 24;
902                 break;
903             case 6:
904                 numSWSBTokens = 20;
905                 break;
906             case 8:
907                 numSWSBTokens = 16;
908                 break;
909             case 10:
910                 numSWSBTokens = 12;
911                 break;
912             case 12:
913                 numSWSBTokens = 8;
914                 break;
915             default:
916                 numSWSBTokens = 16;
917             }
918             break;
919         default:
920             numSWSBTokens = 16;
921         }
922     }
923     else
924     {
925         // Default value based on platform
926         switch (platform)
927         {
928         case GENX_PVC:
929         case GENX_PVCXT:
930             numSWSBTokens = 16;
931             if (numRegTotal == 256)
932             {
933                 numSWSBTokens *= 2;
934             }
935             break;
936         default:
937             numSWSBTokens = 16;
938         }
939     }
940 
941 
942     // Set the number of Acc. They are in the unit of GRFs (i.e., 1 accumulator is the same size as 1 GRF)
943     unsigned overrideNumAcc = m_options->getuInt32Option(vISA_numGeneralAcc);
944     if (overrideNumAcc > 0)
945     {
946         // User-provided number of Acc
947         numAcc = overrideNumAcc;
948     }
949     else if (overrideNumThreads > 0)
950     {
951         switch (platform)
952         {
953         case XeHP_SDV:
954         case GENX_DG2:
955             switch (overrideNumThreads)
956             {
957             case 4:
958                 numAcc = 8;
959                 break;
960             default:
961                 numAcc = 4;
962             }
963             break;
964         case GENX_PVC:
965         case GENX_PVCXT:
966             switch (overrideNumThreads)
967             {
968             case 4:
969                 numAcc = 8;
970                 break;
971             case 5:
972                 numAcc = 6;
973                 break;
974             case 6:
975             case 8:
976                 numAcc = 4;
977                 break;
978             case 10:
979             case 12:
980                 numAcc = 2;
981                 break;
982             default:
983                 numAcc = 8;
984             }
985             break;
986         default:
987             numAcc = 4;
988         }
989     }
990     else
991     {
992         // Default value based on platform
993         switch (platform)
994         {
995         case XeHP_SDV:
996         case GENX_DG2:
997         case GENX_PVC:
998         case GENX_PVCXT:
999             numAcc = 4;
1000             if (numRegTotal == 256)
1001             {
1002                 numAcc *= 2;
1003             }
1004             break;
1005         default:
1006             numAcc = 2;
1007         }
1008     }
1009 
1010     // Set number of threads if it was not defined before
1011     if (numThreads == 0)
1012     {
1013         if (overrideNumThreads > 0)
1014         {
1015             numThreads = overrideNumThreads;
1016         }
1017         else
1018         {
1019             switch (platform)
1020             {
1021             case XeHP_SDV:
1022             case GENX_DG2:
1023                 switch (numRegTotal)
1024                 {
1025                 case 256:
1026                     numThreads = 4;
1027                     break;
1028                 default:
1029                     numThreads = 8;
1030                 }
1031                 break;
1032             case GENX_PVC:
1033             case GENX_PVCXT:
1034                 switch (numRegTotal)
1035                 {
1036                 case 256:
1037                     numThreads = 4;
1038                     break;
1039                 case 192:
1040                     numThreads = 5;
1041                     break;
1042                 case 160:
1043                     numThreads = 6;
1044                     break;
1045                 case 128:
1046                     numThreads = 8;
1047                     break;
1048                 case 96:
1049                     numThreads = 10;
1050                     break;
1051                 case 64:
1052                     numThreads = 12;
1053                     break;
1054                 default:
1055                     numThreads = 8;
1056                 }
1057                 break;
1058             default:
1059                 numThreads = 7;
1060             }
1061         }
1062     }
1063 
1064     if (m_options->getOption(vISA_hasDoubleAcc))
1065     {
1066         numAcc = 16;
1067     }
1068 }
1069 
dump(std::ostream & os) const1070 void G4_Kernel::dump(std::ostream &os) const
1071 {
1072     fg.print(os);
1073 }
1074 
dumpToFile(const std::string & suffixIn)1075 void G4_Kernel::dumpToFile(const std::string &suffixIn)
1076 {
1077     bool dumpDot = m_options->getOption(vISA_DumpDot);
1078     bool dumpG4 =
1079         m_options->getOption(vISA_DumpPasses) ||
1080         m_options->getuInt32Option(vISA_DumpPassesSubset) >= 1;
1081     if (!dumpDot && !dumpG4)
1082         return;
1083 
1084     // calls to this will produce a sequence of dumps
1085     // [kernel-name].000.[suffix].{dot,g4}
1086     // [kernel-name].001.[suffix].{dot,g4}
1087     // ...
1088     // If vISA_DumpPassesSubset == 1 then we omit any files that don't change
1089     // the string representation of the kernel (i.e. skip passes that don't do anything).
1090     std::stringstream ss;
1091     ss << (name ? name : "UnknownKernel");
1092     ss << "." << std::setfill('0') << std::setw(3) << nextDumpIndex++ << "." << suffixIn;
1093     std::string baseName = sanitizePathString(ss.str());
1094 
1095     if (dumpDot)
1096         dumpDotFileInternal(baseName);
1097 
1098     if (dumpG4)
1099         dumpG4Internal(baseName);
1100 }
1101 
emitDeviceAsm(std::ostream & os,const void * binary,uint32_t binarySize)1102 void G4_Kernel::emitDeviceAsm(
1103     std::ostream& os, const void * binary, uint32_t binarySize)
1104 {
1105     //
1106     // for GTGPU lib release, don't dump out asm
1107     //
1108 #ifdef NDEBUG
1109 #ifdef GTGPU_LIB
1110     return;
1111 #endif
1112 #endif
1113     const bool newAsm =
1114         m_options->getOption(vISA_dumpNewSyntax) && !(binary == NULL || binarySize == 0);
1115 
1116     if (!m_options->getOption(vISA_StripComments)) {
1117         emitDeviceAsmHeaderComment(os);
1118     }
1119 
1120     // Set this to NULL to always print filename for each kernel
1121     g4_prevFilename = nullptr;
1122     g4_prevSrcLineNo = 0;
1123 
1124     if (!newAsm) {
1125         emitDeviceAsmInstructionsOldAsm(os);
1126         return;
1127     }
1128 
1129     emitDeviceAsmInstructionsIga(os, binary, binarySize);
1130 
1131     if (getPlatformGeneration(getGenxPlatform()) >= PlatformGen::XE) {
1132         os << "\n\n";
1133         os << "//.BankConflicts: " <<  fg.XeBCStats.BCNum << "\n";
1134         os << "//.BankConflicts.SameBank: " <<  fg.XeBCStats.sameBankConflicts << "\n";
1135         os << "//.BankConflicts.TwoSrc: " <<  fg.XeBCStats.twoSrcBC << "\n";
1136         int nativeSimdSize = 8;
1137         if (getGenxPlatform() >= GENX_PVC)
1138             nativeSimdSize = 16;
1139         os << "//.SIMD" << 2*nativeSimdSize << "ReadSuppressions: " <<  fg.XeBCStats.simd16ReadSuppression << "\n";
1140         os << "//.SIMD" << nativeSimdSize << "s: " <<  fg.XeBCStats.simd8 << "\n//\n";
1141         os << "//.RMWs: " << fg.numRMWs << "\n//\n";
1142     }
1143     else
1144     {
1145         os << "// Bank Conflict Statistics: \n";
1146         os << "// -- GOOD: " << fg.BCStats.NumOfGoodInsts << "\n";
1147         os << "// --  BAD: " << fg.BCStats.NumOfBadInsts << "\n";
1148         os << "// --   OK: " << fg.BCStats.NumOfOKInsts << "\n";
1149     }
1150 }
1151 
emitRegInfo()1152 void G4_Kernel::emitRegInfo()
1153 {
1154     const char* asmName = nullptr;
1155     getOptions()->getOption(VISA_AsmFileName, asmName);
1156     const char* asmNameEmpty = "";
1157     if (!asmName)
1158     {
1159         asmName = asmNameEmpty;
1160     }
1161 
1162     std::string dumpFileName = std::string(asmName) + ".reginfo";
1163     std::fstream ofile(dumpFileName, std::ios::out);
1164 
1165     emitRegInfoKernel(ofile);
1166 
1167     ofile.close();
1168 }
1169 
emitRegInfoKernel(std::ostream & output)1170 void G4_Kernel::emitRegInfoKernel(std::ostream& output)
1171 {
1172     output << "//.platform " << getGenxPlatformString(fg.builder->getPlatform());
1173     output << "\n" << "//.kernel ID 0x" << std::hex << getKernelID() << "\n";
1174     output << std::dec << "\n";
1175     int instOffset = 0;
1176 
1177     for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB)
1178     {
1179         for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end(); ++itInst)
1180         {
1181             G4_INST* inst = (*itInst);
1182             if (inst->isLabel())
1183             {
1184                 continue;
1185             }
1186             if (inst->getLexicalId() == -1)
1187             {
1188                 continue;
1189             }
1190 
1191             (*itBB)->emitRegInfo(output, inst, instOffset);
1192             instOffset += inst->isCompactedInst() ? 8 : 16;
1193         }
1194     }
1195     return;
1196 }
1197 
1198 //
1199 // This routine dumps out the dot file of the control flow graph along with instructions.
1200 // dot is drawing graph tool from AT&T.
1201 //
dumpDotFileInternal(const std::string & baseName)1202 void G4_Kernel::dumpDotFileInternal(const std::string &baseName)
1203 {
1204     std::fstream ofile(baseName + ".dot", std::ios::out);
1205     assert(ofile);
1206     //
1207     // write digraph KernelName {"
1208     //          size = "8, 10";
1209     //
1210     const char* asmFileName = NULL;
1211     m_options->getOption(VISA_AsmFileName, asmFileName);
1212     if (asmFileName == NULL)
1213         ofile << "digraph UnknownKernel" << " {" << std::endl;
1214     else
1215         ofile << "digraph " << asmFileName << " {" << std::endl;
1216     //
1217     // keep the graph width 8, estimate a reasonable graph height
1218     //
1219     const unsigned itemPerPage = 64;                                        // 60 instructions per Letter page
1220     unsigned totalItem = (unsigned)Declares.size();
1221     for (std::list<G4_BB*>::iterator it = fg.begin(); it != fg.end(); ++it)
1222         totalItem += ((unsigned)(*it)->size());
1223     totalItem += (unsigned)fg.size();
1224     float graphHeight = (float)totalItem / itemPerPage;
1225     graphHeight = graphHeight < 100.0f ? 100.0f : graphHeight;    // minimal size: Letter
1226     ofile << "\n\t// Setup\n";
1227     ofile << "\tsize = \"80.0, " << graphHeight << "\";\n";
1228     ofile << "\tpage= \"80.5, 110\";\n";
1229     ofile << "\tpagedir=\"TL\";\n";
1230     //
1231     // dump out declare information
1232     //     Declare [label="
1233     //
1234     //if (name == NULL)
1235     //  ofile << "\tDeclares [shape=record, label=\"{kernel:UnknownKernel" << " | ";
1236     //else
1237     //  ofile << "\tDeclares [shape=record, label=\"{kernel:" << name << " | ";
1238     //for (std::list<G4_Declare*>::iterator it = Declares.begin(); it != Declares.end(); ++it)
1239     //{
1240     //  (*it)->emit(ofile, true, Options::symbolReg);   // Solve the DumpDot error on representing <>
1241     //
1242     //  ofile << "\\l";  // left adjusted
1243     //}
1244     //ofile << "}\"];" << std::endl;
1245     //
1246     // dump out flow graph
1247     //
1248     for (std::list<G4_BB*>::iterator it = fg.begin(); it != fg.end(); ++it)
1249     {
1250         G4_BB* bb = (*it);
1251         //
1252         // write:   BB0 [shape=plaintext, label=<
1253         //                      <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
1254         //                          <TR><TD ALIGN="CENTER">BB0: TestRA_Dot</TD></TR>
1255         //                          <TR><TD>
1256         //                              <TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">
1257         //                                  <TR><TD ALIGN="LEFT">TestRA_Dot:</TD></TR>
1258         //                                  <TR><TD ALIGN="LEFT"><FONT color="red">add (8) Region(0,0)[1] Region(0,0)[8;8,1] PAYLOAD(0,0)[8;8,1] [NoMask]</FONT></TD></TR>
1259         //                              </TABLE>
1260         //                          </TD></TR>
1261         //                      </TABLE>>];
1262         // print out label if the first inst is a label inst
1263         //
1264         ofile << "\t";
1265         bb->writeBBId(ofile);
1266         ofile << " [shape=plaintext, label=<" << std::endl;
1267         ofile << "\t\t\t    <TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">" << std::endl;
1268         ofile << "\t\t\t\t<TR><TD ALIGN=\"CENTER\">";
1269         bb->writeBBId(ofile);
1270         ofile << ": ";
1271 
1272         if (!bb->empty() && bb->front()->isLabel())
1273         {
1274             bb->front()->getSrc(0)->emit(ofile);
1275         }
1276         ofile << "</TD></TR>" << std::endl;
1277         //emit all instructions within basic block
1278         ofile << "\t\t\t\t<TR><TD>" << std::endl;
1279 
1280         if (!bb->empty())
1281         {
1282             ofile << "\t\t\t\t\t    <TABLE BORDER=\"0\" CELLBORDER=\"0\" CELLSPACING=\"0\">" << std::endl;
1283             for (INST_LIST_ITER i = bb->begin(); i != bb->end(); i++)
1284             {
1285                 //
1286                 // detect if there is spill code first, set different color for it
1287                 //
1288                 std::string fontColor = "black";
1289                 //
1290                 // emit the instruction
1291                 //
1292                 ofile << "\t\t\t\t\t\t<TR><TD ALIGN=\"LEFT\"><FONT color=\"" << fontColor << "\">";
1293                 std::ostringstream os;
1294                 (*i)->emit(os, m_options->getOption(vISA_SymbolReg), true);
1295                 std::string dotStr(os.str());
1296                 //TODO: dot doesn't like '<', '>', '{', or '}' (and '&') this code below is a hack. need to replace with delimiters.
1297                 //std::replace_if(dotStr.begin(), dotStr.end(), bind2nd(equal_to<char>(), '<'), '[');
1298                 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '<'), '[');
1299                 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '>'), ']');
1300                 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '{'), '[');
1301                 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '}'), ']');
1302                 std::replace_if(dotStr.begin(), dotStr.end(), std::bind(std::equal_to<char>(), std::placeholders::_1, '&'), '$');
1303                 ofile << dotStr;
1304 
1305                 ofile << "</FONT></TD></TR>" << std::endl;
1306                 //ofile << "\\l"; // left adjusted
1307             }
1308             ofile << "\t\t\t\t\t    </TABLE>" << std::endl;
1309         }
1310 
1311         ofile << "\t\t\t\t</TD></TR>" << std::endl;
1312         ofile << "\t\t\t    </TABLE>>];" << std::endl;
1313         //
1314         // dump out succ edges
1315         // BB12 -> BB10
1316         //
1317         for (std::list<G4_BB*>::iterator sit = bb->Succs.begin();
1318             sit != bb->Succs.end(); ++sit)
1319         {
1320             bb->writeBBId(ofile);
1321             ofile << " -> ";
1322             (*sit)->writeBBId(ofile);
1323             ofile << std::endl;
1324         }
1325     }
1326     //
1327     // write "}" to end digraph
1328     //
1329     ofile << std::endl << " }" << std::endl;
1330     //
1331     // close dot file
1332     //
1333     ofile.close();
1334 }
1335 
1336 // Dump the instructions into a .g4 file
dumpG4Internal(const std::string & file)1337 void G4_Kernel::dumpG4Internal(const std::string &file)
1338 {
1339     std::stringstream g4asm;
1340     dumpG4InternalTo(g4asm);
1341     std::string g4asms = g4asm.str();
1342     if (m_options->getuInt32Option(vISA_DumpPassesSubset) == 1 && g4asms == lastG4Asm) {
1343         return;
1344     }
1345     lastG4Asm = std::move(g4asms);
1346 
1347     std::fstream ofile(file + ".g4", std::ios::out);
1348     assert(ofile);
1349     dumpG4InternalTo(ofile);
1350 }
1351 
dumpG4InternalTo(std::ostream & os)1352 void G4_Kernel::dumpG4InternalTo(std::ostream &os)
1353 {
1354     const char* asmFileName = nullptr;
1355     m_options->getOption(VISA_AsmFileName, asmFileName);
1356     os << ".kernel " << name << "\n";
1357 
1358     for (const G4_Declare *d : Declares) {
1359         static const int MIN_DECL = 34; // skip the built-in decls
1360         if (d->getDeclId() > MIN_DECL) {
1361             // os << d->getDeclId() << "\n";
1362             d->emit(os);
1363         }
1364     }
1365 
1366     for (std::list<G4_BB*>::iterator it = fg.begin();
1367         it != fg.end(); ++it)
1368     {
1369         // Emit BB number
1370         G4_BB* bb = (*it);
1371         bb->writeBBId(os);
1372 
1373         // Emit BB type
1374         if (bb->getBBType())
1375         {
1376             os << " [" << bb->getBBTypeStr() << "] ";
1377         }
1378 
1379         os << "\tPreds: ";
1380         for (auto pred : bb->Preds)
1381         {
1382             pred->writeBBId(os);
1383             os << " ";
1384         }
1385         os << "\tSuccs: ";
1386         for (auto succ : bb->Succs)
1387         {
1388             succ->writeBBId(os);
1389             os << " ";
1390         }
1391         os << "\n";
1392 
1393         bb->emit(os);
1394         os << "\n\n";
1395     } // bbs
1396 }
1397 
emitDeviceAsmHeaderComment(std::ostream & os)1398 void G4_Kernel::emitDeviceAsmHeaderComment(std::ostream& os)
1399 {
1400     os << "//.kernel ";
1401     if (name != NULL)
1402     {
1403         // some 3D kernels do not have a name
1404         os << name;
1405     }
1406 
1407     os << "\n" << "//.platform " << getGenxPlatformString(getGenxPlatform());
1408     os << "\n" << "//.thread_config " << "numGRF=" << numRegTotal << ", numAcc=" << numAcc;
1409     if (fg.builder->hasSWSB())
1410     {
1411         os << ", numSWSB=" << numSWSBTokens;
1412     }
1413     os << "\n" << "//.options_string \"" << m_options->getUserArgString().str() << "\"";
1414     os << "\n" << "//.full_options \"" << m_options->getFullArgString() << "\"";
1415     os << "\n" << "//.instCount " << asmInstCount;
1416     static const char* const RATypeString[] {
1417         RA_TYPE(STRINGIFY)
1418     };
1419     os << "\n//.RA type\t" << RATypeString[RAType];
1420 
1421     if (auto jitInfo = fg.builder->getJitInfo())
1422     {
1423         if (jitInfo->numGRFUsed != 0)
1424         {
1425             os << "\n" << "//.GRF count " << jitInfo->numGRFUsed;
1426         }
1427         if (jitInfo->spillMemUsed > 0)
1428         {
1429             os << "\n" << "//.spill size " << jitInfo->spillMemUsed;
1430         }
1431         if (jitInfo->numGRFSpillFill > 0)
1432         {
1433             os << "\n" << "//.spill GRF est. ref count " << jitInfo->numGRFSpillFill;
1434         }
1435         if (jitInfo->numFlagSpillStore > 0)
1436         {
1437             os << "\n//.spill flag store " << jitInfo->numFlagSpillStore;
1438             os << "\n//.spill flag load " << jitInfo->numFlagSpillLoad;
1439         }
1440     }
1441 
1442     auto privateMemSize = getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
1443     if (privateMemSize != 0)
1444     {
1445         os << "\n//.private memory size " << privateMemSize;
1446     }
1447     os << "\n\n";
1448 
1449     //Step2: emit declares (as needed)
1450     //
1451     // firstly, emit RA declare as comments or code depends on Options::symbolReg
1452     // we check if the register allocation is successful here
1453     //
1454 
1455     for (auto dcl : Declares)
1456     {
1457         dcl->emit(os);
1458     }
1459     os << "\n";
1460 
1461     auto fmtHex = [](int i) {
1462         std::stringstream ss;
1463         ss << "0x" << std::hex << std::uppercase << i;
1464         return ss.str();
1465     };
1466 
1467     const unsigned inputCount = fg.builder->getInputCount();
1468     std::vector<std::string> argNames;
1469     size_t maxNameLen = 8;
1470     for (unsigned id = 0; id < inputCount; id++)
1471     {
1472         const input_info_t* ii = fg.builder->getInputArg(id);
1473         std::stringstream ss;
1474         if (ii->dcl && ii->dcl->getName()) {
1475             ss << ii->dcl->getName();
1476         } else {
1477             ss << "__unnamed" << (id + 1);
1478         }
1479         argNames.push_back(ss.str());
1480         maxNameLen = std::max(maxNameLen, argNames.back().size());
1481     }
1482 
1483     // emit input location and size
1484     os << "// .inputs\n";
1485     const size_t COLW_IDENT = maxNameLen;
1486     static const size_t COLW_TYPE = 8;
1487     static const size_t COLW_SIZE = 6;
1488     static const size_t COLW_AT = 8;
1489     static const size_t COLW_CLASS = 10;
1490 
1491     std::stringstream bordss;
1492     bordss << "// ";
1493     bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_IDENT + 2) << "";
1494     bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_TYPE + 2) << "";
1495     bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_SIZE + 2) << "";
1496     bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_AT + 2) << "";
1497     bordss << '+'; bordss << std::setfill('-') << std::setw(COLW_CLASS + 2) << "";
1498     bordss << '+' << "\n";
1499     std::string border = bordss.str();
1500 
1501     os << border;
1502     os <<
1503         "//" <<
1504         " | " << std::left << std::setw(COLW_IDENT) << "id" <<
1505         " | " << std::left << std::setw(COLW_TYPE) << "type" <<
1506         " | " << std::right << std::setw(COLW_SIZE) << "bytes" <<
1507         " | " << std::left << std::setw(COLW_AT) << "at" <<
1508         " | " << std::left << std::setw(COLW_CLASS) << "class" <<
1509         " |" << "\n";
1510     os << border;
1511 
1512     const unsigned grfSize = getGRFSize();
1513     for (unsigned id = 0; id < inputCount; id++)
1514     {
1515         const input_info_t* input_info = fg.builder->getInputArg(id);
1516         //
1517         os << "//";
1518         //
1519         // id
1520         os <<
1521             " | " << std::left << std::setw(COLW_IDENT) << argNames[id];
1522         //
1523         // type and length
1524         //   e.g. :uq x 16
1525         const G4_Declare *dcl = input_info->dcl;
1526         std::stringstream sstype;
1527         if (dcl) {
1528             switch (dcl->getElemType()) {
1529             case Type_B: sstype << ":b"; break;
1530             case Type_W: sstype << ":w"; break;
1531             case Type_D: sstype << ":d"; break;
1532             case Type_Q: sstype << ":q"; break;
1533             case Type_V: sstype << ":v"; break;
1534             case Type_UB: sstype << ":ub"; break;
1535             case Type_UW: sstype << ":uw"; break;
1536             case Type_UD: sstype << ":ud"; break;
1537             case Type_UQ: sstype << ":uq"; break;
1538             case Type_UV: sstype << ":uv"; break;
1539                 //
1540             case Type_F:  sstype << ":f"; break;
1541             case Type_HF: sstype << ":hf"; break;
1542             case Type_DF: sstype << ":df"; break;
1543             case Type_NF: sstype << ":nf"; break;
1544             case Type_BF: sstype << ":bf"; break;
1545             default:
1546                 sstype << fmtHex((int)dcl->getElemType()) << "?";
1547                 break;
1548             }
1549             if (dcl->getTotalElems() != 1)
1550                 sstype << " x " << dcl->getTotalElems();
1551         } else {
1552             sstype << "?";
1553         }
1554         os << " | " << std::left << std::setw(COLW_TYPE) << sstype.str();
1555         //
1556         // size
1557         os << " | " << std::right << std::setw(COLW_SIZE) << std::dec << input_info->size;
1558 
1559         // location
1560         unsigned reg = input_info->offset / grfSize,
1561             subRegBytes = input_info->offset % grfSize;
1562         std::stringstream ssloc;
1563         ssloc << "r" << reg;
1564         if (subRegBytes != 0)
1565             ssloc << "+" << subRegBytes;
1566         os << " | " << std::left << std::setw(COLW_AT) << ssloc.str();
1567 
1568         // class
1569         std::string inpcls;
1570         switch (input_info->getInputClass()) {
1571         case INPUT_GENERAL: inpcls = "general"; break;
1572         case INPUT_SAMPLER: inpcls = "sampler"; break;
1573         case INPUT_SURFACE: inpcls = "surface"; break;
1574         default: inpcls = fmtHex((int)input_info->getInputClass()); break;
1575         }
1576         os << " | " << std::left << std::setw(COLW_CLASS) << inpcls;
1577         //
1578         os << " |\n";
1579     }
1580     os << border << "\n";
1581 
1582     if (getPlatformGeneration(getGenxPlatform()) < PlatformGen::XE)
1583     {
1584         fg.BCStats.clear();
1585     }
1586     else
1587     {
1588         fg.XeBCStats.clear();
1589     }
1590     fg.numRMWs = 0;
1591 }
1592 
1593 
parseDecodeErrors(KernelView & kView,const char * errBuf,size_t errBufSize)1594 static std::map<int, std::string> parseDecodeErrors(
1595     KernelView &kView, const char *errBuf, size_t errBufSize)
1596 {
1597     // FIXME: IGA KernelView should be refactored to just return PC's
1598     // paired with diagnostic strings for each
1599     // (automatically allocate in IGA and cleanup when KV is deleted)
1600     bool dissasemblyFailed = !kView.decodeSucceeded();
1601     std::string igaErrMsgs;
1602     std::vector<std::string> igaErrMsgsVector;
1603     std::map<int, std::string> errorToStringMap;
1604     if (dissasemblyFailed)
1605     {
1606         std::cerr << "failed to decode binary for asm output";
1607         igaErrMsgs = errBuf;
1608         igaErrMsgsVector = split(igaErrMsgs, "\n");
1609         for (auto msg : igaErrMsgsVector)
1610         {
1611             auto pos = msg.find("ERROR");
1612             if (pos != std::string::npos)
1613             {
1614                 std::cerr << msg << "\n";
1615                 std::vector<std::string> aString = split(msg, " ");
1616                 for (auto token : aString)
1617                 {
1618                     if (token.find_first_of("0123456789") != std::string::npos)
1619                     {
1620                         int errorPC = std::atoi(token.c_str());
1621                         errorToStringMap[errorPC] = msg;
1622                         break;
1623                     }
1624                 }
1625             }
1626         }
1627     }
1628 
1629     return errorToStringMap;
1630 }
1631 
1632 using BlockOffsets = std::map<int32_t,std::vector<std::string>>;
1633 
precomputeBlockOffsets(std::ostream & os,G4_Kernel & g4k,const KernelView & kv)1634 static BlockOffsets precomputeBlockOffsets(
1635     std::ostream& os, G4_Kernel &g4k, const KernelView &kv)
1636 {
1637     // pre-compute the PCs of each basic block
1638     int32_t currPc = 0, lastInstSize = -1;
1639     std::map<int32_t,std::vector<std::string>> blockOffsets;
1640     for (BB_LIST_ITER itBB = g4k.fg.begin(); itBB != g4k.fg.end(); ++itBB) {
1641         for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end(); ++itInst) {
1642             if ((*itInst)->isLabel()) {
1643                 // G4 treats labels as special instructions
1644                 const char *lbl = (*itInst)->getLabelStr();
1645                 if (lbl && *lbl) {
1646                     blockOffsets[currPc].emplace_back(lbl);
1647                 }
1648             } else {
1649                 // we are looking at the next G4 instruction,
1650                 // but reached the end of the decode stream
1651                 if (lastInstSize == 0) {
1652                     os << "// ERROR: deducing G4 block PCs "
1653                         "(IGA decoded stream ends early); falling back to IGA labels\n";
1654                     blockOffsets.clear(); // fallback to IGA default labels
1655                     return blockOffsets;
1656                 }
1657                 lastInstSize = kv.getInstSize(currPc);
1658                 currPc += lastInstSize;
1659             }
1660         }
1661     }
1662     if (kv.getInstSize(currPc) != 0) {
1663         // we are looking at the next G4 instruction,
1664         // but reached the end of the decode stream
1665         os << "// ERROR: deducing G4 block PCs "
1666             "(G4_INST stream ends early); falling back to IGA labels\n";
1667         blockOffsets.clear(); // fallback to IGA default labels
1668     }
1669     return blockOffsets;
1670 }
1671 
1672 
1673 // needs further cleanup (confirm label prefixes are gone, newAsm == true)
emitDeviceAsmInstructionsIga(std::ostream & os,const void * binary,uint32_t binarySize)1674 void G4_Kernel::emitDeviceAsmInstructionsIga(
1675     std::ostream& os, const void * binary, uint32_t binarySize)
1676 {
1677     os << "\n";
1678 
1679     const size_t ERROR_STRING_MAX_LENGTH = 16 * 1024;
1680     char* errBuf = new char[ERROR_STRING_MAX_LENGTH];
1681     assert(errBuf);
1682     if (!errBuf)
1683         return;
1684     KernelView kv(
1685         getIGAPlatform(), binary, binarySize,
1686         GetIGASWSBEncodeMode(*fg.builder),
1687         errBuf, ERROR_STRING_MAX_LENGTH);
1688     const auto errorMap =
1689         parseDecodeErrors(kv, errBuf, ERROR_STRING_MAX_LENGTH);
1690     delete [] errBuf;
1691 
1692     const auto blockOffsets = precomputeBlockOffsets(os, *this, kv);
1693 
1694     //
1695     // Generate a label with uniqueLabel as prefix (required by some tools).
1696     // We do so by using labeler callback.  If uniqueLabels is not present, use iga's
1697     // default label.  For example,
1698     //   Without option -uniqueLabels:
1699     //      generating default label,   L1234
1700     //   With option -uniqueLabels <sth>:
1701     //      generating label with <sth> as prefix, <sth>_L1234
1702     //
1703     std::string labelPrefix;
1704     if (m_options->getOption(vISA_UniqueLabels))
1705     {
1706         const char* labelPrefixC = nullptr;
1707         m_options->getOption(vISA_LabelStr, labelPrefixC);
1708         labelPrefix = labelPrefixC;
1709         if (!labelPrefix.empty())
1710             labelPrefix += '_';
1711     }
1712 
1713     struct LabelerState {
1714         const KernelView *kv;
1715         const BlockOffsets &blockOffsets;
1716         const std::string labelPrefix;
1717         std::string labelStorage;
1718         LabelerState(
1719             const KernelView *_kv,
1720             const BlockOffsets &offs,
1721             const std::string &lblPfx)
1722             : kv(_kv), blockOffsets(offs), labelPrefix(lblPfx)
1723         {
1724         }
1725     };
1726     LabelerState ls(&kv, blockOffsets, labelPrefix);
1727 
1728     // storage for the IGA labeler
1729     auto labeler = [](int32_t pc, void *data) -> const char * {
1730         LabelerState &ls = *(LabelerState *)data;
1731         ls.labelStorage = ls.labelPrefix;
1732         auto itr = ls.blockOffsets.find(pc);
1733         if (itr == ls.blockOffsets.end()) {
1734             // let IGA choose the label name, but we still have to prefix
1735             // our user provided prefix
1736             char igaDefaultLabel[128];
1737             ls.kv->getDefaultLabelName(pc, igaDefaultLabel, sizeof(igaDefaultLabel));
1738             ls.labelStorage += igaDefaultLabel;
1739             return ls.labelStorage.c_str();
1740         }
1741         std::string g4Label = itr->second.front().c_str();
1742         ls.labelStorage += g4Label;
1743         return ls.labelStorage.c_str();
1744     };
1745 
1746 
1747     // initialize register suppression info
1748     int suppressRegs[5] = {};
1749     int lastRegs[3] = {};
1750     for (int i = 0; i < 3; i++)
1751     {
1752         suppressRegs[i] = -1;
1753         lastRegs[i] = -1;
1754     }
1755 
1756     ////////////////////////////////////////
1757     // emit the program text (instructions) iteratively
1758     // this is a little tricky because G4 treats labels as instructions
1759     // thus we need to do a little checking to keep the two streams in sync
1760     int32_t pc = 0;
1761     std::vector<char> igaStringBuffer;
1762     igaStringBuffer.resize(512); // TODO: expand default after testing
1763     for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB) {
1764         os << "// "; (*itBB)->emitBbInfo(os); os << "\n";
1765         for (INST_LIST_ITER itInst = (*itBB)->begin();
1766             itInst != (*itBB)->end(); ++itInst)
1767         {
1768             G4_INST *i = (*itInst);
1769 
1770             // walk to next non-label in this block;
1771             // return true if we find one, else fails if at end of block
1772             auto findNextNonLabel = [&](bool print) {
1773                 while ((*itInst)->isLabel()) {
1774                     if (print)
1775                         os << "// " << (*itInst)->getLabelStr() << ":\n";
1776                     itInst++;
1777                     if (itInst == (*itBB)->end())
1778                         break;
1779                 }
1780                 if (itInst == (*itBB)->end())
1781                     return false;
1782                 i = (*itInst);
1783                 return true;
1784             };
1785 
1786             bool isInstTarget = kv.isInstTarget(pc);
1787             if (isInstTarget) {
1788                 auto itr = ls.blockOffsets.find(pc);
1789                 if (itr == ls.blockOffsets.end()) {
1790                     os << labeler(pc, &ls) << ":\n";
1791                 } else {
1792                     // there can be multiple labels per PC
1793                     for (const std::string &lbl : itr->second) {
1794                         os << ls.labelPrefix << lbl << ":\n";
1795                     }
1796                 }
1797                 if (!findNextNonLabel(false)) {
1798                     break; // at end of block
1799                 }
1800             } else if (i->isLabel()) {
1801                 // IGA doesn't consider this PC to be a label but G4 does
1802                 //
1803                 // move forward until we find the next non-label
1804                 if (!findNextNonLabel(true)) {
1805                     break; // at end of block
1806                 }
1807             }
1808 
1809             ///////////////////////////////////////////////////////////////////
1810             // we are looking at a non-label G4_INST at the next valid IGA PC
1811             // (same instruction)
1812             if (!getOptions()->getOption(vISA_disableInstDebugInfo)) {
1813                 (*itBB)->emitInstructionSourceLineMapping(os, itInst);
1814             }
1815 
1816             auto eitr = errorMap.find(pc);
1817             if (eitr != errorMap.end()) {
1818                 os << "// " << eitr->second << "\n";
1819                 os << "// text representation might not be correct";
1820             }
1821 
1822             static const uint32_t IGA_FMT_OPTS =
1823                 IGA_FORMATTING_OPT_PRINT_LDST
1824                 | IGA_FORMATTING_OPT_PRINT_BFNEXPRS;
1825             while (true) {
1826                 size_t nw = kv.getInstSyntax(
1827                     pc,
1828                     igaStringBuffer.data(), igaStringBuffer.size(),
1829                     IGA_FMT_OPTS,
1830                     labeler, &ls);
1831                 if (nw == 0) {
1832                     os << "<<error formatting instruction at PC " << pc << ">>\n";
1833                     break;
1834                 } else if (nw <= igaStringBuffer.size()) {
1835                     // print it (pad it out so comments line up on most instructions)
1836                     std::string line =igaStringBuffer.data();
1837                     while (line.size() < 100)
1838                         line += ' ';
1839                     os << line;
1840                     break;
1841                 } else {
1842                     igaStringBuffer.resize(igaStringBuffer.size() + 512);
1843                     // try again
1844                 }
1845             }
1846 
1847             (*itBB)->emitBasicInstructionComment(os, itInst, suppressRegs, lastRegs);
1848             os << "\n";
1849 
1850             pc += kv.getInstSize(pc);
1851         } // for insts in block
1852     } // for blocks
1853 } // emitDeviceAsmInstructionsIga
1854 
1855 
1856 // Should be removed once we can confirm no one uses it
1857 // the output comes from G4_INST::... and almost certainly won't be
1858 // parsable by IGA
emitDeviceAsmInstructionsOldAsm(std::ostream & os)1859 void G4_Kernel::emitDeviceAsmInstructionsOldAsm(std::ostream& os)
1860 {
1861     os << std::endl << ".code";
1862     for (BB_LIST_ITER it = fg.begin(); it != fg.end(); ++it)
1863     {
1864         os << "\n";
1865         (*it)->emit(os);
1866     }
1867     //Step4: emit clean-up.
1868     os << std::endl;
1869     os << ".end_code" << std::endl;
1870     os << ".end_kernel" << std::endl;
1871     os << std::endl;
1872 }
1873 
getNextBB(G4_BB * bb) const1874 G4_BB* G4_Kernel::getNextBB(G4_BB* bb) const
1875 {
1876     if (!bb)
1877         return nullptr;
1878 
1879     // Return the lexically following bb.
1880     G4_BB* nextBB = nullptr;
1881     for (auto it = fg.cbegin(), ie = fg.cend(); it != ie; it++)
1882     {
1883         auto curBB = (*it);
1884         if (curBB == bb)
1885         {
1886             if (it != ie)
1887             {
1888                 it++;
1889                 nextBB = (*it);
1890             }
1891             break;
1892         }
1893     }
1894 
1895     return nextBB;
1896 }
1897 
getBinOffsetOfBB(G4_BB * bb) const1898 unsigned G4_Kernel::getBinOffsetOfBB(G4_BB* bb) const {
1899     if (!bb)
1900         return 0;
1901 
1902     // Given a bb, return the binary offset of first non-label of instruction.
1903     auto it = std::find_if(bb->begin(), bb->end(), [](G4_INST* inst) { return !inst->isLabel(); });
1904     assert(it != bb->end() && "expect at least one non-label inst in second BB");
1905     return (unsigned)(*it)->getGenOffset();
1906 }
1907 
getPerThreadNextOff() const1908 unsigned G4_Kernel::getPerThreadNextOff() const
1909 {
1910     if (!hasPerThreadPayloadBB())
1911         return 0;
1912     G4_BB* next = getNextBB(perThreadPayloadBB);
1913     return getBinOffsetOfBB(next);
1914 }
1915 
getCrossThreadNextOff() const1916 unsigned G4_Kernel::getCrossThreadNextOff() const
1917 {
1918     if (!hasCrossThreadPayloadBB())
1919         return 0;
1920     G4_BB* next = getNextBB(crossThreadPayloadBB);
1921     return getBinOffsetOfBB(next);
1922 }
1923 
getComputeFFIDGPNextOff() const1924 unsigned G4_Kernel::getComputeFFIDGPNextOff() const
1925 {
1926     if (!hasComputeFFIDProlog())
1927         return 0;
1928     // return the offset of the second entry (GP1)
1929     // the first instruction in the second BB is the start of the second entry
1930     assert(fg.getNumBB() > 1 && "expect at least one prolog BB");
1931     assert(!computeFFIDGP1->empty() && !computeFFIDGP1->front()->isLabel());
1932     return getBinOffsetOfBB(computeFFIDGP1);
1933 }
1934 
getComputeFFIDGP1NextOff() const1935 unsigned G4_Kernel::getComputeFFIDGP1NextOff() const
1936 {
1937     if (!hasComputeFFIDProlog())
1938         return 0;
1939     // return the offset of the BB next to GP1
1940     // the first instruction in the second BB is the start of the second entry
1941     assert(fg.getNumBB() > 1 && "expect at least one prolog BB");
1942     G4_BB* next = getNextBB(computeFFIDGP1);
1943     return getBinOffsetOfBB(next);
1944 }
1945