1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "Compiler/CISACodeGen/EstimateFunctionSize.h"
10 #include "Compiler/CodeGenContextWrapper.hpp"
11 #include "Compiler/MetaDataUtilsWrapper.h"
12 #include "Compiler/CodeGenPublic.h"
13 #include "Compiler/IGCPassSupport.h"
14 #include "common/igc_regkeys.hpp"
15 #include "common/LLVMWarningsPush.hpp"
16 #include "llvm/IR/Module.h"
17 #include "llvm/IR/Function.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/Support/raw_ostream.h"
20 #include "common/LLVMWarningsPop.hpp"
21 #include "Probe/Assertion.h"
22 #include <deque>
23 #include <iostream>
24 
25 using namespace llvm;
26 using namespace IGC;
27 
28 char EstimateFunctionSize::ID = 0;
29 
30 IGC_INITIALIZE_PASS_BEGIN(EstimateFunctionSize, "EstimateFunctionSize", "EstimateFunctionSize", false, true)
31 IGC_INITIALIZE_PASS_END(EstimateFunctionSize, "EstimateFunctionSize", "EstimateFunctionSize", false, true)
32 
createEstimateFunctionSizePass()33 llvm::ModulePass* IGC::createEstimateFunctionSizePass() {
34     initializeEstimateFunctionSizePass(*PassRegistry::getPassRegistry());
35     return new EstimateFunctionSize;
36 }
37 
38 llvm::ModulePass*
createEstimateFunctionSizePass(EstimateFunctionSize::AnalysisLevel AL)39 IGC::createEstimateFunctionSizePass(EstimateFunctionSize::AnalysisLevel AL) {
40     initializeEstimateFunctionSizePass(*PassRegistry::getPassRegistry());
41     return new EstimateFunctionSize(AL);
42 }
43 
EstimateFunctionSize(AnalysisLevel AL)44 EstimateFunctionSize::EstimateFunctionSize(AnalysisLevel AL)
45     : ModulePass(ID), M(nullptr), AL(AL), HasRecursion(false), EnableSubroutine(false) {}
46 
~EstimateFunctionSize()47 EstimateFunctionSize::~EstimateFunctionSize() { clear(); }
48 
getAnalysisUsage(AnalysisUsage & AU) const49 void EstimateFunctionSize::getAnalysisUsage(AnalysisUsage& AU) const {
50     AU.setPreservesAll();
51 }
52 
runOnModule(Module & Mod)53 bool EstimateFunctionSize::runOnModule(Module& Mod) {
54     clear();
55     M = &Mod;
56     analyze();
57     checkSubroutine();
58     return false;
59 }
60 
61 // Given a module, estimate the maximal function size with complete inlining.
62 /*
63    A ----> B ----> C ---> D ---> F
64     \       \       \
65      \       \       \---> E
66       \       \
67        \       \---> C ---> D --> F
68         \             \
69          \----> F      \---> E
70 */
71 // ExpandedSize(A) = size(A) + size(B) + 2 * size(C) + 2 * size(D)
72 //                   + 2 * size(E) + 3 * size(F)
73 //
74 // We compute the size as follows:
75 //
76 // (1) Initialize the data structure
77 //
78 // A --> {size(A), [B, F], [] }
79 // B --> {size(B), [C, C], [A] }
80 // C --> {size(C), [D, E], [B] }
81 // D --> {size(D), [F],    [C] }
82 // E --> {size(E), [],     [C] }
83 // F --> {size(F), [],     [A, D] }
84 //
85 // where the first list consists of functions to be expanded and the second list
86 // consists of its caller functions.
87 //
88 // (2) Expand all leaf nodes until all become leaf nodes. We update the data
89 // structure to the following after expanding functions E and F:
90 //
91 // A --> {size(A) + size(F), [B],    [] }
92 // B --> {size(B),           [C, C], [A] }
93 // C --> {size(C) + size(E), [D],    [B] }
94 // D --> {size(D) + size(F), [],     [C] }
95 // E --> {size(E),           [],     [C] }
96 // F --> {size(F),           [],     [A, D] }
97 //
98 namespace {
99 
100     /// Associate each function with a partially expanded size and remaining
101     /// unexpanded function list, etc.
102     struct FunctionNode {
FunctionNode__anonc8e5d44b0111::FunctionNode103         FunctionNode(Function* F, std::size_t Size)
104             : F(F), Size(Size), InitialSize(Size), Processed(0), KernelNum(0),
105               CallingSubroutine(false), HasImplicitArg(false) {}
106 
107         Function* F;
108 
109         /// \brief Partially expanded size, or completely expanded size for a
110         /// leaf node.
111         std::size_t Size;
112 
113         /// \brief Initial size before expansion.
114         std::size_t InitialSize;
115 
116         /// \brief A number to indicate whether this node has been processed in current traversal.
117         uint32_t Processed;
118 
119         /// \brief A number to indicate whether this node belongs to the current kernel.
120         uint32_t KernelNum;
121 
122         /// \brief A flag to indicate whether this node has a subroutine call before
123         /// expanding.
124         bool CallingSubroutine;
125 
126         /// \brief A flag to indicate whether this node should be always inlined.
127         bool ToBeInlined;
128 
129         bool HasImplicitArg;
130 
131         /// \brief All functions directly called in this function.
132         std::vector<Function*> CalleeList;
133 
134         /// \brief All functions that call this function F.
135         std::vector<Function*> CallerList;
136 
137         /// \brief A node becomes a leaf when all called functions are expanded.
isLeaf__anonc8e5d44b0111::FunctionNode138         bool isLeaf() const { return CalleeList.empty(); }
139 
140         /// \brief Add a caller or callee.
141         // A caller may call the same callee multiple times, e.g. A->{B,B,B}: A->CalleeList(B,B,B), B->CallerList(A,A,A)
addCallee__anonc8e5d44b0111::FunctionNode142         void addCallee(Function* G) {
143             IGC_ASSERT(!G->empty());
144             CalleeList.push_back(G);
145             CallingSubroutine = true;
146         }
addCaller__anonc8e5d44b0111::FunctionNode147         void addCaller(Function* G) {
148             IGC_ASSERT(!G->empty());
149             CallerList.push_back(G);
150         }
151 
152         /// \brief A single step to expand F: accumulate the size and remove it
153         /// from the callee list.
expand__anonc8e5d44b0111::FunctionNode154         void expand(FunctionNode* Node) {
155             if( IGC_IS_FLAG_ENABLED(ControlInlineImplicitArgs) && HasImplicitArg == false && Node->HasImplicitArg == true )
156             {
157                 HasImplicitArg = true;
158                 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x40 ) != 0 )
159                 {
160                     std::cout << "Func " << this->F->getName().str() << " expands to hasimplicitarg due to " << Node->F->getName().str() << std::endl;
161                 }
162             }
163             // Multiple calls to a function is allowed as in the example above.
164             for (auto I = CalleeList.begin(); I != CalleeList.end(); /* empty */) {
165                 if (*I == Node->F) {
166                     Size += Node->Size;
167                     I = CalleeList.erase(I);
168                 }
169                 else
170                     ++I;
171             }
172         }
173 
174         /// \brief A single step to expand F: accumulate the size and DON't remove it
175         /// from the callee list.
expandSpecial__anonc8e5d44b0111::FunctionNode176         void expandSpecial( FunctionNode* Node ) {
177             Size += Node->Size;
178         }
179 
180 #if defined(_DEBUG)
181         void print(raw_ostream& os);
182 
dump__anonc8e5d44b0111::FunctionNode183         void dump() { print(llvm::errs()); }
184 #endif
185     };
186 
187 } // namespace
188 #if defined(_DEBUG)
189 
print(raw_ostream & os)190 void FunctionNode::print(raw_ostream& os) {
191     os << "Function: " << F->getName() << ", " << Size << "\n";
192     for (auto G : CalleeList)
193         os << "--->>>" << G->getName() << "\n";
194     for (auto G : CallerList)
195         os << "<<<---" << G->getName() << "\n";
196 }
197 #endif
198 
clear()199 void EstimateFunctionSize::clear() {
200     M = nullptr;
201     for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
202         auto Node = (FunctionNode*)I->second;
203         delete Node;
204     }
205     ECG.clear();
206 }
207 
matchImplicitArg(CallInst & CI)208 bool EstimateFunctionSize::matchImplicitArg( CallInst& CI )
209 {
210     bool matched = false;
211     StringRef funcName = CI.getCalledFunction()->getName();
212     if( funcName.equals( GET_LOCAL_ID_X ) ||
213         funcName.equals( GET_LOCAL_ID_Y ) ||
214         funcName.equals( GET_LOCAL_ID_Z ) )
215     {
216         matched = true;
217     }
218     else if( funcName.equals( GET_GROUP_ID ) )
219     {
220         matched = true;
221     }
222     else if( funcName.equals( GET_LOCAL_THREAD_ID ) )
223     {
224         matched = true;
225     }
226     else if( funcName.equals( GET_GLOBAL_OFFSET ) )
227     {
228         matched = true;
229     }
230     else if( funcName.equals( GET_GLOBAL_SIZE ) )
231     {
232         matched = true;
233     }
234     else if( funcName.equals( GET_LOCAL_SIZE ) )
235     {
236         matched = true;
237     }
238     else if( funcName.equals( GET_WORK_DIM ) )
239     {
240         matched = true;
241     }
242     else if( funcName.equals( GET_NUM_GROUPS ) )
243     {
244         matched = true;
245     }
246     else if( funcName.equals( GET_ENQUEUED_LOCAL_SIZE ) )
247     {
248         matched = true;
249     }
250     else if( funcName.equals( GET_STAGE_IN_GRID_ORIGIN ) )
251     {
252         matched = true;
253     }
254     else if( funcName.equals( GET_STAGE_IN_GRID_SIZE ) )
255     {
256         matched = true;
257     }
258     else if( funcName.equals( GET_SYNC_BUFFER ) )
259     {
260         matched = true;
261     }
262     if( matched && ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x40 ) != 0 )
263     {
264         std::cout << "Matched implicit arg " << funcName.str() << std::endl;
265     }
266     return matched;
267 }
268 
269 // visit Call inst to determine if implicit args are used by the caller
visitCallInst(CallInst & CI)270 void EstimateFunctionSize::visitCallInst( CallInst& CI )
271 {
272     if( !CI.getCalledFunction() )
273     {
274         return;
275     }
276     // Check for implicit arg function calls
277     bool matched = matchImplicitArg( CI );
278     tmpHasImplicitArg = matched;
279 }
280 
analyze()281 void EstimateFunctionSize::analyze() {
282     auto getSize = [](llvm::Function& F) -> std::size_t {
283         std::size_t Size = 0;
284         for (auto& BB : F.getBasicBlockList())
285             Size += BB.size();
286         return Size;
287     };
288 
289     // Initial the data structure.
290     for (auto& F : M->getFunctionList()) {
291         if (F.empty())
292             continue;
293         ECG[&F] = new FunctionNode(&F, getSize(F));
294     }
295 
296     // Visit all call instructions and populate CG.
297     for (auto& F : M->getFunctionList()) {
298         if (F.empty())
299             continue;
300 
301         FunctionNode* Node = get<FunctionNode>(&F);
302         for (auto U : F.users()) {
303             // Other users (like bitcast/store) are ignored.
304             if (auto * CI = dyn_cast<CallInst>(U)) {
305                 // G calls F, or G --> F
306                 Function* G = CI->getParent()->getParent();
307                 FunctionNode* GN = get<FunctionNode>( G );
308                 GN->addCallee(&F);
309                 Node->addCaller(G);
310             }
311         }
312     }
313     // check functions and mark those that use implicit args.
314     if( IGC_IS_FLAG_ENABLED( ControlInlineImplicitArgs ) )
315     {
316         for( auto I = ECG.begin(), E = ECG.end(); I != E; ++I )
317         {
318             auto Node = (FunctionNode*)I->second;
319             if( Node /* && Node->isLeaf() a non-leaf function may also call implicit args */ )
320             {
321                 tmpHasImplicitArg = false;
322                 visit( Node->F );
323                 if( tmpHasImplicitArg )
324                 {
325                     Node->HasImplicitArg = true;
326                     if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x40 ) != 0 )
327                     {
328                         static int cnt = 0;
329                         const char* Name;
330                         if( Node->isLeaf() )
331                             Name = "Leaf";
332                         else
333                             Name = "nonLeaf";
334                         std::cout << Name << " Func " << ++cnt << " " << Node->F->getName().str() << " calls implicit args so HasImplicitArg" << std::endl;
335                     }
336                 }
337             }
338         }
339     }
340 
341     // Expand leaf nodes until all are expanded (the second list is empty).
342     while (true) {
343         // Find unexpanded leaf nodes.
344         SmallVector<FunctionNode*, 8> LeafNodes;
345         for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
346             auto Node = (FunctionNode*)I->second;
347             if (!Node->Processed && Node->isLeaf())
348                 LeafNodes.push_back(Node);
349         }
350 
351         // Done.
352         if (LeafNodes.empty())
353             break;
354 
355         // Expand leaf nodes one by one.
356         for (auto Node : LeafNodes) {
357             IGC_ASSERT(Node->CalleeList.empty());
358             // Populate to its Callers.
359             for (auto Caller : Node->CallerList)
360                 get<FunctionNode>(Caller)->expand(Node);
361             Node->Processed = true;
362         }
363     }
364 
365     HasRecursion = false;
366     for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
367         FunctionNode* Node = (FunctionNode*)I->second;
368         if (!Node->isLeaf()) {
369             HasRecursion = true;
370         }
371     }
372 }
373 
374 /// \brief Return the estimated maximal function size after complete inlining.
getMaxExpandedSize() const375 std::size_t EstimateFunctionSize::getMaxExpandedSize() const {
376     std::size_t MaxSize = 0;
377     for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
378         FunctionNode* Node = (FunctionNode*)I->second;
379         // Only functions with subroutine calls count.
380         if (Node->CallingSubroutine)
381             MaxSize = std::max(MaxSize, Node->Size);
382     }
383     return MaxSize;
384 }
385 
checkSubroutine()386 void EstimateFunctionSize::checkSubroutine() {
387     auto CGW = getAnalysisIfAvailable<CodeGenContextWrapper>();
388     if (!CGW) return;
389 
390     EnableSubroutine = true;
391     CodeGenContext* pContext = CGW->getCodeGenContext();
392     if (pContext->type != ShaderType::OPENCL_SHADER &&
393         pContext->type != ShaderType::COMPUTE_SHADER)
394         EnableSubroutine = false;
395 
396     if (EnableSubroutine)
397     {
398         std::size_t Threshold = IGC_GET_FLAG_VALUE(SubroutineThreshold);
399         std::size_t MaxSize = getMaxExpandedSize();
400         if (MaxSize <= Threshold && !HasRecursion)
401         {
402             EnableSubroutine = false;
403         }
404         else if (MaxSize > Threshold && AL == AL_Module)
405         {
406             // If max threshold is exceeded, do analysis on kernel trimming
407             if (IGC_IS_FLAG_ENABLED(ControlKernelTotalSize) &&
408                 IGC_IS_FLAG_DISABLED(DisableAddingAlwaysAttribute))
409             {
410                 if ((IGC_GET_FLAG_VALUE(PrintControlKernelTotalSize) & 0x1) != 0)
411                 {
412                     std::cout << "Max size " << MaxSize << " is larger than the threshold (to trim) " << Threshold << std::endl;
413                 }
414                 reduceKernelSize();
415             }
416         }
417     }
418     IGC_ASSERT(!HasRecursion || EnableSubroutine);
419 }
420 
getExpandedSize(const Function * F) const421 std::size_t EstimateFunctionSize::getExpandedSize(const Function* F) const {
422     auto I = ECG.find((Function*)F);
423     if (I != ECG.end()) {
424         FunctionNode* Node = (FunctionNode*)I->second;
425         IGC_ASSERT(F == Node->F);
426         return Node->Size;
427     }
428 
429     // Unknown.
430     return std::numeric_limits<std::size_t>::max();
431 }
432 
onlyCalledOnce(const Function * F)433 bool EstimateFunctionSize::onlyCalledOnce(const Function* F) {
434     auto I = ECG.find((Function*)F);
435     if (I != ECG.end()) {
436         FunctionNode* Node = (FunctionNode*)I->second;
437         IGC_ASSERT(F == Node->F);
438         // one call-site and not a recursion
439         if (Node->CallerList.size() == 1 &&
440             Node->CallerList.front() != F) {
441             return true;
442         }
443         // OpenCL specific, called once by each kernel
444         SmallPtrSet<Function*, 8> CallerSet;
445         auto MdWrapper = getAnalysisIfAvailable<MetaDataUtilsWrapper>();
446         if (MdWrapper) {
447             auto pMdUtils = MdWrapper->getMetaDataUtils();
448             for (auto Caller : Node->CallerList) {
449                 if (!isEntryFunc(pMdUtils, Caller)) {
450                     return false;
451                 }
452                 if (CallerSet.count(Caller)) {
453                     return false;
454                 }
455                 CallerSet.insert(Caller);
456             }
457             return true;
458         }
459     }
460     return false;
461 }
462 
funcIsGoodtoTrim(llvm::Function * F)463 bool EstimateFunctionSize::funcIsGoodtoTrim( llvm::Function* F)
464 {
465     FunctionNode* func = get<FunctionNode>( F );
466     if ( func->InitialSize < IGC_GET_FLAG_VALUE( ControlInlineTinySize ) )
467         return false; /* tiny function */
468     if ( func->F->hasFnAttribute( llvm::Attribute::AlwaysInline ) )
469         return false; /* user specified alwaysInline */
470     if ( isTrimmedFunction( F ) ) /* already trimmed by other kernels */
471         return false;
472     if( IGC_IS_FLAG_ENABLED( ControlInlineImplicitArgs ) && func->HasImplicitArg )
473     {
474         if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x20 ) != 0 )
475         {
476             std::cout << "Func not inlined: " << func->F->getName().str() << " has implicit args " << std::endl;
477         }
478         return false;
479     }
480     // to allow trimming functions called from other kernels, set the regkey to false
481     if( IGC_IS_FLAG_ENABLED( ForceInlineExternalFunctions ) )
482     {
483         for( auto C : func->CallerList )
484         {
485             FunctionNode* caller = get<FunctionNode>( C );
486             if( caller->KernelNum != func->KernelNum )
487                 return false; // always inline if there is a caller outside this kernel
488         }
489     }
490     return true;
491 }
492 
493 /*
494 For all F: F->ToBeInlined = True
495 For each kernel K
496      kernelTotalSize = findKernelTotalSize(K)  // O(C) >= O(N*logN)
497      IF (FullInlinedKernelSize > T)
498     workList= non-tiny-functions sorted by size from large to small // O(N*logN)
499     AdjustInterval = sizeof(worklist) / 20; cnt = 0;
500     WHILE (worklist not empty) // O(N)
501         remove F from worklist
502         F->ToBeInlined = False
503         if (++cnt == AdjustInterval) {  kernelTotalSize = findKernelTotalSize(K); cnt = 0; }  // exact kernelTotalSize
504         else { kernelTotalSize -= (F->#callers ?1) * F->expdSize ; }
505         IF (kernelTotalSize <= T) break
506     ENDWHILE
507      Inline functions with ToBeInlined = True
508      Inline functions with single caller // done
509 */
reduceKernelSize()510 void EstimateFunctionSize::reduceKernelSize() {
511     auto MdWrapper = getAnalysisIfAvailable<MetaDataUtilsWrapper>();
512     auto pMdUtils = MdWrapper->getMetaDataUtils();
513     std::vector<FunctionNode*> kernels;
514 
515     uint32_t uniqKn = 0; // unique kernel number
516     uint32_t uniqProc = 0; // unique processed number (Processed already updated in EstimateFunctionSize::analyze()
517     // init unique kernel/processed numbers (UK/UP) and identify kernel functions
518     for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
519         auto Node = (FunctionNode*)I->second;
520         Node->KernelNum = uniqKn;
521         Node->Processed = uniqProc;
522         if (Node->F->hasFnAttribute(llvm::Attribute::NoInline)) { /* user specified noinline */
523             Node->ToBeInlined = false;
524         } else {
525             Node->ToBeInlined = true;
526         }
527         if (isEntryFunc(pMdUtils, Node->F)) {
528             if( Node->Size > IGC_GET_FLAG_VALUE( KernelTotalSizeThreshold ) )
529             {
530                 kernels.push_back( Node);
531                 /* if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 )
532                 {
533                     std::cout << "Enqueue kernel " << Node->F->getName().str() << " expanded size=" << Node->Size << std::endl;
534                 } */
535             }
536             else
537             {
538                 /* if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 )
539                 {
540                     std::cout << "Notenqueue kernel " << Node->F->getName().str() << " expanded size=" << Node->Size << std::endl;
541                 } */
542             }
543         }
544     }
545 
546     // Visit all call instructions and populate call graph (callers and calees in function nodes).
547     // The following are already done in EstimateFunctionSize::analyze(), before calling this function
548     // But Callee was destroyed in expand.  Redo here, but don't call addCaller() which is still available.
549     // Should only build CG once -- to be improved late.
550     for (auto& F : M->getFunctionList()) {
551         if (F.empty())
552             continue;
553 
554         // FunctionNode* Node = get<FunctionNode>(&F);
555         // need to verify: F.users(), getParent()->getParent(), etc
556         for (auto U : F.users()) {
557             // Other users (like bitcast/store) are ignored.
558             if (auto* CI = dyn_cast<CallInst>(U)) {
559                 // G calls F, or G --> F
560                 Function* G = CI->getParent()->getParent();
561                 get<FunctionNode>(G)->addCallee(&F);
562                 // Node->addCaller(G);
563             }
564         }
565     }
566 
567     std::sort( kernels.begin(), kernels.end(),
568         [&]( const FunctionNode* LHS, const FunctionNode* RHS ) { return LHS->Size > RHS->Size; });
569 
570     // Iterate over kernels
571     for (auto Kernel : kernels) {
572 
573         uniqKn++; // get a unique number for this kernel
574 
575         if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 ) {
576             std::cout << "Trimming Kernel " << Kernel->F->getName().str() << " expSize= " << Kernel->Size << std::endl;
577         }
578 
579         size_t KernelSize = findKernelTotalSize(Kernel->F, uniqKn, uniqProc );
580         if ( (IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize) & 0x2) != 0) {
581             std::cout << "findKernelTotalSize  " << KernelSize << std::endl;
582         }
583 
584         if (KernelSize <= IGC_GET_FLAG_VALUE(KernelTotalSizeThreshold)) {
585             if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x2 ) != 0 )
586             {
587                 std::cout << "Kernel " << Kernel->F->getName().str() << " ok size " << KernelSize << std::endl;
588             }
589             continue;
590         }
591 
592         if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x2 ) != 0 ) {
593             std::cout << "Kernel size is bigger than threshold " << std::endl;
594             if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x10 ) != 0 )
595             {
596                 continue; // dump collected kernels only
597             }
598         }
599 
600         std::vector<FunctionNode*> SortedKernelFunctions;
601 
602         std::deque<FunctionNode*> Queue;
603         Queue.push_back(Kernel);
604 
605         uniqProc++;
606         // top down traversal to find non-tiny-functions and sort them from large to small
607         while (!Queue.empty()) {
608             FunctionNode* Node = Queue.front();
609             Node->Size = Node->InitialSize;
610             Node->Processed = uniqProc;
611             Node->KernelNum = uniqKn;
612             if ( Node != Kernel && /* not kernel itself */
613                  funcIsGoodtoTrim(Node->F)) /* and other criterias */
614             {
615                SortedKernelFunctions.push_back(Node);
616             }
617             //std::cout << "Node       " << Node->F->getName().str() << std::endl;
618             Queue.pop_front();
619 
620             for (auto Callee : Node->CalleeList) {
621                 FunctionNode* CalleeNode = get<FunctionNode>(Callee);
622                 if( CalleeNode->Processed != uniqProc )  // Not processed yet
623                 {
624                     Queue.push_back(CalleeNode);
625                     CalleeNode->Processed = uniqProc; // prevent this callee from entering queue again
626                 }
627             }
628         }
629         if( SortedKernelFunctions.empty() )
630         {
631             if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
632             {
633                 std::cout << "Kernel " << Kernel->F->getName().str() << " size " << KernelSize << " has no sorted list " << std::endl;
634             }
635             continue; // all functions are tiny.
636         }
637 
638 
639         auto Cmp = [](const FunctionNode* LHS, const FunctionNode* RHS) {
640             return LHS->Size > RHS->Size;
641         };
642         std::sort(SortedKernelFunctions.begin(), SortedKernelFunctions.end(), Cmp);
643 
644         if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
645         {
646             std::cout << "Kernel " << Kernel->F->getName().str() << " has " << SortedKernelFunctions.size() << " funcs to consider for trimming" << std::endl;
647         }
648         uint32_t AdjustInterval = 1; //  SortedKernelFunctions.size() / 20;
649         uint32_t cnt = 0;
650 
651         while (KernelSize > IGC_GET_FLAG_VALUE(KernelTotalSizeThreshold) && !SortedKernelFunctions.empty() ) {
652             FunctionNode* FunctionToRemove = SortedKernelFunctions.front();
653             SortedKernelFunctions.erase(SortedKernelFunctions.begin());
654 
655             FunctionToRemove->ToBeInlined = false;
656             // TrimmingCandidates[FunctionToRemove->F] = true;
657             if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 ) {
658                 std::cout << "FunctionToRemove " << FunctionToRemove->F->getName().str() <<  " initSize "<< FunctionToRemove->InitialSize << " #callers " << FunctionToRemove->CallerList.size() << std::endl;
659             }
660 
661             if( ++cnt == AdjustInterval )
662             {
663                 cnt = 0;
664                 KernelSize = findKernelTotalSize(Kernel->F, uniqKn, uniqProc);
665                 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
666                 {
667                     std::cout << "Precise size after trimming " << FunctionToRemove->F->getName().str() << " is " << KernelSize << std::endl;
668                 }
669             } else {
670                 KernelSize -= (FunctionToRemove->CallerList.size()- 1) * FunctionToRemove->Size; // #caller is underestimated
671                 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
672                 {
673                     std::cout << "Estimated trimming -=" << ( FunctionToRemove->CallerList.size() - 1 ) << " * " << FunctionToRemove->Size << std::endl;
674                 }
675             }
676             if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 ) {
677                 std::cout << "Kernel size is " << KernelSize << " after trimming " << FunctionToRemove->F->getName().str() << std::endl;
678             }
679         }
680         if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 )
681         {
682             size_t ks = findKernelTotalSize(Kernel->F, uniqKn, uniqProc);
683             std::cout << "Kernel " << Kernel->F->getName().str() << " final size " << KernelSize << " real size "<< ks << std::endl;
684         }
685     }
686 }
687 
findKernelTotalSize(llvm::Function * Kernel,uint32_t UK,uint32_t & UP)688 size_t EstimateFunctionSize::findKernelTotalSize(llvm::Function* Kernel, uint32_t UK, uint32_t &UP)
689 {
690     FunctionNode* k = get<FunctionNode>( Kernel );
691     std::deque<FunctionNode*> TopDownQueue;
692     std::deque<FunctionNode*> LeafNodes;
693 
694     UP++;  // unique process number for traversals of call graph
695     TopDownQueue.push_back(get<FunctionNode>(Kernel));
696     get<FunctionNode>( Kernel )->Processed = UP;
697     // top down traversal to find leafNodes.  Could be done once outside this routine and pass here
698     while (!TopDownQueue.empty()) {
699         FunctionNode* Node = TopDownQueue.front();
700         TopDownQueue.pop_front();
701         Node->Size = Node->InitialSize;
702         if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
703         {
704             std::cout << "tdqueue node " << Node->F->getName().str() << " size " << Node->Size << std::endl;
705         }
706         Node->Processed = UP; // processed
707         Node->KernelNum = UK;
708 
709         for (auto Callee : Node->CalleeList) {
710             FunctionNode* CalleeNode = get<FunctionNode>(Callee);
711             if( CalleeNode->Processed != UP )  // Not processed yet
712             {
713                 if( CalleeNode->isLeaf() ) {
714                     CalleeNode->Size = CalleeNode->InitialSize;
715                     LeafNodes.push_back( CalleeNode );
716                     if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
717                     {
718                         std::cout << "found leaf node " << CalleeNode->F->getName().str() << " initSize=Size " << CalleeNode->Size << std::endl;
719                     }
720                 }
721                 else
722                 {
723                     TopDownQueue.push_back( CalleeNode );
724                 }
725                 CalleeNode->Processed = UP; // not to enter into queu again
726             }
727         }
728     }
729     IGC_ASSERT(!LeafNodes.empty());
730     if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
731     {
732         std::cout << "Kernel " << Kernel->getName().str() << " initial size " << k->Size << std::endl;
733     }
734     // Expand leaf nodes until all are expanded (the second list is empty).
735     UP++;
736     while (!LeafNodes.empty()) {
737 
738         // Expand leaf nodes one by one.
739         FunctionNode* Node = LeafNodes.front();
740         LeafNodes.pop_front();
741         if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
742         {
743             std::cout << "Visit leaf node " << Node->F->getName().str() << " size " << Node->Size << std::endl;
744             if( Node->Processed == UP )
745                 std::cout << "  already processed " << std::endl;
746         }
747 
748         if( Node->Processed == UP )
749             continue; // already processed, e.g. the same caller might be enqueued multiple times -- CallerList duplication issue
750 
751         Node->Processed = UP; // processed
752 
753         if (Node->ToBeInlined) {
754             // Populate size to its Callers.
755             for (auto Caller : Node->CallerList) {
756                 get<FunctionNode>(Caller)->expandSpecial(Node);
757                 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
758                 {
759                     std::cout << "Expand to caller " << Caller->getName().str() << " from callee " << Node->F->getName().str() << " new size " << get<FunctionNode>( Caller )->Size << std::endl;
760                 }
761             }
762         }
763         else {
764             get<FunctionNode>(Kernel)->Size += Node->Size;
765         }
766 
767         for (auto C : Node->CallerList) {
768             FunctionNode* caller = get<FunctionNode>(C);
769             if ( caller == k || caller->KernelNum != UK)
770                 continue; // Skip kernel entry and callers outside this kernel
771             if( caller->Processed == UP )  // Already processed
772                 continue;
773             bool is_new_leaf = true;
774             for (auto S : caller->CalleeList) {
775                 if ( get<FunctionNode>(S)->Processed != UP) {
776                     is_new_leaf = false;
777                     break;
778                 }
779             }
780             if (is_new_leaf) {
781                 LeafNodes.push_back(caller);
782             }
783         }
784     }
785 
786     return k->Size;
787 }
788 
isTrimmedFunction(llvm::Function * F)789 bool EstimateFunctionSize::isTrimmedFunction( llvm::Function* F) {
790     return get<FunctionNode>(F)->ToBeInlined == false;
791 }
792