1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "Compiler/CISACodeGen/EstimateFunctionSize.h"
10 #include "Compiler/CodeGenContextWrapper.hpp"
11 #include "Compiler/MetaDataUtilsWrapper.h"
12 #include "Compiler/CodeGenPublic.h"
13 #include "Compiler/IGCPassSupport.h"
14 #include "common/igc_regkeys.hpp"
15 #include "common/LLVMWarningsPush.hpp"
16 #include "llvm/IR/Module.h"
17 #include "llvm/IR/Function.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/Support/raw_ostream.h"
20 #include "common/LLVMWarningsPop.hpp"
21 #include "Probe/Assertion.h"
22 #include <deque>
23 #include <iostream>
24
25 using namespace llvm;
26 using namespace IGC;
27
28 char EstimateFunctionSize::ID = 0;
29
30 IGC_INITIALIZE_PASS_BEGIN(EstimateFunctionSize, "EstimateFunctionSize", "EstimateFunctionSize", false, true)
31 IGC_INITIALIZE_PASS_END(EstimateFunctionSize, "EstimateFunctionSize", "EstimateFunctionSize", false, true)
32
createEstimateFunctionSizePass()33 llvm::ModulePass* IGC::createEstimateFunctionSizePass() {
34 initializeEstimateFunctionSizePass(*PassRegistry::getPassRegistry());
35 return new EstimateFunctionSize;
36 }
37
38 llvm::ModulePass*
createEstimateFunctionSizePass(EstimateFunctionSize::AnalysisLevel AL)39 IGC::createEstimateFunctionSizePass(EstimateFunctionSize::AnalysisLevel AL) {
40 initializeEstimateFunctionSizePass(*PassRegistry::getPassRegistry());
41 return new EstimateFunctionSize(AL);
42 }
43
EstimateFunctionSize(AnalysisLevel AL)44 EstimateFunctionSize::EstimateFunctionSize(AnalysisLevel AL)
45 : ModulePass(ID), M(nullptr), AL(AL), HasRecursion(false), EnableSubroutine(false) {}
46
~EstimateFunctionSize()47 EstimateFunctionSize::~EstimateFunctionSize() { clear(); }
48
getAnalysisUsage(AnalysisUsage & AU) const49 void EstimateFunctionSize::getAnalysisUsage(AnalysisUsage& AU) const {
50 AU.setPreservesAll();
51 }
52
runOnModule(Module & Mod)53 bool EstimateFunctionSize::runOnModule(Module& Mod) {
54 clear();
55 M = &Mod;
56 analyze();
57 checkSubroutine();
58 return false;
59 }
60
61 // Given a module, estimate the maximal function size with complete inlining.
62 /*
63 A ----> B ----> C ---> D ---> F
64 \ \ \
65 \ \ \---> E
66 \ \
67 \ \---> C ---> D --> F
68 \ \
69 \----> F \---> E
70 */
71 // ExpandedSize(A) = size(A) + size(B) + 2 * size(C) + 2 * size(D)
72 // + 2 * size(E) + 3 * size(F)
73 //
74 // We compute the size as follows:
75 //
76 // (1) Initialize the data structure
77 //
78 // A --> {size(A), [B, F], [] }
79 // B --> {size(B), [C, C], [A] }
80 // C --> {size(C), [D, E], [B] }
81 // D --> {size(D), [F], [C] }
82 // E --> {size(E), [], [C] }
83 // F --> {size(F), [], [A, D] }
84 //
85 // where the first list consists of functions to be expanded and the second list
86 // consists of its caller functions.
87 //
88 // (2) Expand all leaf nodes until all become leaf nodes. We update the data
89 // structure to the following after expanding functions E and F:
90 //
91 // A --> {size(A) + size(F), [B], [] }
92 // B --> {size(B), [C, C], [A] }
93 // C --> {size(C) + size(E), [D], [B] }
94 // D --> {size(D) + size(F), [], [C] }
95 // E --> {size(E), [], [C] }
96 // F --> {size(F), [], [A, D] }
97 //
98 namespace {
99
100 /// Associate each function with a partially expanded size and remaining
101 /// unexpanded function list, etc.
102 struct FunctionNode {
FunctionNode__anonc8e5d44b0111::FunctionNode103 FunctionNode(Function* F, std::size_t Size)
104 : F(F), Size(Size), InitialSize(Size), Processed(0), KernelNum(0),
105 CallingSubroutine(false), HasImplicitArg(false) {}
106
107 Function* F;
108
109 /// \brief Partially expanded size, or completely expanded size for a
110 /// leaf node.
111 std::size_t Size;
112
113 /// \brief Initial size before expansion.
114 std::size_t InitialSize;
115
116 /// \brief A number to indicate whether this node has been processed in current traversal.
117 uint32_t Processed;
118
119 /// \brief A number to indicate whether this node belongs to the current kernel.
120 uint32_t KernelNum;
121
122 /// \brief A flag to indicate whether this node has a subroutine call before
123 /// expanding.
124 bool CallingSubroutine;
125
126 /// \brief A flag to indicate whether this node should be always inlined.
127 bool ToBeInlined;
128
129 bool HasImplicitArg;
130
131 /// \brief All functions directly called in this function.
132 std::vector<Function*> CalleeList;
133
134 /// \brief All functions that call this function F.
135 std::vector<Function*> CallerList;
136
137 /// \brief A node becomes a leaf when all called functions are expanded.
isLeaf__anonc8e5d44b0111::FunctionNode138 bool isLeaf() const { return CalleeList.empty(); }
139
140 /// \brief Add a caller or callee.
141 // A caller may call the same callee multiple times, e.g. A->{B,B,B}: A->CalleeList(B,B,B), B->CallerList(A,A,A)
addCallee__anonc8e5d44b0111::FunctionNode142 void addCallee(Function* G) {
143 IGC_ASSERT(!G->empty());
144 CalleeList.push_back(G);
145 CallingSubroutine = true;
146 }
addCaller__anonc8e5d44b0111::FunctionNode147 void addCaller(Function* G) {
148 IGC_ASSERT(!G->empty());
149 CallerList.push_back(G);
150 }
151
152 /// \brief A single step to expand F: accumulate the size and remove it
153 /// from the callee list.
expand__anonc8e5d44b0111::FunctionNode154 void expand(FunctionNode* Node) {
155 if( IGC_IS_FLAG_ENABLED(ControlInlineImplicitArgs) && HasImplicitArg == false && Node->HasImplicitArg == true )
156 {
157 HasImplicitArg = true;
158 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x40 ) != 0 )
159 {
160 std::cout << "Func " << this->F->getName().str() << " expands to hasimplicitarg due to " << Node->F->getName().str() << std::endl;
161 }
162 }
163 // Multiple calls to a function is allowed as in the example above.
164 for (auto I = CalleeList.begin(); I != CalleeList.end(); /* empty */) {
165 if (*I == Node->F) {
166 Size += Node->Size;
167 I = CalleeList.erase(I);
168 }
169 else
170 ++I;
171 }
172 }
173
174 /// \brief A single step to expand F: accumulate the size and DON't remove it
175 /// from the callee list.
expandSpecial__anonc8e5d44b0111::FunctionNode176 void expandSpecial( FunctionNode* Node ) {
177 Size += Node->Size;
178 }
179
180 #if defined(_DEBUG)
181 void print(raw_ostream& os);
182
dump__anonc8e5d44b0111::FunctionNode183 void dump() { print(llvm::errs()); }
184 #endif
185 };
186
187 } // namespace
188 #if defined(_DEBUG)
189
print(raw_ostream & os)190 void FunctionNode::print(raw_ostream& os) {
191 os << "Function: " << F->getName() << ", " << Size << "\n";
192 for (auto G : CalleeList)
193 os << "--->>>" << G->getName() << "\n";
194 for (auto G : CallerList)
195 os << "<<<---" << G->getName() << "\n";
196 }
197 #endif
198
clear()199 void EstimateFunctionSize::clear() {
200 M = nullptr;
201 for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
202 auto Node = (FunctionNode*)I->second;
203 delete Node;
204 }
205 ECG.clear();
206 }
207
matchImplicitArg(CallInst & CI)208 bool EstimateFunctionSize::matchImplicitArg( CallInst& CI )
209 {
210 bool matched = false;
211 StringRef funcName = CI.getCalledFunction()->getName();
212 if( funcName.equals( GET_LOCAL_ID_X ) ||
213 funcName.equals( GET_LOCAL_ID_Y ) ||
214 funcName.equals( GET_LOCAL_ID_Z ) )
215 {
216 matched = true;
217 }
218 else if( funcName.equals( GET_GROUP_ID ) )
219 {
220 matched = true;
221 }
222 else if( funcName.equals( GET_LOCAL_THREAD_ID ) )
223 {
224 matched = true;
225 }
226 else if( funcName.equals( GET_GLOBAL_OFFSET ) )
227 {
228 matched = true;
229 }
230 else if( funcName.equals( GET_GLOBAL_SIZE ) )
231 {
232 matched = true;
233 }
234 else if( funcName.equals( GET_LOCAL_SIZE ) )
235 {
236 matched = true;
237 }
238 else if( funcName.equals( GET_WORK_DIM ) )
239 {
240 matched = true;
241 }
242 else if( funcName.equals( GET_NUM_GROUPS ) )
243 {
244 matched = true;
245 }
246 else if( funcName.equals( GET_ENQUEUED_LOCAL_SIZE ) )
247 {
248 matched = true;
249 }
250 else if( funcName.equals( GET_STAGE_IN_GRID_ORIGIN ) )
251 {
252 matched = true;
253 }
254 else if( funcName.equals( GET_STAGE_IN_GRID_SIZE ) )
255 {
256 matched = true;
257 }
258 else if( funcName.equals( GET_SYNC_BUFFER ) )
259 {
260 matched = true;
261 }
262 if( matched && ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x40 ) != 0 )
263 {
264 std::cout << "Matched implicit arg " << funcName.str() << std::endl;
265 }
266 return matched;
267 }
268
269 // visit Call inst to determine if implicit args are used by the caller
visitCallInst(CallInst & CI)270 void EstimateFunctionSize::visitCallInst( CallInst& CI )
271 {
272 if( !CI.getCalledFunction() )
273 {
274 return;
275 }
276 // Check for implicit arg function calls
277 bool matched = matchImplicitArg( CI );
278 tmpHasImplicitArg = matched;
279 }
280
analyze()281 void EstimateFunctionSize::analyze() {
282 auto getSize = [](llvm::Function& F) -> std::size_t {
283 std::size_t Size = 0;
284 for (auto& BB : F.getBasicBlockList())
285 Size += BB.size();
286 return Size;
287 };
288
289 // Initial the data structure.
290 for (auto& F : M->getFunctionList()) {
291 if (F.empty())
292 continue;
293 ECG[&F] = new FunctionNode(&F, getSize(F));
294 }
295
296 // Visit all call instructions and populate CG.
297 for (auto& F : M->getFunctionList()) {
298 if (F.empty())
299 continue;
300
301 FunctionNode* Node = get<FunctionNode>(&F);
302 for (auto U : F.users()) {
303 // Other users (like bitcast/store) are ignored.
304 if (auto * CI = dyn_cast<CallInst>(U)) {
305 // G calls F, or G --> F
306 Function* G = CI->getParent()->getParent();
307 FunctionNode* GN = get<FunctionNode>( G );
308 GN->addCallee(&F);
309 Node->addCaller(G);
310 }
311 }
312 }
313 // check functions and mark those that use implicit args.
314 if( IGC_IS_FLAG_ENABLED( ControlInlineImplicitArgs ) )
315 {
316 for( auto I = ECG.begin(), E = ECG.end(); I != E; ++I )
317 {
318 auto Node = (FunctionNode*)I->second;
319 if( Node /* && Node->isLeaf() a non-leaf function may also call implicit args */ )
320 {
321 tmpHasImplicitArg = false;
322 visit( Node->F );
323 if( tmpHasImplicitArg )
324 {
325 Node->HasImplicitArg = true;
326 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x40 ) != 0 )
327 {
328 static int cnt = 0;
329 const char* Name;
330 if( Node->isLeaf() )
331 Name = "Leaf";
332 else
333 Name = "nonLeaf";
334 std::cout << Name << " Func " << ++cnt << " " << Node->F->getName().str() << " calls implicit args so HasImplicitArg" << std::endl;
335 }
336 }
337 }
338 }
339 }
340
341 // Expand leaf nodes until all are expanded (the second list is empty).
342 while (true) {
343 // Find unexpanded leaf nodes.
344 SmallVector<FunctionNode*, 8> LeafNodes;
345 for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
346 auto Node = (FunctionNode*)I->second;
347 if (!Node->Processed && Node->isLeaf())
348 LeafNodes.push_back(Node);
349 }
350
351 // Done.
352 if (LeafNodes.empty())
353 break;
354
355 // Expand leaf nodes one by one.
356 for (auto Node : LeafNodes) {
357 IGC_ASSERT(Node->CalleeList.empty());
358 // Populate to its Callers.
359 for (auto Caller : Node->CallerList)
360 get<FunctionNode>(Caller)->expand(Node);
361 Node->Processed = true;
362 }
363 }
364
365 HasRecursion = false;
366 for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
367 FunctionNode* Node = (FunctionNode*)I->second;
368 if (!Node->isLeaf()) {
369 HasRecursion = true;
370 }
371 }
372 }
373
374 /// \brief Return the estimated maximal function size after complete inlining.
getMaxExpandedSize() const375 std::size_t EstimateFunctionSize::getMaxExpandedSize() const {
376 std::size_t MaxSize = 0;
377 for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
378 FunctionNode* Node = (FunctionNode*)I->second;
379 // Only functions with subroutine calls count.
380 if (Node->CallingSubroutine)
381 MaxSize = std::max(MaxSize, Node->Size);
382 }
383 return MaxSize;
384 }
385
checkSubroutine()386 void EstimateFunctionSize::checkSubroutine() {
387 auto CGW = getAnalysisIfAvailable<CodeGenContextWrapper>();
388 if (!CGW) return;
389
390 EnableSubroutine = true;
391 CodeGenContext* pContext = CGW->getCodeGenContext();
392 if (pContext->type != ShaderType::OPENCL_SHADER &&
393 pContext->type != ShaderType::COMPUTE_SHADER)
394 EnableSubroutine = false;
395
396 if (EnableSubroutine)
397 {
398 std::size_t Threshold = IGC_GET_FLAG_VALUE(SubroutineThreshold);
399 std::size_t MaxSize = getMaxExpandedSize();
400 if (MaxSize <= Threshold && !HasRecursion)
401 {
402 EnableSubroutine = false;
403 }
404 else if (MaxSize > Threshold && AL == AL_Module)
405 {
406 // If max threshold is exceeded, do analysis on kernel trimming
407 if (IGC_IS_FLAG_ENABLED(ControlKernelTotalSize) &&
408 IGC_IS_FLAG_DISABLED(DisableAddingAlwaysAttribute))
409 {
410 if ((IGC_GET_FLAG_VALUE(PrintControlKernelTotalSize) & 0x1) != 0)
411 {
412 std::cout << "Max size " << MaxSize << " is larger than the threshold (to trim) " << Threshold << std::endl;
413 }
414 reduceKernelSize();
415 }
416 }
417 }
418 IGC_ASSERT(!HasRecursion || EnableSubroutine);
419 }
420
getExpandedSize(const Function * F) const421 std::size_t EstimateFunctionSize::getExpandedSize(const Function* F) const {
422 auto I = ECG.find((Function*)F);
423 if (I != ECG.end()) {
424 FunctionNode* Node = (FunctionNode*)I->second;
425 IGC_ASSERT(F == Node->F);
426 return Node->Size;
427 }
428
429 // Unknown.
430 return std::numeric_limits<std::size_t>::max();
431 }
432
onlyCalledOnce(const Function * F)433 bool EstimateFunctionSize::onlyCalledOnce(const Function* F) {
434 auto I = ECG.find((Function*)F);
435 if (I != ECG.end()) {
436 FunctionNode* Node = (FunctionNode*)I->second;
437 IGC_ASSERT(F == Node->F);
438 // one call-site and not a recursion
439 if (Node->CallerList.size() == 1 &&
440 Node->CallerList.front() != F) {
441 return true;
442 }
443 // OpenCL specific, called once by each kernel
444 SmallPtrSet<Function*, 8> CallerSet;
445 auto MdWrapper = getAnalysisIfAvailable<MetaDataUtilsWrapper>();
446 if (MdWrapper) {
447 auto pMdUtils = MdWrapper->getMetaDataUtils();
448 for (auto Caller : Node->CallerList) {
449 if (!isEntryFunc(pMdUtils, Caller)) {
450 return false;
451 }
452 if (CallerSet.count(Caller)) {
453 return false;
454 }
455 CallerSet.insert(Caller);
456 }
457 return true;
458 }
459 }
460 return false;
461 }
462
funcIsGoodtoTrim(llvm::Function * F)463 bool EstimateFunctionSize::funcIsGoodtoTrim( llvm::Function* F)
464 {
465 FunctionNode* func = get<FunctionNode>( F );
466 if ( func->InitialSize < IGC_GET_FLAG_VALUE( ControlInlineTinySize ) )
467 return false; /* tiny function */
468 if ( func->F->hasFnAttribute( llvm::Attribute::AlwaysInline ) )
469 return false; /* user specified alwaysInline */
470 if ( isTrimmedFunction( F ) ) /* already trimmed by other kernels */
471 return false;
472 if( IGC_IS_FLAG_ENABLED( ControlInlineImplicitArgs ) && func->HasImplicitArg )
473 {
474 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x20 ) != 0 )
475 {
476 std::cout << "Func not inlined: " << func->F->getName().str() << " has implicit args " << std::endl;
477 }
478 return false;
479 }
480 // to allow trimming functions called from other kernels, set the regkey to false
481 if( IGC_IS_FLAG_ENABLED( ForceInlineExternalFunctions ) )
482 {
483 for( auto C : func->CallerList )
484 {
485 FunctionNode* caller = get<FunctionNode>( C );
486 if( caller->KernelNum != func->KernelNum )
487 return false; // always inline if there is a caller outside this kernel
488 }
489 }
490 return true;
491 }
492
493 /*
494 For all F: F->ToBeInlined = True
495 For each kernel K
496 kernelTotalSize = findKernelTotalSize(K) // O(C) >= O(N*logN)
497 IF (FullInlinedKernelSize > T)
498 workList= non-tiny-functions sorted by size from large to small // O(N*logN)
499 AdjustInterval = sizeof(worklist) / 20; cnt = 0;
500 WHILE (worklist not empty) // O(N)
501 remove F from worklist
502 F->ToBeInlined = False
503 if (++cnt == AdjustInterval) { kernelTotalSize = findKernelTotalSize(K); cnt = 0; } // exact kernelTotalSize
504 else { kernelTotalSize -= (F->#callers ?1) * F->expdSize ; }
505 IF (kernelTotalSize <= T) break
506 ENDWHILE
507 Inline functions with ToBeInlined = True
508 Inline functions with single caller // done
509 */
reduceKernelSize()510 void EstimateFunctionSize::reduceKernelSize() {
511 auto MdWrapper = getAnalysisIfAvailable<MetaDataUtilsWrapper>();
512 auto pMdUtils = MdWrapper->getMetaDataUtils();
513 std::vector<FunctionNode*> kernels;
514
515 uint32_t uniqKn = 0; // unique kernel number
516 uint32_t uniqProc = 0; // unique processed number (Processed already updated in EstimateFunctionSize::analyze()
517 // init unique kernel/processed numbers (UK/UP) and identify kernel functions
518 for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
519 auto Node = (FunctionNode*)I->second;
520 Node->KernelNum = uniqKn;
521 Node->Processed = uniqProc;
522 if (Node->F->hasFnAttribute(llvm::Attribute::NoInline)) { /* user specified noinline */
523 Node->ToBeInlined = false;
524 } else {
525 Node->ToBeInlined = true;
526 }
527 if (isEntryFunc(pMdUtils, Node->F)) {
528 if( Node->Size > IGC_GET_FLAG_VALUE( KernelTotalSizeThreshold ) )
529 {
530 kernels.push_back( Node);
531 /* if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 )
532 {
533 std::cout << "Enqueue kernel " << Node->F->getName().str() << " expanded size=" << Node->Size << std::endl;
534 } */
535 }
536 else
537 {
538 /* if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 )
539 {
540 std::cout << "Notenqueue kernel " << Node->F->getName().str() << " expanded size=" << Node->Size << std::endl;
541 } */
542 }
543 }
544 }
545
546 // Visit all call instructions and populate call graph (callers and calees in function nodes).
547 // The following are already done in EstimateFunctionSize::analyze(), before calling this function
548 // But Callee was destroyed in expand. Redo here, but don't call addCaller() which is still available.
549 // Should only build CG once -- to be improved late.
550 for (auto& F : M->getFunctionList()) {
551 if (F.empty())
552 continue;
553
554 // FunctionNode* Node = get<FunctionNode>(&F);
555 // need to verify: F.users(), getParent()->getParent(), etc
556 for (auto U : F.users()) {
557 // Other users (like bitcast/store) are ignored.
558 if (auto* CI = dyn_cast<CallInst>(U)) {
559 // G calls F, or G --> F
560 Function* G = CI->getParent()->getParent();
561 get<FunctionNode>(G)->addCallee(&F);
562 // Node->addCaller(G);
563 }
564 }
565 }
566
567 std::sort( kernels.begin(), kernels.end(),
568 [&]( const FunctionNode* LHS, const FunctionNode* RHS ) { return LHS->Size > RHS->Size; });
569
570 // Iterate over kernels
571 for (auto Kernel : kernels) {
572
573 uniqKn++; // get a unique number for this kernel
574
575 if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 ) {
576 std::cout << "Trimming Kernel " << Kernel->F->getName().str() << " expSize= " << Kernel->Size << std::endl;
577 }
578
579 size_t KernelSize = findKernelTotalSize(Kernel->F, uniqKn, uniqProc );
580 if ( (IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize) & 0x2) != 0) {
581 std::cout << "findKernelTotalSize " << KernelSize << std::endl;
582 }
583
584 if (KernelSize <= IGC_GET_FLAG_VALUE(KernelTotalSizeThreshold)) {
585 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x2 ) != 0 )
586 {
587 std::cout << "Kernel " << Kernel->F->getName().str() << " ok size " << KernelSize << std::endl;
588 }
589 continue;
590 }
591
592 if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x2 ) != 0 ) {
593 std::cout << "Kernel size is bigger than threshold " << std::endl;
594 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x10 ) != 0 )
595 {
596 continue; // dump collected kernels only
597 }
598 }
599
600 std::vector<FunctionNode*> SortedKernelFunctions;
601
602 std::deque<FunctionNode*> Queue;
603 Queue.push_back(Kernel);
604
605 uniqProc++;
606 // top down traversal to find non-tiny-functions and sort them from large to small
607 while (!Queue.empty()) {
608 FunctionNode* Node = Queue.front();
609 Node->Size = Node->InitialSize;
610 Node->Processed = uniqProc;
611 Node->KernelNum = uniqKn;
612 if ( Node != Kernel && /* not kernel itself */
613 funcIsGoodtoTrim(Node->F)) /* and other criterias */
614 {
615 SortedKernelFunctions.push_back(Node);
616 }
617 //std::cout << "Node " << Node->F->getName().str() << std::endl;
618 Queue.pop_front();
619
620 for (auto Callee : Node->CalleeList) {
621 FunctionNode* CalleeNode = get<FunctionNode>(Callee);
622 if( CalleeNode->Processed != uniqProc ) // Not processed yet
623 {
624 Queue.push_back(CalleeNode);
625 CalleeNode->Processed = uniqProc; // prevent this callee from entering queue again
626 }
627 }
628 }
629 if( SortedKernelFunctions.empty() )
630 {
631 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
632 {
633 std::cout << "Kernel " << Kernel->F->getName().str() << " size " << KernelSize << " has no sorted list " << std::endl;
634 }
635 continue; // all functions are tiny.
636 }
637
638
639 auto Cmp = [](const FunctionNode* LHS, const FunctionNode* RHS) {
640 return LHS->Size > RHS->Size;
641 };
642 std::sort(SortedKernelFunctions.begin(), SortedKernelFunctions.end(), Cmp);
643
644 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
645 {
646 std::cout << "Kernel " << Kernel->F->getName().str() << " has " << SortedKernelFunctions.size() << " funcs to consider for trimming" << std::endl;
647 }
648 uint32_t AdjustInterval = 1; // SortedKernelFunctions.size() / 20;
649 uint32_t cnt = 0;
650
651 while (KernelSize > IGC_GET_FLAG_VALUE(KernelTotalSizeThreshold) && !SortedKernelFunctions.empty() ) {
652 FunctionNode* FunctionToRemove = SortedKernelFunctions.front();
653 SortedKernelFunctions.erase(SortedKernelFunctions.begin());
654
655 FunctionToRemove->ToBeInlined = false;
656 // TrimmingCandidates[FunctionToRemove->F] = true;
657 if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 ) {
658 std::cout << "FunctionToRemove " << FunctionToRemove->F->getName().str() << " initSize "<< FunctionToRemove->InitialSize << " #callers " << FunctionToRemove->CallerList.size() << std::endl;
659 }
660
661 if( ++cnt == AdjustInterval )
662 {
663 cnt = 0;
664 KernelSize = findKernelTotalSize(Kernel->F, uniqKn, uniqProc);
665 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
666 {
667 std::cout << "Precise size after trimming " << FunctionToRemove->F->getName().str() << " is " << KernelSize << std::endl;
668 }
669 } else {
670 KernelSize -= (FunctionToRemove->CallerList.size()- 1) * FunctionToRemove->Size; // #caller is underestimated
671 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 )
672 {
673 std::cout << "Estimated trimming -=" << ( FunctionToRemove->CallerList.size() - 1 ) << " * " << FunctionToRemove->Size << std::endl;
674 }
675 }
676 if ( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x4 ) != 0 ) {
677 std::cout << "Kernel size is " << KernelSize << " after trimming " << FunctionToRemove->F->getName().str() << std::endl;
678 }
679 }
680 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x1 ) != 0 )
681 {
682 size_t ks = findKernelTotalSize(Kernel->F, uniqKn, uniqProc);
683 std::cout << "Kernel " << Kernel->F->getName().str() << " final size " << KernelSize << " real size "<< ks << std::endl;
684 }
685 }
686 }
687
findKernelTotalSize(llvm::Function * Kernel,uint32_t UK,uint32_t & UP)688 size_t EstimateFunctionSize::findKernelTotalSize(llvm::Function* Kernel, uint32_t UK, uint32_t &UP)
689 {
690 FunctionNode* k = get<FunctionNode>( Kernel );
691 std::deque<FunctionNode*> TopDownQueue;
692 std::deque<FunctionNode*> LeafNodes;
693
694 UP++; // unique process number for traversals of call graph
695 TopDownQueue.push_back(get<FunctionNode>(Kernel));
696 get<FunctionNode>( Kernel )->Processed = UP;
697 // top down traversal to find leafNodes. Could be done once outside this routine and pass here
698 while (!TopDownQueue.empty()) {
699 FunctionNode* Node = TopDownQueue.front();
700 TopDownQueue.pop_front();
701 Node->Size = Node->InitialSize;
702 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
703 {
704 std::cout << "tdqueue node " << Node->F->getName().str() << " size " << Node->Size << std::endl;
705 }
706 Node->Processed = UP; // processed
707 Node->KernelNum = UK;
708
709 for (auto Callee : Node->CalleeList) {
710 FunctionNode* CalleeNode = get<FunctionNode>(Callee);
711 if( CalleeNode->Processed != UP ) // Not processed yet
712 {
713 if( CalleeNode->isLeaf() ) {
714 CalleeNode->Size = CalleeNode->InitialSize;
715 LeafNodes.push_back( CalleeNode );
716 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
717 {
718 std::cout << "found leaf node " << CalleeNode->F->getName().str() << " initSize=Size " << CalleeNode->Size << std::endl;
719 }
720 }
721 else
722 {
723 TopDownQueue.push_back( CalleeNode );
724 }
725 CalleeNode->Processed = UP; // not to enter into queu again
726 }
727 }
728 }
729 IGC_ASSERT(!LeafNodes.empty());
730 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
731 {
732 std::cout << "Kernel " << Kernel->getName().str() << " initial size " << k->Size << std::endl;
733 }
734 // Expand leaf nodes until all are expanded (the second list is empty).
735 UP++;
736 while (!LeafNodes.empty()) {
737
738 // Expand leaf nodes one by one.
739 FunctionNode* Node = LeafNodes.front();
740 LeafNodes.pop_front();
741 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
742 {
743 std::cout << "Visit leaf node " << Node->F->getName().str() << " size " << Node->Size << std::endl;
744 if( Node->Processed == UP )
745 std::cout << " already processed " << std::endl;
746 }
747
748 if( Node->Processed == UP )
749 continue; // already processed, e.g. the same caller might be enqueued multiple times -- CallerList duplication issue
750
751 Node->Processed = UP; // processed
752
753 if (Node->ToBeInlined) {
754 // Populate size to its Callers.
755 for (auto Caller : Node->CallerList) {
756 get<FunctionNode>(Caller)->expandSpecial(Node);
757 if( ( IGC_GET_FLAG_VALUE( PrintControlKernelTotalSize ) & 0x8 ) != 0 )
758 {
759 std::cout << "Expand to caller " << Caller->getName().str() << " from callee " << Node->F->getName().str() << " new size " << get<FunctionNode>( Caller )->Size << std::endl;
760 }
761 }
762 }
763 else {
764 get<FunctionNode>(Kernel)->Size += Node->Size;
765 }
766
767 for (auto C : Node->CallerList) {
768 FunctionNode* caller = get<FunctionNode>(C);
769 if ( caller == k || caller->KernelNum != UK)
770 continue; // Skip kernel entry and callers outside this kernel
771 if( caller->Processed == UP ) // Already processed
772 continue;
773 bool is_new_leaf = true;
774 for (auto S : caller->CalleeList) {
775 if ( get<FunctionNode>(S)->Processed != UP) {
776 is_new_leaf = false;
777 break;
778 }
779 }
780 if (is_new_leaf) {
781 LeafNodes.push_back(caller);
782 }
783 }
784 }
785
786 return k->Size;
787 }
788
isTrimmedFunction(llvm::Function * F)789 bool EstimateFunctionSize::isTrimmedFunction( llvm::Function* F) {
790 return get<FunctionNode>(F)->ToBeInlined == false;
791 }
792