1 //===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // This file contains classes used to discover if for a particular value 9 // there from sue to definition that crosses a suspend block. 10 // 11 // Using the information discovered we form a Coroutine Frame structure to 12 // contain those values. All uses of those values are replaced with appropriate 13 // GEP + load from the coroutine frame. At the point of the definition we spill 14 // the value into the coroutine frame. 15 //===----------------------------------------------------------------------===// 16 17 #include "CoroInternal.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/ADT/SmallString.h" 22 #include "llvm/Analysis/PtrUseVisitor.h" 23 #include "llvm/Analysis/StackLifetime.h" 24 #include "llvm/Config/llvm-config.h" 25 #include "llvm/IR/CFG.h" 26 #include "llvm/IR/DIBuilder.h" 27 #include "llvm/IR/DebugInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstIterator.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/MathExtras.h" 34 #include "llvm/Support/OptimizedStructLayout.h" 35 #include "llvm/Support/circular_raw_ostream.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include "llvm/Transforms/Utils/Local.h" 39 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 40 #include <algorithm> 41 #include <deque> 42 #include <optional> 43 44 using namespace llvm; 45 46 // The "coro-suspend-crossing" flag is very noisy. There is another debug type, 47 // "coro-frame", which results in leaner debug spew. 48 #define DEBUG_TYPE "coro-suspend-crossing" 49 50 enum { SmallVectorThreshold = 32 }; 51 52 // Provides two way mapping between the blocks and numbers. 53 namespace { 54 class BlockToIndexMapping { 55 SmallVector<BasicBlock *, SmallVectorThreshold> V; 56 57 public: 58 size_t size() const { return V.size(); } 59 60 BlockToIndexMapping(Function &F) { 61 for (BasicBlock &BB : F) 62 V.push_back(&BB); 63 llvm::sort(V); 64 } 65 66 size_t blockToIndex(BasicBlock *BB) const { 67 auto *I = llvm::lower_bound(V, BB); 68 assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block"); 69 return I - V.begin(); 70 } 71 72 BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; } 73 }; 74 } // end anonymous namespace 75 76 // The SuspendCrossingInfo maintains data that allows to answer a question 77 // whether given two BasicBlocks A and B there is a path from A to B that 78 // passes through a suspend point. 79 // 80 // For every basic block 'i' it maintains a BlockData that consists of: 81 // Consumes: a bit vector which contains a set of indices of blocks that can 82 // reach block 'i'. A block can trivially reach itself. 83 // Kills: a bit vector which contains a set of indices of blocks that can 84 // reach block 'i' but there is a path crossing a suspend point 85 // not repeating 'i' (path to 'i' without cycles containing 'i'). 86 // Suspend: a boolean indicating whether block 'i' contains a suspend point. 87 // End: a boolean indicating whether block 'i' contains a coro.end intrinsic. 88 // KillLoop: There is a path from 'i' to 'i' not otherwise repeating 'i' that 89 // crosses a suspend point. 90 // 91 namespace { 92 class SuspendCrossingInfo { 93 BlockToIndexMapping Mapping; 94 95 struct BlockData { 96 BitVector Consumes; 97 BitVector Kills; 98 bool Suspend = false; 99 bool End = false; 100 bool KillLoop = false; 101 bool Changed = false; 102 }; 103 SmallVector<BlockData, SmallVectorThreshold> Block; 104 105 iterator_range<pred_iterator> predecessors(BlockData const &BD) const { 106 BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]); 107 return llvm::predecessors(BB); 108 } 109 110 BlockData &getBlockData(BasicBlock *BB) { 111 return Block[Mapping.blockToIndex(BB)]; 112 } 113 114 /// Compute the BlockData for the current function in one iteration. 115 /// Returns whether the BlockData changes in this iteration. 116 /// Initialize - Whether this is the first iteration, we can optimize 117 /// the initial case a little bit by manual loop switch. 118 template <bool Initialize = false> bool computeBlockData(); 119 120 public: 121 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 122 void dump() const; 123 void dump(StringRef Label, BitVector const &BV) const; 124 #endif 125 126 SuspendCrossingInfo(Function &F, coro::Shape &Shape); 127 128 /// Returns true if there is a path from \p From to \p To crossing a suspend 129 /// point without crossing \p From a 2nd time. 130 bool hasPathCrossingSuspendPoint(BasicBlock *From, BasicBlock *To) const { 131 size_t const FromIndex = Mapping.blockToIndex(From); 132 size_t const ToIndex = Mapping.blockToIndex(To); 133 bool const Result = Block[ToIndex].Kills[FromIndex]; 134 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 135 << " answer is " << Result << "\n"); 136 return Result; 137 } 138 139 /// Returns true if there is a path from \p From to \p To crossing a suspend 140 /// point without crossing \p From a 2nd time. If \p From is the same as \p To 141 /// this will also check if there is a looping path crossing a suspend point. 142 bool hasPathOrLoopCrossingSuspendPoint(BasicBlock *From, 143 BasicBlock *To) const { 144 size_t const FromIndex = Mapping.blockToIndex(From); 145 size_t const ToIndex = Mapping.blockToIndex(To); 146 bool Result = Block[ToIndex].Kills[FromIndex] || 147 (From == To && Block[ToIndex].KillLoop); 148 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 149 << " answer is " << Result << " (path or loop)\n"); 150 return Result; 151 } 152 153 bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const { 154 auto *I = cast<Instruction>(U); 155 156 // We rewrote PHINodes, so that only the ones with exactly one incoming 157 // value need to be analyzed. 158 if (auto *PN = dyn_cast<PHINode>(I)) 159 if (PN->getNumIncomingValues() > 1) 160 return false; 161 162 BasicBlock *UseBB = I->getParent(); 163 164 // As a special case, treat uses by an llvm.coro.suspend.retcon or an 165 // llvm.coro.suspend.async as if they were uses in the suspend's single 166 // predecessor: the uses conceptually occur before the suspend. 167 if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) { 168 UseBB = UseBB->getSinglePredecessor(); 169 assert(UseBB && "should have split coro.suspend into its own block"); 170 } 171 172 return hasPathCrossingSuspendPoint(DefBB, UseBB); 173 } 174 175 bool isDefinitionAcrossSuspend(Argument &A, User *U) const { 176 return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U); 177 } 178 179 bool isDefinitionAcrossSuspend(Instruction &I, User *U) const { 180 auto *DefBB = I.getParent(); 181 182 // As a special case, treat values produced by an llvm.coro.suspend.* 183 // as if they were defined in the single successor: the uses 184 // conceptually occur after the suspend. 185 if (isa<AnyCoroSuspendInst>(I)) { 186 DefBB = DefBB->getSingleSuccessor(); 187 assert(DefBB && "should have split coro.suspend into its own block"); 188 } 189 190 return isDefinitionAcrossSuspend(DefBB, U); 191 } 192 193 bool isDefinitionAcrossSuspend(Value &V, User *U) const { 194 if (auto *Arg = dyn_cast<Argument>(&V)) 195 return isDefinitionAcrossSuspend(*Arg, U); 196 if (auto *Inst = dyn_cast<Instruction>(&V)) 197 return isDefinitionAcrossSuspend(*Inst, U); 198 199 llvm_unreachable( 200 "Coroutine could only collect Argument and Instruction now."); 201 } 202 }; 203 } // end anonymous namespace 204 205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 206 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label, 207 BitVector const &BV) const { 208 dbgs() << Label << ":"; 209 for (size_t I = 0, N = BV.size(); I < N; ++I) 210 if (BV[I]) 211 dbgs() << " " << Mapping.indexToBlock(I)->getName(); 212 dbgs() << "\n"; 213 } 214 215 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const { 216 for (size_t I = 0, N = Block.size(); I < N; ++I) { 217 BasicBlock *const B = Mapping.indexToBlock(I); 218 dbgs() << B->getName() << ":\n"; 219 dump(" Consumes", Block[I].Consumes); 220 dump(" Kills", Block[I].Kills); 221 } 222 dbgs() << "\n"; 223 } 224 #endif 225 226 template <bool Initialize> bool SuspendCrossingInfo::computeBlockData() { 227 const size_t N = Mapping.size(); 228 bool Changed = false; 229 230 for (size_t I = 0; I < N; ++I) { 231 auto &B = Block[I]; 232 233 // We don't need to count the predecessors when initialization. 234 if constexpr (!Initialize) 235 // If all the predecessors of the current Block don't change, 236 // the BlockData for the current block must not change too. 237 if (all_of(predecessors(B), [this](BasicBlock *BB) { 238 return !Block[Mapping.blockToIndex(BB)].Changed; 239 })) { 240 B.Changed = false; 241 continue; 242 } 243 244 // Saved Consumes and Kills bitsets so that it is easy to see 245 // if anything changed after propagation. 246 auto SavedConsumes = B.Consumes; 247 auto SavedKills = B.Kills; 248 249 for (BasicBlock *PI : predecessors(B)) { 250 auto PrevNo = Mapping.blockToIndex(PI); 251 auto &P = Block[PrevNo]; 252 253 // Propagate Kills and Consumes from predecessors into B. 254 B.Consumes |= P.Consumes; 255 B.Kills |= P.Kills; 256 257 // If block P is a suspend block, it should propagate kills into block 258 // B for every block P consumes. 259 if (P.Suspend) 260 B.Kills |= P.Consumes; 261 } 262 263 if (B.Suspend) { 264 // If block S is a suspend block, it should kill all of the blocks it 265 // consumes. 266 B.Kills |= B.Consumes; 267 } else if (B.End) { 268 // If block B is an end block, it should not propagate kills as the 269 // blocks following coro.end() are reached during initial invocation 270 // of the coroutine while all the data are still available on the 271 // stack or in the registers. 272 B.Kills.reset(); 273 } else { 274 // This is reached when B block it not Suspend nor coro.end and it 275 // need to make sure that it is not in the kill set. 276 B.KillLoop |= B.Kills[I]; 277 B.Kills.reset(I); 278 } 279 280 if constexpr (!Initialize) { 281 B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes); 282 Changed |= B.Changed; 283 } 284 } 285 286 if constexpr (Initialize) 287 return true; 288 289 return Changed; 290 } 291 292 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) 293 : Mapping(F) { 294 const size_t N = Mapping.size(); 295 Block.resize(N); 296 297 // Initialize every block so that it consumes itself 298 for (size_t I = 0; I < N; ++I) { 299 auto &B = Block[I]; 300 B.Consumes.resize(N); 301 B.Kills.resize(N); 302 B.Consumes.set(I); 303 B.Changed = true; 304 } 305 306 // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as 307 // the code beyond coro.end is reachable during initial invocation of the 308 // coroutine. 309 for (auto *CE : Shape.CoroEnds) 310 getBlockData(CE->getParent()).End = true; 311 312 // Mark all suspend blocks and indicate that they kill everything they 313 // consume. Note, that crossing coro.save also requires a spill, as any code 314 // between coro.save and coro.suspend may resume the coroutine and all of the 315 // state needs to be saved by that time. 316 auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) { 317 BasicBlock *SuspendBlock = BarrierInst->getParent(); 318 auto &B = getBlockData(SuspendBlock); 319 B.Suspend = true; 320 B.Kills |= B.Consumes; 321 }; 322 for (auto *CSI : Shape.CoroSuspends) { 323 markSuspendBlock(CSI); 324 if (auto *Save = CSI->getCoroSave()) 325 markSuspendBlock(Save); 326 } 327 328 computeBlockData</*Initialize=*/true>(); 329 330 while (computeBlockData()) 331 ; 332 333 LLVM_DEBUG(dump()); 334 } 335 336 namespace { 337 338 // RematGraph is used to construct a DAG for rematerializable instructions 339 // When the constructor is invoked with a candidate instruction (which is 340 // materializable) it builds a DAG of materializable instructions from that 341 // point. 342 // Typically, for each instruction identified as re-materializable across a 343 // suspend point, a RematGraph will be created. 344 struct RematGraph { 345 // Each RematNode in the graph contains the edges to instructions providing 346 // operands in the current node. 347 struct RematNode { 348 Instruction *Node; 349 SmallVector<RematNode *> Operands; 350 RematNode() = default; 351 RematNode(Instruction *V) : Node(V) {} 352 }; 353 354 RematNode *EntryNode; 355 using RematNodeMap = 356 SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>; 357 RematNodeMap Remats; 358 const std::function<bool(Instruction &)> &MaterializableCallback; 359 SuspendCrossingInfo &Checker; 360 361 RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback, 362 Instruction *I, SuspendCrossingInfo &Checker) 363 : MaterializableCallback(MaterializableCallback), Checker(Checker) { 364 std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I); 365 EntryNode = FirstNode.get(); 366 std::deque<std::unique_ptr<RematNode>> WorkList; 367 addNode(std::move(FirstNode), WorkList, cast<User>(I)); 368 while (WorkList.size()) { 369 std::unique_ptr<RematNode> N = std::move(WorkList.front()); 370 WorkList.pop_front(); 371 addNode(std::move(N), WorkList, cast<User>(I)); 372 } 373 } 374 375 void addNode(std::unique_ptr<RematNode> NUPtr, 376 std::deque<std::unique_ptr<RematNode>> &WorkList, 377 User *FirstUse) { 378 RematNode *N = NUPtr.get(); 379 if (Remats.count(N->Node)) 380 return; 381 382 // We haven't see this node yet - add to the list 383 Remats[N->Node] = std::move(NUPtr); 384 for (auto &Def : N->Node->operands()) { 385 Instruction *D = dyn_cast<Instruction>(Def.get()); 386 if (!D || !MaterializableCallback(*D) || 387 !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) 388 continue; 389 390 if (Remats.count(D)) { 391 // Already have this in the graph 392 N->Operands.push_back(Remats[D].get()); 393 continue; 394 } 395 396 bool NoMatch = true; 397 for (auto &I : WorkList) { 398 if (I->Node == D) { 399 NoMatch = false; 400 N->Operands.push_back(I.get()); 401 break; 402 } 403 } 404 if (NoMatch) { 405 // Create a new node 406 std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D); 407 N->Operands.push_back(ChildNode.get()); 408 WorkList.push_back(std::move(ChildNode)); 409 } 410 } 411 } 412 413 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 414 void dump() const { 415 dbgs() << "Entry ("; 416 if (EntryNode->Node->getParent()->hasName()) 417 dbgs() << EntryNode->Node->getParent()->getName(); 418 else 419 EntryNode->Node->getParent()->printAsOperand(dbgs(), false); 420 dbgs() << ") : " << *EntryNode->Node << "\n"; 421 for (auto &E : Remats) { 422 dbgs() << *(E.first) << "\n"; 423 for (RematNode *U : E.second->Operands) 424 dbgs() << " " << *U->Node << "\n"; 425 } 426 } 427 #endif 428 }; 429 } // end anonymous namespace 430 431 namespace llvm { 432 433 template <> struct GraphTraits<RematGraph *> { 434 using NodeRef = RematGraph::RematNode *; 435 using ChildIteratorType = RematGraph::RematNode **; 436 437 static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } 438 static ChildIteratorType child_begin(NodeRef N) { 439 return N->Operands.begin(); 440 } 441 static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } 442 }; 443 444 } // end namespace llvm 445 446 #undef DEBUG_TYPE // "coro-suspend-crossing" 447 #define DEBUG_TYPE "coro-frame" 448 449 namespace { 450 class FrameTypeBuilder; 451 // Mapping from the to-be-spilled value to all the users that need reload. 452 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>; 453 struct AllocaInfo { 454 AllocaInst *Alloca; 455 DenseMap<Instruction *, std::optional<APInt>> Aliases; 456 bool MayWriteBeforeCoroBegin; 457 AllocaInfo(AllocaInst *Alloca, 458 DenseMap<Instruction *, std::optional<APInt>> Aliases, 459 bool MayWriteBeforeCoroBegin) 460 : Alloca(Alloca), Aliases(std::move(Aliases)), 461 MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {} 462 }; 463 struct FrameDataInfo { 464 // All the values (that are not allocas) that needs to be spilled to the 465 // frame. 466 SpillInfo Spills; 467 // Allocas contains all values defined as allocas that need to live in the 468 // frame. 469 SmallVector<AllocaInfo, 8> Allocas; 470 471 SmallVector<Value *, 8> getAllDefs() const { 472 SmallVector<Value *, 8> Defs; 473 for (const auto &P : Spills) 474 Defs.push_back(P.first); 475 for (const auto &A : Allocas) 476 Defs.push_back(A.Alloca); 477 return Defs; 478 } 479 480 uint32_t getFieldIndex(Value *V) const { 481 auto Itr = FieldIndexMap.find(V); 482 assert(Itr != FieldIndexMap.end() && 483 "Value does not have a frame field index"); 484 return Itr->second; 485 } 486 487 void setFieldIndex(Value *V, uint32_t Index) { 488 assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) && 489 "Cannot set the index for the same field twice."); 490 FieldIndexMap[V] = Index; 491 } 492 493 Align getAlign(Value *V) const { 494 auto Iter = FieldAlignMap.find(V); 495 assert(Iter != FieldAlignMap.end()); 496 return Iter->second; 497 } 498 499 void setAlign(Value *V, Align AL) { 500 assert(FieldAlignMap.count(V) == 0); 501 FieldAlignMap.insert({V, AL}); 502 } 503 504 uint64_t getDynamicAlign(Value *V) const { 505 auto Iter = FieldDynamicAlignMap.find(V); 506 assert(Iter != FieldDynamicAlignMap.end()); 507 return Iter->second; 508 } 509 510 void setDynamicAlign(Value *V, uint64_t Align) { 511 assert(FieldDynamicAlignMap.count(V) == 0); 512 FieldDynamicAlignMap.insert({V, Align}); 513 } 514 515 uint64_t getOffset(Value *V) const { 516 auto Iter = FieldOffsetMap.find(V); 517 assert(Iter != FieldOffsetMap.end()); 518 return Iter->second; 519 } 520 521 void setOffset(Value *V, uint64_t Offset) { 522 assert(FieldOffsetMap.count(V) == 0); 523 FieldOffsetMap.insert({V, Offset}); 524 } 525 526 // Remap the index of every field in the frame, using the final layout index. 527 void updateLayoutIndex(FrameTypeBuilder &B); 528 529 private: 530 // LayoutIndexUpdateStarted is used to avoid updating the index of any field 531 // twice by mistake. 532 bool LayoutIndexUpdateStarted = false; 533 // Map from values to their slot indexes on the frame. They will be first set 534 // with their original insertion field index. After the frame is built, their 535 // indexes will be updated into the final layout index. 536 DenseMap<Value *, uint32_t> FieldIndexMap; 537 // Map from values to their alignment on the frame. They would be set after 538 // the frame is built. 539 DenseMap<Value *, Align> FieldAlignMap; 540 DenseMap<Value *, uint64_t> FieldDynamicAlignMap; 541 // Map from values to their offset on the frame. They would be set after 542 // the frame is built. 543 DenseMap<Value *, uint64_t> FieldOffsetMap; 544 }; 545 } // namespace 546 547 #ifndef NDEBUG 548 static void dumpSpills(StringRef Title, const SpillInfo &Spills) { 549 dbgs() << "------------- " << Title << "--------------\n"; 550 for (const auto &E : Spills) { 551 E.first->dump(); 552 dbgs() << " user: "; 553 for (auto *I : E.second) 554 I->dump(); 555 } 556 } 557 static void dumpRemats( 558 StringRef Title, 559 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) { 560 dbgs() << "------------- " << Title << "--------------\n"; 561 for (const auto &E : RM) { 562 E.second->dump(); 563 dbgs() << "--\n"; 564 } 565 } 566 567 static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) { 568 dbgs() << "------------- Allocas --------------\n"; 569 for (const auto &A : Allocas) { 570 A.Alloca->dump(); 571 } 572 } 573 #endif 574 575 namespace { 576 using FieldIDType = size_t; 577 // We cannot rely solely on natural alignment of a type when building a 578 // coroutine frame and if the alignment specified on the Alloca instruction 579 // differs from the natural alignment of the alloca type we will need to insert 580 // padding. 581 class FrameTypeBuilder { 582 private: 583 struct Field { 584 uint64_t Size; 585 uint64_t Offset; 586 Type *Ty; 587 FieldIDType LayoutFieldIndex; 588 Align Alignment; 589 Align TyAlignment; 590 uint64_t DynamicAlignBuffer; 591 }; 592 593 const DataLayout &DL; 594 LLVMContext &Context; 595 uint64_t StructSize = 0; 596 Align StructAlign; 597 bool IsFinished = false; 598 599 std::optional<Align> MaxFrameAlignment; 600 601 SmallVector<Field, 8> Fields; 602 DenseMap<Value*, unsigned> FieldIndexByKey; 603 604 public: 605 FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL, 606 std::optional<Align> MaxFrameAlignment) 607 : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {} 608 609 /// Add a field to this structure for the storage of an `alloca` 610 /// instruction. 611 [[nodiscard]] FieldIDType addFieldForAlloca(AllocaInst *AI, 612 bool IsHeader = false) { 613 Type *Ty = AI->getAllocatedType(); 614 615 // Make an array type if this is a static array allocation. 616 if (AI->isArrayAllocation()) { 617 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) 618 Ty = ArrayType::get(Ty, CI->getValue().getZExtValue()); 619 else 620 report_fatal_error("Coroutines cannot handle non static allocas yet"); 621 } 622 623 return addField(Ty, AI->getAlign(), IsHeader); 624 } 625 626 /// We want to put the allocas whose lifetime-ranges are not overlapped 627 /// into one slot of coroutine frame. 628 /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566 629 /// 630 /// cppcoro::task<void> alternative_paths(bool cond) { 631 /// if (cond) { 632 /// big_structure a; 633 /// process(a); 634 /// co_await something(); 635 /// } else { 636 /// big_structure b; 637 /// process2(b); 638 /// co_await something(); 639 /// } 640 /// } 641 /// 642 /// We want to put variable a and variable b in the same slot to 643 /// reduce the size of coroutine frame. 644 /// 645 /// This function use StackLifetime algorithm to partition the AllocaInsts in 646 /// Spills to non-overlapped sets in order to put Alloca in the same 647 /// non-overlapped set into the same slot in the Coroutine Frame. Then add 648 /// field for the allocas in the same non-overlapped set by using the largest 649 /// type as the field type. 650 /// 651 /// Side Effects: Because We sort the allocas, the order of allocas in the 652 /// frame may be different with the order in the source code. 653 void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData, 654 coro::Shape &Shape); 655 656 /// Add a field to this structure. 657 [[nodiscard]] FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment, 658 bool IsHeader = false, 659 bool IsSpillOfValue = false) { 660 assert(!IsFinished && "adding fields to a finished builder"); 661 assert(Ty && "must provide a type for a field"); 662 663 // The field size is always the alloc size of the type. 664 uint64_t FieldSize = DL.getTypeAllocSize(Ty); 665 666 // For an alloca with size=0, we don't need to add a field and they 667 // can just point to any index in the frame. Use index 0. 668 if (FieldSize == 0) { 669 return 0; 670 } 671 672 // The field alignment might not be the type alignment, but we need 673 // to remember the type alignment anyway to build the type. 674 // If we are spilling values we don't need to worry about ABI alignment 675 // concerns. 676 Align ABIAlign = DL.getABITypeAlign(Ty); 677 Align TyAlignment = ABIAlign; 678 if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign) 679 TyAlignment = *MaxFrameAlignment; 680 Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment); 681 682 // The field alignment could be bigger than the max frame case, in that case 683 // we request additional storage to be able to dynamically align the 684 // pointer. 685 uint64_t DynamicAlignBuffer = 0; 686 if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) { 687 DynamicAlignBuffer = 688 offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment); 689 FieldAlignment = *MaxFrameAlignment; 690 FieldSize = FieldSize + DynamicAlignBuffer; 691 } 692 693 // Lay out header fields immediately. 694 uint64_t Offset; 695 if (IsHeader) { 696 Offset = alignTo(StructSize, FieldAlignment); 697 StructSize = Offset + FieldSize; 698 699 // Everything else has a flexible offset. 700 } else { 701 Offset = OptimizedStructLayoutField::FlexibleOffset; 702 } 703 704 Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment, 705 DynamicAlignBuffer}); 706 return Fields.size() - 1; 707 } 708 709 /// Finish the layout and set the body on the given type. 710 void finish(StructType *Ty); 711 712 uint64_t getStructSize() const { 713 assert(IsFinished && "not yet finished!"); 714 return StructSize; 715 } 716 717 Align getStructAlign() const { 718 assert(IsFinished && "not yet finished!"); 719 return StructAlign; 720 } 721 722 FieldIDType getLayoutFieldIndex(FieldIDType Id) const { 723 assert(IsFinished && "not yet finished!"); 724 return Fields[Id].LayoutFieldIndex; 725 } 726 727 Field getLayoutField(FieldIDType Id) const { 728 assert(IsFinished && "not yet finished!"); 729 return Fields[Id]; 730 } 731 }; 732 } // namespace 733 734 void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) { 735 auto Updater = [&](Value *I) { 736 auto Field = B.getLayoutField(getFieldIndex(I)); 737 setFieldIndex(I, Field.LayoutFieldIndex); 738 setAlign(I, Field.Alignment); 739 uint64_t dynamicAlign = 740 Field.DynamicAlignBuffer 741 ? Field.DynamicAlignBuffer + Field.Alignment.value() 742 : 0; 743 setDynamicAlign(I, dynamicAlign); 744 setOffset(I, Field.Offset); 745 }; 746 LayoutIndexUpdateStarted = true; 747 for (auto &S : Spills) 748 Updater(S.first); 749 for (const auto &A : Allocas) 750 Updater(A.Alloca); 751 LayoutIndexUpdateStarted = false; 752 } 753 754 void FrameTypeBuilder::addFieldForAllocas(const Function &F, 755 FrameDataInfo &FrameData, 756 coro::Shape &Shape) { 757 using AllocaSetType = SmallVector<AllocaInst *, 4>; 758 SmallVector<AllocaSetType, 4> NonOverlapedAllocas; 759 760 // We need to add field for allocas at the end of this function. 761 auto AddFieldForAllocasAtExit = make_scope_exit([&]() { 762 for (auto AllocaList : NonOverlapedAllocas) { 763 auto *LargestAI = *AllocaList.begin(); 764 FieldIDType Id = addFieldForAlloca(LargestAI); 765 for (auto *Alloca : AllocaList) 766 FrameData.setFieldIndex(Alloca, Id); 767 } 768 }); 769 770 if (!Shape.OptimizeFrame) { 771 for (const auto &A : FrameData.Allocas) { 772 AllocaInst *Alloca = A.Alloca; 773 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 774 } 775 return; 776 } 777 778 // Because there are paths from the lifetime.start to coro.end 779 // for each alloca, the liferanges for every alloca is overlaped 780 // in the blocks who contain coro.end and the successor blocks. 781 // So we choose to skip there blocks when we calculate the liferange 782 // for each alloca. It should be reasonable since there shouldn't be uses 783 // in these blocks and the coroutine frame shouldn't be used outside the 784 // coroutine body. 785 // 786 // Note that the user of coro.suspend may not be SwitchInst. However, this 787 // case seems too complex to handle. And it is harmless to skip these 788 // patterns since it just prevend putting the allocas to live in the same 789 // slot. 790 DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest; 791 for (auto *CoroSuspendInst : Shape.CoroSuspends) { 792 for (auto *U : CoroSuspendInst->users()) { 793 if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) { 794 auto *SWI = const_cast<SwitchInst *>(ConstSWI); 795 DefaultSuspendDest[SWI] = SWI->getDefaultDest(); 796 SWI->setDefaultDest(SWI->getSuccessor(1)); 797 } 798 } 799 } 800 801 auto ExtractAllocas = [&]() { 802 AllocaSetType Allocas; 803 Allocas.reserve(FrameData.Allocas.size()); 804 for (const auto &A : FrameData.Allocas) 805 Allocas.push_back(A.Alloca); 806 return Allocas; 807 }; 808 StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(), 809 StackLifetime::LivenessType::May); 810 StackLifetimeAnalyzer.run(); 811 auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) { 812 return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps( 813 StackLifetimeAnalyzer.getLiveRange(AI2)); 814 }; 815 auto GetAllocaSize = [&](const AllocaInfo &A) { 816 std::optional<TypeSize> RetSize = A.Alloca->getAllocationSize(DL); 817 assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n"); 818 assert(!RetSize->isScalable() && "Scalable vectors are not yet supported"); 819 return RetSize->getFixedValue(); 820 }; 821 // Put larger allocas in the front. So the larger allocas have higher 822 // priority to merge, which can save more space potentially. Also each 823 // AllocaSet would be ordered. So we can get the largest Alloca in one 824 // AllocaSet easily. 825 sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) { 826 return GetAllocaSize(Iter1) > GetAllocaSize(Iter2); 827 }); 828 for (const auto &A : FrameData.Allocas) { 829 AllocaInst *Alloca = A.Alloca; 830 bool Merged = false; 831 // Try to find if the Alloca is not inferenced with any existing 832 // NonOverlappedAllocaSet. If it is true, insert the alloca to that 833 // NonOverlappedAllocaSet. 834 for (auto &AllocaSet : NonOverlapedAllocas) { 835 assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n"); 836 bool NoInference = none_of(AllocaSet, [&](auto Iter) { 837 return IsAllocaInferenre(Alloca, Iter); 838 }); 839 // If the alignment of A is multiple of the alignment of B, the address 840 // of A should satisfy the requirement for aligning for B. 841 // 842 // There may be other more fine-grained strategies to handle the alignment 843 // infomation during the merging process. But it seems hard to handle 844 // these strategies and benefit little. 845 bool Alignable = [&]() -> bool { 846 auto *LargestAlloca = *AllocaSet.begin(); 847 return LargestAlloca->getAlign().value() % Alloca->getAlign().value() == 848 0; 849 }(); 850 bool CouldMerge = NoInference && Alignable; 851 if (!CouldMerge) 852 continue; 853 AllocaSet.push_back(Alloca); 854 Merged = true; 855 break; 856 } 857 if (!Merged) { 858 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 859 } 860 } 861 // Recover the default target destination for each Switch statement 862 // reserved. 863 for (auto SwitchAndDefaultDest : DefaultSuspendDest) { 864 SwitchInst *SWI = SwitchAndDefaultDest.first; 865 BasicBlock *DestBB = SwitchAndDefaultDest.second; 866 SWI->setDefaultDest(DestBB); 867 } 868 // This Debug Info could tell us which allocas are merged into one slot. 869 LLVM_DEBUG(for (auto &AllocaSet 870 : NonOverlapedAllocas) { 871 if (AllocaSet.size() > 1) { 872 dbgs() << "In Function:" << F.getName() << "\n"; 873 dbgs() << "Find Union Set " 874 << "\n"; 875 dbgs() << "\tAllocas are \n"; 876 for (auto Alloca : AllocaSet) 877 dbgs() << "\t\t" << *Alloca << "\n"; 878 } 879 }); 880 } 881 882 void FrameTypeBuilder::finish(StructType *Ty) { 883 assert(!IsFinished && "already finished!"); 884 885 // Prepare the optimal-layout field array. 886 // The Id in the layout field is a pointer to our Field for it. 887 SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 888 LayoutFields.reserve(Fields.size()); 889 for (auto &Field : Fields) { 890 LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment, 891 Field.Offset); 892 } 893 894 // Perform layout. 895 auto SizeAndAlign = performOptimizedStructLayout(LayoutFields); 896 StructSize = SizeAndAlign.first; 897 StructAlign = SizeAndAlign.second; 898 899 auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & { 900 return *static_cast<Field *>(const_cast<void*>(LayoutField.Id)); 901 }; 902 903 // We need to produce a packed struct type if there's a field whose 904 // assigned offset isn't a multiple of its natural type alignment. 905 bool Packed = [&] { 906 for (auto &LayoutField : LayoutFields) { 907 auto &F = getField(LayoutField); 908 if (!isAligned(F.TyAlignment, LayoutField.Offset)) 909 return true; 910 } 911 return false; 912 }(); 913 914 // Build the struct body. 915 SmallVector<Type*, 16> FieldTypes; 916 FieldTypes.reserve(LayoutFields.size() * 3 / 2); 917 uint64_t LastOffset = 0; 918 for (auto &LayoutField : LayoutFields) { 919 auto &F = getField(LayoutField); 920 921 auto Offset = LayoutField.Offset; 922 923 // Add a padding field if there's a padding gap and we're either 924 // building a packed struct or the padding gap is more than we'd 925 // get from aligning to the field type's natural alignment. 926 assert(Offset >= LastOffset); 927 if (Offset != LastOffset) { 928 if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset) 929 FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context), 930 Offset - LastOffset)); 931 } 932 933 F.Offset = Offset; 934 F.LayoutFieldIndex = FieldTypes.size(); 935 936 FieldTypes.push_back(F.Ty); 937 if (F.DynamicAlignBuffer) { 938 FieldTypes.push_back( 939 ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer)); 940 } 941 LastOffset = Offset + F.Size; 942 } 943 944 Ty->setBody(FieldTypes, Packed); 945 946 #ifndef NDEBUG 947 // Check that the IR layout matches the offsets we expect. 948 auto Layout = DL.getStructLayout(Ty); 949 for (auto &F : Fields) { 950 assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty); 951 assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset); 952 } 953 #endif 954 955 IsFinished = true; 956 } 957 958 static void cacheDIVar(FrameDataInfo &FrameData, 959 DenseMap<Value *, DILocalVariable *> &DIVarCache) { 960 for (auto *V : FrameData.getAllDefs()) { 961 if (DIVarCache.contains(V)) 962 continue; 963 964 auto DDIs = FindDbgDeclareUses(V); 965 auto *I = llvm::find_if(DDIs, [](DbgDeclareInst *DDI) { 966 return DDI->getExpression()->getNumElements() == 0; 967 }); 968 if (I != DDIs.end()) 969 DIVarCache.insert({V, (*I)->getVariable()}); 970 } 971 } 972 973 /// Create name for Type. It uses MDString to store new created string to 974 /// avoid memory leak. 975 static StringRef solveTypeName(Type *Ty) { 976 if (Ty->isIntegerTy()) { 977 // The longest name in common may be '__int_128', which has 9 bits. 978 SmallString<16> Buffer; 979 raw_svector_ostream OS(Buffer); 980 OS << "__int_" << cast<IntegerType>(Ty)->getBitWidth(); 981 auto *MDName = MDString::get(Ty->getContext(), OS.str()); 982 return MDName->getString(); 983 } 984 985 if (Ty->isFloatingPointTy()) { 986 if (Ty->isFloatTy()) 987 return "__float_"; 988 if (Ty->isDoubleTy()) 989 return "__double_"; 990 return "__floating_type_"; 991 } 992 993 if (Ty->isPointerTy()) 994 return "PointerType"; 995 996 if (Ty->isStructTy()) { 997 if (!cast<StructType>(Ty)->hasName()) 998 return "__LiteralStructType_"; 999 1000 auto Name = Ty->getStructName(); 1001 1002 SmallString<16> Buffer(Name); 1003 for (auto &Iter : Buffer) 1004 if (Iter == '.' || Iter == ':') 1005 Iter = '_'; 1006 auto *MDName = MDString::get(Ty->getContext(), Buffer.str()); 1007 return MDName->getString(); 1008 } 1009 1010 return "UnknownType"; 1011 } 1012 1013 static DIType *solveDIType(DIBuilder &Builder, Type *Ty, 1014 const DataLayout &Layout, DIScope *Scope, 1015 unsigned LineNum, 1016 DenseMap<Type *, DIType *> &DITypeCache) { 1017 if (DIType *DT = DITypeCache.lookup(Ty)) 1018 return DT; 1019 1020 StringRef Name = solveTypeName(Ty); 1021 1022 DIType *RetType = nullptr; 1023 1024 if (Ty->isIntegerTy()) { 1025 auto BitWidth = cast<IntegerType>(Ty)->getBitWidth(); 1026 RetType = Builder.createBasicType(Name, BitWidth, dwarf::DW_ATE_signed, 1027 llvm::DINode::FlagArtificial); 1028 } else if (Ty->isFloatingPointTy()) { 1029 RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty), 1030 dwarf::DW_ATE_float, 1031 llvm::DINode::FlagArtificial); 1032 } else if (Ty->isPointerTy()) { 1033 // Construct PointerType points to null (aka void *) instead of exploring 1034 // pointee type to avoid infinite search problem. For example, we would be 1035 // in trouble if we traverse recursively: 1036 // 1037 // struct Node { 1038 // Node* ptr; 1039 // }; 1040 RetType = 1041 Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty), 1042 Layout.getABITypeAlign(Ty).value() * CHAR_BIT, 1043 /*DWARFAddressSpace=*/std::nullopt, Name); 1044 } else if (Ty->isStructTy()) { 1045 auto *DIStruct = Builder.createStructType( 1046 Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty), 1047 Layout.getPrefTypeAlign(Ty).value() * CHAR_BIT, 1048 llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray()); 1049 1050 auto *StructTy = cast<StructType>(Ty); 1051 SmallVector<Metadata *, 16> Elements; 1052 for (unsigned I = 0; I < StructTy->getNumElements(); I++) { 1053 DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout, 1054 Scope, LineNum, DITypeCache); 1055 assert(DITy); 1056 Elements.push_back(Builder.createMemberType( 1057 Scope, DITy->getName(), Scope->getFile(), LineNum, 1058 DITy->getSizeInBits(), DITy->getAlignInBits(), 1059 Layout.getStructLayout(StructTy)->getElementOffsetInBits(I), 1060 llvm::DINode::FlagArtificial, DITy)); 1061 } 1062 1063 Builder.replaceArrays(DIStruct, Builder.getOrCreateArray(Elements)); 1064 1065 RetType = DIStruct; 1066 } else { 1067 LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n"); 1068 TypeSize Size = Layout.getTypeSizeInBits(Ty); 1069 auto *CharSizeType = Builder.createBasicType( 1070 Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial); 1071 1072 if (Size <= 8) 1073 RetType = CharSizeType; 1074 else { 1075 if (Size % 8 != 0) 1076 Size = TypeSize::Fixed(Size + 8 - (Size % 8)); 1077 1078 RetType = Builder.createArrayType( 1079 Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType, 1080 Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8))); 1081 } 1082 } 1083 1084 DITypeCache.insert({Ty, RetType}); 1085 return RetType; 1086 } 1087 1088 /// Build artificial debug info for C++ coroutine frames to allow users to 1089 /// inspect the contents of the frame directly 1090 /// 1091 /// Create Debug information for coroutine frame with debug name "__coro_frame". 1092 /// The debug information for the fields of coroutine frame is constructed from 1093 /// the following way: 1094 /// 1. For all the value in the Frame, we search the use of dbg.declare to find 1095 /// the corresponding debug variables for the value. If we can find the 1096 /// debug variable, we can get full and accurate debug information. 1097 /// 2. If we can't get debug information in step 1 and 2, we could only try to 1098 /// build the DIType by Type. We did this in solveDIType. We only handle 1099 /// integer, float, double, integer type and struct type for now. 1100 static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, 1101 FrameDataInfo &FrameData) { 1102 DISubprogram *DIS = F.getSubprogram(); 1103 // If there is no DISubprogram for F, it implies the Function are not compiled 1104 // with debug info. So we also don't need to generate debug info for the frame 1105 // neither. 1106 if (!DIS || !DIS->getUnit() || 1107 !dwarf::isCPlusPlus( 1108 (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage())) 1109 return; 1110 1111 assert(Shape.ABI == coro::ABI::Switch && 1112 "We could only build debug infomation for C++ coroutine now.\n"); 1113 1114 DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); 1115 1116 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1117 assert(PromiseAlloca && 1118 "Coroutine with switch ABI should own Promise alloca"); 1119 1120 TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(PromiseAlloca); 1121 if (DIs.empty()) 1122 return; 1123 1124 DbgDeclareInst *PromiseDDI = DIs.front(); 1125 DILocalVariable *PromiseDIVariable = PromiseDDI->getVariable(); 1126 DILocalScope *PromiseDIScope = PromiseDIVariable->getScope(); 1127 DIFile *DFile = PromiseDIScope->getFile(); 1128 DILocation *DILoc = PromiseDDI->getDebugLoc().get(); 1129 unsigned LineNum = PromiseDIVariable->getLine(); 1130 1131 DICompositeType *FrameDITy = DBuilder.createStructType( 1132 DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), 1133 DFile, LineNum, Shape.FrameSize * 8, 1134 Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr, 1135 llvm::DINodeArray()); 1136 StructType *FrameTy = Shape.FrameTy; 1137 SmallVector<Metadata *, 16> Elements; 1138 DataLayout Layout = F.getParent()->getDataLayout(); 1139 1140 DenseMap<Value *, DILocalVariable *> DIVarCache; 1141 cacheDIVar(FrameData, DIVarCache); 1142 1143 unsigned ResumeIndex = coro::Shape::SwitchFieldIndex::Resume; 1144 unsigned DestroyIndex = coro::Shape::SwitchFieldIndex::Destroy; 1145 unsigned IndexIndex = Shape.SwitchLowering.IndexField; 1146 1147 DenseMap<unsigned, StringRef> NameCache; 1148 NameCache.insert({ResumeIndex, "__resume_fn"}); 1149 NameCache.insert({DestroyIndex, "__destroy_fn"}); 1150 NameCache.insert({IndexIndex, "__coro_index"}); 1151 1152 Type *ResumeFnTy = FrameTy->getElementType(ResumeIndex), 1153 *DestroyFnTy = FrameTy->getElementType(DestroyIndex), 1154 *IndexTy = FrameTy->getElementType(IndexIndex); 1155 1156 DenseMap<unsigned, DIType *> TyCache; 1157 TyCache.insert( 1158 {ResumeIndex, DBuilder.createPointerType( 1159 nullptr, Layout.getTypeSizeInBits(ResumeFnTy))}); 1160 TyCache.insert( 1161 {DestroyIndex, DBuilder.createPointerType( 1162 nullptr, Layout.getTypeSizeInBits(DestroyFnTy))}); 1163 1164 /// FIXME: If we fill the field `SizeInBits` with the actual size of 1165 /// __coro_index in bits, then __coro_index wouldn't show in the debugger. 1166 TyCache.insert({IndexIndex, DBuilder.createBasicType( 1167 "__coro_index", 1168 (Layout.getTypeSizeInBits(IndexTy) < 8) 1169 ? 8 1170 : Layout.getTypeSizeInBits(IndexTy), 1171 dwarf::DW_ATE_unsigned_char)}); 1172 1173 for (auto *V : FrameData.getAllDefs()) { 1174 if (!DIVarCache.contains(V)) 1175 continue; 1176 1177 auto Index = FrameData.getFieldIndex(V); 1178 1179 NameCache.insert({Index, DIVarCache[V]->getName()}); 1180 TyCache.insert({Index, DIVarCache[V]->getType()}); 1181 } 1182 1183 // Cache from index to (Align, Offset Pair) 1184 DenseMap<unsigned, std::pair<unsigned, unsigned>> OffsetCache; 1185 // The Align and Offset of Resume function and Destroy function are fixed. 1186 OffsetCache.insert({ResumeIndex, {8, 0}}); 1187 OffsetCache.insert({DestroyIndex, {8, 8}}); 1188 OffsetCache.insert( 1189 {IndexIndex, 1190 {Shape.SwitchLowering.IndexAlign, Shape.SwitchLowering.IndexOffset}}); 1191 1192 for (auto *V : FrameData.getAllDefs()) { 1193 auto Index = FrameData.getFieldIndex(V); 1194 1195 OffsetCache.insert( 1196 {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}}); 1197 } 1198 1199 DenseMap<Type *, DIType *> DITypeCache; 1200 // This counter is used to avoid same type names. e.g., there would be 1201 // many i32 and i64 types in one coroutine. And we would use i32_0 and 1202 // i32_1 to avoid the same type. Since it makes no sense the name of the 1203 // fields confilicts with each other. 1204 unsigned UnknownTypeNum = 0; 1205 for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) { 1206 if (!OffsetCache.contains(Index)) 1207 continue; 1208 1209 std::string Name; 1210 uint64_t SizeInBits; 1211 uint32_t AlignInBits; 1212 uint64_t OffsetInBits; 1213 DIType *DITy = nullptr; 1214 1215 Type *Ty = FrameTy->getElementType(Index); 1216 assert(Ty->isSized() && "We can't handle type which is not sized.\n"); 1217 SizeInBits = Layout.getTypeSizeInBits(Ty).getFixedValue(); 1218 AlignInBits = OffsetCache[Index].first * 8; 1219 OffsetInBits = OffsetCache[Index].second * 8; 1220 1221 if (NameCache.contains(Index)) { 1222 Name = NameCache[Index].str(); 1223 DITy = TyCache[Index]; 1224 } else { 1225 DITy = solveDIType(DBuilder, Ty, Layout, FrameDITy, LineNum, DITypeCache); 1226 assert(DITy && "SolveDIType shouldn't return nullptr.\n"); 1227 Name = DITy->getName().str(); 1228 Name += "_" + std::to_string(UnknownTypeNum); 1229 UnknownTypeNum++; 1230 } 1231 1232 Elements.push_back(DBuilder.createMemberType( 1233 FrameDITy, Name, DFile, LineNum, SizeInBits, AlignInBits, OffsetInBits, 1234 llvm::DINode::FlagArtificial, DITy)); 1235 } 1236 1237 DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements)); 1238 1239 auto *FrameDIVar = DBuilder.createAutoVariable(PromiseDIScope, "__coro_frame", 1240 DFile, LineNum, FrameDITy, 1241 true, DINode::FlagArtificial); 1242 assert(FrameDIVar->isValidLocationForIntrinsic(PromiseDDI->getDebugLoc())); 1243 1244 // Subprogram would have ContainedNodes field which records the debug 1245 // variables it contained. So we need to add __coro_frame to the 1246 // ContainedNodes of it. 1247 // 1248 // If we don't add __coro_frame to the RetainedNodes, user may get 1249 // `no symbol __coro_frame in context` rather than `__coro_frame` 1250 // is optimized out, which is more precise. 1251 if (auto *SubProgram = dyn_cast<DISubprogram>(PromiseDIScope)) { 1252 auto RetainedNodes = SubProgram->getRetainedNodes(); 1253 SmallVector<Metadata *, 32> RetainedNodesVec(RetainedNodes.begin(), 1254 RetainedNodes.end()); 1255 RetainedNodesVec.push_back(FrameDIVar); 1256 SubProgram->replaceOperandWith( 1257 7, (MDTuple::get(F.getContext(), RetainedNodesVec))); 1258 } 1259 1260 DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar, 1261 DBuilder.createExpression(), DILoc, 1262 Shape.getInsertPtAfterFramePtr()); 1263 } 1264 1265 // Build a struct that will keep state for an active coroutine. 1266 // struct f.frame { 1267 // ResumeFnTy ResumeFnAddr; 1268 // ResumeFnTy DestroyFnAddr; 1269 // int ResumeIndex; 1270 // ... promise (if present) ... 1271 // ... spills ... 1272 // }; 1273 static StructType *buildFrameType(Function &F, coro::Shape &Shape, 1274 FrameDataInfo &FrameData) { 1275 LLVMContext &C = F.getContext(); 1276 const DataLayout &DL = F.getParent()->getDataLayout(); 1277 StructType *FrameTy = [&] { 1278 SmallString<32> Name(F.getName()); 1279 Name.append(".Frame"); 1280 return StructType::create(C, Name); 1281 }(); 1282 1283 // We will use this value to cap the alignment of spilled values. 1284 std::optional<Align> MaxFrameAlignment; 1285 if (Shape.ABI == coro::ABI::Async) 1286 MaxFrameAlignment = Shape.AsyncLowering.getContextAlignment(); 1287 FrameTypeBuilder B(C, DL, MaxFrameAlignment); 1288 1289 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1290 std::optional<FieldIDType> SwitchIndexFieldId; 1291 1292 if (Shape.ABI == coro::ABI::Switch) { 1293 auto *FramePtrTy = FrameTy->getPointerTo(); 1294 auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy, 1295 /*IsVarArg=*/false); 1296 auto *FnPtrTy = FnTy->getPointerTo(); 1297 1298 // Add header fields for the resume and destroy functions. 1299 // We can rely on these being perfectly packed. 1300 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1301 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1302 1303 // PromiseAlloca field needs to be explicitly added here because it's 1304 // a header field with a fixed offset based on its alignment. Hence it 1305 // needs special handling and cannot be added to FrameData.Allocas. 1306 if (PromiseAlloca) 1307 FrameData.setFieldIndex( 1308 PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true)); 1309 1310 // Add a field to store the suspend index. This doesn't need to 1311 // be in the header. 1312 unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); 1313 Type *IndexType = Type::getIntNTy(C, IndexBits); 1314 1315 SwitchIndexFieldId = B.addField(IndexType, std::nullopt); 1316 } else { 1317 assert(PromiseAlloca == nullptr && "lowering doesn't support promises"); 1318 } 1319 1320 // Because multiple allocas may own the same field slot, 1321 // we add allocas to field here. 1322 B.addFieldForAllocas(F, FrameData, Shape); 1323 // Add PromiseAlloca to Allocas list so that 1324 // 1. updateLayoutIndex could update its index after 1325 // `performOptimizedStructLayout` 1326 // 2. it is processed in insertSpills. 1327 if (Shape.ABI == coro::ABI::Switch && PromiseAlloca) 1328 // We assume that the promise alloca won't be modified before 1329 // CoroBegin and no alias will be create before CoroBegin. 1330 FrameData.Allocas.emplace_back( 1331 PromiseAlloca, DenseMap<Instruction *, std::optional<APInt>>{}, false); 1332 // Create an entry for every spilled value. 1333 for (auto &S : FrameData.Spills) { 1334 Type *FieldType = S.first->getType(); 1335 // For byval arguments, we need to store the pointed value in the frame, 1336 // instead of the pointer itself. 1337 if (const Argument *A = dyn_cast<Argument>(S.first)) 1338 if (A->hasByValAttr()) 1339 FieldType = A->getParamByValType(); 1340 FieldIDType Id = B.addField(FieldType, std::nullopt, false /*header*/, 1341 true /*IsSpillOfValue*/); 1342 FrameData.setFieldIndex(S.first, Id); 1343 } 1344 1345 B.finish(FrameTy); 1346 FrameData.updateLayoutIndex(B); 1347 Shape.FrameAlign = B.getStructAlign(); 1348 Shape.FrameSize = B.getStructSize(); 1349 1350 switch (Shape.ABI) { 1351 case coro::ABI::Switch: { 1352 // In the switch ABI, remember the switch-index field. 1353 auto IndexField = B.getLayoutField(*SwitchIndexFieldId); 1354 Shape.SwitchLowering.IndexField = IndexField.LayoutFieldIndex; 1355 Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value(); 1356 Shape.SwitchLowering.IndexOffset = IndexField.Offset; 1357 1358 // Also round the frame size up to a multiple of its alignment, as is 1359 // generally expected in C/C++. 1360 Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign); 1361 break; 1362 } 1363 1364 // In the retcon ABI, remember whether the frame is inline in the storage. 1365 case coro::ABI::Retcon: 1366 case coro::ABI::RetconOnce: { 1367 auto Id = Shape.getRetconCoroId(); 1368 Shape.RetconLowering.IsFrameInlineInStorage 1369 = (B.getStructSize() <= Id->getStorageSize() && 1370 B.getStructAlign() <= Id->getStorageAlignment()); 1371 break; 1372 } 1373 case coro::ABI::Async: { 1374 Shape.AsyncLowering.FrameOffset = 1375 alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign); 1376 // Also make the final context size a multiple of the context alignment to 1377 // make allocation easier for allocators. 1378 Shape.AsyncLowering.ContextSize = 1379 alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize, 1380 Shape.AsyncLowering.getContextAlignment()); 1381 if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) { 1382 report_fatal_error( 1383 "The alignment requirment of frame variables cannot be higher than " 1384 "the alignment of the async function context"); 1385 } 1386 break; 1387 } 1388 } 1389 1390 return FrameTy; 1391 } 1392 1393 // We use a pointer use visitor to track how an alloca is being used. 1394 // The goal is to be able to answer the following three questions: 1395 // 1. Should this alloca be allocated on the frame instead. 1396 // 2. Could the content of the alloca be modified prior to CoroBegn, which would 1397 // require copying the data from alloca to the frame after CoroBegin. 1398 // 3. Is there any alias created for this alloca prior to CoroBegin, but used 1399 // after CoroBegin. In that case, we will need to recreate the alias after 1400 // CoroBegin based off the frame. To answer question 1, we track two things: 1401 // a. List of all BasicBlocks that use this alloca or any of the aliases of 1402 // the alloca. In the end, we check if there exists any two basic blocks that 1403 // cross suspension points. If so, this alloca must be put on the frame. b. 1404 // Whether the alloca or any alias of the alloca is escaped at some point, 1405 // either by storing the address somewhere, or the address is used in a 1406 // function call that might capture. If it's ever escaped, this alloca must be 1407 // put on the frame conservatively. 1408 // To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin. 1409 // Whenever a potential write happens, either through a store instruction, a 1410 // function call or any of the memory intrinsics, we check whether this 1411 // instruction is prior to CoroBegin. To answer question 3, we track the offsets 1412 // of all aliases created for the alloca prior to CoroBegin but used after 1413 // CoroBegin. std::optional is used to be able to represent the case when the 1414 // offset is unknown (e.g. when you have a PHINode that takes in different 1415 // offset values). We cannot handle unknown offsets and will assert. This is the 1416 // potential issue left out. An ideal solution would likely require a 1417 // significant redesign. 1418 namespace { 1419 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { 1420 using Base = PtrUseVisitor<AllocaUseVisitor>; 1421 AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, 1422 const CoroBeginInst &CB, const SuspendCrossingInfo &Checker, 1423 bool ShouldUseLifetimeStartInfo) 1424 : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker), 1425 ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {} 1426 1427 void visit(Instruction &I) { 1428 Users.insert(&I); 1429 Base::visit(I); 1430 // If the pointer is escaped prior to CoroBegin, we have to assume it would 1431 // be written into before CoroBegin as well. 1432 if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) { 1433 MayWriteBeforeCoroBegin = true; 1434 } 1435 } 1436 // We need to provide this overload as PtrUseVisitor uses a pointer based 1437 // visiting function. 1438 void visit(Instruction *I) { return visit(*I); } 1439 1440 void visitPHINode(PHINode &I) { 1441 enqueueUsers(I); 1442 handleAlias(I); 1443 } 1444 1445 void visitSelectInst(SelectInst &I) { 1446 enqueueUsers(I); 1447 handleAlias(I); 1448 } 1449 1450 void visitStoreInst(StoreInst &SI) { 1451 // Regardless whether the alias of the alloca is the value operand or the 1452 // pointer operand, we need to assume the alloca is been written. 1453 handleMayWrite(SI); 1454 1455 if (SI.getValueOperand() != U->get()) 1456 return; 1457 1458 // We are storing the pointer into a memory location, potentially escaping. 1459 // As an optimization, we try to detect simple cases where it doesn't 1460 // actually escape, for example: 1461 // %ptr = alloca .. 1462 // %addr = alloca .. 1463 // store %ptr, %addr 1464 // %x = load %addr 1465 // .. 1466 // If %addr is only used by loading from it, we could simply treat %x as 1467 // another alias of %ptr, and not considering %ptr being escaped. 1468 auto IsSimpleStoreThenLoad = [&]() { 1469 auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand()); 1470 // If the memory location we are storing to is not an alloca, it 1471 // could be an alias of some other memory locations, which is difficult 1472 // to analyze. 1473 if (!AI) 1474 return false; 1475 // StoreAliases contains aliases of the memory location stored into. 1476 SmallVector<Instruction *, 4> StoreAliases = {AI}; 1477 while (!StoreAliases.empty()) { 1478 Instruction *I = StoreAliases.pop_back_val(); 1479 for (User *U : I->users()) { 1480 // If we are loading from the memory location, we are creating an 1481 // alias of the original pointer. 1482 if (auto *LI = dyn_cast<LoadInst>(U)) { 1483 enqueueUsers(*LI); 1484 handleAlias(*LI); 1485 continue; 1486 } 1487 // If we are overriding the memory location, the pointer certainly 1488 // won't escape. 1489 if (auto *S = dyn_cast<StoreInst>(U)) 1490 if (S->getPointerOperand() == I) 1491 continue; 1492 if (auto *II = dyn_cast<IntrinsicInst>(U)) 1493 if (II->isLifetimeStartOrEnd()) 1494 continue; 1495 // BitCastInst creats aliases of the memory location being stored 1496 // into. 1497 if (auto *BI = dyn_cast<BitCastInst>(U)) { 1498 StoreAliases.push_back(BI); 1499 continue; 1500 } 1501 return false; 1502 } 1503 } 1504 1505 return true; 1506 }; 1507 1508 if (!IsSimpleStoreThenLoad()) 1509 PI.setEscaped(&SI); 1510 } 1511 1512 // All mem intrinsics modify the data. 1513 void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); } 1514 1515 void visitBitCastInst(BitCastInst &BC) { 1516 Base::visitBitCastInst(BC); 1517 handleAlias(BC); 1518 } 1519 1520 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 1521 Base::visitAddrSpaceCastInst(ASC); 1522 handleAlias(ASC); 1523 } 1524 1525 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 1526 // The base visitor will adjust Offset accordingly. 1527 Base::visitGetElementPtrInst(GEPI); 1528 handleAlias(GEPI); 1529 } 1530 1531 void visitIntrinsicInst(IntrinsicInst &II) { 1532 // When we found the lifetime markers refers to a 1533 // subrange of the original alloca, ignore the lifetime 1534 // markers to avoid misleading the analysis. 1535 if (II.getIntrinsicID() != Intrinsic::lifetime_start || !IsOffsetKnown || 1536 !Offset.isZero()) 1537 return Base::visitIntrinsicInst(II); 1538 LifetimeStarts.insert(&II); 1539 } 1540 1541 void visitCallBase(CallBase &CB) { 1542 for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op) 1543 if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op)) 1544 PI.setEscaped(&CB); 1545 handleMayWrite(CB); 1546 } 1547 1548 bool getShouldLiveOnFrame() const { 1549 if (!ShouldLiveOnFrame) 1550 ShouldLiveOnFrame = computeShouldLiveOnFrame(); 1551 return *ShouldLiveOnFrame; 1552 } 1553 1554 bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; } 1555 1556 DenseMap<Instruction *, std::optional<APInt>> getAliasesCopy() const { 1557 assert(getShouldLiveOnFrame() && "This method should only be called if the " 1558 "alloca needs to live on the frame."); 1559 for (const auto &P : AliasOffetMap) 1560 if (!P.second) 1561 report_fatal_error("Unable to handle an alias with unknown offset " 1562 "created before CoroBegin."); 1563 return AliasOffetMap; 1564 } 1565 1566 private: 1567 const DominatorTree &DT; 1568 const CoroBeginInst &CoroBegin; 1569 const SuspendCrossingInfo &Checker; 1570 // All alias to the original AllocaInst, created before CoroBegin and used 1571 // after CoroBegin. Each entry contains the instruction and the offset in the 1572 // original Alloca. They need to be recreated after CoroBegin off the frame. 1573 DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{}; 1574 SmallPtrSet<Instruction *, 4> Users{}; 1575 SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{}; 1576 bool MayWriteBeforeCoroBegin{false}; 1577 bool ShouldUseLifetimeStartInfo{true}; 1578 1579 mutable std::optional<bool> ShouldLiveOnFrame{}; 1580 1581 bool computeShouldLiveOnFrame() const { 1582 // If lifetime information is available, we check it first since it's 1583 // more precise. We look at every pair of lifetime.start intrinsic and 1584 // every basic block that uses the pointer to see if they cross suspension 1585 // points. The uses cover both direct uses as well as indirect uses. 1586 if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) { 1587 for (auto *I : Users) 1588 for (auto *S : LifetimeStarts) 1589 if (Checker.isDefinitionAcrossSuspend(*S, I)) 1590 return true; 1591 // Addresses are guaranteed to be identical after every lifetime.start so 1592 // we cannot use the local stack if the address escaped and there is a 1593 // suspend point between lifetime markers. This should also cover the 1594 // case of a single lifetime.start intrinsic in a loop with suspend point. 1595 if (PI.isEscaped()) { 1596 for (auto *A : LifetimeStarts) { 1597 for (auto *B : LifetimeStarts) { 1598 if (Checker.hasPathOrLoopCrossingSuspendPoint(A->getParent(), 1599 B->getParent())) 1600 return true; 1601 } 1602 } 1603 } 1604 return false; 1605 } 1606 // FIXME: Ideally the isEscaped check should come at the beginning. 1607 // However there are a few loose ends that need to be fixed first before 1608 // we can do that. We need to make sure we are not over-conservative, so 1609 // that the data accessed in-between await_suspend and symmetric transfer 1610 // is always put on the stack, and also data accessed after coro.end is 1611 // always put on the stack (esp the return object). To fix that, we need 1612 // to: 1613 // 1) Potentially treat sret as nocapture in calls 1614 // 2) Special handle the return object and put it on the stack 1615 // 3) Utilize lifetime.end intrinsic 1616 if (PI.isEscaped()) 1617 return true; 1618 1619 for (auto *U1 : Users) 1620 for (auto *U2 : Users) 1621 if (Checker.isDefinitionAcrossSuspend(*U1, U2)) 1622 return true; 1623 1624 return false; 1625 } 1626 1627 void handleMayWrite(const Instruction &I) { 1628 if (!DT.dominates(&CoroBegin, &I)) 1629 MayWriteBeforeCoroBegin = true; 1630 } 1631 1632 bool usedAfterCoroBegin(Instruction &I) { 1633 for (auto &U : I.uses()) 1634 if (DT.dominates(&CoroBegin, U)) 1635 return true; 1636 return false; 1637 } 1638 1639 void handleAlias(Instruction &I) { 1640 // We track all aliases created prior to CoroBegin but used after. 1641 // These aliases may need to be recreated after CoroBegin if the alloca 1642 // need to live on the frame. 1643 if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I)) 1644 return; 1645 1646 if (!IsOffsetKnown) { 1647 AliasOffetMap[&I].reset(); 1648 } else { 1649 auto Itr = AliasOffetMap.find(&I); 1650 if (Itr == AliasOffetMap.end()) { 1651 AliasOffetMap[&I] = Offset; 1652 } else if (Itr->second && *Itr->second != Offset) { 1653 // If we have seen two different possible values for this alias, we set 1654 // it to empty. 1655 AliasOffetMap[&I].reset(); 1656 } 1657 } 1658 } 1659 }; 1660 } // namespace 1661 1662 // We need to make room to insert a spill after initial PHIs, but before 1663 // catchswitch instruction. Placing it before violates the requirement that 1664 // catchswitch, like all other EHPads must be the first nonPHI in a block. 1665 // 1666 // Split away catchswitch into a separate block and insert in its place: 1667 // 1668 // cleanuppad <InsertPt> cleanupret. 1669 // 1670 // cleanupret instruction will act as an insert point for the spill. 1671 static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) { 1672 BasicBlock *CurrentBlock = CatchSwitch->getParent(); 1673 BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch); 1674 CurrentBlock->getTerminator()->eraseFromParent(); 1675 1676 auto *CleanupPad = 1677 CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock); 1678 auto *CleanupRet = 1679 CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock); 1680 return CleanupRet; 1681 } 1682 1683 static void createFramePtr(coro::Shape &Shape) { 1684 auto *CB = Shape.CoroBegin; 1685 IRBuilder<> Builder(CB->getNextNode()); 1686 StructType *FrameTy = Shape.FrameTy; 1687 PointerType *FramePtrTy = FrameTy->getPointerTo(); 1688 Shape.FramePtr = 1689 cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr")); 1690 } 1691 1692 // Replace all alloca and SSA values that are accessed across suspend points 1693 // with GetElementPointer from coroutine frame + loads and stores. Create an 1694 // AllocaSpillBB that will become the new entry block for the resume parts of 1695 // the coroutine: 1696 // 1697 // %hdl = coro.begin(...) 1698 // whatever 1699 // 1700 // becomes: 1701 // 1702 // %hdl = coro.begin(...) 1703 // %FramePtr = bitcast i8* hdl to %f.frame* 1704 // br label %AllocaSpillBB 1705 // 1706 // AllocaSpillBB: 1707 // ; geps corresponding to allocas that were moved to coroutine frame 1708 // br label PostSpill 1709 // 1710 // PostSpill: 1711 // whatever 1712 // 1713 // 1714 static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { 1715 auto *CB = Shape.CoroBegin; 1716 LLVMContext &C = CB->getContext(); 1717 Function *F = CB->getFunction(); 1718 IRBuilder<> Builder(C); 1719 StructType *FrameTy = Shape.FrameTy; 1720 Value *FramePtr = Shape.FramePtr; 1721 DominatorTree DT(*F); 1722 SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; 1723 1724 // Create a GEP with the given index into the coroutine frame for the original 1725 // value Orig. Appends an extra 0 index for array-allocas, preserving the 1726 // original type. 1727 auto GetFramePointer = [&](Value *Orig) -> Value * { 1728 FieldIDType Index = FrameData.getFieldIndex(Orig); 1729 SmallVector<Value *, 3> Indices = { 1730 ConstantInt::get(Type::getInt32Ty(C), 0), 1731 ConstantInt::get(Type::getInt32Ty(C), Index), 1732 }; 1733 1734 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1735 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { 1736 auto Count = CI->getValue().getZExtValue(); 1737 if (Count > 1) { 1738 Indices.push_back(ConstantInt::get(Type::getInt32Ty(C), 0)); 1739 } 1740 } else { 1741 report_fatal_error("Coroutines cannot handle non static allocas yet"); 1742 } 1743 } 1744 1745 auto GEP = cast<GetElementPtrInst>( 1746 Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices)); 1747 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1748 if (FrameData.getDynamicAlign(Orig) != 0) { 1749 assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value()); 1750 auto *M = AI->getModule(); 1751 auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType()); 1752 auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy); 1753 auto *AlignMask = 1754 ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1); 1755 PtrValue = Builder.CreateAdd(PtrValue, AlignMask); 1756 PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask)); 1757 return Builder.CreateIntToPtr(PtrValue, AI->getType()); 1758 } 1759 // If the type of GEP is not equal to the type of AllocaInst, it implies 1760 // that the AllocaInst may be reused in the Frame slot of other 1761 // AllocaInst. So We cast GEP to the AllocaInst here to re-use 1762 // the Frame storage. 1763 // 1764 // Note: If we change the strategy dealing with alignment, we need to refine 1765 // this casting. 1766 if (GEP->getType() != Orig->getType()) 1767 return Builder.CreateBitCast(GEP, Orig->getType(), 1768 Orig->getName() + Twine(".cast")); 1769 } 1770 return GEP; 1771 }; 1772 1773 for (auto const &E : FrameData.Spills) { 1774 Value *Def = E.first; 1775 auto SpillAlignment = Align(FrameData.getAlign(Def)); 1776 // Create a store instruction storing the value into the 1777 // coroutine frame. 1778 Instruction *InsertPt = nullptr; 1779 Type *ByValTy = nullptr; 1780 if (auto *Arg = dyn_cast<Argument>(Def)) { 1781 // For arguments, we will place the store instruction right after 1782 // the coroutine frame pointer instruction, i.e. bitcast of 1783 // coro.begin from i8* to %f.frame*. 1784 InsertPt = Shape.getInsertPtAfterFramePtr(); 1785 1786 // If we're spilling an Argument, make sure we clear 'nocapture' 1787 // from the coroutine function. 1788 Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture); 1789 1790 if (Arg->hasByValAttr()) 1791 ByValTy = Arg->getParamByValType(); 1792 } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) { 1793 // Don't spill immediately after a suspend; splitting assumes 1794 // that the suspend will be followed by a branch. 1795 InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI(); 1796 } else { 1797 auto *I = cast<Instruction>(Def); 1798 if (!DT.dominates(CB, I)) { 1799 // If it is not dominated by CoroBegin, then spill should be 1800 // inserted immediately after CoroFrame is computed. 1801 InsertPt = Shape.getInsertPtAfterFramePtr(); 1802 } else if (auto *II = dyn_cast<InvokeInst>(I)) { 1803 // If we are spilling the result of the invoke instruction, split 1804 // the normal edge and insert the spill in the new block. 1805 auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest()); 1806 InsertPt = NewBB->getTerminator(); 1807 } else if (isa<PHINode>(I)) { 1808 // Skip the PHINodes and EH pads instructions. 1809 BasicBlock *DefBlock = I->getParent(); 1810 if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator())) 1811 InsertPt = splitBeforeCatchSwitch(CSI); 1812 else 1813 InsertPt = &*DefBlock->getFirstInsertionPt(); 1814 } else { 1815 assert(!I->isTerminator() && "unexpected terminator"); 1816 // For all other values, the spill is placed immediately after 1817 // the definition. 1818 InsertPt = I->getNextNode(); 1819 } 1820 } 1821 1822 auto Index = FrameData.getFieldIndex(Def); 1823 Builder.SetInsertPoint(InsertPt); 1824 auto *G = Builder.CreateConstInBoundsGEP2_32( 1825 FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr")); 1826 if (ByValTy) { 1827 // For byval arguments, we need to store the pointed value in the frame, 1828 // instead of the pointer itself. 1829 auto *Value = Builder.CreateLoad(ByValTy, Def); 1830 Builder.CreateAlignedStore(Value, G, SpillAlignment); 1831 } else { 1832 Builder.CreateAlignedStore(Def, G, SpillAlignment); 1833 } 1834 1835 BasicBlock *CurrentBlock = nullptr; 1836 Value *CurrentReload = nullptr; 1837 for (auto *U : E.second) { 1838 // If we have not seen the use block, create a load instruction to reload 1839 // the spilled value from the coroutine frame. Populates the Value pointer 1840 // reference provided with the frame GEP. 1841 if (CurrentBlock != U->getParent()) { 1842 CurrentBlock = U->getParent(); 1843 Builder.SetInsertPoint(&*CurrentBlock->getFirstInsertionPt()); 1844 1845 auto *GEP = GetFramePointer(E.first); 1846 GEP->setName(E.first->getName() + Twine(".reload.addr")); 1847 if (ByValTy) 1848 CurrentReload = GEP; 1849 else 1850 CurrentReload = Builder.CreateAlignedLoad( 1851 FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP, 1852 SpillAlignment, E.first->getName() + Twine(".reload")); 1853 1854 TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Def); 1855 // Try best to find dbg.declare. If the spill is a temp, there may not 1856 // be a direct dbg.declare. Walk up the load chain to find one from an 1857 // alias. 1858 if (F->getSubprogram()) { 1859 auto *CurDef = Def; 1860 while (DIs.empty() && isa<LoadInst>(CurDef)) { 1861 auto *LdInst = cast<LoadInst>(CurDef); 1862 // Only consider ptr to ptr same type load. 1863 if (LdInst->getPointerOperandType() != LdInst->getType()) 1864 break; 1865 CurDef = LdInst->getPointerOperand(); 1866 DIs = FindDbgDeclareUses(CurDef); 1867 } 1868 } 1869 1870 for (DbgDeclareInst *DDI : DIs) { 1871 bool AllowUnresolved = false; 1872 // This dbg.declare is preserved for all coro-split function 1873 // fragments. It will be unreachable in the main function, and 1874 // processed by coro::salvageDebugInfo() by CoroCloner. 1875 DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved) 1876 .insertDeclare(CurrentReload, DDI->getVariable(), 1877 DDI->getExpression(), DDI->getDebugLoc(), 1878 &*Builder.GetInsertPoint()); 1879 // This dbg.declare is for the main function entry point. It 1880 // will be deleted in all coro-split functions. 1881 coro::salvageDebugInfo(ArgToAllocaMap, DDI, Shape.OptimizeFrame); 1882 } 1883 } 1884 1885 // If we have a single edge PHINode, remove it and replace it with a 1886 // reload from the coroutine frame. (We already took care of multi edge 1887 // PHINodes by rewriting them in the rewritePHIs function). 1888 if (auto *PN = dyn_cast<PHINode>(U)) { 1889 assert(PN->getNumIncomingValues() == 1 && 1890 "unexpected number of incoming " 1891 "values in the PHINode"); 1892 PN->replaceAllUsesWith(CurrentReload); 1893 PN->eraseFromParent(); 1894 continue; 1895 } 1896 1897 // Replace all uses of CurrentValue in the current instruction with 1898 // reload. 1899 U->replaceUsesOfWith(Def, CurrentReload); 1900 } 1901 } 1902 1903 BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent(); 1904 1905 auto SpillBlock = FramePtrBB->splitBasicBlock( 1906 Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB"); 1907 SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill"); 1908 Shape.AllocaSpillBlock = SpillBlock; 1909 1910 // retcon and retcon.once lowering assumes all uses have been sunk. 1911 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 1912 Shape.ABI == coro::ABI::Async) { 1913 // If we found any allocas, replace all of their remaining uses with Geps. 1914 Builder.SetInsertPoint(&SpillBlock->front()); 1915 for (const auto &P : FrameData.Allocas) { 1916 AllocaInst *Alloca = P.Alloca; 1917 auto *G = GetFramePointer(Alloca); 1918 1919 // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) 1920 // here, as we are changing location of the instruction. 1921 G->takeName(Alloca); 1922 Alloca->replaceAllUsesWith(G); 1923 Alloca->eraseFromParent(); 1924 } 1925 return; 1926 } 1927 1928 // If we found any alloca, replace all of their remaining uses with GEP 1929 // instructions. To remain debugbility, we replace the uses of allocas for 1930 // dbg.declares and dbg.values with the reload from the frame. 1931 // Note: We cannot replace the alloca with GEP instructions indiscriminately, 1932 // as some of the uses may not be dominated by CoroBegin. 1933 Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front()); 1934 SmallVector<Instruction *, 4> UsersToUpdate; 1935 for (const auto &A : FrameData.Allocas) { 1936 AllocaInst *Alloca = A.Alloca; 1937 UsersToUpdate.clear(); 1938 for (User *U : Alloca->users()) { 1939 auto *I = cast<Instruction>(U); 1940 if (DT.dominates(CB, I)) 1941 UsersToUpdate.push_back(I); 1942 } 1943 if (UsersToUpdate.empty()) 1944 continue; 1945 auto *G = GetFramePointer(Alloca); 1946 G->setName(Alloca->getName() + Twine(".reload.addr")); 1947 1948 SmallVector<DbgVariableIntrinsic *, 4> DIs; 1949 findDbgUsers(DIs, Alloca); 1950 for (auto *DVI : DIs) 1951 DVI->replaceUsesOfWith(Alloca, G); 1952 1953 for (Instruction *I : UsersToUpdate) { 1954 // It is meaningless to retain the lifetime intrinsics refer for the 1955 // member of coroutine frames and the meaningless lifetime intrinsics 1956 // are possible to block further optimizations. 1957 if (I->isLifetimeStartOrEnd()) { 1958 I->eraseFromParent(); 1959 continue; 1960 } 1961 1962 I->replaceUsesOfWith(Alloca, G); 1963 } 1964 } 1965 Builder.SetInsertPoint(Shape.getInsertPtAfterFramePtr()); 1966 for (const auto &A : FrameData.Allocas) { 1967 AllocaInst *Alloca = A.Alloca; 1968 if (A.MayWriteBeforeCoroBegin) { 1969 // isEscaped really means potentially modified before CoroBegin. 1970 if (Alloca->isArrayAllocation()) 1971 report_fatal_error( 1972 "Coroutines cannot handle copying of array allocas yet"); 1973 1974 auto *G = GetFramePointer(Alloca); 1975 auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca); 1976 Builder.CreateStore(Value, G); 1977 } 1978 // For each alias to Alloca created before CoroBegin but used after 1979 // CoroBegin, we recreate them after CoroBegin by appplying the offset 1980 // to the pointer in the frame. 1981 for (const auto &Alias : A.Aliases) { 1982 auto *FramePtr = GetFramePointer(Alloca); 1983 auto *FramePtrRaw = 1984 Builder.CreateBitCast(FramePtr, Type::getInt8PtrTy(C)); 1985 auto &Value = *Alias.second; 1986 auto ITy = IntegerType::get(C, Value.getBitWidth()); 1987 auto *AliasPtr = Builder.CreateGEP(Type::getInt8Ty(C), FramePtrRaw, 1988 ConstantInt::get(ITy, Value)); 1989 auto *AliasPtrTyped = 1990 Builder.CreateBitCast(AliasPtr, Alias.first->getType()); 1991 Alias.first->replaceUsesWithIf( 1992 AliasPtrTyped, [&](Use &U) { return DT.dominates(CB, U); }); 1993 } 1994 } 1995 1996 // PromiseAlloca is not collected in FrameData.Allocas. So we don't handle 1997 // the case that the PromiseAlloca may have writes before CoroBegin in the 1998 // above codes. And it may be problematic in edge cases. See 1999 // https://github.com/llvm/llvm-project/issues/57861 for an example. 2000 if (Shape.ABI == coro::ABI::Switch && Shape.SwitchLowering.PromiseAlloca) { 2001 AllocaInst *PA = Shape.SwitchLowering.PromiseAlloca; 2002 // If there is memory accessing to promise alloca before CoroBegin; 2003 bool HasAccessingPromiseBeforeCB = llvm::any_of(PA->uses(), [&](Use &U) { 2004 auto *Inst = dyn_cast<Instruction>(U.getUser()); 2005 if (!Inst || DT.dominates(CB, Inst)) 2006 return false; 2007 2008 if (auto *CI = dyn_cast<CallInst>(Inst)) { 2009 // It is fine if the call wouldn't write to the Promise. 2010 // This is possible for @llvm.coro.id intrinsics, which 2011 // would take the promise as the second argument as a 2012 // marker. 2013 if (CI->onlyReadsMemory() || 2014 CI->onlyReadsMemory(CI->getArgOperandNo(&U))) 2015 return false; 2016 return true; 2017 } 2018 2019 return isa<StoreInst>(Inst) || 2020 // It may take too much time to track the uses. 2021 // Be conservative about the case the use may escape. 2022 isa<GetElementPtrInst>(Inst) || 2023 // There would always be a bitcast for the promise alloca 2024 // before we enabled Opaque pointers. And now given 2025 // opaque pointers are enabled by default. This should be 2026 // fine. 2027 isa<BitCastInst>(Inst); 2028 }); 2029 if (HasAccessingPromiseBeforeCB) { 2030 Builder.SetInsertPoint(Shape.getInsertPtAfterFramePtr()); 2031 auto *G = GetFramePointer(PA); 2032 auto *Value = Builder.CreateLoad(PA->getAllocatedType(), PA); 2033 Builder.CreateStore(Value, G); 2034 } 2035 } 2036 } 2037 2038 // Moves the values in the PHIs in SuccBB that correspong to PredBB into a new 2039 // PHI in InsertedBB. 2040 static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB, 2041 BasicBlock *InsertedBB, 2042 BasicBlock *PredBB, 2043 PHINode *UntilPHI = nullptr) { 2044 auto *PN = cast<PHINode>(&SuccBB->front()); 2045 do { 2046 int Index = PN->getBasicBlockIndex(InsertedBB); 2047 Value *V = PN->getIncomingValue(Index); 2048 PHINode *InputV = PHINode::Create( 2049 V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName(), 2050 &InsertedBB->front()); 2051 InputV->addIncoming(V, PredBB); 2052 PN->setIncomingValue(Index, InputV); 2053 PN = dyn_cast<PHINode>(PN->getNextNode()); 2054 } while (PN != UntilPHI); 2055 } 2056 2057 // Rewrites the PHI Nodes in a cleanuppad. 2058 static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB, 2059 CleanupPadInst *CleanupPad) { 2060 // For every incoming edge to a CleanupPad we will create a new block holding 2061 // all incoming values in single-value PHI nodes. We will then create another 2062 // block to act as a dispather (as all unwind edges for related EH blocks 2063 // must be the same). 2064 // 2065 // cleanuppad: 2066 // %2 = phi i32[%0, %catchswitch], [%1, %catch.1] 2067 // %3 = cleanuppad within none [] 2068 // 2069 // It will create: 2070 // 2071 // cleanuppad.corodispatch 2072 // %2 = phi i8[0, %catchswitch], [1, %catch.1] 2073 // %3 = cleanuppad within none [] 2074 // switch i8 % 2, label %unreachable 2075 // [i8 0, label %cleanuppad.from.catchswitch 2076 // i8 1, label %cleanuppad.from.catch.1] 2077 // cleanuppad.from.catchswitch: 2078 // %4 = phi i32 [%0, %catchswitch] 2079 // br %label cleanuppad 2080 // cleanuppad.from.catch.1: 2081 // %6 = phi i32 [%1, %catch.1] 2082 // br %label cleanuppad 2083 // cleanuppad: 2084 // %8 = phi i32 [%4, %cleanuppad.from.catchswitch], 2085 // [%6, %cleanuppad.from.catch.1] 2086 2087 // Unreachable BB, in case switching on an invalid value in the dispatcher. 2088 auto *UnreachBB = BasicBlock::Create( 2089 CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent()); 2090 IRBuilder<> Builder(UnreachBB); 2091 Builder.CreateUnreachable(); 2092 2093 // Create a new cleanuppad which will be the dispatcher. 2094 auto *NewCleanupPadBB = 2095 BasicBlock::Create(CleanupPadBB->getContext(), 2096 CleanupPadBB->getName() + Twine(".corodispatch"), 2097 CleanupPadBB->getParent(), CleanupPadBB); 2098 Builder.SetInsertPoint(NewCleanupPadBB); 2099 auto *SwitchType = Builder.getInt8Ty(); 2100 auto *SetDispatchValuePN = 2101 Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB)); 2102 CleanupPad->removeFromParent(); 2103 CleanupPad->insertAfter(SetDispatchValuePN); 2104 auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB, 2105 pred_size(CleanupPadBB)); 2106 2107 int SwitchIndex = 0; 2108 SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB)); 2109 for (BasicBlock *Pred : Preds) { 2110 // Create a new cleanuppad and move the PHI values to there. 2111 auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(), 2112 CleanupPadBB->getName() + 2113 Twine(".from.") + Pred->getName(), 2114 CleanupPadBB->getParent(), CleanupPadBB); 2115 updatePhiNodes(CleanupPadBB, Pred, CaseBB); 2116 CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") + 2117 Pred->getName()); 2118 Builder.SetInsertPoint(CaseBB); 2119 Builder.CreateBr(CleanupPadBB); 2120 movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB); 2121 2122 // Update this Pred to the new unwind point. 2123 setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB); 2124 2125 // Setup the switch in the dispatcher. 2126 auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex); 2127 SetDispatchValuePN->addIncoming(SwitchConstant, Pred); 2128 SwitchOnDispatch->addCase(SwitchConstant, CaseBB); 2129 SwitchIndex++; 2130 } 2131 } 2132 2133 static void cleanupSinglePredPHIs(Function &F) { 2134 SmallVector<PHINode *, 32> Worklist; 2135 for (auto &BB : F) { 2136 for (auto &Phi : BB.phis()) { 2137 if (Phi.getNumIncomingValues() == 1) { 2138 Worklist.push_back(&Phi); 2139 } else 2140 break; 2141 } 2142 } 2143 while (!Worklist.empty()) { 2144 auto *Phi = Worklist.pop_back_val(); 2145 auto *OriginalValue = Phi->getIncomingValue(0); 2146 Phi->replaceAllUsesWith(OriginalValue); 2147 } 2148 } 2149 2150 static void rewritePHIs(BasicBlock &BB) { 2151 // For every incoming edge we will create a block holding all 2152 // incoming values in a single PHI nodes. 2153 // 2154 // loop: 2155 // %n.val = phi i32[%n, %entry], [%inc, %loop] 2156 // 2157 // It will create: 2158 // 2159 // loop.from.entry: 2160 // %n.loop.pre = phi i32 [%n, %entry] 2161 // br %label loop 2162 // loop.from.loop: 2163 // %inc.loop.pre = phi i32 [%inc, %loop] 2164 // br %label loop 2165 // 2166 // After this rewrite, further analysis will ignore any phi nodes with more 2167 // than one incoming edge. 2168 2169 // TODO: Simplify PHINodes in the basic block to remove duplicate 2170 // predecessors. 2171 2172 // Special case for CleanupPad: all EH blocks must have the same unwind edge 2173 // so we need to create an additional "dispatcher" block. 2174 if (auto *CleanupPad = 2175 dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) { 2176 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2177 for (BasicBlock *Pred : Preds) { 2178 if (CatchSwitchInst *CS = 2179 dyn_cast<CatchSwitchInst>(Pred->getTerminator())) { 2180 // CleanupPad with a CatchSwitch predecessor: therefore this is an 2181 // unwind destination that needs to be handle specially. 2182 assert(CS->getUnwindDest() == &BB); 2183 (void)CS; 2184 rewritePHIsForCleanupPad(&BB, CleanupPad); 2185 return; 2186 } 2187 } 2188 } 2189 2190 LandingPadInst *LandingPad = nullptr; 2191 PHINode *ReplPHI = nullptr; 2192 if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) { 2193 // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. 2194 // We replace the original landing pad with a PHINode that will collect the 2195 // results from all of them. 2196 ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad); 2197 ReplPHI->takeName(LandingPad); 2198 LandingPad->replaceAllUsesWith(ReplPHI); 2199 // We will erase the original landing pad at the end of this function after 2200 // ehAwareSplitEdge cloned it in the transition blocks. 2201 } 2202 2203 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2204 for (BasicBlock *Pred : Preds) { 2205 auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI); 2206 IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName()); 2207 2208 // Stop the moving of values at ReplPHI, as this is either null or the PHI 2209 // that replaced the landing pad. 2210 movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI); 2211 } 2212 2213 if (LandingPad) { 2214 // Calls to ehAwareSplitEdge function cloned the original lading pad. 2215 // No longer need it. 2216 LandingPad->eraseFromParent(); 2217 } 2218 } 2219 2220 static void rewritePHIs(Function &F) { 2221 SmallVector<BasicBlock *, 8> WorkList; 2222 2223 for (BasicBlock &BB : F) 2224 if (auto *PN = dyn_cast<PHINode>(&BB.front())) 2225 if (PN->getNumIncomingValues() > 1) 2226 WorkList.push_back(&BB); 2227 2228 for (BasicBlock *BB : WorkList) 2229 rewritePHIs(*BB); 2230 } 2231 2232 /// Default materializable callback 2233 // Check for instructions that we can recreate on resume as opposed to spill 2234 // the result into a coroutine frame. 2235 bool coro::defaultMaterializable(Instruction &V) { 2236 return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) || 2237 isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V)); 2238 } 2239 2240 // Check for structural coroutine intrinsics that should not be spilled into 2241 // the coroutine frame. 2242 static bool isCoroutineStructureIntrinsic(Instruction &I) { 2243 return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) || 2244 isa<CoroSuspendInst>(&I); 2245 } 2246 2247 // For each instruction identified as materializable across the suspend point, 2248 // and its associated DAG of other rematerializable instructions, 2249 // recreate the DAG of instructions after the suspend point. 2250 static void rewriteMaterializableInstructions( 2251 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> 2252 &AllRemats) { 2253 // This has to be done in 2 phases 2254 // Do the remats and record the required defs to be replaced in the 2255 // original use instructions 2256 // Once all the remats are complete, replace the uses in the final 2257 // instructions with the new defs 2258 typedef struct { 2259 Instruction *Use; 2260 Instruction *Def; 2261 Instruction *Remat; 2262 } ProcessNode; 2263 2264 SmallVector<ProcessNode> FinalInstructionsToProcess; 2265 2266 for (const auto &E : AllRemats) { 2267 Instruction *Use = E.first; 2268 Instruction *CurrentMaterialization = nullptr; 2269 RematGraph *RG = E.second.get(); 2270 ReversePostOrderTraversal<RematGraph *> RPOT(RG); 2271 SmallVector<Instruction *> InstructionsToProcess; 2272 2273 // If the target use is actually a suspend instruction then we have to 2274 // insert the remats into the end of the predecessor (there should only be 2275 // one). This is so that suspend blocks always have the suspend instruction 2276 // as the first instruction. 2277 auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); 2278 if (isa<AnyCoroSuspendInst>(Use)) { 2279 BasicBlock *SuspendPredecessorBlock = 2280 Use->getParent()->getSinglePredecessor(); 2281 assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); 2282 InsertPoint = SuspendPredecessorBlock->getTerminator(); 2283 } 2284 2285 // Note: skip the first instruction as this is the actual use that we're 2286 // rematerializing everything for. 2287 auto I = RPOT.begin(); 2288 ++I; 2289 for (; I != RPOT.end(); ++I) { 2290 Instruction *D = (*I)->Node; 2291 CurrentMaterialization = D->clone(); 2292 CurrentMaterialization->setName(D->getName()); 2293 CurrentMaterialization->insertBefore(InsertPoint); 2294 InsertPoint = CurrentMaterialization; 2295 2296 // Replace all uses of Def in the instructions being added as part of this 2297 // rematerialization group 2298 for (auto &I : InstructionsToProcess) 2299 I->replaceUsesOfWith(D, CurrentMaterialization); 2300 2301 // Don't replace the final use at this point as this can cause problems 2302 // for other materializations. Instead, for any final use that uses a 2303 // define that's being rematerialized, record the replace values 2304 for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) 2305 if (Use->getOperand(i) == D) // Is this operand pointing to oldval? 2306 FinalInstructionsToProcess.push_back( 2307 {Use, D, CurrentMaterialization}); 2308 2309 InstructionsToProcess.push_back(CurrentMaterialization); 2310 } 2311 } 2312 2313 // Finally, replace the uses with the defines that we've just rematerialized 2314 for (auto &R : FinalInstructionsToProcess) { 2315 if (auto *PN = dyn_cast<PHINode>(R.Use)) { 2316 assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " 2317 "values in the PHINode"); 2318 PN->replaceAllUsesWith(R.Remat); 2319 PN->eraseFromParent(); 2320 continue; 2321 } 2322 R.Use->replaceUsesOfWith(R.Def, R.Remat); 2323 } 2324 } 2325 2326 // Splits the block at a particular instruction unless it is the first 2327 // instruction in the block with a single predecessor. 2328 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) { 2329 auto *BB = I->getParent(); 2330 if (&BB->front() == I) { 2331 if (BB->getSinglePredecessor()) { 2332 BB->setName(Name); 2333 return BB; 2334 } 2335 } 2336 return BB->splitBasicBlock(I, Name); 2337 } 2338 2339 // Split above and below a particular instruction so that it 2340 // will be all alone by itself in a block. 2341 static void splitAround(Instruction *I, const Twine &Name) { 2342 splitBlockIfNotFirst(I, Name); 2343 splitBlockIfNotFirst(I->getNextNode(), "After" + Name); 2344 } 2345 2346 static bool isSuspendBlock(BasicBlock *BB) { 2347 return isa<AnyCoroSuspendInst>(BB->front()); 2348 } 2349 2350 typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet; 2351 2352 /// Does control flow starting at the given block ever reach a suspend 2353 /// instruction before reaching a block in VisitedOrFreeBBs? 2354 static bool isSuspendReachableFrom(BasicBlock *From, 2355 VisitedBlocksSet &VisitedOrFreeBBs) { 2356 // Eagerly try to add this block to the visited set. If it's already 2357 // there, stop recursing; this path doesn't reach a suspend before 2358 // either looping or reaching a freeing block. 2359 if (!VisitedOrFreeBBs.insert(From).second) 2360 return false; 2361 2362 // We assume that we'll already have split suspends into their own blocks. 2363 if (isSuspendBlock(From)) 2364 return true; 2365 2366 // Recurse on the successors. 2367 for (auto *Succ : successors(From)) { 2368 if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs)) 2369 return true; 2370 } 2371 2372 return false; 2373 } 2374 2375 /// Is the given alloca "local", i.e. bounded in lifetime to not cross a 2376 /// suspend point? 2377 static bool isLocalAlloca(CoroAllocaAllocInst *AI) { 2378 // Seed the visited set with all the basic blocks containing a free 2379 // so that we won't pass them up. 2380 VisitedBlocksSet VisitedOrFreeBBs; 2381 for (auto *User : AI->users()) { 2382 if (auto FI = dyn_cast<CoroAllocaFreeInst>(User)) 2383 VisitedOrFreeBBs.insert(FI->getParent()); 2384 } 2385 2386 return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs); 2387 } 2388 2389 /// After we split the coroutine, will the given basic block be along 2390 /// an obvious exit path for the resumption function? 2391 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, 2392 unsigned depth = 3) { 2393 // If we've bottomed out our depth count, stop searching and assume 2394 // that the path might loop back. 2395 if (depth == 0) return false; 2396 2397 // If this is a suspend block, we're about to exit the resumption function. 2398 if (isSuspendBlock(BB)) return true; 2399 2400 // Recurse into the successors. 2401 for (auto *Succ : successors(BB)) { 2402 if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1)) 2403 return false; 2404 } 2405 2406 // If none of the successors leads back in a loop, we're on an exit/abort. 2407 return true; 2408 } 2409 2410 static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) { 2411 // Look for a free that isn't sufficiently obviously followed by 2412 // either a suspend or a termination, i.e. something that will leave 2413 // the coro resumption frame. 2414 for (auto *U : AI->users()) { 2415 auto FI = dyn_cast<CoroAllocaFreeInst>(U); 2416 if (!FI) continue; 2417 2418 if (!willLeaveFunctionImmediatelyAfter(FI->getParent())) 2419 return true; 2420 } 2421 2422 // If we never found one, we don't need a stack save. 2423 return false; 2424 } 2425 2426 /// Turn each of the given local allocas into a normal (dynamic) alloca 2427 /// instruction. 2428 static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas, 2429 SmallVectorImpl<Instruction*> &DeadInsts) { 2430 for (auto *AI : LocalAllocas) { 2431 auto M = AI->getModule(); 2432 IRBuilder<> Builder(AI); 2433 2434 // Save the stack depth. Try to avoid doing this if the stackrestore 2435 // is going to immediately precede a return or something. 2436 Value *StackSave = nullptr; 2437 if (localAllocaNeedsStackSave(AI)) 2438 StackSave = Builder.CreateCall( 2439 Intrinsic::getDeclaration(M, Intrinsic::stacksave)); 2440 2441 // Allocate memory. 2442 auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize()); 2443 Alloca->setAlignment(AI->getAlignment()); 2444 2445 for (auto *U : AI->users()) { 2446 // Replace gets with the allocation. 2447 if (isa<CoroAllocaGetInst>(U)) { 2448 U->replaceAllUsesWith(Alloca); 2449 2450 // Replace frees with stackrestores. This is safe because 2451 // alloca.alloc is required to obey a stack discipline, although we 2452 // don't enforce that structurally. 2453 } else { 2454 auto FI = cast<CoroAllocaFreeInst>(U); 2455 if (StackSave) { 2456 Builder.SetInsertPoint(FI); 2457 Builder.CreateCall( 2458 Intrinsic::getDeclaration(M, Intrinsic::stackrestore), 2459 StackSave); 2460 } 2461 } 2462 DeadInsts.push_back(cast<Instruction>(U)); 2463 } 2464 2465 DeadInsts.push_back(AI); 2466 } 2467 } 2468 2469 /// Turn the given coro.alloca.alloc call into a dynamic allocation. 2470 /// This happens during the all-instructions iteration, so it must not 2471 /// delete the call. 2472 static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI, 2473 coro::Shape &Shape, 2474 SmallVectorImpl<Instruction*> &DeadInsts) { 2475 IRBuilder<> Builder(AI); 2476 auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr); 2477 2478 for (User *U : AI->users()) { 2479 if (isa<CoroAllocaGetInst>(U)) { 2480 U->replaceAllUsesWith(Alloc); 2481 } else { 2482 auto FI = cast<CoroAllocaFreeInst>(U); 2483 Builder.SetInsertPoint(FI); 2484 Shape.emitDealloc(Builder, Alloc, nullptr); 2485 } 2486 DeadInsts.push_back(cast<Instruction>(U)); 2487 } 2488 2489 // Push this on last so that it gets deleted after all the others. 2490 DeadInsts.push_back(AI); 2491 2492 // Return the new allocation value so that we can check for needed spills. 2493 return cast<Instruction>(Alloc); 2494 } 2495 2496 /// Get the current swifterror value. 2497 static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy, 2498 coro::Shape &Shape) { 2499 // Make a fake function pointer as a sort of intrinsic. 2500 auto FnTy = FunctionType::get(ValueTy, {}, false); 2501 auto Fn = ConstantPointerNull::get(FnTy->getPointerTo()); 2502 2503 auto Call = Builder.CreateCall(FnTy, Fn, {}); 2504 Shape.SwiftErrorOps.push_back(Call); 2505 2506 return Call; 2507 } 2508 2509 /// Set the given value as the current swifterror value. 2510 /// 2511 /// Returns a slot that can be used as a swifterror slot. 2512 static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V, 2513 coro::Shape &Shape) { 2514 // Make a fake function pointer as a sort of intrinsic. 2515 auto FnTy = FunctionType::get(V->getType()->getPointerTo(), 2516 {V->getType()}, false); 2517 auto Fn = ConstantPointerNull::get(FnTy->getPointerTo()); 2518 2519 auto Call = Builder.CreateCall(FnTy, Fn, { V }); 2520 Shape.SwiftErrorOps.push_back(Call); 2521 2522 return Call; 2523 } 2524 2525 /// Set the swifterror value from the given alloca before a call, 2526 /// then put in back in the alloca afterwards. 2527 /// 2528 /// Returns an address that will stand in for the swifterror slot 2529 /// until splitting. 2530 static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call, 2531 AllocaInst *Alloca, 2532 coro::Shape &Shape) { 2533 auto ValueTy = Alloca->getAllocatedType(); 2534 IRBuilder<> Builder(Call); 2535 2536 // Load the current value from the alloca and set it as the 2537 // swifterror value. 2538 auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca); 2539 auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape); 2540 2541 // Move to after the call. Since swifterror only has a guaranteed 2542 // value on normal exits, we can ignore implicit and explicit unwind 2543 // edges. 2544 if (isa<CallInst>(Call)) { 2545 Builder.SetInsertPoint(Call->getNextNode()); 2546 } else { 2547 auto Invoke = cast<InvokeInst>(Call); 2548 Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg()); 2549 } 2550 2551 // Get the current swifterror value and store it to the alloca. 2552 auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape); 2553 Builder.CreateStore(ValueAfterCall, Alloca); 2554 2555 return Addr; 2556 } 2557 2558 /// Eliminate a formerly-swifterror alloca by inserting the get/set 2559 /// intrinsics and attempting to MemToReg the alloca away. 2560 static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca, 2561 coro::Shape &Shape) { 2562 for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) { 2563 // swifterror values can only be used in very specific ways. 2564 // We take advantage of that here. 2565 auto User = Use.getUser(); 2566 if (isa<LoadInst>(User) || isa<StoreInst>(User)) 2567 continue; 2568 2569 assert(isa<CallInst>(User) || isa<InvokeInst>(User)); 2570 auto Call = cast<Instruction>(User); 2571 2572 auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape); 2573 2574 // Use the returned slot address as the call argument. 2575 Use.set(Addr); 2576 } 2577 2578 // All the uses should be loads and stores now. 2579 assert(isAllocaPromotable(Alloca)); 2580 } 2581 2582 /// "Eliminate" a swifterror argument by reducing it to the alloca case 2583 /// and then loading and storing in the prologue and epilog. 2584 /// 2585 /// The argument keeps the swifterror flag. 2586 static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, 2587 coro::Shape &Shape, 2588 SmallVectorImpl<AllocaInst*> &AllocasToPromote) { 2589 IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); 2590 2591 auto ArgTy = cast<PointerType>(Arg.getType()); 2592 auto ValueTy = PointerType::getUnqual(F.getContext()); 2593 2594 // Reduce to the alloca case: 2595 2596 // Create an alloca and replace all uses of the arg with it. 2597 auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace()); 2598 Arg.replaceAllUsesWith(Alloca); 2599 2600 // Set an initial value in the alloca. swifterror is always null on entry. 2601 auto InitialValue = Constant::getNullValue(ValueTy); 2602 Builder.CreateStore(InitialValue, Alloca); 2603 2604 // Find all the suspends in the function and save and restore around them. 2605 for (auto *Suspend : Shape.CoroSuspends) { 2606 (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape); 2607 } 2608 2609 // Find all the coro.ends in the function and restore the error value. 2610 for (auto *End : Shape.CoroEnds) { 2611 Builder.SetInsertPoint(End); 2612 auto FinalValue = Builder.CreateLoad(ValueTy, Alloca); 2613 (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape); 2614 } 2615 2616 // Now we can use the alloca logic. 2617 AllocasToPromote.push_back(Alloca); 2618 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2619 } 2620 2621 /// Eliminate all problematic uses of swifterror arguments and allocas 2622 /// from the function. We'll fix them up later when splitting the function. 2623 static void eliminateSwiftError(Function &F, coro::Shape &Shape) { 2624 SmallVector<AllocaInst*, 4> AllocasToPromote; 2625 2626 // Look for a swifterror argument. 2627 for (auto &Arg : F.args()) { 2628 if (!Arg.hasSwiftErrorAttr()) continue; 2629 2630 eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote); 2631 break; 2632 } 2633 2634 // Look for swifterror allocas. 2635 for (auto &Inst : F.getEntryBlock()) { 2636 auto Alloca = dyn_cast<AllocaInst>(&Inst); 2637 if (!Alloca || !Alloca->isSwiftError()) continue; 2638 2639 // Clear the swifterror flag. 2640 Alloca->setSwiftError(false); 2641 2642 AllocasToPromote.push_back(Alloca); 2643 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2644 } 2645 2646 // If we have any allocas to promote, compute a dominator tree and 2647 // promote them en masse. 2648 if (!AllocasToPromote.empty()) { 2649 DominatorTree DT(F); 2650 PromoteMemToReg(AllocasToPromote, DT); 2651 } 2652 } 2653 2654 /// retcon and retcon.once conventions assume that all spill uses can be sunk 2655 /// after the coro.begin intrinsic. 2656 static void sinkSpillUsesAfterCoroBegin(Function &F, 2657 const FrameDataInfo &FrameData, 2658 CoroBeginInst *CoroBegin) { 2659 DominatorTree Dom(F); 2660 2661 SmallSetVector<Instruction *, 32> ToMove; 2662 SmallVector<Instruction *, 32> Worklist; 2663 2664 // Collect all users that precede coro.begin. 2665 for (auto *Def : FrameData.getAllDefs()) { 2666 for (User *U : Def->users()) { 2667 auto Inst = cast<Instruction>(U); 2668 if (Inst->getParent() != CoroBegin->getParent() || 2669 Dom.dominates(CoroBegin, Inst)) 2670 continue; 2671 if (ToMove.insert(Inst)) 2672 Worklist.push_back(Inst); 2673 } 2674 } 2675 // Recursively collect users before coro.begin. 2676 while (!Worklist.empty()) { 2677 auto *Def = Worklist.pop_back_val(); 2678 for (User *U : Def->users()) { 2679 auto Inst = cast<Instruction>(U); 2680 if (Dom.dominates(CoroBegin, Inst)) 2681 continue; 2682 if (ToMove.insert(Inst)) 2683 Worklist.push_back(Inst); 2684 } 2685 } 2686 2687 // Sort by dominance. 2688 SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end()); 2689 llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool { 2690 // If a dominates b it should preceed (<) b. 2691 return Dom.dominates(A, B); 2692 }); 2693 2694 Instruction *InsertPt = CoroBegin->getNextNode(); 2695 for (Instruction *Inst : InsertionList) 2696 Inst->moveBefore(InsertPt); 2697 } 2698 2699 /// For each local variable that all of its user are only used inside one of 2700 /// suspended region, we sink their lifetime.start markers to the place where 2701 /// after the suspend block. Doing so minimizes the lifetime of each variable, 2702 /// hence minimizing the amount of data we end up putting on the frame. 2703 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, 2704 SuspendCrossingInfo &Checker) { 2705 if (F.hasOptNone()) 2706 return; 2707 2708 DominatorTree DT(F); 2709 2710 // Collect all possible basic blocks which may dominate all uses of allocas. 2711 SmallPtrSet<BasicBlock *, 4> DomSet; 2712 DomSet.insert(&F.getEntryBlock()); 2713 for (auto *CSI : Shape.CoroSuspends) { 2714 BasicBlock *SuspendBlock = CSI->getParent(); 2715 assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() && 2716 "should have split coro.suspend into its own block"); 2717 DomSet.insert(SuspendBlock->getSingleSuccessor()); 2718 } 2719 2720 for (Instruction &I : instructions(F)) { 2721 AllocaInst* AI = dyn_cast<AllocaInst>(&I); 2722 if (!AI) 2723 continue; 2724 2725 for (BasicBlock *DomBB : DomSet) { 2726 bool Valid = true; 2727 SmallVector<Instruction *, 1> Lifetimes; 2728 2729 auto isLifetimeStart = [](Instruction* I) { 2730 if (auto* II = dyn_cast<IntrinsicInst>(I)) 2731 return II->getIntrinsicID() == Intrinsic::lifetime_start; 2732 return false; 2733 }; 2734 2735 auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) { 2736 if (isLifetimeStart(U)) { 2737 Lifetimes.push_back(U); 2738 return true; 2739 } 2740 if (!U->hasOneUse() || U->stripPointerCasts() != AI) 2741 return false; 2742 if (isLifetimeStart(U->user_back())) { 2743 Lifetimes.push_back(U->user_back()); 2744 return true; 2745 } 2746 return false; 2747 }; 2748 2749 for (User *U : AI->users()) { 2750 Instruction *UI = cast<Instruction>(U); 2751 // For all users except lifetime.start markers, if they are all 2752 // dominated by one of the basic blocks and do not cross 2753 // suspend points as well, then there is no need to spill the 2754 // instruction. 2755 if (!DT.dominates(DomBB, UI->getParent()) || 2756 Checker.isDefinitionAcrossSuspend(DomBB, UI)) { 2757 // Skip lifetime.start, GEP and bitcast used by lifetime.start 2758 // markers. 2759 if (collectLifetimeStart(UI, AI)) 2760 continue; 2761 Valid = false; 2762 break; 2763 } 2764 } 2765 // Sink lifetime.start markers to dominate block when they are 2766 // only used outside the region. 2767 if (Valid && Lifetimes.size() != 0) { 2768 // May be AI itself, when the type of AI is i8* 2769 auto *NewBitCast = [&](AllocaInst *AI) -> Value* { 2770 if (isa<AllocaInst>(Lifetimes[0]->getOperand(1))) 2771 return AI; 2772 auto *Int8PtrTy = Type::getInt8PtrTy(F.getContext()); 2773 return CastInst::Create(Instruction::BitCast, AI, Int8PtrTy, "", 2774 DomBB->getTerminator()); 2775 }(AI); 2776 2777 auto *NewLifetime = Lifetimes[0]->clone(); 2778 NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), NewBitCast); 2779 NewLifetime->insertBefore(DomBB->getTerminator()); 2780 2781 // All the outsided lifetime.start markers are no longer necessary. 2782 for (Instruction *S : Lifetimes) 2783 S->eraseFromParent(); 2784 2785 break; 2786 } 2787 } 2788 } 2789 } 2790 2791 static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape, 2792 const SuspendCrossingInfo &Checker, 2793 SmallVectorImpl<AllocaInfo> &Allocas, 2794 const DominatorTree &DT) { 2795 if (Shape.CoroSuspends.empty()) 2796 return; 2797 2798 // The PromiseAlloca will be specially handled since it needs to be in a 2799 // fixed position in the frame. 2800 if (AI == Shape.SwitchLowering.PromiseAlloca) 2801 return; 2802 2803 // The code that uses lifetime.start intrinsic does not work for functions 2804 // with loops without exit. Disable it on ABIs we know to generate such 2805 // code. 2806 bool ShouldUseLifetimeStartInfo = 2807 (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 2808 Shape.ABI != coro::ABI::RetconOnce); 2809 AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT, 2810 *Shape.CoroBegin, Checker, 2811 ShouldUseLifetimeStartInfo}; 2812 Visitor.visitPtr(*AI); 2813 if (!Visitor.getShouldLiveOnFrame()) 2814 return; 2815 Allocas.emplace_back(AI, Visitor.getAliasesCopy(), 2816 Visitor.getMayWriteBeforeCoroBegin()); 2817 } 2818 2819 void coro::salvageDebugInfo( 2820 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2821 DbgVariableIntrinsic *DVI, bool OptimizeFrame) { 2822 Function *F = DVI->getFunction(); 2823 IRBuilder<> Builder(F->getContext()); 2824 auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); 2825 while (isa<IntrinsicInst>(InsertPt)) 2826 ++InsertPt; 2827 Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt); 2828 DIExpression *Expr = DVI->getExpression(); 2829 // Follow the pointer arithmetic all the way to the incoming 2830 // function argument and convert into a DIExpression. 2831 bool SkipOutermostLoad = !isa<DbgValueInst>(DVI); 2832 Value *Storage = DVI->getVariableLocationOp(0); 2833 Value *OriginalStorage = Storage; 2834 2835 while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) { 2836 if (auto *LdInst = dyn_cast<LoadInst>(Inst)) { 2837 Storage = LdInst->getPointerOperand(); 2838 // FIXME: This is a heuristic that works around the fact that 2839 // LLVM IR debug intrinsics cannot yet distinguish between 2840 // memory and value locations: Because a dbg.declare(alloca) is 2841 // implicitly a memory location no DW_OP_deref operation for the 2842 // last direct load from an alloca is necessary. This condition 2843 // effectively drops the *last* DW_OP_deref in the expression. 2844 if (!SkipOutermostLoad) 2845 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2846 } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) { 2847 Storage = StInst->getValueOperand(); 2848 } else { 2849 SmallVector<uint64_t, 16> Ops; 2850 SmallVector<Value *, 0> AdditionalValues; 2851 Value *Op = llvm::salvageDebugInfoImpl( 2852 *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops, 2853 AdditionalValues); 2854 if (!Op || !AdditionalValues.empty()) { 2855 // If salvaging failed or salvaging produced more than one location 2856 // operand, give up. 2857 break; 2858 } 2859 Storage = Op; 2860 Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false); 2861 } 2862 SkipOutermostLoad = false; 2863 } 2864 if (!Storage) 2865 return; 2866 2867 auto *StorageAsArg = dyn_cast<Argument>(Storage); 2868 const bool IsSwiftAsyncArg = 2869 StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync); 2870 2871 // Swift async arguments are described by an entry value of the ABI-defined 2872 // register containing the coroutine context. 2873 if (IsSwiftAsyncArg && !Expr->isEntryValue()) 2874 Expr = DIExpression::prepend(Expr, DIExpression::EntryValue); 2875 2876 // If the coroutine frame is an Argument, store it in an alloca to improve 2877 // its availability (e.g. registers may be clobbered). 2878 // Avoid this if optimizations are enabled (they would remove the alloca) or 2879 // if the value is guaranteed to be available through other means (e.g. swift 2880 // ABI guarantees). 2881 if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) { 2882 auto &Cached = ArgToAllocaMap[StorageAsArg]; 2883 if (!Cached) { 2884 Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, 2885 Storage->getName() + ".debug"); 2886 Builder.CreateStore(Storage, Cached); 2887 } 2888 Storage = Cached; 2889 // FIXME: LLVM lacks nuanced semantics to differentiate between 2890 // memory and direct locations at the IR level. The backend will 2891 // turn a dbg.declare(alloca, ..., DIExpression()) into a memory 2892 // location. Thus, if there are deref and offset operations in the 2893 // expression, we need to add a DW_OP_deref at the *start* of the 2894 // expression to first load the contents of the alloca before 2895 // adjusting it with the expression. 2896 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2897 } 2898 2899 DVI->replaceVariableLocationOp(OriginalStorage, Storage); 2900 DVI->setExpression(Expr); 2901 // We only hoist dbg.declare today since it doesn't make sense to hoist 2902 // dbg.value since it does not have the same function wide guarantees that 2903 // dbg.declare does. 2904 if (isa<DbgDeclareInst>(DVI)) { 2905 Instruction *InsertPt = nullptr; 2906 if (auto *I = dyn_cast<Instruction>(Storage)) 2907 InsertPt = I->getInsertionPointAfterDef(); 2908 else if (isa<Argument>(Storage)) 2909 InsertPt = &*F->getEntryBlock().begin(); 2910 if (InsertPt) 2911 DVI->moveBefore(InsertPt); 2912 } 2913 } 2914 2915 static void doRematerializations( 2916 Function &F, SuspendCrossingInfo &Checker, 2917 const std::function<bool(Instruction &)> &MaterializableCallback) { 2918 if (F.hasOptNone()) 2919 return; 2920 2921 SpillInfo Spills; 2922 2923 // See if there are materializable instructions across suspend points 2924 // We record these as the starting point to also identify materializable 2925 // defs of uses in these operations 2926 for (Instruction &I : instructions(F)) { 2927 if (!MaterializableCallback(I)) 2928 continue; 2929 for (User *U : I.users()) 2930 if (Checker.isDefinitionAcrossSuspend(I, U)) 2931 Spills[&I].push_back(cast<Instruction>(U)); 2932 } 2933 2934 // Process each of the identified rematerializable instructions 2935 // and add predecessor instructions that can also be rematerialized. 2936 // This is actually a graph of instructions since we could potentially 2937 // have multiple uses of a def in the set of predecessor instructions. 2938 // The approach here is to maintain a graph of instructions for each bottom 2939 // level instruction - where we have a unique set of instructions (nodes) 2940 // and edges between them. We then walk the graph in reverse post-dominator 2941 // order to insert them past the suspend point, but ensure that ordering is 2942 // correct. We also rely on CSE removing duplicate defs for remats of 2943 // different instructions with a def in common (rather than maintaining more 2944 // complex graphs for each suspend point) 2945 2946 // We can do this by adding new nodes to the list for each suspend 2947 // point. Then using standard GraphTraits to give a reverse post-order 2948 // traversal when we insert the nodes after the suspend 2949 SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats; 2950 for (auto &E : Spills) { 2951 for (Instruction *U : E.second) { 2952 // Don't process a user twice (this can happen if the instruction uses 2953 // more than one rematerializable def) 2954 if (AllRemats.count(U)) 2955 continue; 2956 2957 // Constructor creates the whole RematGraph for the given Use 2958 auto RematUPtr = 2959 std::make_unique<RematGraph>(MaterializableCallback, U, Checker); 2960 2961 LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; 2962 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get()); 2963 for (auto I = RPOT.begin(); I != RPOT.end(); 2964 ++I) { (*I)->Node->dump(); } dbgs() 2965 << "\n";); 2966 2967 AllRemats[U] = std::move(RematUPtr); 2968 } 2969 } 2970 2971 // Rewrite materializable instructions to be materialized at the use 2972 // point. 2973 LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); 2974 rewriteMaterializableInstructions(AllRemats); 2975 } 2976 2977 void coro::buildCoroutineFrame( 2978 Function &F, Shape &Shape, 2979 const std::function<bool(Instruction &)> &MaterializableCallback) { 2980 // Don't eliminate swifterror in async functions that won't be split. 2981 if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) 2982 eliminateSwiftError(F, Shape); 2983 2984 if (Shape.ABI == coro::ABI::Switch && 2985 Shape.SwitchLowering.PromiseAlloca) { 2986 Shape.getSwitchCoroId()->clearPromise(); 2987 } 2988 2989 // Make sure that all coro.save, coro.suspend and the fallthrough coro.end 2990 // intrinsics are in their own blocks to simplify the logic of building up 2991 // SuspendCrossing data. 2992 for (auto *CSI : Shape.CoroSuspends) { 2993 if (auto *Save = CSI->getCoroSave()) 2994 splitAround(Save, "CoroSave"); 2995 splitAround(CSI, "CoroSuspend"); 2996 } 2997 2998 // Put CoroEnds into their own blocks. 2999 for (AnyCoroEndInst *CE : Shape.CoroEnds) { 3000 splitAround(CE, "CoroEnd"); 3001 3002 // Emit the musttail call function in a new block before the CoroEnd. 3003 // We do this here so that the right suspend crossing info is computed for 3004 // the uses of the musttail call function call. (Arguments to the coro.end 3005 // instructions would be ignored) 3006 if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) { 3007 auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction(); 3008 if (!MustTailCallFn) 3009 continue; 3010 IRBuilder<> Builder(AsyncEnd); 3011 SmallVector<Value *, 8> Args(AsyncEnd->args()); 3012 auto Arguments = ArrayRef<Value *>(Args).drop_front(3); 3013 auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn, 3014 Arguments, Builder); 3015 splitAround(Call, "MustTailCall.Before.CoroEnd"); 3016 } 3017 } 3018 3019 // Later code makes structural assumptions about single predecessors phis e.g 3020 // that they are not live across a suspend point. 3021 cleanupSinglePredPHIs(F); 3022 3023 // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will 3024 // never has its definition separated from the PHI by the suspend point. 3025 rewritePHIs(F); 3026 3027 // Build suspend crossing info. 3028 SuspendCrossingInfo Checker(F, Shape); 3029 3030 doRematerializations(F, Checker, MaterializableCallback); 3031 3032 FrameDataInfo FrameData; 3033 SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas; 3034 SmallVector<Instruction*, 4> DeadInstructions; 3035 if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 3036 Shape.ABI != coro::ABI::RetconOnce) 3037 sinkLifetimeStartMarkers(F, Shape, Checker); 3038 3039 // Collect the spills for arguments and other not-materializable values. 3040 for (Argument &A : F.args()) 3041 for (User *U : A.users()) 3042 if (Checker.isDefinitionAcrossSuspend(A, U)) 3043 FrameData.Spills[&A].push_back(cast<Instruction>(U)); 3044 3045 const DominatorTree DT(F); 3046 for (Instruction &I : instructions(F)) { 3047 // Values returned from coroutine structure intrinsics should not be part 3048 // of the Coroutine Frame. 3049 if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin) 3050 continue; 3051 3052 // Handle alloca.alloc specially here. 3053 if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) { 3054 // Check whether the alloca's lifetime is bounded by suspend points. 3055 if (isLocalAlloca(AI)) { 3056 LocalAllocas.push_back(AI); 3057 continue; 3058 } 3059 3060 // If not, do a quick rewrite of the alloca and then add spills of 3061 // the rewritten value. The rewrite doesn't invalidate anything in 3062 // Spills because the other alloca intrinsics have no other operands 3063 // besides AI, and it doesn't invalidate the iteration because we delay 3064 // erasing AI. 3065 auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions); 3066 3067 for (User *U : Alloc->users()) { 3068 if (Checker.isDefinitionAcrossSuspend(*Alloc, U)) 3069 FrameData.Spills[Alloc].push_back(cast<Instruction>(U)); 3070 } 3071 continue; 3072 } 3073 3074 // Ignore alloca.get; we process this as part of coro.alloca.alloc. 3075 if (isa<CoroAllocaGetInst>(I)) 3076 continue; 3077 3078 if (auto *AI = dyn_cast<AllocaInst>(&I)) { 3079 collectFrameAlloca(AI, Shape, Checker, FrameData.Allocas, DT); 3080 continue; 3081 } 3082 3083 for (User *U : I.users()) 3084 if (Checker.isDefinitionAcrossSuspend(I, U)) { 3085 // We cannot spill a token. 3086 if (I.getType()->isTokenTy()) 3087 report_fatal_error( 3088 "token definition is separated from the use by a suspend point"); 3089 FrameData.Spills[&I].push_back(cast<Instruction>(U)); 3090 } 3091 } 3092 3093 LLVM_DEBUG(dumpAllocas(FrameData.Allocas)); 3094 3095 // We don't want the layout of coroutine frame to be affected 3096 // by debug information. So we only choose to salvage DbgValueInst for 3097 // whose value is already in the frame. 3098 // We would handle the dbg.values for allocas specially 3099 for (auto &Iter : FrameData.Spills) { 3100 auto *V = Iter.first; 3101 SmallVector<DbgValueInst *, 16> DVIs; 3102 findDbgValues(DVIs, V); 3103 for (DbgValueInst *DVI : DVIs) 3104 if (Checker.isDefinitionAcrossSuspend(*V, DVI)) 3105 FrameData.Spills[V].push_back(DVI); 3106 } 3107 3108 LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills)); 3109 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 3110 Shape.ABI == coro::ABI::Async) 3111 sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin); 3112 Shape.FrameTy = buildFrameType(F, Shape, FrameData); 3113 createFramePtr(Shape); 3114 // For now, this works for C++ programs only. 3115 buildFrameDebugInfo(F, Shape, FrameData); 3116 insertSpills(FrameData, Shape); 3117 lowerLocalAllocas(LocalAllocas, DeadInstructions); 3118 3119 for (auto *I : DeadInstructions) 3120 I->eraseFromParent(); 3121 } 3122