1 //===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // This file contains classes used to discover if for a particular value 9 // there from sue to definition that crosses a suspend block. 10 // 11 // Using the information discovered we form a Coroutine Frame structure to 12 // contain those values. All uses of those values are replaced with appropriate 13 // GEP + load from the coroutine frame. At the point of the definition we spill 14 // the value into the coroutine frame. 15 //===----------------------------------------------------------------------===// 16 17 #include "CoroInternal.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/ADT/SmallString.h" 22 #include "llvm/Analysis/PtrUseVisitor.h" 23 #include "llvm/Analysis/StackLifetime.h" 24 #include "llvm/Config/llvm-config.h" 25 #include "llvm/IR/CFG.h" 26 #include "llvm/IR/DIBuilder.h" 27 #include "llvm/IR/DebugInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstIterator.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/MathExtras.h" 34 #include "llvm/Support/OptimizedStructLayout.h" 35 #include "llvm/Support/circular_raw_ostream.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include "llvm/Transforms/Utils/Local.h" 39 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 40 #include <algorithm> 41 #include <deque> 42 #include <optional> 43 44 using namespace llvm; 45 46 // The "coro-suspend-crossing" flag is very noisy. There is another debug type, 47 // "coro-frame", which results in leaner debug spew. 48 #define DEBUG_TYPE "coro-suspend-crossing" 49 50 enum { SmallVectorThreshold = 32 }; 51 52 // Provides two way mapping between the blocks and numbers. 53 namespace { 54 class BlockToIndexMapping { 55 SmallVector<BasicBlock *, SmallVectorThreshold> V; 56 57 public: 58 size_t size() const { return V.size(); } 59 60 BlockToIndexMapping(Function &F) { 61 for (BasicBlock &BB : F) 62 V.push_back(&BB); 63 llvm::sort(V); 64 } 65 66 size_t blockToIndex(BasicBlock const *BB) const { 67 auto *I = llvm::lower_bound(V, BB); 68 assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block"); 69 return I - V.begin(); 70 } 71 72 BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; } 73 }; 74 } // end anonymous namespace 75 76 // The SuspendCrossingInfo maintains data that allows to answer a question 77 // whether given two BasicBlocks A and B there is a path from A to B that 78 // passes through a suspend point. 79 // 80 // For every basic block 'i' it maintains a BlockData that consists of: 81 // Consumes: a bit vector which contains a set of indices of blocks that can 82 // reach block 'i'. A block can trivially reach itself. 83 // Kills: a bit vector which contains a set of indices of blocks that can 84 // reach block 'i' but there is a path crossing a suspend point 85 // not repeating 'i' (path to 'i' without cycles containing 'i'). 86 // Suspend: a boolean indicating whether block 'i' contains a suspend point. 87 // End: a boolean indicating whether block 'i' contains a coro.end intrinsic. 88 // KillLoop: There is a path from 'i' to 'i' not otherwise repeating 'i' that 89 // crosses a suspend point. 90 // 91 namespace { 92 class SuspendCrossingInfo { 93 BlockToIndexMapping Mapping; 94 95 struct BlockData { 96 BitVector Consumes; 97 BitVector Kills; 98 bool Suspend = false; 99 bool End = false; 100 bool KillLoop = false; 101 bool Changed = false; 102 }; 103 SmallVector<BlockData, SmallVectorThreshold> Block; 104 105 iterator_range<pred_iterator> predecessors(BlockData const &BD) const { 106 BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]); 107 return llvm::predecessors(BB); 108 } 109 110 BlockData &getBlockData(BasicBlock *BB) { 111 return Block[Mapping.blockToIndex(BB)]; 112 } 113 114 /// Compute the BlockData for the current function in one iteration. 115 /// Initialize - Whether this is the first iteration, we can optimize 116 /// the initial case a little bit by manual loop switch. 117 /// Returns whether the BlockData changes in this iteration. 118 template <bool Initialize = false> 119 bool computeBlockData(const ReversePostOrderTraversal<Function *> &RPOT); 120 121 public: 122 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 123 void dump() const; 124 void dump(StringRef Label, BitVector const &BV) const; 125 #endif 126 127 SuspendCrossingInfo(Function &F, coro::Shape &Shape); 128 129 /// Returns true if there is a path from \p From to \p To crossing a suspend 130 /// point without crossing \p From a 2nd time. 131 bool hasPathCrossingSuspendPoint(BasicBlock *From, BasicBlock *To) const { 132 size_t const FromIndex = Mapping.blockToIndex(From); 133 size_t const ToIndex = Mapping.blockToIndex(To); 134 bool const Result = Block[ToIndex].Kills[FromIndex]; 135 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 136 << " answer is " << Result << "\n"); 137 return Result; 138 } 139 140 /// Returns true if there is a path from \p From to \p To crossing a suspend 141 /// point without crossing \p From a 2nd time. If \p From is the same as \p To 142 /// this will also check if there is a looping path crossing a suspend point. 143 bool hasPathOrLoopCrossingSuspendPoint(BasicBlock *From, 144 BasicBlock *To) const { 145 size_t const FromIndex = Mapping.blockToIndex(From); 146 size_t const ToIndex = Mapping.blockToIndex(To); 147 bool Result = Block[ToIndex].Kills[FromIndex] || 148 (From == To && Block[ToIndex].KillLoop); 149 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 150 << " answer is " << Result << " (path or loop)\n"); 151 return Result; 152 } 153 154 bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const { 155 auto *I = cast<Instruction>(U); 156 157 // We rewrote PHINodes, so that only the ones with exactly one incoming 158 // value need to be analyzed. 159 if (auto *PN = dyn_cast<PHINode>(I)) 160 if (PN->getNumIncomingValues() > 1) 161 return false; 162 163 BasicBlock *UseBB = I->getParent(); 164 165 // As a special case, treat uses by an llvm.coro.suspend.retcon or an 166 // llvm.coro.suspend.async as if they were uses in the suspend's single 167 // predecessor: the uses conceptually occur before the suspend. 168 if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) { 169 UseBB = UseBB->getSinglePredecessor(); 170 assert(UseBB && "should have split coro.suspend into its own block"); 171 } 172 173 return hasPathCrossingSuspendPoint(DefBB, UseBB); 174 } 175 176 bool isDefinitionAcrossSuspend(Argument &A, User *U) const { 177 return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U); 178 } 179 180 bool isDefinitionAcrossSuspend(Instruction &I, User *U) const { 181 auto *DefBB = I.getParent(); 182 183 // As a special case, treat values produced by an llvm.coro.suspend.* 184 // as if they were defined in the single successor: the uses 185 // conceptually occur after the suspend. 186 if (isa<AnyCoroSuspendInst>(I)) { 187 DefBB = DefBB->getSingleSuccessor(); 188 assert(DefBB && "should have split coro.suspend into its own block"); 189 } 190 191 return isDefinitionAcrossSuspend(DefBB, U); 192 } 193 194 bool isDefinitionAcrossSuspend(Value &V, User *U) const { 195 if (auto *Arg = dyn_cast<Argument>(&V)) 196 return isDefinitionAcrossSuspend(*Arg, U); 197 if (auto *Inst = dyn_cast<Instruction>(&V)) 198 return isDefinitionAcrossSuspend(*Inst, U); 199 200 llvm_unreachable( 201 "Coroutine could only collect Argument and Instruction now."); 202 } 203 }; 204 } // end anonymous namespace 205 206 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 207 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label, 208 BitVector const &BV) const { 209 dbgs() << Label << ":"; 210 for (size_t I = 0, N = BV.size(); I < N; ++I) 211 if (BV[I]) 212 dbgs() << " " << Mapping.indexToBlock(I)->getName(); 213 dbgs() << "\n"; 214 } 215 216 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const { 217 for (size_t I = 0, N = Block.size(); I < N; ++I) { 218 BasicBlock *const B = Mapping.indexToBlock(I); 219 dbgs() << B->getName() << ":\n"; 220 dump(" Consumes", Block[I].Consumes); 221 dump(" Kills", Block[I].Kills); 222 } 223 dbgs() << "\n"; 224 } 225 #endif 226 227 template <bool Initialize> 228 bool SuspendCrossingInfo::computeBlockData( 229 const ReversePostOrderTraversal<Function *> &RPOT) { 230 bool Changed = false; 231 232 for (const BasicBlock *BB : RPOT) { 233 auto BBNo = Mapping.blockToIndex(BB); 234 auto &B = Block[BBNo]; 235 236 // We don't need to count the predecessors when initialization. 237 if constexpr (!Initialize) 238 // If all the predecessors of the current Block don't change, 239 // the BlockData for the current block must not change too. 240 if (all_of(predecessors(B), [this](BasicBlock *BB) { 241 return !Block[Mapping.blockToIndex(BB)].Changed; 242 })) { 243 B.Changed = false; 244 continue; 245 } 246 247 // Saved Consumes and Kills bitsets so that it is easy to see 248 // if anything changed after propagation. 249 auto SavedConsumes = B.Consumes; 250 auto SavedKills = B.Kills; 251 252 for (BasicBlock *PI : predecessors(B)) { 253 auto PrevNo = Mapping.blockToIndex(PI); 254 auto &P = Block[PrevNo]; 255 256 // Propagate Kills and Consumes from predecessors into B. 257 B.Consumes |= P.Consumes; 258 B.Kills |= P.Kills; 259 260 // If block P is a suspend block, it should propagate kills into block 261 // B for every block P consumes. 262 if (P.Suspend) 263 B.Kills |= P.Consumes; 264 } 265 266 if (B.Suspend) { 267 // If block B is a suspend block, it should kill all of the blocks it 268 // consumes. 269 B.Kills |= B.Consumes; 270 } else if (B.End) { 271 // If block B is an end block, it should not propagate kills as the 272 // blocks following coro.end() are reached during initial invocation 273 // of the coroutine while all the data are still available on the 274 // stack or in the registers. 275 B.Kills.reset(); 276 } else { 277 // This is reached when B block it not Suspend nor coro.end and it 278 // need to make sure that it is not in the kill set. 279 B.KillLoop |= B.Kills[BBNo]; 280 B.Kills.reset(BBNo); 281 } 282 283 if constexpr (!Initialize) { 284 B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes); 285 Changed |= B.Changed; 286 } 287 } 288 289 return Changed; 290 } 291 292 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) 293 : Mapping(F) { 294 const size_t N = Mapping.size(); 295 Block.resize(N); 296 297 // Initialize every block so that it consumes itself 298 for (size_t I = 0; I < N; ++I) { 299 auto &B = Block[I]; 300 B.Consumes.resize(N); 301 B.Kills.resize(N); 302 B.Consumes.set(I); 303 B.Changed = true; 304 } 305 306 // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as 307 // the code beyond coro.end is reachable during initial invocation of the 308 // coroutine. 309 for (auto *CE : Shape.CoroEnds) 310 getBlockData(CE->getParent()).End = true; 311 312 // Mark all suspend blocks and indicate that they kill everything they 313 // consume. Note, that crossing coro.save also requires a spill, as any code 314 // between coro.save and coro.suspend may resume the coroutine and all of the 315 // state needs to be saved by that time. 316 auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) { 317 BasicBlock *SuspendBlock = BarrierInst->getParent(); 318 auto &B = getBlockData(SuspendBlock); 319 B.Suspend = true; 320 B.Kills |= B.Consumes; 321 }; 322 for (auto *CSI : Shape.CoroSuspends) { 323 markSuspendBlock(CSI); 324 if (auto *Save = CSI->getCoroSave()) 325 markSuspendBlock(Save); 326 } 327 328 // It is considered to be faster to use RPO traversal for forward-edges 329 // dataflow analysis. 330 ReversePostOrderTraversal<Function *> RPOT(&F); 331 computeBlockData</*Initialize=*/true>(RPOT); 332 while (computeBlockData</*Initialize*/ false>(RPOT)) 333 ; 334 335 LLVM_DEBUG(dump()); 336 } 337 338 namespace { 339 340 // RematGraph is used to construct a DAG for rematerializable instructions 341 // When the constructor is invoked with a candidate instruction (which is 342 // materializable) it builds a DAG of materializable instructions from that 343 // point. 344 // Typically, for each instruction identified as re-materializable across a 345 // suspend point, a RematGraph will be created. 346 struct RematGraph { 347 // Each RematNode in the graph contains the edges to instructions providing 348 // operands in the current node. 349 struct RematNode { 350 Instruction *Node; 351 SmallVector<RematNode *> Operands; 352 RematNode() = default; 353 RematNode(Instruction *V) : Node(V) {} 354 }; 355 356 RematNode *EntryNode; 357 using RematNodeMap = 358 SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>; 359 RematNodeMap Remats; 360 const std::function<bool(Instruction &)> &MaterializableCallback; 361 SuspendCrossingInfo &Checker; 362 363 RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback, 364 Instruction *I, SuspendCrossingInfo &Checker) 365 : MaterializableCallback(MaterializableCallback), Checker(Checker) { 366 std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I); 367 EntryNode = FirstNode.get(); 368 std::deque<std::unique_ptr<RematNode>> WorkList; 369 addNode(std::move(FirstNode), WorkList, cast<User>(I)); 370 while (WorkList.size()) { 371 std::unique_ptr<RematNode> N = std::move(WorkList.front()); 372 WorkList.pop_front(); 373 addNode(std::move(N), WorkList, cast<User>(I)); 374 } 375 } 376 377 void addNode(std::unique_ptr<RematNode> NUPtr, 378 std::deque<std::unique_ptr<RematNode>> &WorkList, 379 User *FirstUse) { 380 RematNode *N = NUPtr.get(); 381 if (Remats.count(N->Node)) 382 return; 383 384 // We haven't see this node yet - add to the list 385 Remats[N->Node] = std::move(NUPtr); 386 for (auto &Def : N->Node->operands()) { 387 Instruction *D = dyn_cast<Instruction>(Def.get()); 388 if (!D || !MaterializableCallback(*D) || 389 !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) 390 continue; 391 392 if (Remats.count(D)) { 393 // Already have this in the graph 394 N->Operands.push_back(Remats[D].get()); 395 continue; 396 } 397 398 bool NoMatch = true; 399 for (auto &I : WorkList) { 400 if (I->Node == D) { 401 NoMatch = false; 402 N->Operands.push_back(I.get()); 403 break; 404 } 405 } 406 if (NoMatch) { 407 // Create a new node 408 std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D); 409 N->Operands.push_back(ChildNode.get()); 410 WorkList.push_back(std::move(ChildNode)); 411 } 412 } 413 } 414 415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 416 void dump() const { 417 dbgs() << "Entry ("; 418 if (EntryNode->Node->getParent()->hasName()) 419 dbgs() << EntryNode->Node->getParent()->getName(); 420 else 421 EntryNode->Node->getParent()->printAsOperand(dbgs(), false); 422 dbgs() << ") : " << *EntryNode->Node << "\n"; 423 for (auto &E : Remats) { 424 dbgs() << *(E.first) << "\n"; 425 for (RematNode *U : E.second->Operands) 426 dbgs() << " " << *U->Node << "\n"; 427 } 428 } 429 #endif 430 }; 431 } // end anonymous namespace 432 433 namespace llvm { 434 435 template <> struct GraphTraits<RematGraph *> { 436 using NodeRef = RematGraph::RematNode *; 437 using ChildIteratorType = RematGraph::RematNode **; 438 439 static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } 440 static ChildIteratorType child_begin(NodeRef N) { 441 return N->Operands.begin(); 442 } 443 static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } 444 }; 445 446 } // end namespace llvm 447 448 #undef DEBUG_TYPE // "coro-suspend-crossing" 449 #define DEBUG_TYPE "coro-frame" 450 451 namespace { 452 class FrameTypeBuilder; 453 // Mapping from the to-be-spilled value to all the users that need reload. 454 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>; 455 struct AllocaInfo { 456 AllocaInst *Alloca; 457 DenseMap<Instruction *, std::optional<APInt>> Aliases; 458 bool MayWriteBeforeCoroBegin; 459 AllocaInfo(AllocaInst *Alloca, 460 DenseMap<Instruction *, std::optional<APInt>> Aliases, 461 bool MayWriteBeforeCoroBegin) 462 : Alloca(Alloca), Aliases(std::move(Aliases)), 463 MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {} 464 }; 465 struct FrameDataInfo { 466 // All the values (that are not allocas) that needs to be spilled to the 467 // frame. 468 SpillInfo Spills; 469 // Allocas contains all values defined as allocas that need to live in the 470 // frame. 471 SmallVector<AllocaInfo, 8> Allocas; 472 473 SmallVector<Value *, 8> getAllDefs() const { 474 SmallVector<Value *, 8> Defs; 475 for (const auto &P : Spills) 476 Defs.push_back(P.first); 477 for (const auto &A : Allocas) 478 Defs.push_back(A.Alloca); 479 return Defs; 480 } 481 482 uint32_t getFieldIndex(Value *V) const { 483 auto Itr = FieldIndexMap.find(V); 484 assert(Itr != FieldIndexMap.end() && 485 "Value does not have a frame field index"); 486 return Itr->second; 487 } 488 489 void setFieldIndex(Value *V, uint32_t Index) { 490 assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) && 491 "Cannot set the index for the same field twice."); 492 FieldIndexMap[V] = Index; 493 } 494 495 Align getAlign(Value *V) const { 496 auto Iter = FieldAlignMap.find(V); 497 assert(Iter != FieldAlignMap.end()); 498 return Iter->second; 499 } 500 501 void setAlign(Value *V, Align AL) { 502 assert(FieldAlignMap.count(V) == 0); 503 FieldAlignMap.insert({V, AL}); 504 } 505 506 uint64_t getDynamicAlign(Value *V) const { 507 auto Iter = FieldDynamicAlignMap.find(V); 508 assert(Iter != FieldDynamicAlignMap.end()); 509 return Iter->second; 510 } 511 512 void setDynamicAlign(Value *V, uint64_t Align) { 513 assert(FieldDynamicAlignMap.count(V) == 0); 514 FieldDynamicAlignMap.insert({V, Align}); 515 } 516 517 uint64_t getOffset(Value *V) const { 518 auto Iter = FieldOffsetMap.find(V); 519 assert(Iter != FieldOffsetMap.end()); 520 return Iter->second; 521 } 522 523 void setOffset(Value *V, uint64_t Offset) { 524 assert(FieldOffsetMap.count(V) == 0); 525 FieldOffsetMap.insert({V, Offset}); 526 } 527 528 // Remap the index of every field in the frame, using the final layout index. 529 void updateLayoutIndex(FrameTypeBuilder &B); 530 531 private: 532 // LayoutIndexUpdateStarted is used to avoid updating the index of any field 533 // twice by mistake. 534 bool LayoutIndexUpdateStarted = false; 535 // Map from values to their slot indexes on the frame. They will be first set 536 // with their original insertion field index. After the frame is built, their 537 // indexes will be updated into the final layout index. 538 DenseMap<Value *, uint32_t> FieldIndexMap; 539 // Map from values to their alignment on the frame. They would be set after 540 // the frame is built. 541 DenseMap<Value *, Align> FieldAlignMap; 542 DenseMap<Value *, uint64_t> FieldDynamicAlignMap; 543 // Map from values to their offset on the frame. They would be set after 544 // the frame is built. 545 DenseMap<Value *, uint64_t> FieldOffsetMap; 546 }; 547 } // namespace 548 549 #ifndef NDEBUG 550 static void dumpSpills(StringRef Title, const SpillInfo &Spills) { 551 dbgs() << "------------- " << Title << "--------------\n"; 552 for (const auto &E : Spills) { 553 E.first->dump(); 554 dbgs() << " user: "; 555 for (auto *I : E.second) 556 I->dump(); 557 } 558 } 559 static void dumpRemats( 560 StringRef Title, 561 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) { 562 dbgs() << "------------- " << Title << "--------------\n"; 563 for (const auto &E : RM) { 564 E.second->dump(); 565 dbgs() << "--\n"; 566 } 567 } 568 569 static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) { 570 dbgs() << "------------- Allocas --------------\n"; 571 for (const auto &A : Allocas) { 572 A.Alloca->dump(); 573 } 574 } 575 #endif 576 577 namespace { 578 using FieldIDType = size_t; 579 // We cannot rely solely on natural alignment of a type when building a 580 // coroutine frame and if the alignment specified on the Alloca instruction 581 // differs from the natural alignment of the alloca type we will need to insert 582 // padding. 583 class FrameTypeBuilder { 584 private: 585 struct Field { 586 uint64_t Size; 587 uint64_t Offset; 588 Type *Ty; 589 FieldIDType LayoutFieldIndex; 590 Align Alignment; 591 Align TyAlignment; 592 uint64_t DynamicAlignBuffer; 593 }; 594 595 const DataLayout &DL; 596 LLVMContext &Context; 597 uint64_t StructSize = 0; 598 Align StructAlign; 599 bool IsFinished = false; 600 601 std::optional<Align> MaxFrameAlignment; 602 603 SmallVector<Field, 8> Fields; 604 DenseMap<Value*, unsigned> FieldIndexByKey; 605 606 public: 607 FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL, 608 std::optional<Align> MaxFrameAlignment) 609 : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {} 610 611 /// Add a field to this structure for the storage of an `alloca` 612 /// instruction. 613 [[nodiscard]] FieldIDType addFieldForAlloca(AllocaInst *AI, 614 bool IsHeader = false) { 615 Type *Ty = AI->getAllocatedType(); 616 617 // Make an array type if this is a static array allocation. 618 if (AI->isArrayAllocation()) { 619 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) 620 Ty = ArrayType::get(Ty, CI->getValue().getZExtValue()); 621 else 622 report_fatal_error("Coroutines cannot handle non static allocas yet"); 623 } 624 625 return addField(Ty, AI->getAlign(), IsHeader); 626 } 627 628 /// We want to put the allocas whose lifetime-ranges are not overlapped 629 /// into one slot of coroutine frame. 630 /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566 631 /// 632 /// cppcoro::task<void> alternative_paths(bool cond) { 633 /// if (cond) { 634 /// big_structure a; 635 /// process(a); 636 /// co_await something(); 637 /// } else { 638 /// big_structure b; 639 /// process2(b); 640 /// co_await something(); 641 /// } 642 /// } 643 /// 644 /// We want to put variable a and variable b in the same slot to 645 /// reduce the size of coroutine frame. 646 /// 647 /// This function use StackLifetime algorithm to partition the AllocaInsts in 648 /// Spills to non-overlapped sets in order to put Alloca in the same 649 /// non-overlapped set into the same slot in the Coroutine Frame. Then add 650 /// field for the allocas in the same non-overlapped set by using the largest 651 /// type as the field type. 652 /// 653 /// Side Effects: Because We sort the allocas, the order of allocas in the 654 /// frame may be different with the order in the source code. 655 void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData, 656 coro::Shape &Shape); 657 658 /// Add a field to this structure. 659 [[nodiscard]] FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment, 660 bool IsHeader = false, 661 bool IsSpillOfValue = false) { 662 assert(!IsFinished && "adding fields to a finished builder"); 663 assert(Ty && "must provide a type for a field"); 664 665 // The field size is always the alloc size of the type. 666 uint64_t FieldSize = DL.getTypeAllocSize(Ty); 667 668 // For an alloca with size=0, we don't need to add a field and they 669 // can just point to any index in the frame. Use index 0. 670 if (FieldSize == 0) { 671 return 0; 672 } 673 674 // The field alignment might not be the type alignment, but we need 675 // to remember the type alignment anyway to build the type. 676 // If we are spilling values we don't need to worry about ABI alignment 677 // concerns. 678 Align ABIAlign = DL.getABITypeAlign(Ty); 679 Align TyAlignment = ABIAlign; 680 if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign) 681 TyAlignment = *MaxFrameAlignment; 682 Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment); 683 684 // The field alignment could be bigger than the max frame case, in that case 685 // we request additional storage to be able to dynamically align the 686 // pointer. 687 uint64_t DynamicAlignBuffer = 0; 688 if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) { 689 DynamicAlignBuffer = 690 offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment); 691 FieldAlignment = *MaxFrameAlignment; 692 FieldSize = FieldSize + DynamicAlignBuffer; 693 } 694 695 // Lay out header fields immediately. 696 uint64_t Offset; 697 if (IsHeader) { 698 Offset = alignTo(StructSize, FieldAlignment); 699 StructSize = Offset + FieldSize; 700 701 // Everything else has a flexible offset. 702 } else { 703 Offset = OptimizedStructLayoutField::FlexibleOffset; 704 } 705 706 Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment, 707 DynamicAlignBuffer}); 708 return Fields.size() - 1; 709 } 710 711 /// Finish the layout and set the body on the given type. 712 void finish(StructType *Ty); 713 714 uint64_t getStructSize() const { 715 assert(IsFinished && "not yet finished!"); 716 return StructSize; 717 } 718 719 Align getStructAlign() const { 720 assert(IsFinished && "not yet finished!"); 721 return StructAlign; 722 } 723 724 FieldIDType getLayoutFieldIndex(FieldIDType Id) const { 725 assert(IsFinished && "not yet finished!"); 726 return Fields[Id].LayoutFieldIndex; 727 } 728 729 Field getLayoutField(FieldIDType Id) const { 730 assert(IsFinished && "not yet finished!"); 731 return Fields[Id]; 732 } 733 }; 734 } // namespace 735 736 void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) { 737 auto Updater = [&](Value *I) { 738 auto Field = B.getLayoutField(getFieldIndex(I)); 739 setFieldIndex(I, Field.LayoutFieldIndex); 740 setAlign(I, Field.Alignment); 741 uint64_t dynamicAlign = 742 Field.DynamicAlignBuffer 743 ? Field.DynamicAlignBuffer + Field.Alignment.value() 744 : 0; 745 setDynamicAlign(I, dynamicAlign); 746 setOffset(I, Field.Offset); 747 }; 748 LayoutIndexUpdateStarted = true; 749 for (auto &S : Spills) 750 Updater(S.first); 751 for (const auto &A : Allocas) 752 Updater(A.Alloca); 753 LayoutIndexUpdateStarted = false; 754 } 755 756 void FrameTypeBuilder::addFieldForAllocas(const Function &F, 757 FrameDataInfo &FrameData, 758 coro::Shape &Shape) { 759 using AllocaSetType = SmallVector<AllocaInst *, 4>; 760 SmallVector<AllocaSetType, 4> NonOverlapedAllocas; 761 762 // We need to add field for allocas at the end of this function. 763 auto AddFieldForAllocasAtExit = make_scope_exit([&]() { 764 for (auto AllocaList : NonOverlapedAllocas) { 765 auto *LargestAI = *AllocaList.begin(); 766 FieldIDType Id = addFieldForAlloca(LargestAI); 767 for (auto *Alloca : AllocaList) 768 FrameData.setFieldIndex(Alloca, Id); 769 } 770 }); 771 772 if (!Shape.OptimizeFrame) { 773 for (const auto &A : FrameData.Allocas) { 774 AllocaInst *Alloca = A.Alloca; 775 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 776 } 777 return; 778 } 779 780 // Because there are paths from the lifetime.start to coro.end 781 // for each alloca, the liferanges for every alloca is overlaped 782 // in the blocks who contain coro.end and the successor blocks. 783 // So we choose to skip there blocks when we calculate the liferange 784 // for each alloca. It should be reasonable since there shouldn't be uses 785 // in these blocks and the coroutine frame shouldn't be used outside the 786 // coroutine body. 787 // 788 // Note that the user of coro.suspend may not be SwitchInst. However, this 789 // case seems too complex to handle. And it is harmless to skip these 790 // patterns since it just prevend putting the allocas to live in the same 791 // slot. 792 DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest; 793 for (auto *CoroSuspendInst : Shape.CoroSuspends) { 794 for (auto *U : CoroSuspendInst->users()) { 795 if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) { 796 auto *SWI = const_cast<SwitchInst *>(ConstSWI); 797 DefaultSuspendDest[SWI] = SWI->getDefaultDest(); 798 SWI->setDefaultDest(SWI->getSuccessor(1)); 799 } 800 } 801 } 802 803 auto ExtractAllocas = [&]() { 804 AllocaSetType Allocas; 805 Allocas.reserve(FrameData.Allocas.size()); 806 for (const auto &A : FrameData.Allocas) 807 Allocas.push_back(A.Alloca); 808 return Allocas; 809 }; 810 StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(), 811 StackLifetime::LivenessType::May); 812 StackLifetimeAnalyzer.run(); 813 auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) { 814 return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps( 815 StackLifetimeAnalyzer.getLiveRange(AI2)); 816 }; 817 auto GetAllocaSize = [&](const AllocaInfo &A) { 818 std::optional<TypeSize> RetSize = A.Alloca->getAllocationSize(DL); 819 assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n"); 820 assert(!RetSize->isScalable() && "Scalable vectors are not yet supported"); 821 return RetSize->getFixedValue(); 822 }; 823 // Put larger allocas in the front. So the larger allocas have higher 824 // priority to merge, which can save more space potentially. Also each 825 // AllocaSet would be ordered. So we can get the largest Alloca in one 826 // AllocaSet easily. 827 sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) { 828 return GetAllocaSize(Iter1) > GetAllocaSize(Iter2); 829 }); 830 for (const auto &A : FrameData.Allocas) { 831 AllocaInst *Alloca = A.Alloca; 832 bool Merged = false; 833 // Try to find if the Alloca is not inferenced with any existing 834 // NonOverlappedAllocaSet. If it is true, insert the alloca to that 835 // NonOverlappedAllocaSet. 836 for (auto &AllocaSet : NonOverlapedAllocas) { 837 assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n"); 838 bool NoInference = none_of(AllocaSet, [&](auto Iter) { 839 return IsAllocaInferenre(Alloca, Iter); 840 }); 841 // If the alignment of A is multiple of the alignment of B, the address 842 // of A should satisfy the requirement for aligning for B. 843 // 844 // There may be other more fine-grained strategies to handle the alignment 845 // infomation during the merging process. But it seems hard to handle 846 // these strategies and benefit little. 847 bool Alignable = [&]() -> bool { 848 auto *LargestAlloca = *AllocaSet.begin(); 849 return LargestAlloca->getAlign().value() % Alloca->getAlign().value() == 850 0; 851 }(); 852 bool CouldMerge = NoInference && Alignable; 853 if (!CouldMerge) 854 continue; 855 AllocaSet.push_back(Alloca); 856 Merged = true; 857 break; 858 } 859 if (!Merged) { 860 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 861 } 862 } 863 // Recover the default target destination for each Switch statement 864 // reserved. 865 for (auto SwitchAndDefaultDest : DefaultSuspendDest) { 866 SwitchInst *SWI = SwitchAndDefaultDest.first; 867 BasicBlock *DestBB = SwitchAndDefaultDest.second; 868 SWI->setDefaultDest(DestBB); 869 } 870 // This Debug Info could tell us which allocas are merged into one slot. 871 LLVM_DEBUG(for (auto &AllocaSet 872 : NonOverlapedAllocas) { 873 if (AllocaSet.size() > 1) { 874 dbgs() << "In Function:" << F.getName() << "\n"; 875 dbgs() << "Find Union Set " 876 << "\n"; 877 dbgs() << "\tAllocas are \n"; 878 for (auto Alloca : AllocaSet) 879 dbgs() << "\t\t" << *Alloca << "\n"; 880 } 881 }); 882 } 883 884 void FrameTypeBuilder::finish(StructType *Ty) { 885 assert(!IsFinished && "already finished!"); 886 887 // Prepare the optimal-layout field array. 888 // The Id in the layout field is a pointer to our Field for it. 889 SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 890 LayoutFields.reserve(Fields.size()); 891 for (auto &Field : Fields) { 892 LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment, 893 Field.Offset); 894 } 895 896 // Perform layout. 897 auto SizeAndAlign = performOptimizedStructLayout(LayoutFields); 898 StructSize = SizeAndAlign.first; 899 StructAlign = SizeAndAlign.second; 900 901 auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & { 902 return *static_cast<Field *>(const_cast<void*>(LayoutField.Id)); 903 }; 904 905 // We need to produce a packed struct type if there's a field whose 906 // assigned offset isn't a multiple of its natural type alignment. 907 bool Packed = [&] { 908 for (auto &LayoutField : LayoutFields) { 909 auto &F = getField(LayoutField); 910 if (!isAligned(F.TyAlignment, LayoutField.Offset)) 911 return true; 912 } 913 return false; 914 }(); 915 916 // Build the struct body. 917 SmallVector<Type*, 16> FieldTypes; 918 FieldTypes.reserve(LayoutFields.size() * 3 / 2); 919 uint64_t LastOffset = 0; 920 for (auto &LayoutField : LayoutFields) { 921 auto &F = getField(LayoutField); 922 923 auto Offset = LayoutField.Offset; 924 925 // Add a padding field if there's a padding gap and we're either 926 // building a packed struct or the padding gap is more than we'd 927 // get from aligning to the field type's natural alignment. 928 assert(Offset >= LastOffset); 929 if (Offset != LastOffset) { 930 if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset) 931 FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context), 932 Offset - LastOffset)); 933 } 934 935 F.Offset = Offset; 936 F.LayoutFieldIndex = FieldTypes.size(); 937 938 FieldTypes.push_back(F.Ty); 939 if (F.DynamicAlignBuffer) { 940 FieldTypes.push_back( 941 ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer)); 942 } 943 LastOffset = Offset + F.Size; 944 } 945 946 Ty->setBody(FieldTypes, Packed); 947 948 #ifndef NDEBUG 949 // Check that the IR layout matches the offsets we expect. 950 auto Layout = DL.getStructLayout(Ty); 951 for (auto &F : Fields) { 952 assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty); 953 assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset); 954 } 955 #endif 956 957 IsFinished = true; 958 } 959 960 static void cacheDIVar(FrameDataInfo &FrameData, 961 DenseMap<Value *, DILocalVariable *> &DIVarCache) { 962 for (auto *V : FrameData.getAllDefs()) { 963 if (DIVarCache.contains(V)) 964 continue; 965 966 SmallVector<DbgDeclareInst *, 1> DDIs; 967 SmallVector<DPValue *, 1> DPVs; 968 findDbgDeclares(DDIs, V, &DPVs); 969 auto CacheIt = [&DIVarCache, V](auto &Container) { 970 auto *I = llvm::find_if(Container, [](auto *DDI) { 971 return DDI->getExpression()->getNumElements() == 0; 972 }); 973 if (I != Container.end()) 974 DIVarCache.insert({V, (*I)->getVariable()}); 975 }; 976 CacheIt(DDIs); 977 CacheIt(DPVs); 978 } 979 } 980 981 /// Create name for Type. It uses MDString to store new created string to 982 /// avoid memory leak. 983 static StringRef solveTypeName(Type *Ty) { 984 if (Ty->isIntegerTy()) { 985 // The longest name in common may be '__int_128', which has 9 bits. 986 SmallString<16> Buffer; 987 raw_svector_ostream OS(Buffer); 988 OS << "__int_" << cast<IntegerType>(Ty)->getBitWidth(); 989 auto *MDName = MDString::get(Ty->getContext(), OS.str()); 990 return MDName->getString(); 991 } 992 993 if (Ty->isFloatingPointTy()) { 994 if (Ty->isFloatTy()) 995 return "__float_"; 996 if (Ty->isDoubleTy()) 997 return "__double_"; 998 return "__floating_type_"; 999 } 1000 1001 if (Ty->isPointerTy()) 1002 return "PointerType"; 1003 1004 if (Ty->isStructTy()) { 1005 if (!cast<StructType>(Ty)->hasName()) 1006 return "__LiteralStructType_"; 1007 1008 auto Name = Ty->getStructName(); 1009 1010 SmallString<16> Buffer(Name); 1011 for (auto &Iter : Buffer) 1012 if (Iter == '.' || Iter == ':') 1013 Iter = '_'; 1014 auto *MDName = MDString::get(Ty->getContext(), Buffer.str()); 1015 return MDName->getString(); 1016 } 1017 1018 return "UnknownType"; 1019 } 1020 1021 static DIType *solveDIType(DIBuilder &Builder, Type *Ty, 1022 const DataLayout &Layout, DIScope *Scope, 1023 unsigned LineNum, 1024 DenseMap<Type *, DIType *> &DITypeCache) { 1025 if (DIType *DT = DITypeCache.lookup(Ty)) 1026 return DT; 1027 1028 StringRef Name = solveTypeName(Ty); 1029 1030 DIType *RetType = nullptr; 1031 1032 if (Ty->isIntegerTy()) { 1033 auto BitWidth = cast<IntegerType>(Ty)->getBitWidth(); 1034 RetType = Builder.createBasicType(Name, BitWidth, dwarf::DW_ATE_signed, 1035 llvm::DINode::FlagArtificial); 1036 } else if (Ty->isFloatingPointTy()) { 1037 RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty), 1038 dwarf::DW_ATE_float, 1039 llvm::DINode::FlagArtificial); 1040 } else if (Ty->isPointerTy()) { 1041 // Construct PointerType points to null (aka void *) instead of exploring 1042 // pointee type to avoid infinite search problem. For example, we would be 1043 // in trouble if we traverse recursively: 1044 // 1045 // struct Node { 1046 // Node* ptr; 1047 // }; 1048 RetType = 1049 Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty), 1050 Layout.getABITypeAlign(Ty).value() * CHAR_BIT, 1051 /*DWARFAddressSpace=*/std::nullopt, Name); 1052 } else if (Ty->isStructTy()) { 1053 auto *DIStruct = Builder.createStructType( 1054 Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty), 1055 Layout.getPrefTypeAlign(Ty).value() * CHAR_BIT, 1056 llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray()); 1057 1058 auto *StructTy = cast<StructType>(Ty); 1059 SmallVector<Metadata *, 16> Elements; 1060 for (unsigned I = 0; I < StructTy->getNumElements(); I++) { 1061 DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout, 1062 Scope, LineNum, DITypeCache); 1063 assert(DITy); 1064 Elements.push_back(Builder.createMemberType( 1065 Scope, DITy->getName(), Scope->getFile(), LineNum, 1066 DITy->getSizeInBits(), DITy->getAlignInBits(), 1067 Layout.getStructLayout(StructTy)->getElementOffsetInBits(I), 1068 llvm::DINode::FlagArtificial, DITy)); 1069 } 1070 1071 Builder.replaceArrays(DIStruct, Builder.getOrCreateArray(Elements)); 1072 1073 RetType = DIStruct; 1074 } else { 1075 LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n"); 1076 TypeSize Size = Layout.getTypeSizeInBits(Ty); 1077 auto *CharSizeType = Builder.createBasicType( 1078 Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial); 1079 1080 if (Size <= 8) 1081 RetType = CharSizeType; 1082 else { 1083 if (Size % 8 != 0) 1084 Size = TypeSize::getFixed(Size + 8 - (Size % 8)); 1085 1086 RetType = Builder.createArrayType( 1087 Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType, 1088 Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8))); 1089 } 1090 } 1091 1092 DITypeCache.insert({Ty, RetType}); 1093 return RetType; 1094 } 1095 1096 /// Build artificial debug info for C++ coroutine frames to allow users to 1097 /// inspect the contents of the frame directly 1098 /// 1099 /// Create Debug information for coroutine frame with debug name "__coro_frame". 1100 /// The debug information for the fields of coroutine frame is constructed from 1101 /// the following way: 1102 /// 1. For all the value in the Frame, we search the use of dbg.declare to find 1103 /// the corresponding debug variables for the value. If we can find the 1104 /// debug variable, we can get full and accurate debug information. 1105 /// 2. If we can't get debug information in step 1 and 2, we could only try to 1106 /// build the DIType by Type. We did this in solveDIType. We only handle 1107 /// integer, float, double, integer type and struct type for now. 1108 static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, 1109 FrameDataInfo &FrameData) { 1110 DISubprogram *DIS = F.getSubprogram(); 1111 // If there is no DISubprogram for F, it implies the Function are not compiled 1112 // with debug info. So we also don't need to generate debug info for the frame 1113 // neither. 1114 if (!DIS || !DIS->getUnit() || 1115 !dwarf::isCPlusPlus( 1116 (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage())) 1117 return; 1118 1119 assert(Shape.ABI == coro::ABI::Switch && 1120 "We could only build debug infomation for C++ coroutine now.\n"); 1121 1122 DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); 1123 1124 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1125 assert(PromiseAlloca && 1126 "Coroutine with switch ABI should own Promise alloca"); 1127 1128 SmallVector<DbgDeclareInst *, 1> DIs; 1129 SmallVector<DPValue *, 1> DPVs; 1130 findDbgDeclares(DIs, PromiseAlloca, &DPVs); 1131 1132 DILocalVariable *PromiseDIVariable = nullptr; 1133 DILocation *DILoc = nullptr; 1134 if (!DIs.empty()) { 1135 DbgDeclareInst *PromiseDDI = DIs.front(); 1136 PromiseDIVariable = PromiseDDI->getVariable(); 1137 DILoc = PromiseDDI->getDebugLoc().get(); 1138 } else if (!DPVs.empty()) { 1139 DPValue *PromiseDPV = DPVs.front(); 1140 PromiseDIVariable = PromiseDPV->getVariable(); 1141 DILoc = PromiseDPV->getDebugLoc().get(); 1142 } else { 1143 return; 1144 } 1145 1146 DILocalScope *PromiseDIScope = PromiseDIVariable->getScope(); 1147 DIFile *DFile = PromiseDIScope->getFile(); 1148 unsigned LineNum = PromiseDIVariable->getLine(); 1149 1150 DICompositeType *FrameDITy = DBuilder.createStructType( 1151 DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), 1152 DFile, LineNum, Shape.FrameSize * 8, 1153 Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr, 1154 llvm::DINodeArray()); 1155 StructType *FrameTy = Shape.FrameTy; 1156 SmallVector<Metadata *, 16> Elements; 1157 DataLayout Layout = F.getParent()->getDataLayout(); 1158 1159 DenseMap<Value *, DILocalVariable *> DIVarCache; 1160 cacheDIVar(FrameData, DIVarCache); 1161 1162 unsigned ResumeIndex = coro::Shape::SwitchFieldIndex::Resume; 1163 unsigned DestroyIndex = coro::Shape::SwitchFieldIndex::Destroy; 1164 unsigned IndexIndex = Shape.SwitchLowering.IndexField; 1165 1166 DenseMap<unsigned, StringRef> NameCache; 1167 NameCache.insert({ResumeIndex, "__resume_fn"}); 1168 NameCache.insert({DestroyIndex, "__destroy_fn"}); 1169 NameCache.insert({IndexIndex, "__coro_index"}); 1170 1171 Type *ResumeFnTy = FrameTy->getElementType(ResumeIndex), 1172 *DestroyFnTy = FrameTy->getElementType(DestroyIndex), 1173 *IndexTy = FrameTy->getElementType(IndexIndex); 1174 1175 DenseMap<unsigned, DIType *> TyCache; 1176 TyCache.insert( 1177 {ResumeIndex, DBuilder.createPointerType( 1178 nullptr, Layout.getTypeSizeInBits(ResumeFnTy))}); 1179 TyCache.insert( 1180 {DestroyIndex, DBuilder.createPointerType( 1181 nullptr, Layout.getTypeSizeInBits(DestroyFnTy))}); 1182 1183 /// FIXME: If we fill the field `SizeInBits` with the actual size of 1184 /// __coro_index in bits, then __coro_index wouldn't show in the debugger. 1185 TyCache.insert({IndexIndex, DBuilder.createBasicType( 1186 "__coro_index", 1187 (Layout.getTypeSizeInBits(IndexTy) < 8) 1188 ? 8 1189 : Layout.getTypeSizeInBits(IndexTy), 1190 dwarf::DW_ATE_unsigned_char)}); 1191 1192 for (auto *V : FrameData.getAllDefs()) { 1193 if (!DIVarCache.contains(V)) 1194 continue; 1195 1196 auto Index = FrameData.getFieldIndex(V); 1197 1198 NameCache.insert({Index, DIVarCache[V]->getName()}); 1199 TyCache.insert({Index, DIVarCache[V]->getType()}); 1200 } 1201 1202 // Cache from index to (Align, Offset Pair) 1203 DenseMap<unsigned, std::pair<unsigned, unsigned>> OffsetCache; 1204 // The Align and Offset of Resume function and Destroy function are fixed. 1205 OffsetCache.insert({ResumeIndex, {8, 0}}); 1206 OffsetCache.insert({DestroyIndex, {8, 8}}); 1207 OffsetCache.insert( 1208 {IndexIndex, 1209 {Shape.SwitchLowering.IndexAlign, Shape.SwitchLowering.IndexOffset}}); 1210 1211 for (auto *V : FrameData.getAllDefs()) { 1212 auto Index = FrameData.getFieldIndex(V); 1213 1214 OffsetCache.insert( 1215 {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}}); 1216 } 1217 1218 DenseMap<Type *, DIType *> DITypeCache; 1219 // This counter is used to avoid same type names. e.g., there would be 1220 // many i32 and i64 types in one coroutine. And we would use i32_0 and 1221 // i32_1 to avoid the same type. Since it makes no sense the name of the 1222 // fields confilicts with each other. 1223 unsigned UnknownTypeNum = 0; 1224 for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) { 1225 if (!OffsetCache.contains(Index)) 1226 continue; 1227 1228 std::string Name; 1229 uint64_t SizeInBits; 1230 uint32_t AlignInBits; 1231 uint64_t OffsetInBits; 1232 DIType *DITy = nullptr; 1233 1234 Type *Ty = FrameTy->getElementType(Index); 1235 assert(Ty->isSized() && "We can't handle type which is not sized.\n"); 1236 SizeInBits = Layout.getTypeSizeInBits(Ty).getFixedValue(); 1237 AlignInBits = OffsetCache[Index].first * 8; 1238 OffsetInBits = OffsetCache[Index].second * 8; 1239 1240 if (NameCache.contains(Index)) { 1241 Name = NameCache[Index].str(); 1242 DITy = TyCache[Index]; 1243 } else { 1244 DITy = solveDIType(DBuilder, Ty, Layout, FrameDITy, LineNum, DITypeCache); 1245 assert(DITy && "SolveDIType shouldn't return nullptr.\n"); 1246 Name = DITy->getName().str(); 1247 Name += "_" + std::to_string(UnknownTypeNum); 1248 UnknownTypeNum++; 1249 } 1250 1251 Elements.push_back(DBuilder.createMemberType( 1252 FrameDITy, Name, DFile, LineNum, SizeInBits, AlignInBits, OffsetInBits, 1253 llvm::DINode::FlagArtificial, DITy)); 1254 } 1255 1256 DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements)); 1257 1258 auto *FrameDIVar = DBuilder.createAutoVariable(PromiseDIScope, "__coro_frame", 1259 DFile, LineNum, FrameDITy, 1260 true, DINode::FlagArtificial); 1261 assert(FrameDIVar->isValidLocationForIntrinsic(DILoc)); 1262 1263 // Subprogram would have ContainedNodes field which records the debug 1264 // variables it contained. So we need to add __coro_frame to the 1265 // ContainedNodes of it. 1266 // 1267 // If we don't add __coro_frame to the RetainedNodes, user may get 1268 // `no symbol __coro_frame in context` rather than `__coro_frame` 1269 // is optimized out, which is more precise. 1270 if (auto *SubProgram = dyn_cast<DISubprogram>(PromiseDIScope)) { 1271 auto RetainedNodes = SubProgram->getRetainedNodes(); 1272 SmallVector<Metadata *, 32> RetainedNodesVec(RetainedNodes.begin(), 1273 RetainedNodes.end()); 1274 RetainedNodesVec.push_back(FrameDIVar); 1275 SubProgram->replaceOperandWith( 1276 7, (MDTuple::get(F.getContext(), RetainedNodesVec))); 1277 } 1278 1279 if (UseNewDbgInfoFormat) { 1280 DPValue *NewDPV = new DPValue(ValueAsMetadata::get(Shape.FramePtr), 1281 FrameDIVar, DBuilder.createExpression(), 1282 DILoc, DPValue::LocationType::Declare); 1283 BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr(); 1284 It->getParent()->insertDPValueBefore(NewDPV, It); 1285 } else { 1286 DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar, 1287 DBuilder.createExpression(), DILoc, 1288 &*Shape.getInsertPtAfterFramePtr()); 1289 } 1290 } 1291 1292 // Build a struct that will keep state for an active coroutine. 1293 // struct f.frame { 1294 // ResumeFnTy ResumeFnAddr; 1295 // ResumeFnTy DestroyFnAddr; 1296 // int ResumeIndex; 1297 // ... promise (if present) ... 1298 // ... spills ... 1299 // }; 1300 static StructType *buildFrameType(Function &F, coro::Shape &Shape, 1301 FrameDataInfo &FrameData) { 1302 LLVMContext &C = F.getContext(); 1303 const DataLayout &DL = F.getParent()->getDataLayout(); 1304 StructType *FrameTy = [&] { 1305 SmallString<32> Name(F.getName()); 1306 Name.append(".Frame"); 1307 return StructType::create(C, Name); 1308 }(); 1309 1310 // We will use this value to cap the alignment of spilled values. 1311 std::optional<Align> MaxFrameAlignment; 1312 if (Shape.ABI == coro::ABI::Async) 1313 MaxFrameAlignment = Shape.AsyncLowering.getContextAlignment(); 1314 FrameTypeBuilder B(C, DL, MaxFrameAlignment); 1315 1316 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1317 std::optional<FieldIDType> SwitchIndexFieldId; 1318 1319 if (Shape.ABI == coro::ABI::Switch) { 1320 auto *FnPtrTy = PointerType::getUnqual(C); 1321 1322 // Add header fields for the resume and destroy functions. 1323 // We can rely on these being perfectly packed. 1324 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1325 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1326 1327 // PromiseAlloca field needs to be explicitly added here because it's 1328 // a header field with a fixed offset based on its alignment. Hence it 1329 // needs special handling and cannot be added to FrameData.Allocas. 1330 if (PromiseAlloca) 1331 FrameData.setFieldIndex( 1332 PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true)); 1333 1334 // Add a field to store the suspend index. This doesn't need to 1335 // be in the header. 1336 unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); 1337 Type *IndexType = Type::getIntNTy(C, IndexBits); 1338 1339 SwitchIndexFieldId = B.addField(IndexType, std::nullopt); 1340 } else { 1341 assert(PromiseAlloca == nullptr && "lowering doesn't support promises"); 1342 } 1343 1344 // Because multiple allocas may own the same field slot, 1345 // we add allocas to field here. 1346 B.addFieldForAllocas(F, FrameData, Shape); 1347 // Add PromiseAlloca to Allocas list so that 1348 // 1. updateLayoutIndex could update its index after 1349 // `performOptimizedStructLayout` 1350 // 2. it is processed in insertSpills. 1351 if (Shape.ABI == coro::ABI::Switch && PromiseAlloca) 1352 // We assume that the promise alloca won't be modified before 1353 // CoroBegin and no alias will be create before CoroBegin. 1354 FrameData.Allocas.emplace_back( 1355 PromiseAlloca, DenseMap<Instruction *, std::optional<APInt>>{}, false); 1356 // Create an entry for every spilled value. 1357 for (auto &S : FrameData.Spills) { 1358 Type *FieldType = S.first->getType(); 1359 // For byval arguments, we need to store the pointed value in the frame, 1360 // instead of the pointer itself. 1361 if (const Argument *A = dyn_cast<Argument>(S.first)) 1362 if (A->hasByValAttr()) 1363 FieldType = A->getParamByValType(); 1364 FieldIDType Id = B.addField(FieldType, std::nullopt, false /*header*/, 1365 true /*IsSpillOfValue*/); 1366 FrameData.setFieldIndex(S.first, Id); 1367 } 1368 1369 B.finish(FrameTy); 1370 FrameData.updateLayoutIndex(B); 1371 Shape.FrameAlign = B.getStructAlign(); 1372 Shape.FrameSize = B.getStructSize(); 1373 1374 switch (Shape.ABI) { 1375 case coro::ABI::Switch: { 1376 // In the switch ABI, remember the switch-index field. 1377 auto IndexField = B.getLayoutField(*SwitchIndexFieldId); 1378 Shape.SwitchLowering.IndexField = IndexField.LayoutFieldIndex; 1379 Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value(); 1380 Shape.SwitchLowering.IndexOffset = IndexField.Offset; 1381 1382 // Also round the frame size up to a multiple of its alignment, as is 1383 // generally expected in C/C++. 1384 Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign); 1385 break; 1386 } 1387 1388 // In the retcon ABI, remember whether the frame is inline in the storage. 1389 case coro::ABI::Retcon: 1390 case coro::ABI::RetconOnce: { 1391 auto Id = Shape.getRetconCoroId(); 1392 Shape.RetconLowering.IsFrameInlineInStorage 1393 = (B.getStructSize() <= Id->getStorageSize() && 1394 B.getStructAlign() <= Id->getStorageAlignment()); 1395 break; 1396 } 1397 case coro::ABI::Async: { 1398 Shape.AsyncLowering.FrameOffset = 1399 alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign); 1400 // Also make the final context size a multiple of the context alignment to 1401 // make allocation easier for allocators. 1402 Shape.AsyncLowering.ContextSize = 1403 alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize, 1404 Shape.AsyncLowering.getContextAlignment()); 1405 if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) { 1406 report_fatal_error( 1407 "The alignment requirment of frame variables cannot be higher than " 1408 "the alignment of the async function context"); 1409 } 1410 break; 1411 } 1412 } 1413 1414 return FrameTy; 1415 } 1416 1417 // We use a pointer use visitor to track how an alloca is being used. 1418 // The goal is to be able to answer the following three questions: 1419 // 1. Should this alloca be allocated on the frame instead. 1420 // 2. Could the content of the alloca be modified prior to CoroBegn, which would 1421 // require copying the data from alloca to the frame after CoroBegin. 1422 // 3. Is there any alias created for this alloca prior to CoroBegin, but used 1423 // after CoroBegin. In that case, we will need to recreate the alias after 1424 // CoroBegin based off the frame. To answer question 1, we track two things: 1425 // a. List of all BasicBlocks that use this alloca or any of the aliases of 1426 // the alloca. In the end, we check if there exists any two basic blocks that 1427 // cross suspension points. If so, this alloca must be put on the frame. b. 1428 // Whether the alloca or any alias of the alloca is escaped at some point, 1429 // either by storing the address somewhere, or the address is used in a 1430 // function call that might capture. If it's ever escaped, this alloca must be 1431 // put on the frame conservatively. 1432 // To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin. 1433 // Whenever a potential write happens, either through a store instruction, a 1434 // function call or any of the memory intrinsics, we check whether this 1435 // instruction is prior to CoroBegin. To answer question 3, we track the offsets 1436 // of all aliases created for the alloca prior to CoroBegin but used after 1437 // CoroBegin. std::optional is used to be able to represent the case when the 1438 // offset is unknown (e.g. when you have a PHINode that takes in different 1439 // offset values). We cannot handle unknown offsets and will assert. This is the 1440 // potential issue left out. An ideal solution would likely require a 1441 // significant redesign. 1442 namespace { 1443 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { 1444 using Base = PtrUseVisitor<AllocaUseVisitor>; 1445 AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, 1446 const CoroBeginInst &CB, const SuspendCrossingInfo &Checker, 1447 bool ShouldUseLifetimeStartInfo) 1448 : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker), 1449 ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {} 1450 1451 void visit(Instruction &I) { 1452 Users.insert(&I); 1453 Base::visit(I); 1454 // If the pointer is escaped prior to CoroBegin, we have to assume it would 1455 // be written into before CoroBegin as well. 1456 if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) { 1457 MayWriteBeforeCoroBegin = true; 1458 } 1459 } 1460 // We need to provide this overload as PtrUseVisitor uses a pointer based 1461 // visiting function. 1462 void visit(Instruction *I) { return visit(*I); } 1463 1464 void visitPHINode(PHINode &I) { 1465 enqueueUsers(I); 1466 handleAlias(I); 1467 } 1468 1469 void visitSelectInst(SelectInst &I) { 1470 enqueueUsers(I); 1471 handleAlias(I); 1472 } 1473 1474 void visitStoreInst(StoreInst &SI) { 1475 // Regardless whether the alias of the alloca is the value operand or the 1476 // pointer operand, we need to assume the alloca is been written. 1477 handleMayWrite(SI); 1478 1479 if (SI.getValueOperand() != U->get()) 1480 return; 1481 1482 // We are storing the pointer into a memory location, potentially escaping. 1483 // As an optimization, we try to detect simple cases where it doesn't 1484 // actually escape, for example: 1485 // %ptr = alloca .. 1486 // %addr = alloca .. 1487 // store %ptr, %addr 1488 // %x = load %addr 1489 // .. 1490 // If %addr is only used by loading from it, we could simply treat %x as 1491 // another alias of %ptr, and not considering %ptr being escaped. 1492 auto IsSimpleStoreThenLoad = [&]() { 1493 auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand()); 1494 // If the memory location we are storing to is not an alloca, it 1495 // could be an alias of some other memory locations, which is difficult 1496 // to analyze. 1497 if (!AI) 1498 return false; 1499 // StoreAliases contains aliases of the memory location stored into. 1500 SmallVector<Instruction *, 4> StoreAliases = {AI}; 1501 while (!StoreAliases.empty()) { 1502 Instruction *I = StoreAliases.pop_back_val(); 1503 for (User *U : I->users()) { 1504 // If we are loading from the memory location, we are creating an 1505 // alias of the original pointer. 1506 if (auto *LI = dyn_cast<LoadInst>(U)) { 1507 enqueueUsers(*LI); 1508 handleAlias(*LI); 1509 continue; 1510 } 1511 // If we are overriding the memory location, the pointer certainly 1512 // won't escape. 1513 if (auto *S = dyn_cast<StoreInst>(U)) 1514 if (S->getPointerOperand() == I) 1515 continue; 1516 if (auto *II = dyn_cast<IntrinsicInst>(U)) 1517 if (II->isLifetimeStartOrEnd()) 1518 continue; 1519 // BitCastInst creats aliases of the memory location being stored 1520 // into. 1521 if (auto *BI = dyn_cast<BitCastInst>(U)) { 1522 StoreAliases.push_back(BI); 1523 continue; 1524 } 1525 return false; 1526 } 1527 } 1528 1529 return true; 1530 }; 1531 1532 if (!IsSimpleStoreThenLoad()) 1533 PI.setEscaped(&SI); 1534 } 1535 1536 // All mem intrinsics modify the data. 1537 void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); } 1538 1539 void visitBitCastInst(BitCastInst &BC) { 1540 Base::visitBitCastInst(BC); 1541 handleAlias(BC); 1542 } 1543 1544 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 1545 Base::visitAddrSpaceCastInst(ASC); 1546 handleAlias(ASC); 1547 } 1548 1549 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 1550 // The base visitor will adjust Offset accordingly. 1551 Base::visitGetElementPtrInst(GEPI); 1552 handleAlias(GEPI); 1553 } 1554 1555 void visitIntrinsicInst(IntrinsicInst &II) { 1556 // When we found the lifetime markers refers to a 1557 // subrange of the original alloca, ignore the lifetime 1558 // markers to avoid misleading the analysis. 1559 if (II.getIntrinsicID() != Intrinsic::lifetime_start || !IsOffsetKnown || 1560 !Offset.isZero()) 1561 return Base::visitIntrinsicInst(II); 1562 LifetimeStarts.insert(&II); 1563 } 1564 1565 void visitCallBase(CallBase &CB) { 1566 for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op) 1567 if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op)) 1568 PI.setEscaped(&CB); 1569 handleMayWrite(CB); 1570 } 1571 1572 bool getShouldLiveOnFrame() const { 1573 if (!ShouldLiveOnFrame) 1574 ShouldLiveOnFrame = computeShouldLiveOnFrame(); 1575 return *ShouldLiveOnFrame; 1576 } 1577 1578 bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; } 1579 1580 DenseMap<Instruction *, std::optional<APInt>> getAliasesCopy() const { 1581 assert(getShouldLiveOnFrame() && "This method should only be called if the " 1582 "alloca needs to live on the frame."); 1583 for (const auto &P : AliasOffetMap) 1584 if (!P.second) 1585 report_fatal_error("Unable to handle an alias with unknown offset " 1586 "created before CoroBegin."); 1587 return AliasOffetMap; 1588 } 1589 1590 private: 1591 const DominatorTree &DT; 1592 const CoroBeginInst &CoroBegin; 1593 const SuspendCrossingInfo &Checker; 1594 // All alias to the original AllocaInst, created before CoroBegin and used 1595 // after CoroBegin. Each entry contains the instruction and the offset in the 1596 // original Alloca. They need to be recreated after CoroBegin off the frame. 1597 DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{}; 1598 SmallPtrSet<Instruction *, 4> Users{}; 1599 SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{}; 1600 bool MayWriteBeforeCoroBegin{false}; 1601 bool ShouldUseLifetimeStartInfo{true}; 1602 1603 mutable std::optional<bool> ShouldLiveOnFrame{}; 1604 1605 bool computeShouldLiveOnFrame() const { 1606 // If lifetime information is available, we check it first since it's 1607 // more precise. We look at every pair of lifetime.start intrinsic and 1608 // every basic block that uses the pointer to see if they cross suspension 1609 // points. The uses cover both direct uses as well as indirect uses. 1610 if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) { 1611 for (auto *I : Users) 1612 for (auto *S : LifetimeStarts) 1613 if (Checker.isDefinitionAcrossSuspend(*S, I)) 1614 return true; 1615 // Addresses are guaranteed to be identical after every lifetime.start so 1616 // we cannot use the local stack if the address escaped and there is a 1617 // suspend point between lifetime markers. This should also cover the 1618 // case of a single lifetime.start intrinsic in a loop with suspend point. 1619 if (PI.isEscaped()) { 1620 for (auto *A : LifetimeStarts) { 1621 for (auto *B : LifetimeStarts) { 1622 if (Checker.hasPathOrLoopCrossingSuspendPoint(A->getParent(), 1623 B->getParent())) 1624 return true; 1625 } 1626 } 1627 } 1628 return false; 1629 } 1630 // FIXME: Ideally the isEscaped check should come at the beginning. 1631 // However there are a few loose ends that need to be fixed first before 1632 // we can do that. We need to make sure we are not over-conservative, so 1633 // that the data accessed in-between await_suspend and symmetric transfer 1634 // is always put on the stack, and also data accessed after coro.end is 1635 // always put on the stack (esp the return object). To fix that, we need 1636 // to: 1637 // 1) Potentially treat sret as nocapture in calls 1638 // 2) Special handle the return object and put it on the stack 1639 // 3) Utilize lifetime.end intrinsic 1640 if (PI.isEscaped()) 1641 return true; 1642 1643 for (auto *U1 : Users) 1644 for (auto *U2 : Users) 1645 if (Checker.isDefinitionAcrossSuspend(*U1, U2)) 1646 return true; 1647 1648 return false; 1649 } 1650 1651 void handleMayWrite(const Instruction &I) { 1652 if (!DT.dominates(&CoroBegin, &I)) 1653 MayWriteBeforeCoroBegin = true; 1654 } 1655 1656 bool usedAfterCoroBegin(Instruction &I) { 1657 for (auto &U : I.uses()) 1658 if (DT.dominates(&CoroBegin, U)) 1659 return true; 1660 return false; 1661 } 1662 1663 void handleAlias(Instruction &I) { 1664 // We track all aliases created prior to CoroBegin but used after. 1665 // These aliases may need to be recreated after CoroBegin if the alloca 1666 // need to live on the frame. 1667 if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I)) 1668 return; 1669 1670 if (!IsOffsetKnown) { 1671 AliasOffetMap[&I].reset(); 1672 } else { 1673 auto Itr = AliasOffetMap.find(&I); 1674 if (Itr == AliasOffetMap.end()) { 1675 AliasOffetMap[&I] = Offset; 1676 } else if (Itr->second && *Itr->second != Offset) { 1677 // If we have seen two different possible values for this alias, we set 1678 // it to empty. 1679 AliasOffetMap[&I].reset(); 1680 } 1681 } 1682 } 1683 }; 1684 } // namespace 1685 1686 // We need to make room to insert a spill after initial PHIs, but before 1687 // catchswitch instruction. Placing it before violates the requirement that 1688 // catchswitch, like all other EHPads must be the first nonPHI in a block. 1689 // 1690 // Split away catchswitch into a separate block and insert in its place: 1691 // 1692 // cleanuppad <InsertPt> cleanupret. 1693 // 1694 // cleanupret instruction will act as an insert point for the spill. 1695 static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) { 1696 BasicBlock *CurrentBlock = CatchSwitch->getParent(); 1697 BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch); 1698 CurrentBlock->getTerminator()->eraseFromParent(); 1699 1700 auto *CleanupPad = 1701 CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock); 1702 auto *CleanupRet = 1703 CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock); 1704 return CleanupRet; 1705 } 1706 1707 // Replace all alloca and SSA values that are accessed across suspend points 1708 // with GetElementPointer from coroutine frame + loads and stores. Create an 1709 // AllocaSpillBB that will become the new entry block for the resume parts of 1710 // the coroutine: 1711 // 1712 // %hdl = coro.begin(...) 1713 // whatever 1714 // 1715 // becomes: 1716 // 1717 // %hdl = coro.begin(...) 1718 // br label %AllocaSpillBB 1719 // 1720 // AllocaSpillBB: 1721 // ; geps corresponding to allocas that were moved to coroutine frame 1722 // br label PostSpill 1723 // 1724 // PostSpill: 1725 // whatever 1726 // 1727 // 1728 static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { 1729 auto *CB = Shape.CoroBegin; 1730 LLVMContext &C = CB->getContext(); 1731 Function *F = CB->getFunction(); 1732 IRBuilder<> Builder(C); 1733 StructType *FrameTy = Shape.FrameTy; 1734 Value *FramePtr = Shape.FramePtr; 1735 DominatorTree DT(*F); 1736 SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; 1737 1738 // Create a GEP with the given index into the coroutine frame for the original 1739 // value Orig. Appends an extra 0 index for array-allocas, preserving the 1740 // original type. 1741 auto GetFramePointer = [&](Value *Orig) -> Value * { 1742 FieldIDType Index = FrameData.getFieldIndex(Orig); 1743 SmallVector<Value *, 3> Indices = { 1744 ConstantInt::get(Type::getInt32Ty(C), 0), 1745 ConstantInt::get(Type::getInt32Ty(C), Index), 1746 }; 1747 1748 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1749 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { 1750 auto Count = CI->getValue().getZExtValue(); 1751 if (Count > 1) { 1752 Indices.push_back(ConstantInt::get(Type::getInt32Ty(C), 0)); 1753 } 1754 } else { 1755 report_fatal_error("Coroutines cannot handle non static allocas yet"); 1756 } 1757 } 1758 1759 auto GEP = cast<GetElementPtrInst>( 1760 Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices)); 1761 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1762 if (FrameData.getDynamicAlign(Orig) != 0) { 1763 assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value()); 1764 auto *M = AI->getModule(); 1765 auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType()); 1766 auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy); 1767 auto *AlignMask = 1768 ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1); 1769 PtrValue = Builder.CreateAdd(PtrValue, AlignMask); 1770 PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask)); 1771 return Builder.CreateIntToPtr(PtrValue, AI->getType()); 1772 } 1773 // If the type of GEP is not equal to the type of AllocaInst, it implies 1774 // that the AllocaInst may be reused in the Frame slot of other 1775 // AllocaInst. So We cast GEP to the AllocaInst here to re-use 1776 // the Frame storage. 1777 // 1778 // Note: If we change the strategy dealing with alignment, we need to refine 1779 // this casting. 1780 if (GEP->getType() != Orig->getType()) 1781 return Builder.CreateAddrSpaceCast(GEP, Orig->getType(), 1782 Orig->getName() + Twine(".cast")); 1783 } 1784 return GEP; 1785 }; 1786 1787 for (auto const &E : FrameData.Spills) { 1788 Value *Def = E.first; 1789 auto SpillAlignment = Align(FrameData.getAlign(Def)); 1790 // Create a store instruction storing the value into the 1791 // coroutine frame. 1792 BasicBlock::iterator InsertPt; 1793 Type *ByValTy = nullptr; 1794 if (auto *Arg = dyn_cast<Argument>(Def)) { 1795 // For arguments, we will place the store instruction right after 1796 // the coroutine frame pointer instruction, i.e. coro.begin. 1797 InsertPt = Shape.getInsertPtAfterFramePtr(); 1798 1799 // If we're spilling an Argument, make sure we clear 'nocapture' 1800 // from the coroutine function. 1801 Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture); 1802 1803 if (Arg->hasByValAttr()) 1804 ByValTy = Arg->getParamByValType(); 1805 } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) { 1806 // Don't spill immediately after a suspend; splitting assumes 1807 // that the suspend will be followed by a branch. 1808 InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHIIt(); 1809 } else { 1810 auto *I = cast<Instruction>(Def); 1811 if (!DT.dominates(CB, I)) { 1812 // If it is not dominated by CoroBegin, then spill should be 1813 // inserted immediately after CoroFrame is computed. 1814 InsertPt = Shape.getInsertPtAfterFramePtr(); 1815 } else if (auto *II = dyn_cast<InvokeInst>(I)) { 1816 // If we are spilling the result of the invoke instruction, split 1817 // the normal edge and insert the spill in the new block. 1818 auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest()); 1819 InsertPt = NewBB->getTerminator()->getIterator(); 1820 } else if (isa<PHINode>(I)) { 1821 // Skip the PHINodes and EH pads instructions. 1822 BasicBlock *DefBlock = I->getParent(); 1823 if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator())) 1824 InsertPt = splitBeforeCatchSwitch(CSI)->getIterator(); 1825 else 1826 InsertPt = DefBlock->getFirstInsertionPt(); 1827 } else { 1828 assert(!I->isTerminator() && "unexpected terminator"); 1829 // For all other values, the spill is placed immediately after 1830 // the definition. 1831 InsertPt = I->getNextNode()->getIterator(); 1832 } 1833 } 1834 1835 auto Index = FrameData.getFieldIndex(Def); 1836 Builder.SetInsertPoint(InsertPt->getParent(), InsertPt); 1837 auto *G = Builder.CreateConstInBoundsGEP2_32( 1838 FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr")); 1839 if (ByValTy) { 1840 // For byval arguments, we need to store the pointed value in the frame, 1841 // instead of the pointer itself. 1842 auto *Value = Builder.CreateLoad(ByValTy, Def); 1843 Builder.CreateAlignedStore(Value, G, SpillAlignment); 1844 } else { 1845 Builder.CreateAlignedStore(Def, G, SpillAlignment); 1846 } 1847 1848 BasicBlock *CurrentBlock = nullptr; 1849 Value *CurrentReload = nullptr; 1850 for (auto *U : E.second) { 1851 // If we have not seen the use block, create a load instruction to reload 1852 // the spilled value from the coroutine frame. Populates the Value pointer 1853 // reference provided with the frame GEP. 1854 if (CurrentBlock != U->getParent()) { 1855 CurrentBlock = U->getParent(); 1856 Builder.SetInsertPoint(CurrentBlock, 1857 CurrentBlock->getFirstInsertionPt()); 1858 1859 auto *GEP = GetFramePointer(E.first); 1860 GEP->setName(E.first->getName() + Twine(".reload.addr")); 1861 if (ByValTy) 1862 CurrentReload = GEP; 1863 else 1864 CurrentReload = Builder.CreateAlignedLoad( 1865 FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP, 1866 SpillAlignment, E.first->getName() + Twine(".reload")); 1867 1868 SmallVector<DbgDeclareInst *, 1> DIs; 1869 SmallVector<DPValue *, 1> DPVs; 1870 findDbgDeclares(DIs, Def, &DPVs); 1871 // Try best to find dbg.declare. If the spill is a temp, there may not 1872 // be a direct dbg.declare. Walk up the load chain to find one from an 1873 // alias. 1874 if (F->getSubprogram()) { 1875 auto *CurDef = Def; 1876 while (DIs.empty() && isa<LoadInst>(CurDef)) { 1877 auto *LdInst = cast<LoadInst>(CurDef); 1878 // Only consider ptr to ptr same type load. 1879 if (LdInst->getPointerOperandType() != LdInst->getType()) 1880 break; 1881 CurDef = LdInst->getPointerOperand(); 1882 if (!isa<AllocaInst, LoadInst>(CurDef)) 1883 break; 1884 DIs.clear(); 1885 DPVs.clear(); 1886 findDbgDeclares(DIs, CurDef, &DPVs); 1887 } 1888 } 1889 1890 auto SalvageOne = [&](auto *DDI) { 1891 bool AllowUnresolved = false; 1892 // This dbg.declare is preserved for all coro-split function 1893 // fragments. It will be unreachable in the main function, and 1894 // processed by coro::salvageDebugInfo() by CoroCloner. 1895 if (UseNewDbgInfoFormat) { 1896 DPValue *NewDPV = 1897 new DPValue(ValueAsMetadata::get(CurrentReload), 1898 DDI->getVariable(), DDI->getExpression(), 1899 DDI->getDebugLoc(), DPValue::LocationType::Declare); 1900 Builder.GetInsertPoint()->getParent()->insertDPValueBefore( 1901 NewDPV, Builder.GetInsertPoint()); 1902 } else { 1903 DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved) 1904 .insertDeclare(CurrentReload, DDI->getVariable(), 1905 DDI->getExpression(), DDI->getDebugLoc(), 1906 &*Builder.GetInsertPoint()); 1907 } 1908 // This dbg.declare is for the main function entry point. It 1909 // will be deleted in all coro-split functions. 1910 coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame, 1911 false /*UseEntryValue*/); 1912 }; 1913 for_each(DIs, SalvageOne); 1914 for_each(DPVs, SalvageOne); 1915 } 1916 1917 // If we have a single edge PHINode, remove it and replace it with a 1918 // reload from the coroutine frame. (We already took care of multi edge 1919 // PHINodes by rewriting them in the rewritePHIs function). 1920 if (auto *PN = dyn_cast<PHINode>(U)) { 1921 assert(PN->getNumIncomingValues() == 1 && 1922 "unexpected number of incoming " 1923 "values in the PHINode"); 1924 PN->replaceAllUsesWith(CurrentReload); 1925 PN->eraseFromParent(); 1926 continue; 1927 } 1928 1929 // Replace all uses of CurrentValue in the current instruction with 1930 // reload. 1931 U->replaceUsesOfWith(Def, CurrentReload); 1932 // Instructions are added to Def's user list if the attached 1933 // debug records use Def. Update those now. 1934 for (auto &DPV : U->getDbgValueRange()) 1935 DPV.replaceVariableLocationOp(Def, CurrentReload, true); 1936 } 1937 } 1938 1939 BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent(); 1940 1941 auto SpillBlock = FramePtrBB->splitBasicBlock( 1942 Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB"); 1943 SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill"); 1944 Shape.AllocaSpillBlock = SpillBlock; 1945 1946 // retcon and retcon.once lowering assumes all uses have been sunk. 1947 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 1948 Shape.ABI == coro::ABI::Async) { 1949 // If we found any allocas, replace all of their remaining uses with Geps. 1950 Builder.SetInsertPoint(SpillBlock, SpillBlock->begin()); 1951 for (const auto &P : FrameData.Allocas) { 1952 AllocaInst *Alloca = P.Alloca; 1953 auto *G = GetFramePointer(Alloca); 1954 1955 // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) 1956 // here, as we are changing location of the instruction. 1957 G->takeName(Alloca); 1958 Alloca->replaceAllUsesWith(G); 1959 Alloca->eraseFromParent(); 1960 } 1961 return; 1962 } 1963 1964 // If we found any alloca, replace all of their remaining uses with GEP 1965 // instructions. To remain debugbility, we replace the uses of allocas for 1966 // dbg.declares and dbg.values with the reload from the frame. 1967 // Note: We cannot replace the alloca with GEP instructions indiscriminately, 1968 // as some of the uses may not be dominated by CoroBegin. 1969 Builder.SetInsertPoint(Shape.AllocaSpillBlock, 1970 Shape.AllocaSpillBlock->begin()); 1971 SmallVector<Instruction *, 4> UsersToUpdate; 1972 for (const auto &A : FrameData.Allocas) { 1973 AllocaInst *Alloca = A.Alloca; 1974 UsersToUpdate.clear(); 1975 for (User *U : Alloca->users()) { 1976 auto *I = cast<Instruction>(U); 1977 if (DT.dominates(CB, I)) 1978 UsersToUpdate.push_back(I); 1979 } 1980 if (UsersToUpdate.empty()) 1981 continue; 1982 auto *G = GetFramePointer(Alloca); 1983 G->setName(Alloca->getName() + Twine(".reload.addr")); 1984 1985 SmallVector<DbgVariableIntrinsic *, 4> DIs; 1986 SmallVector<DPValue *> DPValues; 1987 findDbgUsers(DIs, Alloca, &DPValues); 1988 for (auto *DVI : DIs) 1989 DVI->replaceUsesOfWith(Alloca, G); 1990 for (auto *DPV : DPValues) 1991 DPV->replaceVariableLocationOp(Alloca, G); 1992 1993 for (Instruction *I : UsersToUpdate) { 1994 // It is meaningless to retain the lifetime intrinsics refer for the 1995 // member of coroutine frames and the meaningless lifetime intrinsics 1996 // are possible to block further optimizations. 1997 if (I->isLifetimeStartOrEnd()) { 1998 I->eraseFromParent(); 1999 continue; 2000 } 2001 2002 I->replaceUsesOfWith(Alloca, G); 2003 } 2004 } 2005 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2006 for (const auto &A : FrameData.Allocas) { 2007 AllocaInst *Alloca = A.Alloca; 2008 if (A.MayWriteBeforeCoroBegin) { 2009 // isEscaped really means potentially modified before CoroBegin. 2010 if (Alloca->isArrayAllocation()) 2011 report_fatal_error( 2012 "Coroutines cannot handle copying of array allocas yet"); 2013 2014 auto *G = GetFramePointer(Alloca); 2015 auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca); 2016 Builder.CreateStore(Value, G); 2017 } 2018 // For each alias to Alloca created before CoroBegin but used after 2019 // CoroBegin, we recreate them after CoroBegin by appplying the offset 2020 // to the pointer in the frame. 2021 for (const auto &Alias : A.Aliases) { 2022 auto *FramePtr = GetFramePointer(Alloca); 2023 auto &Value = *Alias.second; 2024 auto ITy = IntegerType::get(C, Value.getBitWidth()); 2025 auto *AliasPtr = Builder.CreateGEP(Type::getInt8Ty(C), FramePtr, 2026 ConstantInt::get(ITy, Value)); 2027 Alias.first->replaceUsesWithIf( 2028 AliasPtr, [&](Use &U) { return DT.dominates(CB, U); }); 2029 } 2030 } 2031 2032 // PromiseAlloca is not collected in FrameData.Allocas. So we don't handle 2033 // the case that the PromiseAlloca may have writes before CoroBegin in the 2034 // above codes. And it may be problematic in edge cases. See 2035 // https://github.com/llvm/llvm-project/issues/57861 for an example. 2036 if (Shape.ABI == coro::ABI::Switch && Shape.SwitchLowering.PromiseAlloca) { 2037 AllocaInst *PA = Shape.SwitchLowering.PromiseAlloca; 2038 // If there is memory accessing to promise alloca before CoroBegin; 2039 bool HasAccessingPromiseBeforeCB = llvm::any_of(PA->uses(), [&](Use &U) { 2040 auto *Inst = dyn_cast<Instruction>(U.getUser()); 2041 if (!Inst || DT.dominates(CB, Inst)) 2042 return false; 2043 2044 if (auto *CI = dyn_cast<CallInst>(Inst)) { 2045 // It is fine if the call wouldn't write to the Promise. 2046 // This is possible for @llvm.coro.id intrinsics, which 2047 // would take the promise as the second argument as a 2048 // marker. 2049 if (CI->onlyReadsMemory() || 2050 CI->onlyReadsMemory(CI->getArgOperandNo(&U))) 2051 return false; 2052 return true; 2053 } 2054 2055 return isa<StoreInst>(Inst) || 2056 // It may take too much time to track the uses. 2057 // Be conservative about the case the use may escape. 2058 isa<GetElementPtrInst>(Inst) || 2059 // There would always be a bitcast for the promise alloca 2060 // before we enabled Opaque pointers. And now given 2061 // opaque pointers are enabled by default. This should be 2062 // fine. 2063 isa<BitCastInst>(Inst); 2064 }); 2065 if (HasAccessingPromiseBeforeCB) { 2066 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2067 auto *G = GetFramePointer(PA); 2068 auto *Value = Builder.CreateLoad(PA->getAllocatedType(), PA); 2069 Builder.CreateStore(Value, G); 2070 } 2071 } 2072 } 2073 2074 // Moves the values in the PHIs in SuccBB that correspong to PredBB into a new 2075 // PHI in InsertedBB. 2076 static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB, 2077 BasicBlock *InsertedBB, 2078 BasicBlock *PredBB, 2079 PHINode *UntilPHI = nullptr) { 2080 auto *PN = cast<PHINode>(&SuccBB->front()); 2081 do { 2082 int Index = PN->getBasicBlockIndex(InsertedBB); 2083 Value *V = PN->getIncomingValue(Index); 2084 PHINode *InputV = PHINode::Create( 2085 V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName()); 2086 InputV->insertBefore(InsertedBB->begin()); 2087 InputV->addIncoming(V, PredBB); 2088 PN->setIncomingValue(Index, InputV); 2089 PN = dyn_cast<PHINode>(PN->getNextNode()); 2090 } while (PN != UntilPHI); 2091 } 2092 2093 // Rewrites the PHI Nodes in a cleanuppad. 2094 static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB, 2095 CleanupPadInst *CleanupPad) { 2096 // For every incoming edge to a CleanupPad we will create a new block holding 2097 // all incoming values in single-value PHI nodes. We will then create another 2098 // block to act as a dispather (as all unwind edges for related EH blocks 2099 // must be the same). 2100 // 2101 // cleanuppad: 2102 // %2 = phi i32[%0, %catchswitch], [%1, %catch.1] 2103 // %3 = cleanuppad within none [] 2104 // 2105 // It will create: 2106 // 2107 // cleanuppad.corodispatch 2108 // %2 = phi i8[0, %catchswitch], [1, %catch.1] 2109 // %3 = cleanuppad within none [] 2110 // switch i8 % 2, label %unreachable 2111 // [i8 0, label %cleanuppad.from.catchswitch 2112 // i8 1, label %cleanuppad.from.catch.1] 2113 // cleanuppad.from.catchswitch: 2114 // %4 = phi i32 [%0, %catchswitch] 2115 // br %label cleanuppad 2116 // cleanuppad.from.catch.1: 2117 // %6 = phi i32 [%1, %catch.1] 2118 // br %label cleanuppad 2119 // cleanuppad: 2120 // %8 = phi i32 [%4, %cleanuppad.from.catchswitch], 2121 // [%6, %cleanuppad.from.catch.1] 2122 2123 // Unreachable BB, in case switching on an invalid value in the dispatcher. 2124 auto *UnreachBB = BasicBlock::Create( 2125 CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent()); 2126 IRBuilder<> Builder(UnreachBB); 2127 Builder.CreateUnreachable(); 2128 2129 // Create a new cleanuppad which will be the dispatcher. 2130 auto *NewCleanupPadBB = 2131 BasicBlock::Create(CleanupPadBB->getContext(), 2132 CleanupPadBB->getName() + Twine(".corodispatch"), 2133 CleanupPadBB->getParent(), CleanupPadBB); 2134 Builder.SetInsertPoint(NewCleanupPadBB); 2135 auto *SwitchType = Builder.getInt8Ty(); 2136 auto *SetDispatchValuePN = 2137 Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB)); 2138 CleanupPad->removeFromParent(); 2139 CleanupPad->insertAfter(SetDispatchValuePN); 2140 auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB, 2141 pred_size(CleanupPadBB)); 2142 2143 int SwitchIndex = 0; 2144 SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB)); 2145 for (BasicBlock *Pred : Preds) { 2146 // Create a new cleanuppad and move the PHI values to there. 2147 auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(), 2148 CleanupPadBB->getName() + 2149 Twine(".from.") + Pred->getName(), 2150 CleanupPadBB->getParent(), CleanupPadBB); 2151 updatePhiNodes(CleanupPadBB, Pred, CaseBB); 2152 CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") + 2153 Pred->getName()); 2154 Builder.SetInsertPoint(CaseBB); 2155 Builder.CreateBr(CleanupPadBB); 2156 movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB); 2157 2158 // Update this Pred to the new unwind point. 2159 setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB); 2160 2161 // Setup the switch in the dispatcher. 2162 auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex); 2163 SetDispatchValuePN->addIncoming(SwitchConstant, Pred); 2164 SwitchOnDispatch->addCase(SwitchConstant, CaseBB); 2165 SwitchIndex++; 2166 } 2167 } 2168 2169 static void cleanupSinglePredPHIs(Function &F) { 2170 SmallVector<PHINode *, 32> Worklist; 2171 for (auto &BB : F) { 2172 for (auto &Phi : BB.phis()) { 2173 if (Phi.getNumIncomingValues() == 1) { 2174 Worklist.push_back(&Phi); 2175 } else 2176 break; 2177 } 2178 } 2179 while (!Worklist.empty()) { 2180 auto *Phi = Worklist.pop_back_val(); 2181 auto *OriginalValue = Phi->getIncomingValue(0); 2182 Phi->replaceAllUsesWith(OriginalValue); 2183 } 2184 } 2185 2186 static void rewritePHIs(BasicBlock &BB) { 2187 // For every incoming edge we will create a block holding all 2188 // incoming values in a single PHI nodes. 2189 // 2190 // loop: 2191 // %n.val = phi i32[%n, %entry], [%inc, %loop] 2192 // 2193 // It will create: 2194 // 2195 // loop.from.entry: 2196 // %n.loop.pre = phi i32 [%n, %entry] 2197 // br %label loop 2198 // loop.from.loop: 2199 // %inc.loop.pre = phi i32 [%inc, %loop] 2200 // br %label loop 2201 // 2202 // After this rewrite, further analysis will ignore any phi nodes with more 2203 // than one incoming edge. 2204 2205 // TODO: Simplify PHINodes in the basic block to remove duplicate 2206 // predecessors. 2207 2208 // Special case for CleanupPad: all EH blocks must have the same unwind edge 2209 // so we need to create an additional "dispatcher" block. 2210 if (auto *CleanupPad = 2211 dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) { 2212 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2213 for (BasicBlock *Pred : Preds) { 2214 if (CatchSwitchInst *CS = 2215 dyn_cast<CatchSwitchInst>(Pred->getTerminator())) { 2216 // CleanupPad with a CatchSwitch predecessor: therefore this is an 2217 // unwind destination that needs to be handle specially. 2218 assert(CS->getUnwindDest() == &BB); 2219 (void)CS; 2220 rewritePHIsForCleanupPad(&BB, CleanupPad); 2221 return; 2222 } 2223 } 2224 } 2225 2226 LandingPadInst *LandingPad = nullptr; 2227 PHINode *ReplPHI = nullptr; 2228 if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) { 2229 // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. 2230 // We replace the original landing pad with a PHINode that will collect the 2231 // results from all of them. 2232 ReplPHI = PHINode::Create(LandingPad->getType(), 1, ""); 2233 ReplPHI->insertBefore(LandingPad->getIterator()); 2234 ReplPHI->takeName(LandingPad); 2235 LandingPad->replaceAllUsesWith(ReplPHI); 2236 // We will erase the original landing pad at the end of this function after 2237 // ehAwareSplitEdge cloned it in the transition blocks. 2238 } 2239 2240 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2241 for (BasicBlock *Pred : Preds) { 2242 auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI); 2243 IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName()); 2244 2245 // Stop the moving of values at ReplPHI, as this is either null or the PHI 2246 // that replaced the landing pad. 2247 movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI); 2248 } 2249 2250 if (LandingPad) { 2251 // Calls to ehAwareSplitEdge function cloned the original lading pad. 2252 // No longer need it. 2253 LandingPad->eraseFromParent(); 2254 } 2255 } 2256 2257 static void rewritePHIs(Function &F) { 2258 SmallVector<BasicBlock *, 8> WorkList; 2259 2260 for (BasicBlock &BB : F) 2261 if (auto *PN = dyn_cast<PHINode>(&BB.front())) 2262 if (PN->getNumIncomingValues() > 1) 2263 WorkList.push_back(&BB); 2264 2265 for (BasicBlock *BB : WorkList) 2266 rewritePHIs(*BB); 2267 } 2268 2269 /// Default materializable callback 2270 // Check for instructions that we can recreate on resume as opposed to spill 2271 // the result into a coroutine frame. 2272 bool coro::defaultMaterializable(Instruction &V) { 2273 return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) || 2274 isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V)); 2275 } 2276 2277 // Check for structural coroutine intrinsics that should not be spilled into 2278 // the coroutine frame. 2279 static bool isCoroutineStructureIntrinsic(Instruction &I) { 2280 return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) || 2281 isa<CoroSuspendInst>(&I); 2282 } 2283 2284 // For each instruction identified as materializable across the suspend point, 2285 // and its associated DAG of other rematerializable instructions, 2286 // recreate the DAG of instructions after the suspend point. 2287 static void rewriteMaterializableInstructions( 2288 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> 2289 &AllRemats) { 2290 // This has to be done in 2 phases 2291 // Do the remats and record the required defs to be replaced in the 2292 // original use instructions 2293 // Once all the remats are complete, replace the uses in the final 2294 // instructions with the new defs 2295 typedef struct { 2296 Instruction *Use; 2297 Instruction *Def; 2298 Instruction *Remat; 2299 } ProcessNode; 2300 2301 SmallVector<ProcessNode> FinalInstructionsToProcess; 2302 2303 for (const auto &E : AllRemats) { 2304 Instruction *Use = E.first; 2305 Instruction *CurrentMaterialization = nullptr; 2306 RematGraph *RG = E.second.get(); 2307 ReversePostOrderTraversal<RematGraph *> RPOT(RG); 2308 SmallVector<Instruction *> InstructionsToProcess; 2309 2310 // If the target use is actually a suspend instruction then we have to 2311 // insert the remats into the end of the predecessor (there should only be 2312 // one). This is so that suspend blocks always have the suspend instruction 2313 // as the first instruction. 2314 auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); 2315 if (isa<AnyCoroSuspendInst>(Use)) { 2316 BasicBlock *SuspendPredecessorBlock = 2317 Use->getParent()->getSinglePredecessor(); 2318 assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); 2319 InsertPoint = SuspendPredecessorBlock->getTerminator(); 2320 } 2321 2322 // Note: skip the first instruction as this is the actual use that we're 2323 // rematerializing everything for. 2324 auto I = RPOT.begin(); 2325 ++I; 2326 for (; I != RPOT.end(); ++I) { 2327 Instruction *D = (*I)->Node; 2328 CurrentMaterialization = D->clone(); 2329 CurrentMaterialization->setName(D->getName()); 2330 CurrentMaterialization->insertBefore(InsertPoint); 2331 InsertPoint = CurrentMaterialization; 2332 2333 // Replace all uses of Def in the instructions being added as part of this 2334 // rematerialization group 2335 for (auto &I : InstructionsToProcess) 2336 I->replaceUsesOfWith(D, CurrentMaterialization); 2337 2338 // Don't replace the final use at this point as this can cause problems 2339 // for other materializations. Instead, for any final use that uses a 2340 // define that's being rematerialized, record the replace values 2341 for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) 2342 if (Use->getOperand(i) == D) // Is this operand pointing to oldval? 2343 FinalInstructionsToProcess.push_back( 2344 {Use, D, CurrentMaterialization}); 2345 2346 InstructionsToProcess.push_back(CurrentMaterialization); 2347 } 2348 } 2349 2350 // Finally, replace the uses with the defines that we've just rematerialized 2351 for (auto &R : FinalInstructionsToProcess) { 2352 if (auto *PN = dyn_cast<PHINode>(R.Use)) { 2353 assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " 2354 "values in the PHINode"); 2355 PN->replaceAllUsesWith(R.Remat); 2356 PN->eraseFromParent(); 2357 continue; 2358 } 2359 R.Use->replaceUsesOfWith(R.Def, R.Remat); 2360 } 2361 } 2362 2363 // Splits the block at a particular instruction unless it is the first 2364 // instruction in the block with a single predecessor. 2365 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) { 2366 auto *BB = I->getParent(); 2367 if (&BB->front() == I) { 2368 if (BB->getSinglePredecessor()) { 2369 BB->setName(Name); 2370 return BB; 2371 } 2372 } 2373 return BB->splitBasicBlock(I, Name); 2374 } 2375 2376 // Split above and below a particular instruction so that it 2377 // will be all alone by itself in a block. 2378 static void splitAround(Instruction *I, const Twine &Name) { 2379 splitBlockIfNotFirst(I, Name); 2380 splitBlockIfNotFirst(I->getNextNode(), "After" + Name); 2381 } 2382 2383 static bool isSuspendBlock(BasicBlock *BB) { 2384 return isa<AnyCoroSuspendInst>(BB->front()); 2385 } 2386 2387 typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet; 2388 2389 /// Does control flow starting at the given block ever reach a suspend 2390 /// instruction before reaching a block in VisitedOrFreeBBs? 2391 static bool isSuspendReachableFrom(BasicBlock *From, 2392 VisitedBlocksSet &VisitedOrFreeBBs) { 2393 // Eagerly try to add this block to the visited set. If it's already 2394 // there, stop recursing; this path doesn't reach a suspend before 2395 // either looping or reaching a freeing block. 2396 if (!VisitedOrFreeBBs.insert(From).second) 2397 return false; 2398 2399 // We assume that we'll already have split suspends into their own blocks. 2400 if (isSuspendBlock(From)) 2401 return true; 2402 2403 // Recurse on the successors. 2404 for (auto *Succ : successors(From)) { 2405 if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs)) 2406 return true; 2407 } 2408 2409 return false; 2410 } 2411 2412 /// Is the given alloca "local", i.e. bounded in lifetime to not cross a 2413 /// suspend point? 2414 static bool isLocalAlloca(CoroAllocaAllocInst *AI) { 2415 // Seed the visited set with all the basic blocks containing a free 2416 // so that we won't pass them up. 2417 VisitedBlocksSet VisitedOrFreeBBs; 2418 for (auto *User : AI->users()) { 2419 if (auto FI = dyn_cast<CoroAllocaFreeInst>(User)) 2420 VisitedOrFreeBBs.insert(FI->getParent()); 2421 } 2422 2423 return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs); 2424 } 2425 2426 /// After we split the coroutine, will the given basic block be along 2427 /// an obvious exit path for the resumption function? 2428 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, 2429 unsigned depth = 3) { 2430 // If we've bottomed out our depth count, stop searching and assume 2431 // that the path might loop back. 2432 if (depth == 0) return false; 2433 2434 // If this is a suspend block, we're about to exit the resumption function. 2435 if (isSuspendBlock(BB)) return true; 2436 2437 // Recurse into the successors. 2438 for (auto *Succ : successors(BB)) { 2439 if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1)) 2440 return false; 2441 } 2442 2443 // If none of the successors leads back in a loop, we're on an exit/abort. 2444 return true; 2445 } 2446 2447 static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) { 2448 // Look for a free that isn't sufficiently obviously followed by 2449 // either a suspend or a termination, i.e. something that will leave 2450 // the coro resumption frame. 2451 for (auto *U : AI->users()) { 2452 auto FI = dyn_cast<CoroAllocaFreeInst>(U); 2453 if (!FI) continue; 2454 2455 if (!willLeaveFunctionImmediatelyAfter(FI->getParent())) 2456 return true; 2457 } 2458 2459 // If we never found one, we don't need a stack save. 2460 return false; 2461 } 2462 2463 /// Turn each of the given local allocas into a normal (dynamic) alloca 2464 /// instruction. 2465 static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas, 2466 SmallVectorImpl<Instruction*> &DeadInsts) { 2467 for (auto *AI : LocalAllocas) { 2468 IRBuilder<> Builder(AI); 2469 2470 // Save the stack depth. Try to avoid doing this if the stackrestore 2471 // is going to immediately precede a return or something. 2472 Value *StackSave = nullptr; 2473 if (localAllocaNeedsStackSave(AI)) 2474 StackSave = Builder.CreateStackSave(); 2475 2476 // Allocate memory. 2477 auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize()); 2478 Alloca->setAlignment(AI->getAlignment()); 2479 2480 for (auto *U : AI->users()) { 2481 // Replace gets with the allocation. 2482 if (isa<CoroAllocaGetInst>(U)) { 2483 U->replaceAllUsesWith(Alloca); 2484 2485 // Replace frees with stackrestores. This is safe because 2486 // alloca.alloc is required to obey a stack discipline, although we 2487 // don't enforce that structurally. 2488 } else { 2489 auto FI = cast<CoroAllocaFreeInst>(U); 2490 if (StackSave) { 2491 Builder.SetInsertPoint(FI); 2492 Builder.CreateStackRestore(StackSave); 2493 } 2494 } 2495 DeadInsts.push_back(cast<Instruction>(U)); 2496 } 2497 2498 DeadInsts.push_back(AI); 2499 } 2500 } 2501 2502 /// Turn the given coro.alloca.alloc call into a dynamic allocation. 2503 /// This happens during the all-instructions iteration, so it must not 2504 /// delete the call. 2505 static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI, 2506 coro::Shape &Shape, 2507 SmallVectorImpl<Instruction*> &DeadInsts) { 2508 IRBuilder<> Builder(AI); 2509 auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr); 2510 2511 for (User *U : AI->users()) { 2512 if (isa<CoroAllocaGetInst>(U)) { 2513 U->replaceAllUsesWith(Alloc); 2514 } else { 2515 auto FI = cast<CoroAllocaFreeInst>(U); 2516 Builder.SetInsertPoint(FI); 2517 Shape.emitDealloc(Builder, Alloc, nullptr); 2518 } 2519 DeadInsts.push_back(cast<Instruction>(U)); 2520 } 2521 2522 // Push this on last so that it gets deleted after all the others. 2523 DeadInsts.push_back(AI); 2524 2525 // Return the new allocation value so that we can check for needed spills. 2526 return cast<Instruction>(Alloc); 2527 } 2528 2529 /// Get the current swifterror value. 2530 static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy, 2531 coro::Shape &Shape) { 2532 // Make a fake function pointer as a sort of intrinsic. 2533 auto FnTy = FunctionType::get(ValueTy, {}, false); 2534 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2535 2536 auto Call = Builder.CreateCall(FnTy, Fn, {}); 2537 Shape.SwiftErrorOps.push_back(Call); 2538 2539 return Call; 2540 } 2541 2542 /// Set the given value as the current swifterror value. 2543 /// 2544 /// Returns a slot that can be used as a swifterror slot. 2545 static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V, 2546 coro::Shape &Shape) { 2547 // Make a fake function pointer as a sort of intrinsic. 2548 auto FnTy = FunctionType::get(Builder.getPtrTy(), 2549 {V->getType()}, false); 2550 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2551 2552 auto Call = Builder.CreateCall(FnTy, Fn, { V }); 2553 Shape.SwiftErrorOps.push_back(Call); 2554 2555 return Call; 2556 } 2557 2558 /// Set the swifterror value from the given alloca before a call, 2559 /// then put in back in the alloca afterwards. 2560 /// 2561 /// Returns an address that will stand in for the swifterror slot 2562 /// until splitting. 2563 static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call, 2564 AllocaInst *Alloca, 2565 coro::Shape &Shape) { 2566 auto ValueTy = Alloca->getAllocatedType(); 2567 IRBuilder<> Builder(Call); 2568 2569 // Load the current value from the alloca and set it as the 2570 // swifterror value. 2571 auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca); 2572 auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape); 2573 2574 // Move to after the call. Since swifterror only has a guaranteed 2575 // value on normal exits, we can ignore implicit and explicit unwind 2576 // edges. 2577 if (isa<CallInst>(Call)) { 2578 Builder.SetInsertPoint(Call->getNextNode()); 2579 } else { 2580 auto Invoke = cast<InvokeInst>(Call); 2581 Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg()); 2582 } 2583 2584 // Get the current swifterror value and store it to the alloca. 2585 auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape); 2586 Builder.CreateStore(ValueAfterCall, Alloca); 2587 2588 return Addr; 2589 } 2590 2591 /// Eliminate a formerly-swifterror alloca by inserting the get/set 2592 /// intrinsics and attempting to MemToReg the alloca away. 2593 static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca, 2594 coro::Shape &Shape) { 2595 for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) { 2596 // swifterror values can only be used in very specific ways. 2597 // We take advantage of that here. 2598 auto User = Use.getUser(); 2599 if (isa<LoadInst>(User) || isa<StoreInst>(User)) 2600 continue; 2601 2602 assert(isa<CallInst>(User) || isa<InvokeInst>(User)); 2603 auto Call = cast<Instruction>(User); 2604 2605 auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape); 2606 2607 // Use the returned slot address as the call argument. 2608 Use.set(Addr); 2609 } 2610 2611 // All the uses should be loads and stores now. 2612 assert(isAllocaPromotable(Alloca)); 2613 } 2614 2615 /// "Eliminate" a swifterror argument by reducing it to the alloca case 2616 /// and then loading and storing in the prologue and epilog. 2617 /// 2618 /// The argument keeps the swifterror flag. 2619 static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, 2620 coro::Shape &Shape, 2621 SmallVectorImpl<AllocaInst*> &AllocasToPromote) { 2622 IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); 2623 2624 auto ArgTy = cast<PointerType>(Arg.getType()); 2625 auto ValueTy = PointerType::getUnqual(F.getContext()); 2626 2627 // Reduce to the alloca case: 2628 2629 // Create an alloca and replace all uses of the arg with it. 2630 auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace()); 2631 Arg.replaceAllUsesWith(Alloca); 2632 2633 // Set an initial value in the alloca. swifterror is always null on entry. 2634 auto InitialValue = Constant::getNullValue(ValueTy); 2635 Builder.CreateStore(InitialValue, Alloca); 2636 2637 // Find all the suspends in the function and save and restore around them. 2638 for (auto *Suspend : Shape.CoroSuspends) { 2639 (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape); 2640 } 2641 2642 // Find all the coro.ends in the function and restore the error value. 2643 for (auto *End : Shape.CoroEnds) { 2644 Builder.SetInsertPoint(End); 2645 auto FinalValue = Builder.CreateLoad(ValueTy, Alloca); 2646 (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape); 2647 } 2648 2649 // Now we can use the alloca logic. 2650 AllocasToPromote.push_back(Alloca); 2651 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2652 } 2653 2654 /// Eliminate all problematic uses of swifterror arguments and allocas 2655 /// from the function. We'll fix them up later when splitting the function. 2656 static void eliminateSwiftError(Function &F, coro::Shape &Shape) { 2657 SmallVector<AllocaInst*, 4> AllocasToPromote; 2658 2659 // Look for a swifterror argument. 2660 for (auto &Arg : F.args()) { 2661 if (!Arg.hasSwiftErrorAttr()) continue; 2662 2663 eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote); 2664 break; 2665 } 2666 2667 // Look for swifterror allocas. 2668 for (auto &Inst : F.getEntryBlock()) { 2669 auto Alloca = dyn_cast<AllocaInst>(&Inst); 2670 if (!Alloca || !Alloca->isSwiftError()) continue; 2671 2672 // Clear the swifterror flag. 2673 Alloca->setSwiftError(false); 2674 2675 AllocasToPromote.push_back(Alloca); 2676 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2677 } 2678 2679 // If we have any allocas to promote, compute a dominator tree and 2680 // promote them en masse. 2681 if (!AllocasToPromote.empty()) { 2682 DominatorTree DT(F); 2683 PromoteMemToReg(AllocasToPromote, DT); 2684 } 2685 } 2686 2687 /// retcon and retcon.once conventions assume that all spill uses can be sunk 2688 /// after the coro.begin intrinsic. 2689 static void sinkSpillUsesAfterCoroBegin(Function &F, 2690 const FrameDataInfo &FrameData, 2691 CoroBeginInst *CoroBegin) { 2692 DominatorTree Dom(F); 2693 2694 SmallSetVector<Instruction *, 32> ToMove; 2695 SmallVector<Instruction *, 32> Worklist; 2696 2697 // Collect all users that precede coro.begin. 2698 for (auto *Def : FrameData.getAllDefs()) { 2699 for (User *U : Def->users()) { 2700 auto Inst = cast<Instruction>(U); 2701 if (Inst->getParent() != CoroBegin->getParent() || 2702 Dom.dominates(CoroBegin, Inst)) 2703 continue; 2704 if (ToMove.insert(Inst)) 2705 Worklist.push_back(Inst); 2706 } 2707 } 2708 // Recursively collect users before coro.begin. 2709 while (!Worklist.empty()) { 2710 auto *Def = Worklist.pop_back_val(); 2711 for (User *U : Def->users()) { 2712 auto Inst = cast<Instruction>(U); 2713 if (Dom.dominates(CoroBegin, Inst)) 2714 continue; 2715 if (ToMove.insert(Inst)) 2716 Worklist.push_back(Inst); 2717 } 2718 } 2719 2720 // Sort by dominance. 2721 SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end()); 2722 llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool { 2723 // If a dominates b it should preceed (<) b. 2724 return Dom.dominates(A, B); 2725 }); 2726 2727 Instruction *InsertPt = CoroBegin->getNextNode(); 2728 for (Instruction *Inst : InsertionList) 2729 Inst->moveBefore(InsertPt); 2730 } 2731 2732 /// For each local variable that all of its user are only used inside one of 2733 /// suspended region, we sink their lifetime.start markers to the place where 2734 /// after the suspend block. Doing so minimizes the lifetime of each variable, 2735 /// hence minimizing the amount of data we end up putting on the frame. 2736 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, 2737 SuspendCrossingInfo &Checker) { 2738 if (F.hasOptNone()) 2739 return; 2740 2741 DominatorTree DT(F); 2742 2743 // Collect all possible basic blocks which may dominate all uses of allocas. 2744 SmallPtrSet<BasicBlock *, 4> DomSet; 2745 DomSet.insert(&F.getEntryBlock()); 2746 for (auto *CSI : Shape.CoroSuspends) { 2747 BasicBlock *SuspendBlock = CSI->getParent(); 2748 assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() && 2749 "should have split coro.suspend into its own block"); 2750 DomSet.insert(SuspendBlock->getSingleSuccessor()); 2751 } 2752 2753 for (Instruction &I : instructions(F)) { 2754 AllocaInst* AI = dyn_cast<AllocaInst>(&I); 2755 if (!AI) 2756 continue; 2757 2758 for (BasicBlock *DomBB : DomSet) { 2759 bool Valid = true; 2760 SmallVector<Instruction *, 1> Lifetimes; 2761 2762 auto isLifetimeStart = [](Instruction* I) { 2763 if (auto* II = dyn_cast<IntrinsicInst>(I)) 2764 return II->getIntrinsicID() == Intrinsic::lifetime_start; 2765 return false; 2766 }; 2767 2768 auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) { 2769 if (isLifetimeStart(U)) { 2770 Lifetimes.push_back(U); 2771 return true; 2772 } 2773 if (!U->hasOneUse() || U->stripPointerCasts() != AI) 2774 return false; 2775 if (isLifetimeStart(U->user_back())) { 2776 Lifetimes.push_back(U->user_back()); 2777 return true; 2778 } 2779 return false; 2780 }; 2781 2782 for (User *U : AI->users()) { 2783 Instruction *UI = cast<Instruction>(U); 2784 // For all users except lifetime.start markers, if they are all 2785 // dominated by one of the basic blocks and do not cross 2786 // suspend points as well, then there is no need to spill the 2787 // instruction. 2788 if (!DT.dominates(DomBB, UI->getParent()) || 2789 Checker.isDefinitionAcrossSuspend(DomBB, UI)) { 2790 // Skip lifetime.start, GEP and bitcast used by lifetime.start 2791 // markers. 2792 if (collectLifetimeStart(UI, AI)) 2793 continue; 2794 Valid = false; 2795 break; 2796 } 2797 } 2798 // Sink lifetime.start markers to dominate block when they are 2799 // only used outside the region. 2800 if (Valid && Lifetimes.size() != 0) { 2801 auto *NewLifetime = Lifetimes[0]->clone(); 2802 NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI); 2803 NewLifetime->insertBefore(DomBB->getTerminator()); 2804 2805 // All the outsided lifetime.start markers are no longer necessary. 2806 for (Instruction *S : Lifetimes) 2807 S->eraseFromParent(); 2808 2809 break; 2810 } 2811 } 2812 } 2813 } 2814 2815 static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape, 2816 const SuspendCrossingInfo &Checker, 2817 SmallVectorImpl<AllocaInfo> &Allocas, 2818 const DominatorTree &DT) { 2819 if (Shape.CoroSuspends.empty()) 2820 return; 2821 2822 // The PromiseAlloca will be specially handled since it needs to be in a 2823 // fixed position in the frame. 2824 if (AI == Shape.SwitchLowering.PromiseAlloca) 2825 return; 2826 2827 // The __coro_gro alloca should outlive the promise, make sure we 2828 // keep it outside the frame. 2829 if (AI->hasMetadata(LLVMContext::MD_coro_outside_frame)) 2830 return; 2831 2832 // The code that uses lifetime.start intrinsic does not work for functions 2833 // with loops without exit. Disable it on ABIs we know to generate such 2834 // code. 2835 bool ShouldUseLifetimeStartInfo = 2836 (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 2837 Shape.ABI != coro::ABI::RetconOnce); 2838 AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT, 2839 *Shape.CoroBegin, Checker, 2840 ShouldUseLifetimeStartInfo}; 2841 Visitor.visitPtr(*AI); 2842 if (!Visitor.getShouldLiveOnFrame()) 2843 return; 2844 Allocas.emplace_back(AI, Visitor.getAliasesCopy(), 2845 Visitor.getMayWriteBeforeCoroBegin()); 2846 } 2847 2848 static std::optional<std::pair<Value &, DIExpression &>> 2849 salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2850 bool OptimizeFrame, bool UseEntryValue, Function *F, 2851 Value *Storage, DIExpression *Expr, 2852 bool SkipOutermostLoad) { 2853 IRBuilder<> Builder(F->getContext()); 2854 auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); 2855 while (isa<IntrinsicInst>(InsertPt)) 2856 ++InsertPt; 2857 Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt); 2858 2859 while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) { 2860 if (auto *LdInst = dyn_cast<LoadInst>(Inst)) { 2861 Storage = LdInst->getPointerOperand(); 2862 // FIXME: This is a heuristic that works around the fact that 2863 // LLVM IR debug intrinsics cannot yet distinguish between 2864 // memory and value locations: Because a dbg.declare(alloca) is 2865 // implicitly a memory location no DW_OP_deref operation for the 2866 // last direct load from an alloca is necessary. This condition 2867 // effectively drops the *last* DW_OP_deref in the expression. 2868 if (!SkipOutermostLoad) 2869 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2870 } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) { 2871 Storage = StInst->getValueOperand(); 2872 } else { 2873 SmallVector<uint64_t, 16> Ops; 2874 SmallVector<Value *, 0> AdditionalValues; 2875 Value *Op = llvm::salvageDebugInfoImpl( 2876 *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops, 2877 AdditionalValues); 2878 if (!Op || !AdditionalValues.empty()) { 2879 // If salvaging failed or salvaging produced more than one location 2880 // operand, give up. 2881 break; 2882 } 2883 Storage = Op; 2884 Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false); 2885 } 2886 SkipOutermostLoad = false; 2887 } 2888 if (!Storage) 2889 return std::nullopt; 2890 2891 auto *StorageAsArg = dyn_cast<Argument>(Storage); 2892 const bool IsSwiftAsyncArg = 2893 StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync); 2894 2895 // Swift async arguments are described by an entry value of the ABI-defined 2896 // register containing the coroutine context. 2897 // Entry values in variadic expressions are not supported. 2898 if (IsSwiftAsyncArg && UseEntryValue && !Expr->isEntryValue() && 2899 Expr->isSingleLocationExpression()) 2900 Expr = DIExpression::prepend(Expr, DIExpression::EntryValue); 2901 2902 // If the coroutine frame is an Argument, store it in an alloca to improve 2903 // its availability (e.g. registers may be clobbered). 2904 // Avoid this if optimizations are enabled (they would remove the alloca) or 2905 // if the value is guaranteed to be available through other means (e.g. swift 2906 // ABI guarantees). 2907 if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) { 2908 auto &Cached = ArgToAllocaMap[StorageAsArg]; 2909 if (!Cached) { 2910 Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, 2911 Storage->getName() + ".debug"); 2912 Builder.CreateStore(Storage, Cached); 2913 } 2914 Storage = Cached; 2915 // FIXME: LLVM lacks nuanced semantics to differentiate between 2916 // memory and direct locations at the IR level. The backend will 2917 // turn a dbg.declare(alloca, ..., DIExpression()) into a memory 2918 // location. Thus, if there are deref and offset operations in the 2919 // expression, we need to add a DW_OP_deref at the *start* of the 2920 // expression to first load the contents of the alloca before 2921 // adjusting it with the expression. 2922 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2923 } 2924 2925 return {{*Storage, *Expr}}; 2926 } 2927 2928 void coro::salvageDebugInfo( 2929 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2930 DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool UseEntryValue) { 2931 2932 Function *F = DVI.getFunction(); 2933 // Follow the pointer arithmetic all the way to the incoming 2934 // function argument and convert into a DIExpression. 2935 bool SkipOutermostLoad = !isa<DbgValueInst>(DVI); 2936 Value *OriginalStorage = DVI.getVariableLocationOp(0); 2937 2938 auto SalvagedInfo = ::salvageDebugInfoImpl( 2939 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 2940 DVI.getExpression(), SkipOutermostLoad); 2941 if (!SalvagedInfo) 2942 return; 2943 2944 Value *Storage = &SalvagedInfo->first; 2945 DIExpression *Expr = &SalvagedInfo->second; 2946 2947 DVI.replaceVariableLocationOp(OriginalStorage, Storage); 2948 DVI.setExpression(Expr); 2949 // We only hoist dbg.declare today since it doesn't make sense to hoist 2950 // dbg.value since it does not have the same function wide guarantees that 2951 // dbg.declare does. 2952 if (isa<DbgDeclareInst>(DVI)) { 2953 std::optional<BasicBlock::iterator> InsertPt; 2954 if (auto *I = dyn_cast<Instruction>(Storage)) 2955 InsertPt = I->getInsertionPointAfterDef(); 2956 else if (isa<Argument>(Storage)) 2957 InsertPt = F->getEntryBlock().begin(); 2958 if (InsertPt) 2959 DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt); 2960 } 2961 } 2962 2963 void coro::salvageDebugInfo( 2964 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DPValue &DPV, 2965 bool OptimizeFrame, bool UseEntryValue) { 2966 2967 Function *F = DPV.getFunction(); 2968 // Follow the pointer arithmetic all the way to the incoming 2969 // function argument and convert into a DIExpression. 2970 bool SkipOutermostLoad = DPV.getType() == DPValue::LocationType::Declare; 2971 Value *OriginalStorage = DPV.getVariableLocationOp(0); 2972 2973 auto SalvagedInfo = ::salvageDebugInfoImpl( 2974 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 2975 DPV.getExpression(), SkipOutermostLoad); 2976 if (!SalvagedInfo) 2977 return; 2978 2979 Value *Storage = &SalvagedInfo->first; 2980 DIExpression *Expr = &SalvagedInfo->second; 2981 2982 DPV.replaceVariableLocationOp(OriginalStorage, Storage); 2983 DPV.setExpression(Expr); 2984 // We only hoist dbg.declare today since it doesn't make sense to hoist 2985 // dbg.value since it does not have the same function wide guarantees that 2986 // dbg.declare does. 2987 if (DPV.getType() == DPValue::LocationType::Declare) { 2988 std::optional<BasicBlock::iterator> InsertPt; 2989 if (auto *I = dyn_cast<Instruction>(Storage)) 2990 InsertPt = I->getInsertionPointAfterDef(); 2991 else if (isa<Argument>(Storage)) 2992 InsertPt = F->getEntryBlock().begin(); 2993 if (InsertPt) { 2994 DPV.removeFromParent(); 2995 (*InsertPt)->getParent()->insertDPValueBefore(&DPV, *InsertPt); 2996 } 2997 } 2998 } 2999 3000 static void doRematerializations( 3001 Function &F, SuspendCrossingInfo &Checker, 3002 const std::function<bool(Instruction &)> &MaterializableCallback) { 3003 if (F.hasOptNone()) 3004 return; 3005 3006 SpillInfo Spills; 3007 3008 // See if there are materializable instructions across suspend points 3009 // We record these as the starting point to also identify materializable 3010 // defs of uses in these operations 3011 for (Instruction &I : instructions(F)) { 3012 if (!MaterializableCallback(I)) 3013 continue; 3014 for (User *U : I.users()) 3015 if (Checker.isDefinitionAcrossSuspend(I, U)) 3016 Spills[&I].push_back(cast<Instruction>(U)); 3017 } 3018 3019 // Process each of the identified rematerializable instructions 3020 // and add predecessor instructions that can also be rematerialized. 3021 // This is actually a graph of instructions since we could potentially 3022 // have multiple uses of a def in the set of predecessor instructions. 3023 // The approach here is to maintain a graph of instructions for each bottom 3024 // level instruction - where we have a unique set of instructions (nodes) 3025 // and edges between them. We then walk the graph in reverse post-dominator 3026 // order to insert them past the suspend point, but ensure that ordering is 3027 // correct. We also rely on CSE removing duplicate defs for remats of 3028 // different instructions with a def in common (rather than maintaining more 3029 // complex graphs for each suspend point) 3030 3031 // We can do this by adding new nodes to the list for each suspend 3032 // point. Then using standard GraphTraits to give a reverse post-order 3033 // traversal when we insert the nodes after the suspend 3034 SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats; 3035 for (auto &E : Spills) { 3036 for (Instruction *U : E.second) { 3037 // Don't process a user twice (this can happen if the instruction uses 3038 // more than one rematerializable def) 3039 if (AllRemats.count(U)) 3040 continue; 3041 3042 // Constructor creates the whole RematGraph for the given Use 3043 auto RematUPtr = 3044 std::make_unique<RematGraph>(MaterializableCallback, U, Checker); 3045 3046 LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; 3047 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get()); 3048 for (auto I = RPOT.begin(); I != RPOT.end(); 3049 ++I) { (*I)->Node->dump(); } dbgs() 3050 << "\n";); 3051 3052 AllRemats[U] = std::move(RematUPtr); 3053 } 3054 } 3055 3056 // Rewrite materializable instructions to be materialized at the use 3057 // point. 3058 LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); 3059 rewriteMaterializableInstructions(AllRemats); 3060 } 3061 3062 void coro::buildCoroutineFrame( 3063 Function &F, Shape &Shape, 3064 const std::function<bool(Instruction &)> &MaterializableCallback) { 3065 // Don't eliminate swifterror in async functions that won't be split. 3066 if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) 3067 eliminateSwiftError(F, Shape); 3068 3069 if (Shape.ABI == coro::ABI::Switch && 3070 Shape.SwitchLowering.PromiseAlloca) { 3071 Shape.getSwitchCoroId()->clearPromise(); 3072 } 3073 3074 // Make sure that all coro.save, coro.suspend and the fallthrough coro.end 3075 // intrinsics are in their own blocks to simplify the logic of building up 3076 // SuspendCrossing data. 3077 for (auto *CSI : Shape.CoroSuspends) { 3078 if (auto *Save = CSI->getCoroSave()) 3079 splitAround(Save, "CoroSave"); 3080 splitAround(CSI, "CoroSuspend"); 3081 } 3082 3083 // Put CoroEnds into their own blocks. 3084 for (AnyCoroEndInst *CE : Shape.CoroEnds) { 3085 splitAround(CE, "CoroEnd"); 3086 3087 // Emit the musttail call function in a new block before the CoroEnd. 3088 // We do this here so that the right suspend crossing info is computed for 3089 // the uses of the musttail call function call. (Arguments to the coro.end 3090 // instructions would be ignored) 3091 if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) { 3092 auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction(); 3093 if (!MustTailCallFn) 3094 continue; 3095 IRBuilder<> Builder(AsyncEnd); 3096 SmallVector<Value *, 8> Args(AsyncEnd->args()); 3097 auto Arguments = ArrayRef<Value *>(Args).drop_front(3); 3098 auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn, 3099 Arguments, Builder); 3100 splitAround(Call, "MustTailCall.Before.CoroEnd"); 3101 } 3102 } 3103 3104 // Later code makes structural assumptions about single predecessors phis e.g 3105 // that they are not live across a suspend point. 3106 cleanupSinglePredPHIs(F); 3107 3108 // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will 3109 // never has its definition separated from the PHI by the suspend point. 3110 rewritePHIs(F); 3111 3112 // Build suspend crossing info. 3113 SuspendCrossingInfo Checker(F, Shape); 3114 3115 doRematerializations(F, Checker, MaterializableCallback); 3116 3117 FrameDataInfo FrameData; 3118 SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas; 3119 SmallVector<Instruction*, 4> DeadInstructions; 3120 if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 3121 Shape.ABI != coro::ABI::RetconOnce) 3122 sinkLifetimeStartMarkers(F, Shape, Checker); 3123 3124 // Collect the spills for arguments and other not-materializable values. 3125 for (Argument &A : F.args()) 3126 for (User *U : A.users()) 3127 if (Checker.isDefinitionAcrossSuspend(A, U)) 3128 FrameData.Spills[&A].push_back(cast<Instruction>(U)); 3129 3130 const DominatorTree DT(F); 3131 for (Instruction &I : instructions(F)) { 3132 // Values returned from coroutine structure intrinsics should not be part 3133 // of the Coroutine Frame. 3134 if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin) 3135 continue; 3136 3137 // Handle alloca.alloc specially here. 3138 if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) { 3139 // Check whether the alloca's lifetime is bounded by suspend points. 3140 if (isLocalAlloca(AI)) { 3141 LocalAllocas.push_back(AI); 3142 continue; 3143 } 3144 3145 // If not, do a quick rewrite of the alloca and then add spills of 3146 // the rewritten value. The rewrite doesn't invalidate anything in 3147 // Spills because the other alloca intrinsics have no other operands 3148 // besides AI, and it doesn't invalidate the iteration because we delay 3149 // erasing AI. 3150 auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions); 3151 3152 for (User *U : Alloc->users()) { 3153 if (Checker.isDefinitionAcrossSuspend(*Alloc, U)) 3154 FrameData.Spills[Alloc].push_back(cast<Instruction>(U)); 3155 } 3156 continue; 3157 } 3158 3159 // Ignore alloca.get; we process this as part of coro.alloca.alloc. 3160 if (isa<CoroAllocaGetInst>(I)) 3161 continue; 3162 3163 if (auto *AI = dyn_cast<AllocaInst>(&I)) { 3164 collectFrameAlloca(AI, Shape, Checker, FrameData.Allocas, DT); 3165 continue; 3166 } 3167 3168 for (User *U : I.users()) 3169 if (Checker.isDefinitionAcrossSuspend(I, U)) { 3170 // We cannot spill a token. 3171 if (I.getType()->isTokenTy()) 3172 report_fatal_error( 3173 "token definition is separated from the use by a suspend point"); 3174 FrameData.Spills[&I].push_back(cast<Instruction>(U)); 3175 } 3176 } 3177 3178 LLVM_DEBUG(dumpAllocas(FrameData.Allocas)); 3179 3180 // We don't want the layout of coroutine frame to be affected 3181 // by debug information. So we only choose to salvage DbgValueInst for 3182 // whose value is already in the frame. 3183 // We would handle the dbg.values for allocas specially 3184 for (auto &Iter : FrameData.Spills) { 3185 auto *V = Iter.first; 3186 SmallVector<DbgValueInst *, 16> DVIs; 3187 SmallVector<DPValue *, 16> DPVs; 3188 findDbgValues(DVIs, V, &DPVs); 3189 for (DbgValueInst *DVI : DVIs) 3190 if (Checker.isDefinitionAcrossSuspend(*V, DVI)) 3191 FrameData.Spills[V].push_back(DVI); 3192 // Add the instructions which carry debug info that is in the frame. 3193 for (DPValue *DPV : DPVs) 3194 if (Checker.isDefinitionAcrossSuspend(*V, DPV->Marker->MarkedInstr)) 3195 FrameData.Spills[V].push_back(DPV->Marker->MarkedInstr); 3196 } 3197 3198 LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills)); 3199 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 3200 Shape.ABI == coro::ABI::Async) 3201 sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin); 3202 Shape.FrameTy = buildFrameType(F, Shape, FrameData); 3203 Shape.FramePtr = Shape.CoroBegin; 3204 // For now, this works for C++ programs only. 3205 buildFrameDebugInfo(F, Shape, FrameData); 3206 insertSpills(FrameData, Shape); 3207 lowerLocalAllocas(LocalAllocas, DeadInstructions); 3208 3209 for (auto *I : DeadInstructions) 3210 I->eraseFromParent(); 3211 } 3212