1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2018-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 /*========================== begin_copyright_notice ============================
10 
11 This file is distributed under the University of Illinois Open Source License.
12 See LICENSE.TXT for details.
13 
14 ============================= end_copyright_notice ===========================*/
15 
16 // This file implements the visit functions for load, store and alloca.
17 
18 #include "common/LLVMWarningsPush.hpp"
19 #include "InstCombineInternal.h"
20 #include "llvm/ADT/MapVector.h"
21 #include "llvm/ADT/SmallString.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/Loads.h"
24 #include "llvm/Transforms/Utils/Local.h"
25 #include "llvm/IR/ConstantRange.h"
26 #include "llvm/IR/DataLayout.h"
27 #include "llvm/IR/IntrinsicInst.h"
28 #include "llvm/IR/LLVMContext.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/IR/PatternMatch.h"
31 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
32 #include "Probe/Assertion.h"
33 
34 using namespace llvm;
35 using namespace PatternMatch;
36 using namespace IGCombiner;
37 
38 #define DEBUG_TYPE "instcombine"
39 
40 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
41 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
42 
43 /// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
44 /// some part of a constant global variable.  This intentionally only accepts
45 /// constant expressions because we can't rewrite arbitrary instructions.
pointsToConstantGlobal(Value * V)46 static bool pointsToConstantGlobal(Value* V) {
47     if (GlobalVariable * GV = dyn_cast<GlobalVariable>(V))
48         return GV->isConstant();
49 
50     if (ConstantExpr * CE = dyn_cast<ConstantExpr>(V)) {
51         if (CE->getOpcode() == Instruction::BitCast ||
52             CE->getOpcode() == Instruction::AddrSpaceCast ||
53             CE->getOpcode() == Instruction::GetElementPtr)
54             return pointsToConstantGlobal(CE->getOperand(0));
55     }
56     return false;
57 }
58 
59 /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
60 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
61 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
62 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
63 /// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
64 /// the alloca, and if the source pointer is a pointer to a constant global, we
65 /// can optimize this.
66 static bool
isOnlyCopiedFromConstantGlobal(Value * V,MemTransferInst * & TheCopy,SmallVectorImpl<Instruction * > & ToDelete)67 isOnlyCopiedFromConstantGlobal(Value* V, MemTransferInst*& TheCopy,
68     SmallVectorImpl<Instruction*>& ToDelete) {
69     // We track lifetime intrinsics as we encounter them.  If we decide to go
70     // ahead and replace the value with the global, this lets the caller quickly
71     // eliminate the markers.
72 
73     SmallVector<std::pair<Value*, bool>, 35> ValuesToInspect;
74     ValuesToInspect.emplace_back(V, false);
75     while (!ValuesToInspect.empty()) {
76         auto ValuePair = ValuesToInspect.pop_back_val();
77         const bool IsOffset = ValuePair.second;
78         for (auto& U : ValuePair.first->uses()) {
79             auto* I = cast<Instruction>(U.getUser());
80 
81             if (auto * LI = dyn_cast<LoadInst>(I)) {
82                 // Ignore non-volatile loads, they are always ok.
83                 if (!LI->isSimple()) return false;
84                 continue;
85             }
86 
87             if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
88                 // If uses of the bitcast are ok, we are ok.
89                 ValuesToInspect.emplace_back(I, IsOffset);
90                 continue;
91             }
92             if (auto * GEP = dyn_cast<GetElementPtrInst>(I)) {
93                 // If the GEP has all zero indices, it doesn't offset the pointer. If it
94                 // doesn't, it does.
95                 ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
96                 continue;
97             }
98 
99             if (auto CS = CallSite(I)) {
100                 // If this is the function being called then we treat it like a load and
101                 // ignore it.
102                 if (CS.isCallee(&U))
103                     continue;
104 
105                 unsigned DataOpNo = CS.getDataOperandNo(&U);
106                 bool IsArgOperand = CS.isArgOperand(&U);
107 
108                 // Inalloca arguments are clobbered by the call.
109                 if (IsArgOperand && CS.isInAllocaArgument(DataOpNo))
110                     return false;
111 
112                 // If this is a readonly/readnone call site, then we know it is just a
113                 // load (but one that potentially returns the value itself), so we can
114                 // ignore it if we know that the value isn't captured.
115                 if (CS.onlyReadsMemory() &&
116                     (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo)))
117                     continue;
118 
119                 // If this is being passed as a byval argument, the caller is making a
120                 // copy, so it is only a read of the alloca.
121                 if (IsArgOperand && CS.isByValArgument(DataOpNo))
122                     continue;
123             }
124 
125             // Lifetime intrinsics can be handled by the caller.
126             if (IntrinsicInst * II = dyn_cast<IntrinsicInst>(I)) {
127                 if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
128                     II->getIntrinsicID() == Intrinsic::lifetime_end) {
129                     IGC_ASSERT_MESSAGE(II->use_empty(), "Lifetime markers have no result to use!");
130                     ToDelete.push_back(II);
131                     continue;
132                 }
133             }
134 
135             // If this is isn't our memcpy/memmove, reject it as something we can't
136             // handle.
137             MemTransferInst* MI = dyn_cast<MemTransferInst>(I);
138             if (!MI)
139                 return false;
140 
141             // If the transfer is using the alloca as a source of the transfer, then
142             // ignore it since it is a load (unless the transfer is volatile).
143             if (U.getOperandNo() == 1) {
144                 if (MI->isVolatile()) return false;
145                 continue;
146             }
147 
148             // If we already have seen a copy, reject the second one.
149             if (TheCopy) return false;
150 
151             // If the pointer has been offset from the start of the alloca, we can't
152             // safely handle this.
153             if (IsOffset) return false;
154 
155             // If the memintrinsic isn't using the alloca as the dest, reject it.
156             if (U.getOperandNo() != 0) return false;
157 
158             // If the source of the memcpy/move is not a constant global, reject it.
159             if (!pointsToConstantGlobal(MI->getSource()))
160                 return false;
161 
162             // Otherwise, the transform is safe.  Remember the copy instruction.
163             TheCopy = MI;
164         }
165     }
166     return true;
167 }
168 
169 /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
170 /// modified by a copy from a constant global.  If we can prove this, we can
171 /// replace any uses of the alloca with uses of the global directly.
172 static MemTransferInst*
isOnlyCopiedFromConstantGlobal(AllocaInst * AI,SmallVectorImpl<Instruction * > & ToDelete)173 isOnlyCopiedFromConstantGlobal(AllocaInst* AI,
174     SmallVectorImpl<Instruction*>& ToDelete) {
175     MemTransferInst* TheCopy = nullptr;
176     if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
177         return TheCopy;
178     return nullptr;
179 }
180 
181 /// Returns true if V is dereferenceable for size of alloca.
isDereferenceableForAllocaSize(const Value * V,const AllocaInst * AI,const DataLayout & DL)182 static bool isDereferenceableForAllocaSize(const Value* V, const AllocaInst* AI,
183     const DataLayout& DL) {
184     if (AI->isArrayAllocation())
185         return false;
186     uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
187     if (!AllocaSize)
188         return false;
189     return isDereferenceableAndAlignedPointer(V, AI->getAlignment(),
190         APInt(64, AllocaSize), DL);
191 }
192 
simplifyAllocaArraySize(InstCombiner & IC,AllocaInst & AI)193 static Instruction* simplifyAllocaArraySize(InstCombiner& IC, AllocaInst& AI) {
194     // Check for array size of 1 (scalar allocation).
195     if (!AI.isArrayAllocation()) {
196         // i32 1 is the canonical array size for scalar allocations.
197         if (AI.getArraySize()->getType()->isIntegerTy(32))
198             return nullptr;
199 
200         // Canonicalize it.
201         Value* V = IC.Builder.getInt32(1);
202         AI.setOperand(0, V);
203         return &AI;
204     }
205 
206     // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
207     if (const ConstantInt * C = dyn_cast<ConstantInt>(AI.getArraySize())) {
208         Type* NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
209         AllocaInst* New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
210         New->setAlignment(AI.getAlignment());
211 
212         // Scan to the end of the allocation instructions, to skip over a block of
213         // allocas if possible...also skip interleaved debug info
214         //
215         BasicBlock::iterator It(New);
216         while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
217             ++It;
218 
219         // Now that I is pointing to the first non-allocation-inst in the block,
220         // insert our getelementptr instruction...
221         //
222         Type* IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
223         Value* NullIdx = Constant::getNullValue(IdxTy);
224         Value* Idx[2] = { NullIdx, NullIdx };
225         Instruction* GEP =
226             GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
227         IC.InsertNewInstBefore(GEP, *It);
228 
229         // Now make everything use the getelementptr instead of the original
230         // allocation.
231         return IC.replaceInstUsesWith(AI, GEP);
232     }
233 
234     if (isa<UndefValue>(AI.getArraySize()))
235         return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
236 
237     // Ensure that the alloca array size argument has type intptr_t, so that
238     // any casting is exposed early.
239     Type* IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
240     if (AI.getArraySize()->getType() != IntPtrTy) {
241         Value* V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
242         AI.setOperand(0, V);
243         return &AI;
244     }
245 
246     return nullptr;
247 }
248 
249 namespace {
250     // If I and V are pointers in different address space, it is not allowed to
251     // use replaceAllUsesWith since I and V have different types. A
252     // non-target-specific transformation should not use addrspacecast on V since
253     // the two address space may be disjoint depending on target.
254     //
255     // This class chases down uses of the old pointer until reaching the load
256     // instructions, then replaces the old pointer in the load instructions with
257     // the new pointer. If during the chasing it sees bitcast or GEP, it will
258     // create new bitcast or GEP with the new pointer and use them in the load
259     // instruction.
260     class PointerReplacer {
261     public:
PointerReplacer(InstCombiner & IC)262         PointerReplacer(InstCombiner& IC) : IC(IC) {}
263         void replacePointer(Instruction& I, Value* V);
264 
265     private:
266         void findLoadAndReplace(Instruction& I);
267         void replace(Instruction* I);
268         Value* getReplacement(Value* I);
269 
270         SmallVector<Instruction*, 4> Path;
271         MapVector<Value*, Value*> WorkMap;
272         InstCombiner& IC;
273     };
274 } // end anonymous namespace
275 
findLoadAndReplace(Instruction & I)276 void PointerReplacer::findLoadAndReplace(Instruction& I) {
277     for (auto U : I.users()) {
278         auto* Inst = dyn_cast<Instruction>(&*U);
279         if (!Inst)
280             return;
281         LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
282         if (isa<LoadInst>(Inst)) {
283             for (auto P : Path)
284                 replace(P);
285             replace(Inst);
286         }
287         else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
288             Path.push_back(Inst);
289             findLoadAndReplace(*Inst);
290             Path.pop_back();
291         }
292         else {
293             return;
294         }
295     }
296 }
297 
getReplacement(Value * V)298 Value* PointerReplacer::getReplacement(Value* V) {
299     auto Loc = WorkMap.find(V);
300     if (Loc != WorkMap.end())
301         return Loc->second;
302     return nullptr;
303 }
304 
replace(Instruction * I)305 void PointerReplacer::replace(Instruction* I) {
306     if (getReplacement(I))
307         return;
308 
309     if (auto * LT = dyn_cast<LoadInst>(I)) {
310         auto* V = getReplacement(LT->getPointerOperand());
311         IGC_ASSERT_MESSAGE(V, "Operand not replaced");
312         auto* NewI = new LoadInst(V);
313         NewI->takeName(LT);
314         IC.InsertNewInstWith(NewI, *LT);
315         IC.replaceInstUsesWith(*LT, NewI);
316         WorkMap[LT] = NewI;
317     }
318     else if (auto * GEP = dyn_cast<GetElementPtrInst>(I)) {
319         auto* V = getReplacement(GEP->getPointerOperand());
320         IGC_ASSERT_MESSAGE(V, "Operand not replaced");
321         SmallVector<Value*, 8> Indices;
322         Indices.append(GEP->idx_begin(), GEP->idx_end());
323         auto* NewI = GetElementPtrInst::Create(
324             V->getType()->getPointerElementType(), V, Indices);
325         IC.InsertNewInstWith(NewI, *GEP);
326         NewI->takeName(GEP);
327         WorkMap[GEP] = NewI;
328     }
329     else if (auto * BC = dyn_cast<BitCastInst>(I)) {
330         auto* V = getReplacement(BC->getOperand(0));
331         IGC_ASSERT_MESSAGE(V, "Operand not replaced");
332         auto* NewT = PointerType::get(BC->getType()->getPointerElementType(),
333             V->getType()->getPointerAddressSpace());
334         auto* NewI = new BitCastInst(V, NewT);
335         IC.InsertNewInstWith(NewI, *BC);
336         NewI->takeName(BC);
337         WorkMap[BC] = NewI;
338     }
339     else {
340         IGC_ASSERT_EXIT_MESSAGE(0, "should never reach here");
341     }
342 }
343 
replacePointer(Instruction & I,Value * V)344 void PointerReplacer::replacePointer(Instruction& I, Value* V) {
345     IGC_ASSERT(nullptr != V);
346     IGC_ASSERT(nullptr != cast<PointerType>(I.getType()));
347     IGC_ASSERT(nullptr != cast<PointerType>(V->getType()));
348     IGC_ASSERT(cast<PointerType>(I.getType()) != cast<PointerType>(V->getType()));
349     IGC_ASSERT(cast<PointerType>(I.getType())->getElementType() == cast<PointerType>(V->getType())->getElementType());
350     WorkMap[&I] = V;
351     findLoadAndReplace(I);
352 }
353 
visitAllocaInst(AllocaInst & AI)354 Instruction* InstCombiner::visitAllocaInst(AllocaInst& AI) {
355     if (auto * I = simplifyAllocaArraySize(*this, AI))
356         return I;
357 
358     if (AI.getAllocatedType()->isSized()) {
359         // If the alignment is 0 (unspecified), assign it the preferred alignment.
360         if (AI.getAlignment() == 0)
361             AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType()));
362 
363         // Move all alloca's of zero byte objects to the entry block and merge them
364         // together.  Note that we only do this for alloca's, because malloc should
365         // allocate and return a unique pointer, even for a zero byte allocation.
366         if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) {
367             // For a zero sized alloca there is no point in doing an array allocation.
368             // This is helpful if the array size is a complicated expression not used
369             // elsewhere.
370             if (AI.isArrayAllocation()) {
371                 AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
372                 return &AI;
373             }
374 
375             // Get the first instruction in the entry block.
376             BasicBlock& EntryBlock = AI.getParent()->getParent()->getEntryBlock();
377             Instruction* FirstInst = EntryBlock.getFirstNonPHIOrDbg();
378             if (FirstInst != &AI) {
379                 // If the entry block doesn't start with a zero-size alloca then move
380                 // this one to the start of the entry block.  There is no problem with
381                 // dominance as the array size was forced to a constant earlier already.
382                 AllocaInst* EntryAI = dyn_cast<AllocaInst>(FirstInst);
383                 if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
384                     DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
385                     AI.moveBefore(FirstInst);
386                     return &AI;
387                 }
388 
389                 // If the alignment of the entry block alloca is 0 (unspecified),
390                 // assign it the preferred alignment.
391                 if (EntryAI->getAlignment() == 0)
392                     EntryAI->setAlignment(
393                         DL.getPrefTypeAlignment(EntryAI->getAllocatedType()));
394                 // Replace this zero-sized alloca with the one at the start of the entry
395                 // block after ensuring that the address will be aligned enough for both
396                 // types.
397                 unsigned MaxAlign = std::max(EntryAI->getAlignment(),
398                     AI.getAlignment());
399                 EntryAI->setAlignment(MaxAlign);
400                 if (AI.getType() != EntryAI->getType())
401                     return new BitCastInst(EntryAI, AI.getType());
402                 return replaceInstUsesWith(AI, EntryAI);
403             }
404         }
405     }
406 
407     if (AI.getAlignment()) {
408         // Check to see if this allocation is only modified by a memcpy/memmove from
409         // a constant global whose alignment is equal to or exceeds that of the
410         // allocation.  If this is the case, we can change all users to use
411         // the constant global instead.  This is commonly produced by the CFE by
412         // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
413         // is only subsequently read.
414         SmallVector<Instruction*, 4> ToDelete;
415         if (MemTransferInst * Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
416             unsigned SourceAlign = getOrEnforceKnownAlignment(
417                 Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
418             if (AI.getAlignment() <= SourceAlign &&
419                 isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
420                 LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
421                 LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
422                 for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
423                     eraseInstFromFunction(*ToDelete[i]);
424                 Constant* TheSrc = cast<Constant>(Copy->getSource());
425                 auto* SrcTy = TheSrc->getType();
426                 auto* DestTy = PointerType::get(AI.getType()->getPointerElementType(),
427                     SrcTy->getPointerAddressSpace());
428                 Constant* Cast =
429                     ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
430                 if (AI.getType()->getPointerAddressSpace() ==
431                     SrcTy->getPointerAddressSpace()) {
432                     Instruction* NewI = replaceInstUsesWith(AI, Cast);
433                     eraseInstFromFunction(*Copy);
434                     ++NumGlobalCopies;
435                     return NewI;
436                 }
437                 else {
438                     PointerReplacer PtrReplacer(*this);
439                     PtrReplacer.replacePointer(AI, Cast);
440                     ++NumGlobalCopies;
441                 }
442             }
443         }
444     }
445 
446     // At last, use the generic allocation site handler to aggressively remove
447     // unused allocas.
448     return visitAllocSite(AI);
449 }
450 
451 // Are we allowed to form a atomic load or store of this type?
isSupportedAtomicType(Type * Ty)452 static bool isSupportedAtomicType(Type* Ty) {
453     return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
454 }
455 
456 /// Helper to combine a load to a new type.
457 ///
458 /// This just does the work of combining a load to a new type. It handles
459 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
460 /// loaded *value* type. This will convert it to a pointer, cast the operand to
461 /// that pointer type, load it, etc.
462 ///
463 /// Note that this will create all of the instructions with whatever insert
464 /// point the \c InstCombiner currently is using.
combineLoadToNewType(InstCombiner & IC,LoadInst & LI,Type * NewTy,const Twine & Suffix="")465 static LoadInst* combineLoadToNewType(InstCombiner& IC, LoadInst& LI, Type* NewTy,
466     const Twine& Suffix = "") {
467     IGC_ASSERT_MESSAGE((!LI.isAtomic() || isSupportedAtomicType(NewTy)), "can't fold an atomic load to requested type");
468 
469     Value* Ptr = LI.getPointerOperand();
470     unsigned AS = LI.getPointerAddressSpace();
471     SmallVector<std::pair<unsigned, MDNode*>, 8> MD;
472     LI.getAllMetadata(MD);
473 
474     Value* NewPtr = nullptr;
475     if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
476         NewPtr->getType()->getPointerElementType() == NewTy &&
477         NewPtr->getType()->getPointerAddressSpace() == AS))
478         NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
479 
480     LoadInst* NewLoad = IC.Builder.CreateAlignedLoad(
481         NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
482     NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
483     MDBuilder MDB(NewLoad->getContext());
484     for (const auto& MDPair : MD) {
485         unsigned ID = MDPair.first;
486         MDNode* N = MDPair.second;
487         // Note, essentially every kind of metadata should be preserved here! This
488         // routine is supposed to clone a load instruction changing *only its type*.
489         // The only metadata it makes sense to drop is metadata which is invalidated
490         // when the pointer type changes. This should essentially never be the case
491         // in LLVM, but we explicitly switch over only known metadata to be
492         // conservatively correct. If you are adding metadata to LLVM which pertains
493         // to loads, you almost certainly want to add it here.
494         switch (ID) {
495         case LLVMContext::MD_dbg:
496         case LLVMContext::MD_tbaa:
497         case LLVMContext::MD_prof:
498         case LLVMContext::MD_fpmath:
499         case LLVMContext::MD_tbaa_struct:
500         case LLVMContext::MD_invariant_load:
501         case LLVMContext::MD_alias_scope:
502         case LLVMContext::MD_noalias:
503         case LLVMContext::MD_nontemporal:
504         case LLVMContext::MD_mem_parallel_loop_access:
505             // All of these directly apply.
506             NewLoad->setMetadata(ID, N);
507             break;
508 
509         case LLVMContext::MD_nonnull:
510             copyNonnullMetadata(LI, N, *NewLoad);
511             break;
512         case LLVMContext::MD_align:
513         case LLVMContext::MD_dereferenceable:
514         case LLVMContext::MD_dereferenceable_or_null:
515             // These only directly apply if the new type is also a pointer.
516             if (NewTy->isPointerTy())
517                 NewLoad->setMetadata(ID, N);
518             break;
519         case LLVMContext::MD_range:
520             copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
521             break;
522         }
523     }
524     return NewLoad;
525 }
526 
527 /// Combine a store to a new type.
528 ///
529 /// Returns the newly created store instruction.
combineStoreToNewValue(InstCombiner & IC,StoreInst & SI,Value * V)530 static StoreInst* combineStoreToNewValue(InstCombiner& IC, StoreInst& SI, Value* V) {
531     IGC_ASSERT_MESSAGE((!SI.isAtomic() || isSupportedAtomicType(V->getType())), "can't fold an atomic store of requested type");
532 
533     Value* Ptr = SI.getPointerOperand();
534     unsigned AS = SI.getPointerAddressSpace();
535     SmallVector<std::pair<unsigned, MDNode*>, 8> MD;
536     SI.getAllMetadata(MD);
537 
538     StoreInst* NewStore = IC.Builder.CreateAlignedStore(
539         V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
540         SI.getAlignment(), SI.isVolatile());
541     NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
542     for (const auto& MDPair : MD) {
543         unsigned ID = MDPair.first;
544         MDNode* N = MDPair.second;
545         // Note, essentially every kind of metadata should be preserved here! This
546         // routine is supposed to clone a store instruction changing *only its
547         // type*. The only metadata it makes sense to drop is metadata which is
548         // invalidated when the pointer type changes. This should essentially
549         // never be the case in LLVM, but we explicitly switch over only known
550         // metadata to be conservatively correct. If you are adding metadata to
551         // LLVM which pertains to stores, you almost certainly want to add it
552         // here.
553         switch (ID) {
554         case LLVMContext::MD_dbg:
555         case LLVMContext::MD_tbaa:
556         case LLVMContext::MD_prof:
557         case LLVMContext::MD_fpmath:
558         case LLVMContext::MD_tbaa_struct:
559         case LLVMContext::MD_alias_scope:
560         case LLVMContext::MD_noalias:
561         case LLVMContext::MD_nontemporal:
562         case LLVMContext::MD_mem_parallel_loop_access:
563             // All of these directly apply.
564             NewStore->setMetadata(ID, N);
565             break;
566 
567         case LLVMContext::MD_invariant_load:
568         case LLVMContext::MD_nonnull:
569         case LLVMContext::MD_range:
570         case LLVMContext::MD_align:
571         case LLVMContext::MD_dereferenceable:
572         case LLVMContext::MD_dereferenceable_or_null:
573             // These don't apply for stores.
574             break;
575         }
576     }
577 
578     return NewStore;
579 }
580 
581 /// Returns true if instruction represent minmax pattern like:
582 ///   select ((cmp load V1, load V2), V1, V2).
isMinMaxWithLoads(Value * V)583 static bool isMinMaxWithLoads(Value* V) {
584     IGC_ASSERT_MESSAGE(V->getType()->isPointerTy(), "Expected pointer type.");
585     // Ignore possible ty* to ixx* bitcast.
586     V = peekThroughBitcast(V);
587     // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
588     // pattern.
589     CmpInst::Predicate Pred;
590     Instruction* L1;
591     Instruction* L2;
592     Value* LHS;
593     Value* RHS;
594     if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
595         m_Value(LHS), m_Value(RHS))))
596         return false;
597     return (match(L1, m_Load(m_Specific(LHS))) &&
598         match(L2, m_Load(m_Specific(RHS)))) ||
599         (match(L1, m_Load(m_Specific(RHS))) &&
600             match(L2, m_Load(m_Specific(LHS))));
601 }
602 
603 /// Combine loads to match the type of their uses' value after looking
604 /// through intervening bitcasts.
605 ///
606 /// The core idea here is that if the result of a load is used in an operation,
607 /// we should load the type most conducive to that operation. For example, when
608 /// loading an integer and converting that immediately to a pointer, we should
609 /// instead directly load a pointer.
610 ///
611 /// However, this routine must never change the width of a load or the number of
612 /// loads as that would introduce a semantic change. This combine is expected to
613 /// be a semantic no-op which just allows loads to more closely model the types
614 /// of their consuming operations.
615 ///
616 /// Currently, we also refuse to change the precise type used for an atomic load
617 /// or a volatile load. This is debatable, and might be reasonable to change
618 /// later. However, it is risky in case some backend or other part of LLVM is
619 /// relying on the exact type loaded to select appropriate atomic operations.
combineLoadToOperationType(InstCombiner & IC,LoadInst & LI)620 static Instruction* combineLoadToOperationType(InstCombiner& IC, LoadInst& LI) {
621     // FIXME: We could probably with some care handle both volatile and ordered
622     // atomic loads here but it isn't clear that this is important.
623     if (!LI.isUnordered())
624         return nullptr;
625 
626     if (LI.use_empty())
627         return nullptr;
628 
629     // swifterror values can't be bitcasted.
630     if (LI.getPointerOperand()->isSwiftError())
631         return nullptr;
632 
633     Type* Ty = LI.getType();
634     const DataLayout& DL = IC.getDataLayout();
635 
636     // Try to canonicalize loads which are only ever stored to operate over
637     // integers instead of any other type. We only do this when the loaded type
638     // is sized and has a size exactly the same as its store size and the store
639     // size is a legal integer type.
640     // Do not perform canonicalization if minmax pattern is found (to avoid
641     // infinite loop).
642     if (!Ty->isIntegerTy() && Ty->isSized() &&
643         DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
644         DL.getTypeStoreSizeInBits(Ty) == DL.getTypeSizeInBits(Ty) &&
645         !DL.isNonIntegralPointerType(Ty) &&
646         !isMinMaxWithLoads(
647             peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true))) {
648         if (all_of(LI.users(), [&LI](User* U) {
649             auto* SI = dyn_cast<StoreInst>(U);
650             return SI && SI->getPointerOperand() != &LI &&
651                 !SI->getPointerOperand()->isSwiftError();
652         })) {
653             LoadInst* NewLoad = combineLoadToNewType(
654                 IC, LI,
655                 Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
656             // Replace all the stores with stores of the newly loaded value.
657             for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
658                 auto* SI = cast<StoreInst>(*UI++);
659                 IC.Builder.SetInsertPoint(SI);
660                 combineStoreToNewValue(IC, *SI, NewLoad);
661                 IC.eraseInstFromFunction(*SI);
662             }
663             IGC_ASSERT_MESSAGE(LI.use_empty(), "Failed to remove all users of the load!");
664             // Return the old load so the combiner can delete it safely.
665             return &LI;
666         }
667     }
668 
669     // Fold away bit casts of the loaded value by loading the desired type.
670     // We can do this for BitCastInsts as well as casts from and to pointer types,
671     // as long as those are noops (i.e., the source or dest type have the same
672     // bitwidth as the target's pointers).
673     if (LI.hasOneUse())
674         if (auto * CI = dyn_cast<CastInst>(LI.user_back()))
675             if (CI->isNoopCast(DL))
676                 if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
677                     LoadInst* NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
678                     CI->replaceAllUsesWith(NewLoad);
679                     IC.eraseInstFromFunction(*CI);
680                     return &LI;
681                 }
682 
683     // FIXME: We should also canonicalize loads of vectors when their elements are
684     // cast to other types.
685     return nullptr;
686 }
687 
unpackLoadToAggregate(InstCombiner & IC,LoadInst & LI)688 static Instruction* unpackLoadToAggregate(InstCombiner& IC, LoadInst& LI) {
689     // FIXME: We could probably with some care handle both volatile and atomic
690     // stores here but it isn't clear that this is important.
691     if (!LI.isSimple())
692         return nullptr;
693 
694     Type* T = LI.getType();
695     if (!T->isAggregateType())
696         return nullptr;
697 
698     StringRef Name = LI.getName();
699     IGC_ASSERT_MESSAGE(LI.getAlignment(), "Alignment must be set at this point");
700 
701     if (auto * ST = dyn_cast<StructType>(T)) {
702         // If the struct only have one element, we unpack.
703         auto NumElements = ST->getNumElements();
704         if (NumElements == 1) {
705             LoadInst* NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
706                 ".unpack");
707             AAMDNodes AAMD;
708             LI.getAAMetadata(AAMD);
709             NewLoad->setAAMetadata(AAMD);
710             return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
711                 UndefValue::get(T), NewLoad, 0, Name));
712         }
713 
714         // We don't want to break loads with padding here as we'd loose
715         // the knowledge that padding exists for the rest of the pipeline.
716         const DataLayout& DL = IC.getDataLayout();
717         auto* SL = DL.getStructLayout(ST);
718         if (SL->hasPadding())
719             return nullptr;
720 
721         auto Align = LI.getAlignment();
722         if (!Align)
723             Align = DL.getABITypeAlignment(ST);
724 
725         auto* Addr = LI.getPointerOperand();
726         auto* IdxType = Type::getInt32Ty(T->getContext());
727         auto* Zero = ConstantInt::get(IdxType, 0);
728 
729         Value* V = UndefValue::get(T);
730         for (unsigned i = 0; i < NumElements; i++) {
731             Value* Indices[2] = {
732               Zero,
733               ConstantInt::get(IdxType, i),
734             };
735             auto* Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
736                 Name + ".elt");
737             auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
738             auto* L = IC.Builder.CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
739             // Propagate AA metadata. It'll still be valid on the narrowed load.
740             AAMDNodes AAMD;
741             LI.getAAMetadata(AAMD);
742             L->setAAMetadata(AAMD);
743             V = IC.Builder.CreateInsertValue(V, L, i);
744         }
745 
746         V->setName(Name);
747         return IC.replaceInstUsesWith(LI, V);
748     }
749 
750     if (auto * AT = dyn_cast<ArrayType>(T)) {
751         auto* ET = AT->getElementType();
752         auto NumElements = AT->getNumElements();
753         if (NumElements == 1) {
754             LoadInst* NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
755             AAMDNodes AAMD;
756             LI.getAAMetadata(AAMD);
757             NewLoad->setAAMetadata(AAMD);
758             return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
759                 UndefValue::get(T), NewLoad, 0, Name));
760         }
761 
762         // Bail out if the array is too large. Ideally we would like to optimize
763         // arrays of arbitrary size but this has a terrible impact on compile time.
764         // The threshold here is chosen arbitrarily, maybe needs a little bit of
765         // tuning.
766         if (NumElements > IC.MaxArraySizeForCombine)
767             return nullptr;
768 
769         const DataLayout& DL = IC.getDataLayout();
770         auto EltSize = DL.getTypeAllocSize(ET);
771         auto Align = LI.getAlignment();
772         if (!Align)
773             Align = DL.getABITypeAlignment(T);
774 
775         auto* Addr = LI.getPointerOperand();
776         auto* IdxType = Type::getInt64Ty(T->getContext());
777         auto* Zero = ConstantInt::get(IdxType, 0);
778 
779         Value* V = UndefValue::get(T);
780         uint64_t Offset = 0;
781         for (uint64_t i = 0; i < NumElements; i++) {
782             Value* Indices[2] = {
783               Zero,
784               ConstantInt::get(IdxType, i),
785             };
786             auto* Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
787                 Name + ".elt");
788             auto* L = IC.Builder.CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
789                 Name + ".unpack");
790             AAMDNodes AAMD;
791             LI.getAAMetadata(AAMD);
792             L->setAAMetadata(AAMD);
793             V = IC.Builder.CreateInsertValue(V, L, i);
794             Offset += EltSize;
795         }
796 
797         V->setName(Name);
798         return IC.replaceInstUsesWith(LI, V);
799     }
800 
801     return nullptr;
802 }
803 
804 // If we can determine that all possible objects pointed to by the provided
805 // pointer value are, not only dereferenceable, but also definitively less than
806 // or equal to the provided maximum size, then return true. Otherwise, return
807 // false (constant global values and allocas fall into this category).
808 //
809 // FIXME: This should probably live in ValueTracking (or similar).
isObjectSizeLessThanOrEq(Value * V,uint64_t MaxSize,const DataLayout & DL)810 static bool isObjectSizeLessThanOrEq(Value* V, uint64_t MaxSize,
811     const DataLayout& DL) {
812     SmallPtrSet<Value*, 4> Visited;
813     SmallVector<Value*, 4> Worklist(1, V);
814 
815     do {
816         Value* P = Worklist.pop_back_val();
817         P = P->stripPointerCasts();
818 
819         if (!Visited.insert(P).second)
820             continue;
821 
822         if (SelectInst * SI = dyn_cast<SelectInst>(P)) {
823             Worklist.push_back(SI->getTrueValue());
824             Worklist.push_back(SI->getFalseValue());
825             continue;
826         }
827 
828         if (PHINode * PN = dyn_cast<PHINode>(P)) {
829             for (Value* IncValue : PN->incoming_values())
830                 Worklist.push_back(IncValue);
831             continue;
832         }
833 
834         if (GlobalAlias * GA = dyn_cast<GlobalAlias>(P)) {
835             if (GA->isInterposable())
836                 return false;
837             Worklist.push_back(GA->getAliasee());
838             continue;
839         }
840 
841         // If we know how big this object is, and it is less than MaxSize, continue
842         // searching. Otherwise, return false.
843         if (AllocaInst * AI = dyn_cast<AllocaInst>(P)) {
844             if (!AI->getAllocatedType()->isSized())
845                 return false;
846 
847             ConstantInt* CS = dyn_cast<ConstantInt>(AI->getArraySize());
848             if (!CS)
849                 return false;
850 
851             uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
852             // Make sure that, even if the multiplication below would wrap as an
853             // uint64_t, we still do the right thing.
854             if ((CS->getValue().zextOrSelf(128) * APInt(128, TypeSize)).ugt(MaxSize))
855                 return false;
856             continue;
857         }
858 
859         if (GlobalVariable * GV = dyn_cast<GlobalVariable>(P)) {
860             if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
861                 return false;
862 
863             uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
864             if (InitSize > MaxSize)
865                 return false;
866             continue;
867         }
868 
869         return false;
870     } while (!Worklist.empty());
871 
872     return true;
873 }
874 
875 // If we're indexing into an object of a known size, and the outer index is
876 // not a constant, but having any value but zero would lead to undefined
877 // behavior, replace it with zero.
878 //
879 // For example, if we have:
880 // @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
881 // ...
882 // %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
883 // ... = load i32* %arrayidx, align 4
884 // Then we know that we can replace %x in the GEP with i64 0.
885 //
886 // FIXME: We could fold any GEP index to zero that would cause UB if it were
887 // not zero. Currently, we only handle the first such index. Also, we could
888 // also search through non-zero constant indices if we kept track of the
889 // offsets those indices implied.
canReplaceGEPIdxWithZero(InstCombiner & IC,GetElementPtrInst * GEPI,Instruction * MemI,unsigned & Idx)890 static bool canReplaceGEPIdxWithZero(InstCombiner& IC, GetElementPtrInst* GEPI,
891     Instruction* MemI, unsigned& Idx) {
892     if (GEPI->getNumOperands() < 2)
893         return false;
894 
895     // Find the first non-zero index of a GEP. If all indices are zero, return
896     // one past the last index.
897     auto FirstNZIdx = [](const GetElementPtrInst* GEPI) {
898         unsigned I = 1;
899         for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
900             Value* V = GEPI->getOperand(I);
901             if (const ConstantInt * CI = dyn_cast<ConstantInt>(V))
902                 if (CI->isZero())
903                     continue;
904 
905             break;
906         }
907 
908         return I;
909     };
910 
911     // Skip through initial 'zero' indices, and find the corresponding pointer
912     // type. See if the next index is not a constant.
913     Idx = FirstNZIdx(GEPI);
914     if (Idx == GEPI->getNumOperands())
915         return false;
916     if (isa<Constant>(GEPI->getOperand(Idx)))
917         return false;
918 
919     SmallVector<Value*, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
920     Type* AllocTy =
921         GetElementPtrInst::getIndexedType(GEPI->getSourceElementType(), Ops);
922     if (!AllocTy || !AllocTy->isSized())
923         return false;
924     const DataLayout& DL = IC.getDataLayout();
925     uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy);
926 
927     // If there are more indices after the one we might replace with a zero, make
928     // sure they're all non-negative. If any of them are negative, the overall
929     // address being computed might be before the base address determined by the
930     // first non-zero index.
931     auto IsAllNonNegative = [&]() {
932         for (unsigned i = Idx + 1, e = GEPI->getNumOperands(); i != e; ++i) {
933             KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
934             if (Known.isNonNegative())
935                 continue;
936             return false;
937         }
938 
939         return true;
940     };
941 
942     // FIXME: If the GEP is not inbounds, and there are extra indices after the
943     // one we'll replace, those could cause the address computation to wrap
944     // (rendering the IsAllNonNegative() check below insufficient). We can do
945     // better, ignoring zero indices (and other indices we can prove small
946     // enough not to wrap).
947     if (Idx + 1 != GEPI->getNumOperands() && !GEPI->isInBounds())
948         return false;
949 
950     // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
951     // also known to be dereferenceable.
952     return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
953         IsAllNonNegative();
954 }
955 
956 // If we're indexing into an object with a variable index for the memory
957 // access, but the object has only one element, we can assume that the index
958 // will always be zero. If we replace the GEP, return it.
959 template <typename T>
replaceGEPIdxWithZero(InstCombiner & IC,Value * Ptr,T & MemI)960 static Instruction* replaceGEPIdxWithZero(InstCombiner& IC, Value* Ptr,
961     T& MemI) {
962     if (GetElementPtrInst * GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
963         unsigned Idx;
964         if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
965             Instruction* NewGEPI = GEPI->clone();
966             NewGEPI->setOperand(Idx,
967                 ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
968             NewGEPI->insertBefore(GEPI);
969             MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
970             return NewGEPI;
971         }
972     }
973 
974     return nullptr;
975 }
976 
canSimplifyNullStoreOrGEP(StoreInst & SI)977 static bool canSimplifyNullStoreOrGEP(StoreInst& SI) {
978     if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
979         return false;
980 
981     auto* Ptr = SI.getPointerOperand();
982     if (GetElementPtrInst * GEPI = dyn_cast<GetElementPtrInst>(Ptr))
983         Ptr = GEPI->getOperand(0);
984     return (isa<ConstantPointerNull>(Ptr) &&
985         !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
986 }
987 
canSimplifyNullLoadOrGEP(LoadInst & LI,Value * Op)988 static bool canSimplifyNullLoadOrGEP(LoadInst& LI, Value* Op) {
989     if (GetElementPtrInst * GEPI = dyn_cast<GetElementPtrInst>(Op)) {
990         const Value* GEPI0 = GEPI->getOperand(0);
991         if (isa<ConstantPointerNull>(GEPI0) &&
992             !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
993             return true;
994     }
995     if (isa<UndefValue>(Op) ||
996         (isa<ConstantPointerNull>(Op) &&
997             !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
998         return true;
999     return false;
1000 }
1001 
visitLoadInst(LoadInst & LI)1002 Instruction* InstCombiner::visitLoadInst(LoadInst& LI) {
1003     Value* Op = LI.getOperand(0);
1004 
1005     // Try to canonicalize the loaded type.
1006     if (Instruction * Res = combineLoadToOperationType(*this, LI))
1007         return Res;
1008 
1009     // Attempt to improve the alignment.
1010     unsigned KnownAlign = getOrEnforceKnownAlignment(
1011         Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, &AC, &DT);
1012     unsigned LoadAlign = LI.getAlignment();
1013     unsigned EffectiveLoadAlign =
1014         LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
1015 
1016     if (KnownAlign > EffectiveLoadAlign)
1017         LI.setAlignment(KnownAlign);
1018     else if (LoadAlign == 0)
1019         LI.setAlignment(EffectiveLoadAlign);
1020 
1021     // Replace GEP indices if possible.
1022     if (Instruction * NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
1023         Worklist.Add(NewGEPI);
1024         return &LI;
1025     }
1026 
1027     if (Instruction * Res = unpackLoadToAggregate(*this, LI))
1028         return Res;
1029 
1030     // Do really simple store-to-load forwarding and load CSE, to catch cases
1031     // where there are several consecutive memory accesses to the same location,
1032     // separated by a few arithmetic operations.
1033     BasicBlock::iterator BBI(LI);
1034     bool IsLoadCSE = false;
1035     if (Value * AvailableVal = FindAvailableLoadedValue(
1036         &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
1037         if (IsLoadCSE)
1038             combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI);
1039 
1040         return replaceInstUsesWith(
1041             LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
1042                 LI.getName() + ".cast"));
1043     }
1044 
1045     // None of the following transforms are legal for volatile/ordered atomic
1046     // loads.  Most of them do apply for unordered atomics.
1047     if (!LI.isUnordered()) return nullptr;
1048 
1049     // load(gep null, ...) -> unreachable
1050     // load null/undef -> unreachable
1051     // TODO: Consider a target hook for valid address spaces for this xforms.
1052     if (canSimplifyNullLoadOrGEP(LI, Op)) {
1053         // Insert a new store to null instruction before the load to indicate
1054         // that this code is not reachable.  We do this instead of inserting
1055         // an unreachable instruction directly because we cannot modify the
1056         // CFG.
1057         StoreInst* SI = new StoreInst(UndefValue::get(LI.getType()),
1058             Constant::getNullValue(Op->getType()), &LI);
1059         SI->setDebugLoc(LI.getDebugLoc());
1060         return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
1061     }
1062 
1063     if (Op->hasOneUse()) {
1064         // Change select and PHI nodes to select values instead of addresses: this
1065         // helps alias analysis out a lot, allows many others simplifications, and
1066         // exposes redundancy in the code.
1067         //
1068         // Note that we cannot do the transformation unless we know that the
1069         // introduced loads cannot trap!  Something like this is valid as long as
1070         // the condition is always false: load (select bool %C, int* null, int* %G),
1071         // but it would not be valid if we transformed it to load from null
1072         // unconditionally.
1073         //
1074         if (SelectInst * SI = dyn_cast<SelectInst>(Op)) {
1075             // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
1076             unsigned Align = LI.getAlignment();
1077             if (isSafeToLoadUnconditionally(SI->getOperand(1), Align, DL, SI) &&
1078                 isSafeToLoadUnconditionally(SI->getOperand(2), Align, DL, SI)) {
1079                 LoadInst* V1 = Builder.CreateLoad(SI->getOperand(1),
1080                     SI->getOperand(1)->getName() + ".val");
1081                 LoadInst* V2 = Builder.CreateLoad(SI->getOperand(2),
1082                     SI->getOperand(2)->getName() + ".val");
1083                 IGC_ASSERT_MESSAGE(LI.isUnordered(), "implied by above");
1084                 V1->setAlignment(Align);
1085                 V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1086                 V2->setAlignment(Align);
1087                 V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1088                 return SelectInst::Create(SI->getCondition(), V1, V2);
1089             }
1090 
1091             // load (select (cond, null, P)) -> load P
1092             if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
1093                 !NullPointerIsDefined(SI->getFunction(),
1094                     LI.getPointerAddressSpace())) {
1095                 LI.setOperand(0, SI->getOperand(2));
1096                 return &LI;
1097             }
1098 
1099             // load (select (cond, P, null)) -> load P
1100             if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
1101                 !NullPointerIsDefined(SI->getFunction(),
1102                     LI.getPointerAddressSpace())) {
1103                 LI.setOperand(0, SI->getOperand(1));
1104                 return &LI;
1105             }
1106         }
1107     }
1108     return nullptr;
1109 }
1110 
1111 /// Look for extractelement/insertvalue sequence that acts like a bitcast.
1112 ///
1113 /// \returns underlying value that was "cast", or nullptr otherwise.
1114 ///
1115 /// For example, if we have:
1116 ///
1117 ///     %E0 = extractelement <2 x double> %U, i32 0
1118 ///     %V0 = insertvalue [2 x double] undef, double %E0, 0
1119 ///     %E1 = extractelement <2 x double> %U, i32 1
1120 ///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
1121 ///
1122 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
1123 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
1124 /// Note that %U may contain non-undef values where %V1 has undef.
likeBitCastFromVector(InstCombiner & IC,Value * V)1125 static Value* likeBitCastFromVector(InstCombiner& IC, Value* V) {
1126     Value* U = nullptr;
1127     while (auto * IV = dyn_cast<InsertValueInst>(V)) {
1128         auto* E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
1129         if (!E)
1130             return nullptr;
1131         auto* W = E->getVectorOperand();
1132         if (!U)
1133             U = W;
1134         else if (U != W)
1135             return nullptr;
1136         auto* CI = dyn_cast<ConstantInt>(E->getIndexOperand());
1137         if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
1138             return nullptr;
1139         V = IV->getAggregateOperand();
1140     }
1141     if (!isa<UndefValue>(V) || !U)
1142         return nullptr;
1143 
1144     auto* UT = cast<VectorType>(U->getType());
1145     auto* VT = V->getType();
1146     // Check that types UT and VT are bitwise isomorphic.
1147     const auto& DL = IC.getDataLayout();
1148     if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
1149         return nullptr;
1150     }
1151     if (auto * AT = dyn_cast<ArrayType>(VT)) {
1152         if (AT->getNumElements() != UT->getNumElements())
1153             return nullptr;
1154     }
1155     else {
1156         auto* ST = cast<StructType>(VT);
1157         if (ST->getNumElements() != UT->getNumElements())
1158             return nullptr;
1159         for (const auto* EltT : ST->elements()) {
1160             if (EltT != UT->getElementType())
1161                 return nullptr;
1162         }
1163     }
1164     return U;
1165 }
1166 
1167 /// Combine stores to match the type of value being stored.
1168 ///
1169 /// The core idea here is that the memory does not have any intrinsic type and
1170 /// where we can we should match the type of a store to the type of value being
1171 /// stored.
1172 ///
1173 /// However, this routine must never change the width of a store or the number of
1174 /// stores as that would introduce a semantic change. This combine is expected to
1175 /// be a semantic no-op which just allows stores to more closely model the types
1176 /// of their incoming values.
1177 ///
1178 /// Currently, we also refuse to change the precise type used for an atomic or
1179 /// volatile store. This is debatable, and might be reasonable to change later.
1180 /// However, it is risky in case some backend or other part of LLVM is relying
1181 /// on the exact type stored to select appropriate atomic operations.
1182 ///
1183 /// \returns true if the store was successfully combined away. This indicates
1184 /// the caller must erase the store instruction. We have to let the caller erase
1185 /// the store instruction as otherwise there is no way to signal whether it was
1186 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
combineStoreToValueType(InstCombiner & IC,StoreInst & SI)1187 static bool combineStoreToValueType(InstCombiner& IC, StoreInst& SI) {
1188     // FIXME: We could probably with some care handle both volatile and ordered
1189     // atomic stores here but it isn't clear that this is important.
1190     if (!SI.isUnordered())
1191         return false;
1192 
1193     // swifterror values can't be bitcasted.
1194     if (SI.getPointerOperand()->isSwiftError())
1195         return false;
1196 
1197     Value* V = SI.getValueOperand();
1198 
1199     // Fold away bit casts of the stored value by storing the original type.
1200     if (auto * BC = dyn_cast<BitCastInst>(V)) {
1201         V = BC->getOperand(0);
1202         if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
1203             combineStoreToNewValue(IC, SI, V);
1204             return true;
1205         }
1206     }
1207 
1208     if (Value * U = likeBitCastFromVector(IC, V))
1209         if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
1210             combineStoreToNewValue(IC, SI, U);
1211             return true;
1212         }
1213 
1214     // FIXME: We should also canonicalize stores of vectors when their elements
1215     // are cast to other types.
1216     return false;
1217 }
1218 
unpackStoreToAggregate(InstCombiner & IC,StoreInst & SI)1219 static bool unpackStoreToAggregate(InstCombiner& IC, StoreInst& SI) {
1220     // FIXME: We could probably with some care handle both volatile and atomic
1221     // stores here but it isn't clear that this is important.
1222     if (!SI.isSimple())
1223         return false;
1224 
1225     Value* V = SI.getValueOperand();
1226     Type* T = V->getType();
1227 
1228     if (!T->isAggregateType())
1229         return false;
1230 
1231     if (auto * ST = dyn_cast<StructType>(T)) {
1232         // If the struct only have one element, we unpack.
1233         unsigned Count = ST->getNumElements();
1234         if (Count == 1) {
1235             V = IC.Builder.CreateExtractValue(V, 0);
1236             combineStoreToNewValue(IC, SI, V);
1237             return true;
1238         }
1239 
1240         // We don't want to break loads with padding here as we'd loose
1241         // the knowledge that padding exists for the rest of the pipeline.
1242         const DataLayout& DL = IC.getDataLayout();
1243         auto* SL = DL.getStructLayout(ST);
1244         if (SL->hasPadding())
1245             return false;
1246 
1247         auto Align = SI.getAlignment();
1248         if (!Align)
1249             Align = DL.getABITypeAlignment(ST);
1250 
1251         SmallString<16> EltName = V->getName();
1252         EltName += ".elt";
1253         auto* Addr = SI.getPointerOperand();
1254         SmallString<16> AddrName = Addr->getName();
1255         AddrName += ".repack";
1256 
1257         auto* IdxType = Type::getInt32Ty(ST->getContext());
1258         auto* Zero = ConstantInt::get(IdxType, 0);
1259         for (unsigned i = 0; i < Count; i++) {
1260             Value* Indices[2] = {
1261               Zero,
1262               ConstantInt::get(IdxType, i),
1263             };
1264             auto* Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
1265                 AddrName);
1266             auto* Val = IC.Builder.CreateExtractValue(V, i, EltName);
1267             auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
1268             llvm::Instruction* NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1269             AAMDNodes AAMD;
1270             SI.getAAMetadata(AAMD);
1271             NS->setAAMetadata(AAMD);
1272         }
1273 
1274         return true;
1275     }
1276 
1277     if (auto * AT = dyn_cast<ArrayType>(T)) {
1278         // If the array only have one element, we unpack.
1279         auto NumElements = AT->getNumElements();
1280         if (NumElements == 1) {
1281             V = IC.Builder.CreateExtractValue(V, 0);
1282             combineStoreToNewValue(IC, SI, V);
1283             return true;
1284         }
1285 
1286         // Bail out if the array is too large. Ideally we would like to optimize
1287         // arrays of arbitrary size but this has a terrible impact on compile time.
1288         // The threshold here is chosen arbitrarily, maybe needs a little bit of
1289         // tuning.
1290         if (NumElements > IC.MaxArraySizeForCombine)
1291             return false;
1292 
1293         const DataLayout& DL = IC.getDataLayout();
1294         auto EltSize = DL.getTypeAllocSize(AT->getElementType());
1295         auto Align = SI.getAlignment();
1296         if (!Align)
1297             Align = DL.getABITypeAlignment(T);
1298 
1299         SmallString<16> EltName = V->getName();
1300         EltName += ".elt";
1301         auto* Addr = SI.getPointerOperand();
1302         SmallString<16> AddrName = Addr->getName();
1303         AddrName += ".repack";
1304 
1305         auto* IdxType = Type::getInt64Ty(T->getContext());
1306         auto* Zero = ConstantInt::get(IdxType, 0);
1307 
1308         uint64_t Offset = 0;
1309         for (uint64_t i = 0; i < NumElements; i++) {
1310             Value* Indices[2] = {
1311               Zero,
1312               ConstantInt::get(IdxType, i),
1313             };
1314             auto* Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
1315                 AddrName);
1316             auto* Val = IC.Builder.CreateExtractValue(V, i, EltName);
1317             auto EltAlign = MinAlign(Align, Offset);
1318             Instruction* NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1319             AAMDNodes AAMD;
1320             SI.getAAMetadata(AAMD);
1321             NS->setAAMetadata(AAMD);
1322             Offset += EltSize;
1323         }
1324 
1325         return true;
1326     }
1327 
1328     return false;
1329 }
1330 
1331 /// equivalentAddressValues - Test if A and B will obviously have the same
1332 /// value. This includes recognizing that %t0 and %t1 will have the same
1333 /// value in code like this:
1334 ///   %t0 = getelementptr \@a, 0, 3
1335 ///   store i32 0, i32* %t0
1336 ///   %t1 = getelementptr \@a, 0, 3
1337 ///   %t2 = load i32* %t1
1338 ///
equivalentAddressValues(Value * A,Value * B)1339 static bool equivalentAddressValues(Value* A, Value* B) {
1340     // Test if the values are trivially equivalent.
1341     if (A == B) return true;
1342 
1343     // Test if the values come form identical arithmetic instructions.
1344     // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
1345     // its only used to compare two uses within the same basic block, which
1346     // means that they'll always either have the same value or one of them
1347     // will have an undefined value.
1348     if (isa<BinaryOperator>(A) ||
1349         isa<CastInst>(A) ||
1350         isa<PHINode>(A) ||
1351         isa<GetElementPtrInst>(A))
1352         if (Instruction * BI = dyn_cast<Instruction>(B))
1353             if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
1354                 return true;
1355 
1356     // Otherwise they may not be equivalent.
1357     return false;
1358 }
1359 
1360 /// Converts store (bitcast (load (bitcast (select ...)))) to
1361 /// store (load (select ...)), where select is minmax:
1362 /// select ((cmp load V1, load V2), V1, V2).
removeBitcastsFromLoadStoreOnMinMax(InstCombiner & IC,StoreInst & SI)1363 static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner& IC,
1364     StoreInst& SI) {
1365     // bitcast?
1366     if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
1367         return false;
1368     // load? integer?
1369     Value* LoadAddr;
1370     if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
1371         return false;
1372     auto* LI = cast<LoadInst>(SI.getValueOperand());
1373     if (!LI->getType()->isIntegerTy())
1374         return false;
1375     if (!isMinMaxWithLoads(LoadAddr))
1376         return false;
1377 
1378     if (!all_of(LI->users(), [LI, LoadAddr](User* U) {
1379         auto* SI = dyn_cast<StoreInst>(U);
1380         return SI && SI->getPointerOperand() != LI &&
1381             peekThroughBitcast(SI->getPointerOperand()) != LoadAddr &&
1382             !SI->getPointerOperand()->isSwiftError();
1383     }))
1384         return false;
1385 
1386     IC.Builder.SetInsertPoint(LI);
1387     LoadInst* NewLI = combineLoadToNewType(
1388         IC, *LI, LoadAddr->getType()->getPointerElementType());
1389     // Replace all the stores with stores of the newly loaded value.
1390     for (auto* UI : LI->users()) {
1391         auto* USI = cast<StoreInst>(UI);
1392         IC.Builder.SetInsertPoint(USI);
1393         combineStoreToNewValue(IC, *USI, NewLI);
1394     }
1395     IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
1396     IC.eraseInstFromFunction(*LI);
1397     return true;
1398 }
1399 
visitStoreInst(StoreInst & SI)1400 Instruction* InstCombiner::visitStoreInst(StoreInst& SI) {
1401     Value* Val = SI.getOperand(0);
1402     Value* Ptr = SI.getOperand(1);
1403 
1404     // Try to canonicalize the stored type.
1405     if (combineStoreToValueType(*this, SI))
1406         return eraseInstFromFunction(SI);
1407 
1408     // Attempt to improve the alignment.
1409     unsigned KnownAlign = getOrEnforceKnownAlignment(
1410         Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT);
1411     unsigned StoreAlign = SI.getAlignment();
1412     unsigned EffectiveStoreAlign =
1413         StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType());
1414 
1415     if (KnownAlign > EffectiveStoreAlign)
1416         SI.setAlignment(KnownAlign);
1417     else if (StoreAlign == 0)
1418         SI.setAlignment(EffectiveStoreAlign);
1419 
1420     // Try to canonicalize the stored type.
1421     if (unpackStoreToAggregate(*this, SI))
1422         return eraseInstFromFunction(SI);
1423 
1424     if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
1425         return eraseInstFromFunction(SI);
1426 
1427     // Replace GEP indices if possible.
1428     if (Instruction * NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
1429         Worklist.Add(NewGEPI);
1430         return &SI;
1431     }
1432 
1433     // Don't hack volatile/ordered stores.
1434     // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
1435     if (!SI.isUnordered()) return nullptr;
1436 
1437     // If the RHS is an alloca with a single use, zapify the store, making the
1438     // alloca dead.
1439     if (Ptr->hasOneUse()) {
1440         if (isa<AllocaInst>(Ptr))
1441             return eraseInstFromFunction(SI);
1442         if (GetElementPtrInst * GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1443             if (isa<AllocaInst>(GEP->getOperand(0))) {
1444                 if (GEP->getOperand(0)->hasOneUse())
1445                     return eraseInstFromFunction(SI);
1446             }
1447         }
1448     }
1449 
1450     // Do really simple DSE, to catch cases where there are several consecutive
1451     // stores to the same location, separated by a few arithmetic operations. This
1452     // situation often occurs with bitfield accesses.
1453     BasicBlock::iterator BBI(SI);
1454     for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
1455         --ScanInsts) {
1456         --BBI;
1457         // Don't count debug info directives, lest they affect codegen,
1458         // and we skip pointer-to-pointer bitcasts, which are NOPs.
1459         if (isa<DbgInfoIntrinsic>(BBI) ||
1460             (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1461             ScanInsts++;
1462             continue;
1463         }
1464 
1465         if (StoreInst * PrevSI = dyn_cast<StoreInst>(BBI)) {
1466             // Prev store isn't volatile, and stores to the same location?
1467             if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
1468                 SI.getOperand(1))) {
1469                 ++NumDeadStore;
1470                 ++BBI;
1471                 eraseInstFromFunction(*PrevSI);
1472                 continue;
1473             }
1474             break;
1475         }
1476 
1477         // If this is a load, we have to stop.  However, if the loaded value is from
1478         // the pointer we're loading and is producing the pointer we're storing,
1479         // then *this* store is dead (X = load P; store X -> P).
1480         if (LoadInst * LI = dyn_cast<LoadInst>(BBI)) {
1481             if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
1482                 IGC_ASSERT_MESSAGE(SI.isUnordered(), "can't eliminate ordering operation");
1483                 return eraseInstFromFunction(SI);
1484             }
1485 
1486             // Otherwise, this is a load from some other location.  Stores before it
1487             // may not be dead.
1488             break;
1489         }
1490 
1491         // Don't skip over loads, throws or things that can modify memory.
1492         if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
1493             break;
1494     }
1495 
1496     // store X, null    -> turns into 'unreachable' in SimplifyCFG
1497     // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
1498     if (canSimplifyNullStoreOrGEP(SI)) {
1499         if (!isa<UndefValue>(Val)) {
1500             SI.setOperand(0, UndefValue::get(Val->getType()));
1501             if (Instruction * U = dyn_cast<Instruction>(Val))
1502                 Worklist.Add(U);  // Dropped a use.
1503         }
1504         return nullptr;  // Do not modify these!
1505     }
1506 
1507     // store undef, Ptr -> noop
1508     if (isa<UndefValue>(Val))
1509         return eraseInstFromFunction(SI);
1510 
1511     // If this store is the last instruction in the basic block (possibly
1512     // excepting debug info instructions), and if the block ends with an
1513     // unconditional branch, try to move it to the successor block.
1514     BBI = SI.getIterator();
1515     do {
1516         ++BBI;
1517     } while (isa<DbgInfoIntrinsic>(BBI) ||
1518         (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()));
1519     if (BranchInst * BI = dyn_cast<BranchInst>(BBI))
1520         if (BI->isUnconditional())
1521             if (SimplifyStoreAtEndOfBlock(SI))
1522                 return nullptr;  // xform done!
1523 
1524     return nullptr;
1525 }
1526 
1527 /// SimplifyStoreAtEndOfBlock - Turn things like:
1528 ///   if () { *P = v1; } else { *P = v2 }
1529 /// into a phi node with a store in the successor.
1530 ///
1531 /// Simplify things like:
1532 ///   *P = v1; if () { *P = v2; }
1533 /// into a phi node with a store in the successor.
1534 ///
SimplifyStoreAtEndOfBlock(StoreInst & SI)1535 bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst& SI) {
1536     IGC_ASSERT_MESSAGE(SI.isUnordered(), "this code has not been auditted for volatile or ordered store case");
1537 
1538     BasicBlock* StoreBB = SI.getParent();
1539 
1540     // Check to see if the successor block has exactly two incoming edges.  If
1541     // so, see if the other predecessor contains a store to the same location.
1542     // if so, insert a PHI node (if needed) and move the stores down.
1543     BasicBlock* DestBB = StoreBB->getTerminator()->getSuccessor(0);
1544 
1545     // Determine whether Dest has exactly two predecessors and, if so, compute
1546     // the other predecessor.
1547     pred_iterator PI = pred_begin(DestBB);
1548     BasicBlock* P = *PI;
1549     BasicBlock* OtherBB = nullptr;
1550 
1551     if (P != StoreBB)
1552         OtherBB = P;
1553 
1554     if (++PI == pred_end(DestBB))
1555         return false;
1556 
1557     P = *PI;
1558     if (P != StoreBB) {
1559         if (OtherBB)
1560             return false;
1561         OtherBB = P;
1562     }
1563     if (++PI != pred_end(DestBB))
1564         return false;
1565 
1566     // Bail out if all the relevant blocks aren't distinct (this can happen,
1567     // for example, if SI is in an infinite loop)
1568     if (StoreBB == DestBB || OtherBB == DestBB)
1569         return false;
1570 
1571     // Verify that the other block ends in a branch and is not otherwise empty.
1572     BasicBlock::iterator BBI(OtherBB->getTerminator());
1573     BranchInst* OtherBr = dyn_cast<BranchInst>(BBI);
1574     if (!OtherBr || BBI == OtherBB->begin())
1575         return false;
1576 
1577     // If the other block ends in an unconditional branch, check for the 'if then
1578     // else' case.  there is an instruction before the branch.
1579     StoreInst* OtherStore = nullptr;
1580     if (OtherBr->isUnconditional()) {
1581         --BBI;
1582         // Skip over debugging info.
1583         while (isa<DbgInfoIntrinsic>(BBI) ||
1584             (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1585             if (BBI == OtherBB->begin())
1586                 return false;
1587             --BBI;
1588         }
1589         // If this isn't a store, isn't a store to the same location, or is not the
1590         // right kind of store, bail out.
1591         OtherStore = dyn_cast<StoreInst>(BBI);
1592         if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
1593             !SI.isSameOperationAs(OtherStore))
1594             return false;
1595     }
1596     else {
1597         // Otherwise, the other block ended with a conditional branch. If one of the
1598         // destinations is StoreBB, then we have the if/then case.
1599         if (OtherBr->getSuccessor(0) != StoreBB &&
1600             OtherBr->getSuccessor(1) != StoreBB)
1601             return false;
1602 
1603         // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
1604         // if/then triangle.  See if there is a store to the same ptr as SI that
1605         // lives in OtherBB.
1606         for (;; --BBI) {
1607             // Check to see if we find the matching store.
1608             if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
1609                 if (OtherStore->getOperand(1) != SI.getOperand(1) ||
1610                     !SI.isSameOperationAs(OtherStore))
1611                     return false;
1612                 break;
1613             }
1614             // If we find something that may be using or overwriting the stored
1615             // value, or if we run out of instructions, we can't do the xform.
1616             if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
1617                 BBI->mayWriteToMemory() || BBI == OtherBB->begin())
1618                 return false;
1619         }
1620 
1621         // In order to eliminate the store in OtherBr, we have to
1622         // make sure nothing reads or overwrites the stored value in
1623         // StoreBB.
1624         for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
1625             // FIXME: This should really be AA driven.
1626             if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
1627                 return false;
1628         }
1629     }
1630 
1631     // Insert a PHI node now if we need it.
1632     Value* MergedVal = OtherStore->getOperand(0);
1633     if (MergedVal != SI.getOperand(0)) {
1634         PHINode* PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
1635         PN->addIncoming(SI.getOperand(0), SI.getParent());
1636         PN->addIncoming(OtherStore->getOperand(0), OtherBB);
1637         MergedVal = InsertNewInstBefore(PN, DestBB->front());
1638     }
1639 
1640     // Advance to a place where it is safe to insert the new store and
1641     // insert it.
1642     BBI = DestBB->getFirstInsertionPt();
1643     StoreInst* NewSI = new StoreInst(MergedVal, SI.getOperand(1),
1644         SI.isVolatile(),
1645         SI.getAlignment(),
1646         SI.getOrdering(),
1647         SI.getSyncScopeID());
1648     InsertNewInstBefore(NewSI, *BBI);
1649     // The debug locations of the original instructions might differ; merge them.
1650     NewSI->applyMergedLocation(SI.getDebugLoc(), OtherStore->getDebugLoc());
1651 
1652     // If the two stores had AA tags, merge them.
1653     AAMDNodes AATags;
1654     SI.getAAMetadata(AATags);
1655     if (AATags) {
1656         OtherStore->getAAMetadata(AATags, /* Merge = */ true);
1657         NewSI->setAAMetadata(AATags);
1658     }
1659 
1660     // Nuke the old stores.
1661     eraseInstFromFunction(SI);
1662     eraseInstFromFunction(*OtherStore);
1663     return true;
1664 }
1665 #include "common/LLVMWarningsPop.hpp"
1666