1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2018-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 /*========================== begin_copyright_notice ============================
10
11 This file is distributed under the University of Illinois Open Source License.
12 See LICENSE.TXT for details.
13
14 ============================= end_copyright_notice ===========================*/
15
16 // This file implements the visit functions for load, store and alloca.
17
18 #include "common/LLVMWarningsPush.hpp"
19 #include "InstCombineInternal.h"
20 #include "llvm/ADT/MapVector.h"
21 #include "llvm/ADT/SmallString.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/Loads.h"
24 #include "llvm/Transforms/Utils/Local.h"
25 #include "llvm/IR/ConstantRange.h"
26 #include "llvm/IR/DataLayout.h"
27 #include "llvm/IR/IntrinsicInst.h"
28 #include "llvm/IR/LLVMContext.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/IR/PatternMatch.h"
31 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
32 #include "Probe/Assertion.h"
33
34 using namespace llvm;
35 using namespace PatternMatch;
36 using namespace IGCombiner;
37
38 #define DEBUG_TYPE "instcombine"
39
40 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
41 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
42
43 /// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
44 /// some part of a constant global variable. This intentionally only accepts
45 /// constant expressions because we can't rewrite arbitrary instructions.
pointsToConstantGlobal(Value * V)46 static bool pointsToConstantGlobal(Value* V) {
47 if (GlobalVariable * GV = dyn_cast<GlobalVariable>(V))
48 return GV->isConstant();
49
50 if (ConstantExpr * CE = dyn_cast<ConstantExpr>(V)) {
51 if (CE->getOpcode() == Instruction::BitCast ||
52 CE->getOpcode() == Instruction::AddrSpaceCast ||
53 CE->getOpcode() == Instruction::GetElementPtr)
54 return pointsToConstantGlobal(CE->getOperand(0));
55 }
56 return false;
57 }
58
59 /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
60 /// pointer to an alloca. Ignore any reads of the pointer, return false if we
61 /// see any stores or other unknown uses. If we see pointer arithmetic, keep
62 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
63 /// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to
64 /// the alloca, and if the source pointer is a pointer to a constant global, we
65 /// can optimize this.
66 static bool
isOnlyCopiedFromConstantGlobal(Value * V,MemTransferInst * & TheCopy,SmallVectorImpl<Instruction * > & ToDelete)67 isOnlyCopiedFromConstantGlobal(Value* V, MemTransferInst*& TheCopy,
68 SmallVectorImpl<Instruction*>& ToDelete) {
69 // We track lifetime intrinsics as we encounter them. If we decide to go
70 // ahead and replace the value with the global, this lets the caller quickly
71 // eliminate the markers.
72
73 SmallVector<std::pair<Value*, bool>, 35> ValuesToInspect;
74 ValuesToInspect.emplace_back(V, false);
75 while (!ValuesToInspect.empty()) {
76 auto ValuePair = ValuesToInspect.pop_back_val();
77 const bool IsOffset = ValuePair.second;
78 for (auto& U : ValuePair.first->uses()) {
79 auto* I = cast<Instruction>(U.getUser());
80
81 if (auto * LI = dyn_cast<LoadInst>(I)) {
82 // Ignore non-volatile loads, they are always ok.
83 if (!LI->isSimple()) return false;
84 continue;
85 }
86
87 if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
88 // If uses of the bitcast are ok, we are ok.
89 ValuesToInspect.emplace_back(I, IsOffset);
90 continue;
91 }
92 if (auto * GEP = dyn_cast<GetElementPtrInst>(I)) {
93 // If the GEP has all zero indices, it doesn't offset the pointer. If it
94 // doesn't, it does.
95 ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
96 continue;
97 }
98
99 if (auto CS = CallSite(I)) {
100 // If this is the function being called then we treat it like a load and
101 // ignore it.
102 if (CS.isCallee(&U))
103 continue;
104
105 unsigned DataOpNo = CS.getDataOperandNo(&U);
106 bool IsArgOperand = CS.isArgOperand(&U);
107
108 // Inalloca arguments are clobbered by the call.
109 if (IsArgOperand && CS.isInAllocaArgument(DataOpNo))
110 return false;
111
112 // If this is a readonly/readnone call site, then we know it is just a
113 // load (but one that potentially returns the value itself), so we can
114 // ignore it if we know that the value isn't captured.
115 if (CS.onlyReadsMemory() &&
116 (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo)))
117 continue;
118
119 // If this is being passed as a byval argument, the caller is making a
120 // copy, so it is only a read of the alloca.
121 if (IsArgOperand && CS.isByValArgument(DataOpNo))
122 continue;
123 }
124
125 // Lifetime intrinsics can be handled by the caller.
126 if (IntrinsicInst * II = dyn_cast<IntrinsicInst>(I)) {
127 if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
128 II->getIntrinsicID() == Intrinsic::lifetime_end) {
129 IGC_ASSERT_MESSAGE(II->use_empty(), "Lifetime markers have no result to use!");
130 ToDelete.push_back(II);
131 continue;
132 }
133 }
134
135 // If this is isn't our memcpy/memmove, reject it as something we can't
136 // handle.
137 MemTransferInst* MI = dyn_cast<MemTransferInst>(I);
138 if (!MI)
139 return false;
140
141 // If the transfer is using the alloca as a source of the transfer, then
142 // ignore it since it is a load (unless the transfer is volatile).
143 if (U.getOperandNo() == 1) {
144 if (MI->isVolatile()) return false;
145 continue;
146 }
147
148 // If we already have seen a copy, reject the second one.
149 if (TheCopy) return false;
150
151 // If the pointer has been offset from the start of the alloca, we can't
152 // safely handle this.
153 if (IsOffset) return false;
154
155 // If the memintrinsic isn't using the alloca as the dest, reject it.
156 if (U.getOperandNo() != 0) return false;
157
158 // If the source of the memcpy/move is not a constant global, reject it.
159 if (!pointsToConstantGlobal(MI->getSource()))
160 return false;
161
162 // Otherwise, the transform is safe. Remember the copy instruction.
163 TheCopy = MI;
164 }
165 }
166 return true;
167 }
168
169 /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
170 /// modified by a copy from a constant global. If we can prove this, we can
171 /// replace any uses of the alloca with uses of the global directly.
172 static MemTransferInst*
isOnlyCopiedFromConstantGlobal(AllocaInst * AI,SmallVectorImpl<Instruction * > & ToDelete)173 isOnlyCopiedFromConstantGlobal(AllocaInst* AI,
174 SmallVectorImpl<Instruction*>& ToDelete) {
175 MemTransferInst* TheCopy = nullptr;
176 if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
177 return TheCopy;
178 return nullptr;
179 }
180
181 /// Returns true if V is dereferenceable for size of alloca.
isDereferenceableForAllocaSize(const Value * V,const AllocaInst * AI,const DataLayout & DL)182 static bool isDereferenceableForAllocaSize(const Value* V, const AllocaInst* AI,
183 const DataLayout& DL) {
184 if (AI->isArrayAllocation())
185 return false;
186 uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
187 if (!AllocaSize)
188 return false;
189 return isDereferenceableAndAlignedPointer(V, AI->getAlignment(),
190 APInt(64, AllocaSize), DL);
191 }
192
simplifyAllocaArraySize(InstCombiner & IC,AllocaInst & AI)193 static Instruction* simplifyAllocaArraySize(InstCombiner& IC, AllocaInst& AI) {
194 // Check for array size of 1 (scalar allocation).
195 if (!AI.isArrayAllocation()) {
196 // i32 1 is the canonical array size for scalar allocations.
197 if (AI.getArraySize()->getType()->isIntegerTy(32))
198 return nullptr;
199
200 // Canonicalize it.
201 Value* V = IC.Builder.getInt32(1);
202 AI.setOperand(0, V);
203 return &AI;
204 }
205
206 // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
207 if (const ConstantInt * C = dyn_cast<ConstantInt>(AI.getArraySize())) {
208 Type* NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
209 AllocaInst* New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
210 New->setAlignment(AI.getAlignment());
211
212 // Scan to the end of the allocation instructions, to skip over a block of
213 // allocas if possible...also skip interleaved debug info
214 //
215 BasicBlock::iterator It(New);
216 while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
217 ++It;
218
219 // Now that I is pointing to the first non-allocation-inst in the block,
220 // insert our getelementptr instruction...
221 //
222 Type* IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
223 Value* NullIdx = Constant::getNullValue(IdxTy);
224 Value* Idx[2] = { NullIdx, NullIdx };
225 Instruction* GEP =
226 GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
227 IC.InsertNewInstBefore(GEP, *It);
228
229 // Now make everything use the getelementptr instead of the original
230 // allocation.
231 return IC.replaceInstUsesWith(AI, GEP);
232 }
233
234 if (isa<UndefValue>(AI.getArraySize()))
235 return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
236
237 // Ensure that the alloca array size argument has type intptr_t, so that
238 // any casting is exposed early.
239 Type* IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
240 if (AI.getArraySize()->getType() != IntPtrTy) {
241 Value* V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
242 AI.setOperand(0, V);
243 return &AI;
244 }
245
246 return nullptr;
247 }
248
249 namespace {
250 // If I and V are pointers in different address space, it is not allowed to
251 // use replaceAllUsesWith since I and V have different types. A
252 // non-target-specific transformation should not use addrspacecast on V since
253 // the two address space may be disjoint depending on target.
254 //
255 // This class chases down uses of the old pointer until reaching the load
256 // instructions, then replaces the old pointer in the load instructions with
257 // the new pointer. If during the chasing it sees bitcast or GEP, it will
258 // create new bitcast or GEP with the new pointer and use them in the load
259 // instruction.
260 class PointerReplacer {
261 public:
PointerReplacer(InstCombiner & IC)262 PointerReplacer(InstCombiner& IC) : IC(IC) {}
263 void replacePointer(Instruction& I, Value* V);
264
265 private:
266 void findLoadAndReplace(Instruction& I);
267 void replace(Instruction* I);
268 Value* getReplacement(Value* I);
269
270 SmallVector<Instruction*, 4> Path;
271 MapVector<Value*, Value*> WorkMap;
272 InstCombiner& IC;
273 };
274 } // end anonymous namespace
275
findLoadAndReplace(Instruction & I)276 void PointerReplacer::findLoadAndReplace(Instruction& I) {
277 for (auto U : I.users()) {
278 auto* Inst = dyn_cast<Instruction>(&*U);
279 if (!Inst)
280 return;
281 LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
282 if (isa<LoadInst>(Inst)) {
283 for (auto P : Path)
284 replace(P);
285 replace(Inst);
286 }
287 else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
288 Path.push_back(Inst);
289 findLoadAndReplace(*Inst);
290 Path.pop_back();
291 }
292 else {
293 return;
294 }
295 }
296 }
297
getReplacement(Value * V)298 Value* PointerReplacer::getReplacement(Value* V) {
299 auto Loc = WorkMap.find(V);
300 if (Loc != WorkMap.end())
301 return Loc->second;
302 return nullptr;
303 }
304
replace(Instruction * I)305 void PointerReplacer::replace(Instruction* I) {
306 if (getReplacement(I))
307 return;
308
309 if (auto * LT = dyn_cast<LoadInst>(I)) {
310 auto* V = getReplacement(LT->getPointerOperand());
311 IGC_ASSERT_MESSAGE(V, "Operand not replaced");
312 auto* NewI = new LoadInst(V);
313 NewI->takeName(LT);
314 IC.InsertNewInstWith(NewI, *LT);
315 IC.replaceInstUsesWith(*LT, NewI);
316 WorkMap[LT] = NewI;
317 }
318 else if (auto * GEP = dyn_cast<GetElementPtrInst>(I)) {
319 auto* V = getReplacement(GEP->getPointerOperand());
320 IGC_ASSERT_MESSAGE(V, "Operand not replaced");
321 SmallVector<Value*, 8> Indices;
322 Indices.append(GEP->idx_begin(), GEP->idx_end());
323 auto* NewI = GetElementPtrInst::Create(
324 V->getType()->getPointerElementType(), V, Indices);
325 IC.InsertNewInstWith(NewI, *GEP);
326 NewI->takeName(GEP);
327 WorkMap[GEP] = NewI;
328 }
329 else if (auto * BC = dyn_cast<BitCastInst>(I)) {
330 auto* V = getReplacement(BC->getOperand(0));
331 IGC_ASSERT_MESSAGE(V, "Operand not replaced");
332 auto* NewT = PointerType::get(BC->getType()->getPointerElementType(),
333 V->getType()->getPointerAddressSpace());
334 auto* NewI = new BitCastInst(V, NewT);
335 IC.InsertNewInstWith(NewI, *BC);
336 NewI->takeName(BC);
337 WorkMap[BC] = NewI;
338 }
339 else {
340 IGC_ASSERT_EXIT_MESSAGE(0, "should never reach here");
341 }
342 }
343
replacePointer(Instruction & I,Value * V)344 void PointerReplacer::replacePointer(Instruction& I, Value* V) {
345 IGC_ASSERT(nullptr != V);
346 IGC_ASSERT(nullptr != cast<PointerType>(I.getType()));
347 IGC_ASSERT(nullptr != cast<PointerType>(V->getType()));
348 IGC_ASSERT(cast<PointerType>(I.getType()) != cast<PointerType>(V->getType()));
349 IGC_ASSERT(cast<PointerType>(I.getType())->getElementType() == cast<PointerType>(V->getType())->getElementType());
350 WorkMap[&I] = V;
351 findLoadAndReplace(I);
352 }
353
visitAllocaInst(AllocaInst & AI)354 Instruction* InstCombiner::visitAllocaInst(AllocaInst& AI) {
355 if (auto * I = simplifyAllocaArraySize(*this, AI))
356 return I;
357
358 if (AI.getAllocatedType()->isSized()) {
359 // If the alignment is 0 (unspecified), assign it the preferred alignment.
360 if (AI.getAlignment() == 0)
361 AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType()));
362
363 // Move all alloca's of zero byte objects to the entry block and merge them
364 // together. Note that we only do this for alloca's, because malloc should
365 // allocate and return a unique pointer, even for a zero byte allocation.
366 if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) {
367 // For a zero sized alloca there is no point in doing an array allocation.
368 // This is helpful if the array size is a complicated expression not used
369 // elsewhere.
370 if (AI.isArrayAllocation()) {
371 AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
372 return &AI;
373 }
374
375 // Get the first instruction in the entry block.
376 BasicBlock& EntryBlock = AI.getParent()->getParent()->getEntryBlock();
377 Instruction* FirstInst = EntryBlock.getFirstNonPHIOrDbg();
378 if (FirstInst != &AI) {
379 // If the entry block doesn't start with a zero-size alloca then move
380 // this one to the start of the entry block. There is no problem with
381 // dominance as the array size was forced to a constant earlier already.
382 AllocaInst* EntryAI = dyn_cast<AllocaInst>(FirstInst);
383 if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
384 DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
385 AI.moveBefore(FirstInst);
386 return &AI;
387 }
388
389 // If the alignment of the entry block alloca is 0 (unspecified),
390 // assign it the preferred alignment.
391 if (EntryAI->getAlignment() == 0)
392 EntryAI->setAlignment(
393 DL.getPrefTypeAlignment(EntryAI->getAllocatedType()));
394 // Replace this zero-sized alloca with the one at the start of the entry
395 // block after ensuring that the address will be aligned enough for both
396 // types.
397 unsigned MaxAlign = std::max(EntryAI->getAlignment(),
398 AI.getAlignment());
399 EntryAI->setAlignment(MaxAlign);
400 if (AI.getType() != EntryAI->getType())
401 return new BitCastInst(EntryAI, AI.getType());
402 return replaceInstUsesWith(AI, EntryAI);
403 }
404 }
405 }
406
407 if (AI.getAlignment()) {
408 // Check to see if this allocation is only modified by a memcpy/memmove from
409 // a constant global whose alignment is equal to or exceeds that of the
410 // allocation. If this is the case, we can change all users to use
411 // the constant global instead. This is commonly produced by the CFE by
412 // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
413 // is only subsequently read.
414 SmallVector<Instruction*, 4> ToDelete;
415 if (MemTransferInst * Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
416 unsigned SourceAlign = getOrEnforceKnownAlignment(
417 Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
418 if (AI.getAlignment() <= SourceAlign &&
419 isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
420 LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
421 LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
422 for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
423 eraseInstFromFunction(*ToDelete[i]);
424 Constant* TheSrc = cast<Constant>(Copy->getSource());
425 auto* SrcTy = TheSrc->getType();
426 auto* DestTy = PointerType::get(AI.getType()->getPointerElementType(),
427 SrcTy->getPointerAddressSpace());
428 Constant* Cast =
429 ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
430 if (AI.getType()->getPointerAddressSpace() ==
431 SrcTy->getPointerAddressSpace()) {
432 Instruction* NewI = replaceInstUsesWith(AI, Cast);
433 eraseInstFromFunction(*Copy);
434 ++NumGlobalCopies;
435 return NewI;
436 }
437 else {
438 PointerReplacer PtrReplacer(*this);
439 PtrReplacer.replacePointer(AI, Cast);
440 ++NumGlobalCopies;
441 }
442 }
443 }
444 }
445
446 // At last, use the generic allocation site handler to aggressively remove
447 // unused allocas.
448 return visitAllocSite(AI);
449 }
450
451 // Are we allowed to form a atomic load or store of this type?
isSupportedAtomicType(Type * Ty)452 static bool isSupportedAtomicType(Type* Ty) {
453 return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
454 }
455
456 /// Helper to combine a load to a new type.
457 ///
458 /// This just does the work of combining a load to a new type. It handles
459 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
460 /// loaded *value* type. This will convert it to a pointer, cast the operand to
461 /// that pointer type, load it, etc.
462 ///
463 /// Note that this will create all of the instructions with whatever insert
464 /// point the \c InstCombiner currently is using.
combineLoadToNewType(InstCombiner & IC,LoadInst & LI,Type * NewTy,const Twine & Suffix="")465 static LoadInst* combineLoadToNewType(InstCombiner& IC, LoadInst& LI, Type* NewTy,
466 const Twine& Suffix = "") {
467 IGC_ASSERT_MESSAGE((!LI.isAtomic() || isSupportedAtomicType(NewTy)), "can't fold an atomic load to requested type");
468
469 Value* Ptr = LI.getPointerOperand();
470 unsigned AS = LI.getPointerAddressSpace();
471 SmallVector<std::pair<unsigned, MDNode*>, 8> MD;
472 LI.getAllMetadata(MD);
473
474 Value* NewPtr = nullptr;
475 if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
476 NewPtr->getType()->getPointerElementType() == NewTy &&
477 NewPtr->getType()->getPointerAddressSpace() == AS))
478 NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
479
480 LoadInst* NewLoad = IC.Builder.CreateAlignedLoad(
481 NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
482 NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
483 MDBuilder MDB(NewLoad->getContext());
484 for (const auto& MDPair : MD) {
485 unsigned ID = MDPair.first;
486 MDNode* N = MDPair.second;
487 // Note, essentially every kind of metadata should be preserved here! This
488 // routine is supposed to clone a load instruction changing *only its type*.
489 // The only metadata it makes sense to drop is metadata which is invalidated
490 // when the pointer type changes. This should essentially never be the case
491 // in LLVM, but we explicitly switch over only known metadata to be
492 // conservatively correct. If you are adding metadata to LLVM which pertains
493 // to loads, you almost certainly want to add it here.
494 switch (ID) {
495 case LLVMContext::MD_dbg:
496 case LLVMContext::MD_tbaa:
497 case LLVMContext::MD_prof:
498 case LLVMContext::MD_fpmath:
499 case LLVMContext::MD_tbaa_struct:
500 case LLVMContext::MD_invariant_load:
501 case LLVMContext::MD_alias_scope:
502 case LLVMContext::MD_noalias:
503 case LLVMContext::MD_nontemporal:
504 case LLVMContext::MD_mem_parallel_loop_access:
505 // All of these directly apply.
506 NewLoad->setMetadata(ID, N);
507 break;
508
509 case LLVMContext::MD_nonnull:
510 copyNonnullMetadata(LI, N, *NewLoad);
511 break;
512 case LLVMContext::MD_align:
513 case LLVMContext::MD_dereferenceable:
514 case LLVMContext::MD_dereferenceable_or_null:
515 // These only directly apply if the new type is also a pointer.
516 if (NewTy->isPointerTy())
517 NewLoad->setMetadata(ID, N);
518 break;
519 case LLVMContext::MD_range:
520 copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
521 break;
522 }
523 }
524 return NewLoad;
525 }
526
527 /// Combine a store to a new type.
528 ///
529 /// Returns the newly created store instruction.
combineStoreToNewValue(InstCombiner & IC,StoreInst & SI,Value * V)530 static StoreInst* combineStoreToNewValue(InstCombiner& IC, StoreInst& SI, Value* V) {
531 IGC_ASSERT_MESSAGE((!SI.isAtomic() || isSupportedAtomicType(V->getType())), "can't fold an atomic store of requested type");
532
533 Value* Ptr = SI.getPointerOperand();
534 unsigned AS = SI.getPointerAddressSpace();
535 SmallVector<std::pair<unsigned, MDNode*>, 8> MD;
536 SI.getAllMetadata(MD);
537
538 StoreInst* NewStore = IC.Builder.CreateAlignedStore(
539 V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
540 SI.getAlignment(), SI.isVolatile());
541 NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
542 for (const auto& MDPair : MD) {
543 unsigned ID = MDPair.first;
544 MDNode* N = MDPair.second;
545 // Note, essentially every kind of metadata should be preserved here! This
546 // routine is supposed to clone a store instruction changing *only its
547 // type*. The only metadata it makes sense to drop is metadata which is
548 // invalidated when the pointer type changes. This should essentially
549 // never be the case in LLVM, but we explicitly switch over only known
550 // metadata to be conservatively correct. If you are adding metadata to
551 // LLVM which pertains to stores, you almost certainly want to add it
552 // here.
553 switch (ID) {
554 case LLVMContext::MD_dbg:
555 case LLVMContext::MD_tbaa:
556 case LLVMContext::MD_prof:
557 case LLVMContext::MD_fpmath:
558 case LLVMContext::MD_tbaa_struct:
559 case LLVMContext::MD_alias_scope:
560 case LLVMContext::MD_noalias:
561 case LLVMContext::MD_nontemporal:
562 case LLVMContext::MD_mem_parallel_loop_access:
563 // All of these directly apply.
564 NewStore->setMetadata(ID, N);
565 break;
566
567 case LLVMContext::MD_invariant_load:
568 case LLVMContext::MD_nonnull:
569 case LLVMContext::MD_range:
570 case LLVMContext::MD_align:
571 case LLVMContext::MD_dereferenceable:
572 case LLVMContext::MD_dereferenceable_or_null:
573 // These don't apply for stores.
574 break;
575 }
576 }
577
578 return NewStore;
579 }
580
581 /// Returns true if instruction represent minmax pattern like:
582 /// select ((cmp load V1, load V2), V1, V2).
isMinMaxWithLoads(Value * V)583 static bool isMinMaxWithLoads(Value* V) {
584 IGC_ASSERT_MESSAGE(V->getType()->isPointerTy(), "Expected pointer type.");
585 // Ignore possible ty* to ixx* bitcast.
586 V = peekThroughBitcast(V);
587 // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
588 // pattern.
589 CmpInst::Predicate Pred;
590 Instruction* L1;
591 Instruction* L2;
592 Value* LHS;
593 Value* RHS;
594 if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
595 m_Value(LHS), m_Value(RHS))))
596 return false;
597 return (match(L1, m_Load(m_Specific(LHS))) &&
598 match(L2, m_Load(m_Specific(RHS)))) ||
599 (match(L1, m_Load(m_Specific(RHS))) &&
600 match(L2, m_Load(m_Specific(LHS))));
601 }
602
603 /// Combine loads to match the type of their uses' value after looking
604 /// through intervening bitcasts.
605 ///
606 /// The core idea here is that if the result of a load is used in an operation,
607 /// we should load the type most conducive to that operation. For example, when
608 /// loading an integer and converting that immediately to a pointer, we should
609 /// instead directly load a pointer.
610 ///
611 /// However, this routine must never change the width of a load or the number of
612 /// loads as that would introduce a semantic change. This combine is expected to
613 /// be a semantic no-op which just allows loads to more closely model the types
614 /// of their consuming operations.
615 ///
616 /// Currently, we also refuse to change the precise type used for an atomic load
617 /// or a volatile load. This is debatable, and might be reasonable to change
618 /// later. However, it is risky in case some backend or other part of LLVM is
619 /// relying on the exact type loaded to select appropriate atomic operations.
combineLoadToOperationType(InstCombiner & IC,LoadInst & LI)620 static Instruction* combineLoadToOperationType(InstCombiner& IC, LoadInst& LI) {
621 // FIXME: We could probably with some care handle both volatile and ordered
622 // atomic loads here but it isn't clear that this is important.
623 if (!LI.isUnordered())
624 return nullptr;
625
626 if (LI.use_empty())
627 return nullptr;
628
629 // swifterror values can't be bitcasted.
630 if (LI.getPointerOperand()->isSwiftError())
631 return nullptr;
632
633 Type* Ty = LI.getType();
634 const DataLayout& DL = IC.getDataLayout();
635
636 // Try to canonicalize loads which are only ever stored to operate over
637 // integers instead of any other type. We only do this when the loaded type
638 // is sized and has a size exactly the same as its store size and the store
639 // size is a legal integer type.
640 // Do not perform canonicalization if minmax pattern is found (to avoid
641 // infinite loop).
642 if (!Ty->isIntegerTy() && Ty->isSized() &&
643 DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
644 DL.getTypeStoreSizeInBits(Ty) == DL.getTypeSizeInBits(Ty) &&
645 !DL.isNonIntegralPointerType(Ty) &&
646 !isMinMaxWithLoads(
647 peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true))) {
648 if (all_of(LI.users(), [&LI](User* U) {
649 auto* SI = dyn_cast<StoreInst>(U);
650 return SI && SI->getPointerOperand() != &LI &&
651 !SI->getPointerOperand()->isSwiftError();
652 })) {
653 LoadInst* NewLoad = combineLoadToNewType(
654 IC, LI,
655 Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
656 // Replace all the stores with stores of the newly loaded value.
657 for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
658 auto* SI = cast<StoreInst>(*UI++);
659 IC.Builder.SetInsertPoint(SI);
660 combineStoreToNewValue(IC, *SI, NewLoad);
661 IC.eraseInstFromFunction(*SI);
662 }
663 IGC_ASSERT_MESSAGE(LI.use_empty(), "Failed to remove all users of the load!");
664 // Return the old load so the combiner can delete it safely.
665 return &LI;
666 }
667 }
668
669 // Fold away bit casts of the loaded value by loading the desired type.
670 // We can do this for BitCastInsts as well as casts from and to pointer types,
671 // as long as those are noops (i.e., the source or dest type have the same
672 // bitwidth as the target's pointers).
673 if (LI.hasOneUse())
674 if (auto * CI = dyn_cast<CastInst>(LI.user_back()))
675 if (CI->isNoopCast(DL))
676 if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
677 LoadInst* NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
678 CI->replaceAllUsesWith(NewLoad);
679 IC.eraseInstFromFunction(*CI);
680 return &LI;
681 }
682
683 // FIXME: We should also canonicalize loads of vectors when their elements are
684 // cast to other types.
685 return nullptr;
686 }
687
unpackLoadToAggregate(InstCombiner & IC,LoadInst & LI)688 static Instruction* unpackLoadToAggregate(InstCombiner& IC, LoadInst& LI) {
689 // FIXME: We could probably with some care handle both volatile and atomic
690 // stores here but it isn't clear that this is important.
691 if (!LI.isSimple())
692 return nullptr;
693
694 Type* T = LI.getType();
695 if (!T->isAggregateType())
696 return nullptr;
697
698 StringRef Name = LI.getName();
699 IGC_ASSERT_MESSAGE(LI.getAlignment(), "Alignment must be set at this point");
700
701 if (auto * ST = dyn_cast<StructType>(T)) {
702 // If the struct only have one element, we unpack.
703 auto NumElements = ST->getNumElements();
704 if (NumElements == 1) {
705 LoadInst* NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
706 ".unpack");
707 AAMDNodes AAMD;
708 LI.getAAMetadata(AAMD);
709 NewLoad->setAAMetadata(AAMD);
710 return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
711 UndefValue::get(T), NewLoad, 0, Name));
712 }
713
714 // We don't want to break loads with padding here as we'd loose
715 // the knowledge that padding exists for the rest of the pipeline.
716 const DataLayout& DL = IC.getDataLayout();
717 auto* SL = DL.getStructLayout(ST);
718 if (SL->hasPadding())
719 return nullptr;
720
721 auto Align = LI.getAlignment();
722 if (!Align)
723 Align = DL.getABITypeAlignment(ST);
724
725 auto* Addr = LI.getPointerOperand();
726 auto* IdxType = Type::getInt32Ty(T->getContext());
727 auto* Zero = ConstantInt::get(IdxType, 0);
728
729 Value* V = UndefValue::get(T);
730 for (unsigned i = 0; i < NumElements; i++) {
731 Value* Indices[2] = {
732 Zero,
733 ConstantInt::get(IdxType, i),
734 };
735 auto* Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
736 Name + ".elt");
737 auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
738 auto* L = IC.Builder.CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
739 // Propagate AA metadata. It'll still be valid on the narrowed load.
740 AAMDNodes AAMD;
741 LI.getAAMetadata(AAMD);
742 L->setAAMetadata(AAMD);
743 V = IC.Builder.CreateInsertValue(V, L, i);
744 }
745
746 V->setName(Name);
747 return IC.replaceInstUsesWith(LI, V);
748 }
749
750 if (auto * AT = dyn_cast<ArrayType>(T)) {
751 auto* ET = AT->getElementType();
752 auto NumElements = AT->getNumElements();
753 if (NumElements == 1) {
754 LoadInst* NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
755 AAMDNodes AAMD;
756 LI.getAAMetadata(AAMD);
757 NewLoad->setAAMetadata(AAMD);
758 return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
759 UndefValue::get(T), NewLoad, 0, Name));
760 }
761
762 // Bail out if the array is too large. Ideally we would like to optimize
763 // arrays of arbitrary size but this has a terrible impact on compile time.
764 // The threshold here is chosen arbitrarily, maybe needs a little bit of
765 // tuning.
766 if (NumElements > IC.MaxArraySizeForCombine)
767 return nullptr;
768
769 const DataLayout& DL = IC.getDataLayout();
770 auto EltSize = DL.getTypeAllocSize(ET);
771 auto Align = LI.getAlignment();
772 if (!Align)
773 Align = DL.getABITypeAlignment(T);
774
775 auto* Addr = LI.getPointerOperand();
776 auto* IdxType = Type::getInt64Ty(T->getContext());
777 auto* Zero = ConstantInt::get(IdxType, 0);
778
779 Value* V = UndefValue::get(T);
780 uint64_t Offset = 0;
781 for (uint64_t i = 0; i < NumElements; i++) {
782 Value* Indices[2] = {
783 Zero,
784 ConstantInt::get(IdxType, i),
785 };
786 auto* Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
787 Name + ".elt");
788 auto* L = IC.Builder.CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
789 Name + ".unpack");
790 AAMDNodes AAMD;
791 LI.getAAMetadata(AAMD);
792 L->setAAMetadata(AAMD);
793 V = IC.Builder.CreateInsertValue(V, L, i);
794 Offset += EltSize;
795 }
796
797 V->setName(Name);
798 return IC.replaceInstUsesWith(LI, V);
799 }
800
801 return nullptr;
802 }
803
804 // If we can determine that all possible objects pointed to by the provided
805 // pointer value are, not only dereferenceable, but also definitively less than
806 // or equal to the provided maximum size, then return true. Otherwise, return
807 // false (constant global values and allocas fall into this category).
808 //
809 // FIXME: This should probably live in ValueTracking (or similar).
isObjectSizeLessThanOrEq(Value * V,uint64_t MaxSize,const DataLayout & DL)810 static bool isObjectSizeLessThanOrEq(Value* V, uint64_t MaxSize,
811 const DataLayout& DL) {
812 SmallPtrSet<Value*, 4> Visited;
813 SmallVector<Value*, 4> Worklist(1, V);
814
815 do {
816 Value* P = Worklist.pop_back_val();
817 P = P->stripPointerCasts();
818
819 if (!Visited.insert(P).second)
820 continue;
821
822 if (SelectInst * SI = dyn_cast<SelectInst>(P)) {
823 Worklist.push_back(SI->getTrueValue());
824 Worklist.push_back(SI->getFalseValue());
825 continue;
826 }
827
828 if (PHINode * PN = dyn_cast<PHINode>(P)) {
829 for (Value* IncValue : PN->incoming_values())
830 Worklist.push_back(IncValue);
831 continue;
832 }
833
834 if (GlobalAlias * GA = dyn_cast<GlobalAlias>(P)) {
835 if (GA->isInterposable())
836 return false;
837 Worklist.push_back(GA->getAliasee());
838 continue;
839 }
840
841 // If we know how big this object is, and it is less than MaxSize, continue
842 // searching. Otherwise, return false.
843 if (AllocaInst * AI = dyn_cast<AllocaInst>(P)) {
844 if (!AI->getAllocatedType()->isSized())
845 return false;
846
847 ConstantInt* CS = dyn_cast<ConstantInt>(AI->getArraySize());
848 if (!CS)
849 return false;
850
851 uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
852 // Make sure that, even if the multiplication below would wrap as an
853 // uint64_t, we still do the right thing.
854 if ((CS->getValue().zextOrSelf(128) * APInt(128, TypeSize)).ugt(MaxSize))
855 return false;
856 continue;
857 }
858
859 if (GlobalVariable * GV = dyn_cast<GlobalVariable>(P)) {
860 if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
861 return false;
862
863 uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
864 if (InitSize > MaxSize)
865 return false;
866 continue;
867 }
868
869 return false;
870 } while (!Worklist.empty());
871
872 return true;
873 }
874
875 // If we're indexing into an object of a known size, and the outer index is
876 // not a constant, but having any value but zero would lead to undefined
877 // behavior, replace it with zero.
878 //
879 // For example, if we have:
880 // @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
881 // ...
882 // %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
883 // ... = load i32* %arrayidx, align 4
884 // Then we know that we can replace %x in the GEP with i64 0.
885 //
886 // FIXME: We could fold any GEP index to zero that would cause UB if it were
887 // not zero. Currently, we only handle the first such index. Also, we could
888 // also search through non-zero constant indices if we kept track of the
889 // offsets those indices implied.
canReplaceGEPIdxWithZero(InstCombiner & IC,GetElementPtrInst * GEPI,Instruction * MemI,unsigned & Idx)890 static bool canReplaceGEPIdxWithZero(InstCombiner& IC, GetElementPtrInst* GEPI,
891 Instruction* MemI, unsigned& Idx) {
892 if (GEPI->getNumOperands() < 2)
893 return false;
894
895 // Find the first non-zero index of a GEP. If all indices are zero, return
896 // one past the last index.
897 auto FirstNZIdx = [](const GetElementPtrInst* GEPI) {
898 unsigned I = 1;
899 for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
900 Value* V = GEPI->getOperand(I);
901 if (const ConstantInt * CI = dyn_cast<ConstantInt>(V))
902 if (CI->isZero())
903 continue;
904
905 break;
906 }
907
908 return I;
909 };
910
911 // Skip through initial 'zero' indices, and find the corresponding pointer
912 // type. See if the next index is not a constant.
913 Idx = FirstNZIdx(GEPI);
914 if (Idx == GEPI->getNumOperands())
915 return false;
916 if (isa<Constant>(GEPI->getOperand(Idx)))
917 return false;
918
919 SmallVector<Value*, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
920 Type* AllocTy =
921 GetElementPtrInst::getIndexedType(GEPI->getSourceElementType(), Ops);
922 if (!AllocTy || !AllocTy->isSized())
923 return false;
924 const DataLayout& DL = IC.getDataLayout();
925 uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy);
926
927 // If there are more indices after the one we might replace with a zero, make
928 // sure they're all non-negative. If any of them are negative, the overall
929 // address being computed might be before the base address determined by the
930 // first non-zero index.
931 auto IsAllNonNegative = [&]() {
932 for (unsigned i = Idx + 1, e = GEPI->getNumOperands(); i != e; ++i) {
933 KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
934 if (Known.isNonNegative())
935 continue;
936 return false;
937 }
938
939 return true;
940 };
941
942 // FIXME: If the GEP is not inbounds, and there are extra indices after the
943 // one we'll replace, those could cause the address computation to wrap
944 // (rendering the IsAllNonNegative() check below insufficient). We can do
945 // better, ignoring zero indices (and other indices we can prove small
946 // enough not to wrap).
947 if (Idx + 1 != GEPI->getNumOperands() && !GEPI->isInBounds())
948 return false;
949
950 // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
951 // also known to be dereferenceable.
952 return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
953 IsAllNonNegative();
954 }
955
956 // If we're indexing into an object with a variable index for the memory
957 // access, but the object has only one element, we can assume that the index
958 // will always be zero. If we replace the GEP, return it.
959 template <typename T>
replaceGEPIdxWithZero(InstCombiner & IC,Value * Ptr,T & MemI)960 static Instruction* replaceGEPIdxWithZero(InstCombiner& IC, Value* Ptr,
961 T& MemI) {
962 if (GetElementPtrInst * GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
963 unsigned Idx;
964 if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
965 Instruction* NewGEPI = GEPI->clone();
966 NewGEPI->setOperand(Idx,
967 ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
968 NewGEPI->insertBefore(GEPI);
969 MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
970 return NewGEPI;
971 }
972 }
973
974 return nullptr;
975 }
976
canSimplifyNullStoreOrGEP(StoreInst & SI)977 static bool canSimplifyNullStoreOrGEP(StoreInst& SI) {
978 if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
979 return false;
980
981 auto* Ptr = SI.getPointerOperand();
982 if (GetElementPtrInst * GEPI = dyn_cast<GetElementPtrInst>(Ptr))
983 Ptr = GEPI->getOperand(0);
984 return (isa<ConstantPointerNull>(Ptr) &&
985 !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
986 }
987
canSimplifyNullLoadOrGEP(LoadInst & LI,Value * Op)988 static bool canSimplifyNullLoadOrGEP(LoadInst& LI, Value* Op) {
989 if (GetElementPtrInst * GEPI = dyn_cast<GetElementPtrInst>(Op)) {
990 const Value* GEPI0 = GEPI->getOperand(0);
991 if (isa<ConstantPointerNull>(GEPI0) &&
992 !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
993 return true;
994 }
995 if (isa<UndefValue>(Op) ||
996 (isa<ConstantPointerNull>(Op) &&
997 !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
998 return true;
999 return false;
1000 }
1001
visitLoadInst(LoadInst & LI)1002 Instruction* InstCombiner::visitLoadInst(LoadInst& LI) {
1003 Value* Op = LI.getOperand(0);
1004
1005 // Try to canonicalize the loaded type.
1006 if (Instruction * Res = combineLoadToOperationType(*this, LI))
1007 return Res;
1008
1009 // Attempt to improve the alignment.
1010 unsigned KnownAlign = getOrEnforceKnownAlignment(
1011 Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, &AC, &DT);
1012 unsigned LoadAlign = LI.getAlignment();
1013 unsigned EffectiveLoadAlign =
1014 LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
1015
1016 if (KnownAlign > EffectiveLoadAlign)
1017 LI.setAlignment(KnownAlign);
1018 else if (LoadAlign == 0)
1019 LI.setAlignment(EffectiveLoadAlign);
1020
1021 // Replace GEP indices if possible.
1022 if (Instruction * NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
1023 Worklist.Add(NewGEPI);
1024 return &LI;
1025 }
1026
1027 if (Instruction * Res = unpackLoadToAggregate(*this, LI))
1028 return Res;
1029
1030 // Do really simple store-to-load forwarding and load CSE, to catch cases
1031 // where there are several consecutive memory accesses to the same location,
1032 // separated by a few arithmetic operations.
1033 BasicBlock::iterator BBI(LI);
1034 bool IsLoadCSE = false;
1035 if (Value * AvailableVal = FindAvailableLoadedValue(
1036 &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
1037 if (IsLoadCSE)
1038 combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI);
1039
1040 return replaceInstUsesWith(
1041 LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
1042 LI.getName() + ".cast"));
1043 }
1044
1045 // None of the following transforms are legal for volatile/ordered atomic
1046 // loads. Most of them do apply for unordered atomics.
1047 if (!LI.isUnordered()) return nullptr;
1048
1049 // load(gep null, ...) -> unreachable
1050 // load null/undef -> unreachable
1051 // TODO: Consider a target hook for valid address spaces for this xforms.
1052 if (canSimplifyNullLoadOrGEP(LI, Op)) {
1053 // Insert a new store to null instruction before the load to indicate
1054 // that this code is not reachable. We do this instead of inserting
1055 // an unreachable instruction directly because we cannot modify the
1056 // CFG.
1057 StoreInst* SI = new StoreInst(UndefValue::get(LI.getType()),
1058 Constant::getNullValue(Op->getType()), &LI);
1059 SI->setDebugLoc(LI.getDebugLoc());
1060 return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
1061 }
1062
1063 if (Op->hasOneUse()) {
1064 // Change select and PHI nodes to select values instead of addresses: this
1065 // helps alias analysis out a lot, allows many others simplifications, and
1066 // exposes redundancy in the code.
1067 //
1068 // Note that we cannot do the transformation unless we know that the
1069 // introduced loads cannot trap! Something like this is valid as long as
1070 // the condition is always false: load (select bool %C, int* null, int* %G),
1071 // but it would not be valid if we transformed it to load from null
1072 // unconditionally.
1073 //
1074 if (SelectInst * SI = dyn_cast<SelectInst>(Op)) {
1075 // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2).
1076 unsigned Align = LI.getAlignment();
1077 if (isSafeToLoadUnconditionally(SI->getOperand(1), Align, DL, SI) &&
1078 isSafeToLoadUnconditionally(SI->getOperand(2), Align, DL, SI)) {
1079 LoadInst* V1 = Builder.CreateLoad(SI->getOperand(1),
1080 SI->getOperand(1)->getName() + ".val");
1081 LoadInst* V2 = Builder.CreateLoad(SI->getOperand(2),
1082 SI->getOperand(2)->getName() + ".val");
1083 IGC_ASSERT_MESSAGE(LI.isUnordered(), "implied by above");
1084 V1->setAlignment(Align);
1085 V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1086 V2->setAlignment(Align);
1087 V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1088 return SelectInst::Create(SI->getCondition(), V1, V2);
1089 }
1090
1091 // load (select (cond, null, P)) -> load P
1092 if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
1093 !NullPointerIsDefined(SI->getFunction(),
1094 LI.getPointerAddressSpace())) {
1095 LI.setOperand(0, SI->getOperand(2));
1096 return &LI;
1097 }
1098
1099 // load (select (cond, P, null)) -> load P
1100 if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
1101 !NullPointerIsDefined(SI->getFunction(),
1102 LI.getPointerAddressSpace())) {
1103 LI.setOperand(0, SI->getOperand(1));
1104 return &LI;
1105 }
1106 }
1107 }
1108 return nullptr;
1109 }
1110
1111 /// Look for extractelement/insertvalue sequence that acts like a bitcast.
1112 ///
1113 /// \returns underlying value that was "cast", or nullptr otherwise.
1114 ///
1115 /// For example, if we have:
1116 ///
1117 /// %E0 = extractelement <2 x double> %U, i32 0
1118 /// %V0 = insertvalue [2 x double] undef, double %E0, 0
1119 /// %E1 = extractelement <2 x double> %U, i32 1
1120 /// %V1 = insertvalue [2 x double] %V0, double %E1, 1
1121 ///
1122 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
1123 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
1124 /// Note that %U may contain non-undef values where %V1 has undef.
likeBitCastFromVector(InstCombiner & IC,Value * V)1125 static Value* likeBitCastFromVector(InstCombiner& IC, Value* V) {
1126 Value* U = nullptr;
1127 while (auto * IV = dyn_cast<InsertValueInst>(V)) {
1128 auto* E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
1129 if (!E)
1130 return nullptr;
1131 auto* W = E->getVectorOperand();
1132 if (!U)
1133 U = W;
1134 else if (U != W)
1135 return nullptr;
1136 auto* CI = dyn_cast<ConstantInt>(E->getIndexOperand());
1137 if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
1138 return nullptr;
1139 V = IV->getAggregateOperand();
1140 }
1141 if (!isa<UndefValue>(V) || !U)
1142 return nullptr;
1143
1144 auto* UT = cast<VectorType>(U->getType());
1145 auto* VT = V->getType();
1146 // Check that types UT and VT are bitwise isomorphic.
1147 const auto& DL = IC.getDataLayout();
1148 if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
1149 return nullptr;
1150 }
1151 if (auto * AT = dyn_cast<ArrayType>(VT)) {
1152 if (AT->getNumElements() != UT->getNumElements())
1153 return nullptr;
1154 }
1155 else {
1156 auto* ST = cast<StructType>(VT);
1157 if (ST->getNumElements() != UT->getNumElements())
1158 return nullptr;
1159 for (const auto* EltT : ST->elements()) {
1160 if (EltT != UT->getElementType())
1161 return nullptr;
1162 }
1163 }
1164 return U;
1165 }
1166
1167 /// Combine stores to match the type of value being stored.
1168 ///
1169 /// The core idea here is that the memory does not have any intrinsic type and
1170 /// where we can we should match the type of a store to the type of value being
1171 /// stored.
1172 ///
1173 /// However, this routine must never change the width of a store or the number of
1174 /// stores as that would introduce a semantic change. This combine is expected to
1175 /// be a semantic no-op which just allows stores to more closely model the types
1176 /// of their incoming values.
1177 ///
1178 /// Currently, we also refuse to change the precise type used for an atomic or
1179 /// volatile store. This is debatable, and might be reasonable to change later.
1180 /// However, it is risky in case some backend or other part of LLVM is relying
1181 /// on the exact type stored to select appropriate atomic operations.
1182 ///
1183 /// \returns true if the store was successfully combined away. This indicates
1184 /// the caller must erase the store instruction. We have to let the caller erase
1185 /// the store instruction as otherwise there is no way to signal whether it was
1186 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
combineStoreToValueType(InstCombiner & IC,StoreInst & SI)1187 static bool combineStoreToValueType(InstCombiner& IC, StoreInst& SI) {
1188 // FIXME: We could probably with some care handle both volatile and ordered
1189 // atomic stores here but it isn't clear that this is important.
1190 if (!SI.isUnordered())
1191 return false;
1192
1193 // swifterror values can't be bitcasted.
1194 if (SI.getPointerOperand()->isSwiftError())
1195 return false;
1196
1197 Value* V = SI.getValueOperand();
1198
1199 // Fold away bit casts of the stored value by storing the original type.
1200 if (auto * BC = dyn_cast<BitCastInst>(V)) {
1201 V = BC->getOperand(0);
1202 if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
1203 combineStoreToNewValue(IC, SI, V);
1204 return true;
1205 }
1206 }
1207
1208 if (Value * U = likeBitCastFromVector(IC, V))
1209 if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
1210 combineStoreToNewValue(IC, SI, U);
1211 return true;
1212 }
1213
1214 // FIXME: We should also canonicalize stores of vectors when their elements
1215 // are cast to other types.
1216 return false;
1217 }
1218
unpackStoreToAggregate(InstCombiner & IC,StoreInst & SI)1219 static bool unpackStoreToAggregate(InstCombiner& IC, StoreInst& SI) {
1220 // FIXME: We could probably with some care handle both volatile and atomic
1221 // stores here but it isn't clear that this is important.
1222 if (!SI.isSimple())
1223 return false;
1224
1225 Value* V = SI.getValueOperand();
1226 Type* T = V->getType();
1227
1228 if (!T->isAggregateType())
1229 return false;
1230
1231 if (auto * ST = dyn_cast<StructType>(T)) {
1232 // If the struct only have one element, we unpack.
1233 unsigned Count = ST->getNumElements();
1234 if (Count == 1) {
1235 V = IC.Builder.CreateExtractValue(V, 0);
1236 combineStoreToNewValue(IC, SI, V);
1237 return true;
1238 }
1239
1240 // We don't want to break loads with padding here as we'd loose
1241 // the knowledge that padding exists for the rest of the pipeline.
1242 const DataLayout& DL = IC.getDataLayout();
1243 auto* SL = DL.getStructLayout(ST);
1244 if (SL->hasPadding())
1245 return false;
1246
1247 auto Align = SI.getAlignment();
1248 if (!Align)
1249 Align = DL.getABITypeAlignment(ST);
1250
1251 SmallString<16> EltName = V->getName();
1252 EltName += ".elt";
1253 auto* Addr = SI.getPointerOperand();
1254 SmallString<16> AddrName = Addr->getName();
1255 AddrName += ".repack";
1256
1257 auto* IdxType = Type::getInt32Ty(ST->getContext());
1258 auto* Zero = ConstantInt::get(IdxType, 0);
1259 for (unsigned i = 0; i < Count; i++) {
1260 Value* Indices[2] = {
1261 Zero,
1262 ConstantInt::get(IdxType, i),
1263 };
1264 auto* Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
1265 AddrName);
1266 auto* Val = IC.Builder.CreateExtractValue(V, i, EltName);
1267 auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
1268 llvm::Instruction* NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1269 AAMDNodes AAMD;
1270 SI.getAAMetadata(AAMD);
1271 NS->setAAMetadata(AAMD);
1272 }
1273
1274 return true;
1275 }
1276
1277 if (auto * AT = dyn_cast<ArrayType>(T)) {
1278 // If the array only have one element, we unpack.
1279 auto NumElements = AT->getNumElements();
1280 if (NumElements == 1) {
1281 V = IC.Builder.CreateExtractValue(V, 0);
1282 combineStoreToNewValue(IC, SI, V);
1283 return true;
1284 }
1285
1286 // Bail out if the array is too large. Ideally we would like to optimize
1287 // arrays of arbitrary size but this has a terrible impact on compile time.
1288 // The threshold here is chosen arbitrarily, maybe needs a little bit of
1289 // tuning.
1290 if (NumElements > IC.MaxArraySizeForCombine)
1291 return false;
1292
1293 const DataLayout& DL = IC.getDataLayout();
1294 auto EltSize = DL.getTypeAllocSize(AT->getElementType());
1295 auto Align = SI.getAlignment();
1296 if (!Align)
1297 Align = DL.getABITypeAlignment(T);
1298
1299 SmallString<16> EltName = V->getName();
1300 EltName += ".elt";
1301 auto* Addr = SI.getPointerOperand();
1302 SmallString<16> AddrName = Addr->getName();
1303 AddrName += ".repack";
1304
1305 auto* IdxType = Type::getInt64Ty(T->getContext());
1306 auto* Zero = ConstantInt::get(IdxType, 0);
1307
1308 uint64_t Offset = 0;
1309 for (uint64_t i = 0; i < NumElements; i++) {
1310 Value* Indices[2] = {
1311 Zero,
1312 ConstantInt::get(IdxType, i),
1313 };
1314 auto* Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
1315 AddrName);
1316 auto* Val = IC.Builder.CreateExtractValue(V, i, EltName);
1317 auto EltAlign = MinAlign(Align, Offset);
1318 Instruction* NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1319 AAMDNodes AAMD;
1320 SI.getAAMetadata(AAMD);
1321 NS->setAAMetadata(AAMD);
1322 Offset += EltSize;
1323 }
1324
1325 return true;
1326 }
1327
1328 return false;
1329 }
1330
1331 /// equivalentAddressValues - Test if A and B will obviously have the same
1332 /// value. This includes recognizing that %t0 and %t1 will have the same
1333 /// value in code like this:
1334 /// %t0 = getelementptr \@a, 0, 3
1335 /// store i32 0, i32* %t0
1336 /// %t1 = getelementptr \@a, 0, 3
1337 /// %t2 = load i32* %t1
1338 ///
equivalentAddressValues(Value * A,Value * B)1339 static bool equivalentAddressValues(Value* A, Value* B) {
1340 // Test if the values are trivially equivalent.
1341 if (A == B) return true;
1342
1343 // Test if the values come form identical arithmetic instructions.
1344 // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
1345 // its only used to compare two uses within the same basic block, which
1346 // means that they'll always either have the same value or one of them
1347 // will have an undefined value.
1348 if (isa<BinaryOperator>(A) ||
1349 isa<CastInst>(A) ||
1350 isa<PHINode>(A) ||
1351 isa<GetElementPtrInst>(A))
1352 if (Instruction * BI = dyn_cast<Instruction>(B))
1353 if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
1354 return true;
1355
1356 // Otherwise they may not be equivalent.
1357 return false;
1358 }
1359
1360 /// Converts store (bitcast (load (bitcast (select ...)))) to
1361 /// store (load (select ...)), where select is minmax:
1362 /// select ((cmp load V1, load V2), V1, V2).
removeBitcastsFromLoadStoreOnMinMax(InstCombiner & IC,StoreInst & SI)1363 static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner& IC,
1364 StoreInst& SI) {
1365 // bitcast?
1366 if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
1367 return false;
1368 // load? integer?
1369 Value* LoadAddr;
1370 if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
1371 return false;
1372 auto* LI = cast<LoadInst>(SI.getValueOperand());
1373 if (!LI->getType()->isIntegerTy())
1374 return false;
1375 if (!isMinMaxWithLoads(LoadAddr))
1376 return false;
1377
1378 if (!all_of(LI->users(), [LI, LoadAddr](User* U) {
1379 auto* SI = dyn_cast<StoreInst>(U);
1380 return SI && SI->getPointerOperand() != LI &&
1381 peekThroughBitcast(SI->getPointerOperand()) != LoadAddr &&
1382 !SI->getPointerOperand()->isSwiftError();
1383 }))
1384 return false;
1385
1386 IC.Builder.SetInsertPoint(LI);
1387 LoadInst* NewLI = combineLoadToNewType(
1388 IC, *LI, LoadAddr->getType()->getPointerElementType());
1389 // Replace all the stores with stores of the newly loaded value.
1390 for (auto* UI : LI->users()) {
1391 auto* USI = cast<StoreInst>(UI);
1392 IC.Builder.SetInsertPoint(USI);
1393 combineStoreToNewValue(IC, *USI, NewLI);
1394 }
1395 IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
1396 IC.eraseInstFromFunction(*LI);
1397 return true;
1398 }
1399
visitStoreInst(StoreInst & SI)1400 Instruction* InstCombiner::visitStoreInst(StoreInst& SI) {
1401 Value* Val = SI.getOperand(0);
1402 Value* Ptr = SI.getOperand(1);
1403
1404 // Try to canonicalize the stored type.
1405 if (combineStoreToValueType(*this, SI))
1406 return eraseInstFromFunction(SI);
1407
1408 // Attempt to improve the alignment.
1409 unsigned KnownAlign = getOrEnforceKnownAlignment(
1410 Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT);
1411 unsigned StoreAlign = SI.getAlignment();
1412 unsigned EffectiveStoreAlign =
1413 StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType());
1414
1415 if (KnownAlign > EffectiveStoreAlign)
1416 SI.setAlignment(KnownAlign);
1417 else if (StoreAlign == 0)
1418 SI.setAlignment(EffectiveStoreAlign);
1419
1420 // Try to canonicalize the stored type.
1421 if (unpackStoreToAggregate(*this, SI))
1422 return eraseInstFromFunction(SI);
1423
1424 if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
1425 return eraseInstFromFunction(SI);
1426
1427 // Replace GEP indices if possible.
1428 if (Instruction * NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
1429 Worklist.Add(NewGEPI);
1430 return &SI;
1431 }
1432
1433 // Don't hack volatile/ordered stores.
1434 // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
1435 if (!SI.isUnordered()) return nullptr;
1436
1437 // If the RHS is an alloca with a single use, zapify the store, making the
1438 // alloca dead.
1439 if (Ptr->hasOneUse()) {
1440 if (isa<AllocaInst>(Ptr))
1441 return eraseInstFromFunction(SI);
1442 if (GetElementPtrInst * GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1443 if (isa<AllocaInst>(GEP->getOperand(0))) {
1444 if (GEP->getOperand(0)->hasOneUse())
1445 return eraseInstFromFunction(SI);
1446 }
1447 }
1448 }
1449
1450 // Do really simple DSE, to catch cases where there are several consecutive
1451 // stores to the same location, separated by a few arithmetic operations. This
1452 // situation often occurs with bitfield accesses.
1453 BasicBlock::iterator BBI(SI);
1454 for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
1455 --ScanInsts) {
1456 --BBI;
1457 // Don't count debug info directives, lest they affect codegen,
1458 // and we skip pointer-to-pointer bitcasts, which are NOPs.
1459 if (isa<DbgInfoIntrinsic>(BBI) ||
1460 (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1461 ScanInsts++;
1462 continue;
1463 }
1464
1465 if (StoreInst * PrevSI = dyn_cast<StoreInst>(BBI)) {
1466 // Prev store isn't volatile, and stores to the same location?
1467 if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
1468 SI.getOperand(1))) {
1469 ++NumDeadStore;
1470 ++BBI;
1471 eraseInstFromFunction(*PrevSI);
1472 continue;
1473 }
1474 break;
1475 }
1476
1477 // If this is a load, we have to stop. However, if the loaded value is from
1478 // the pointer we're loading and is producing the pointer we're storing,
1479 // then *this* store is dead (X = load P; store X -> P).
1480 if (LoadInst * LI = dyn_cast<LoadInst>(BBI)) {
1481 if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
1482 IGC_ASSERT_MESSAGE(SI.isUnordered(), "can't eliminate ordering operation");
1483 return eraseInstFromFunction(SI);
1484 }
1485
1486 // Otherwise, this is a load from some other location. Stores before it
1487 // may not be dead.
1488 break;
1489 }
1490
1491 // Don't skip over loads, throws or things that can modify memory.
1492 if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
1493 break;
1494 }
1495
1496 // store X, null -> turns into 'unreachable' in SimplifyCFG
1497 // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
1498 if (canSimplifyNullStoreOrGEP(SI)) {
1499 if (!isa<UndefValue>(Val)) {
1500 SI.setOperand(0, UndefValue::get(Val->getType()));
1501 if (Instruction * U = dyn_cast<Instruction>(Val))
1502 Worklist.Add(U); // Dropped a use.
1503 }
1504 return nullptr; // Do not modify these!
1505 }
1506
1507 // store undef, Ptr -> noop
1508 if (isa<UndefValue>(Val))
1509 return eraseInstFromFunction(SI);
1510
1511 // If this store is the last instruction in the basic block (possibly
1512 // excepting debug info instructions), and if the block ends with an
1513 // unconditional branch, try to move it to the successor block.
1514 BBI = SI.getIterator();
1515 do {
1516 ++BBI;
1517 } while (isa<DbgInfoIntrinsic>(BBI) ||
1518 (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()));
1519 if (BranchInst * BI = dyn_cast<BranchInst>(BBI))
1520 if (BI->isUnconditional())
1521 if (SimplifyStoreAtEndOfBlock(SI))
1522 return nullptr; // xform done!
1523
1524 return nullptr;
1525 }
1526
1527 /// SimplifyStoreAtEndOfBlock - Turn things like:
1528 /// if () { *P = v1; } else { *P = v2 }
1529 /// into a phi node with a store in the successor.
1530 ///
1531 /// Simplify things like:
1532 /// *P = v1; if () { *P = v2; }
1533 /// into a phi node with a store in the successor.
1534 ///
SimplifyStoreAtEndOfBlock(StoreInst & SI)1535 bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst& SI) {
1536 IGC_ASSERT_MESSAGE(SI.isUnordered(), "this code has not been auditted for volatile or ordered store case");
1537
1538 BasicBlock* StoreBB = SI.getParent();
1539
1540 // Check to see if the successor block has exactly two incoming edges. If
1541 // so, see if the other predecessor contains a store to the same location.
1542 // if so, insert a PHI node (if needed) and move the stores down.
1543 BasicBlock* DestBB = StoreBB->getTerminator()->getSuccessor(0);
1544
1545 // Determine whether Dest has exactly two predecessors and, if so, compute
1546 // the other predecessor.
1547 pred_iterator PI = pred_begin(DestBB);
1548 BasicBlock* P = *PI;
1549 BasicBlock* OtherBB = nullptr;
1550
1551 if (P != StoreBB)
1552 OtherBB = P;
1553
1554 if (++PI == pred_end(DestBB))
1555 return false;
1556
1557 P = *PI;
1558 if (P != StoreBB) {
1559 if (OtherBB)
1560 return false;
1561 OtherBB = P;
1562 }
1563 if (++PI != pred_end(DestBB))
1564 return false;
1565
1566 // Bail out if all the relevant blocks aren't distinct (this can happen,
1567 // for example, if SI is in an infinite loop)
1568 if (StoreBB == DestBB || OtherBB == DestBB)
1569 return false;
1570
1571 // Verify that the other block ends in a branch and is not otherwise empty.
1572 BasicBlock::iterator BBI(OtherBB->getTerminator());
1573 BranchInst* OtherBr = dyn_cast<BranchInst>(BBI);
1574 if (!OtherBr || BBI == OtherBB->begin())
1575 return false;
1576
1577 // If the other block ends in an unconditional branch, check for the 'if then
1578 // else' case. there is an instruction before the branch.
1579 StoreInst* OtherStore = nullptr;
1580 if (OtherBr->isUnconditional()) {
1581 --BBI;
1582 // Skip over debugging info.
1583 while (isa<DbgInfoIntrinsic>(BBI) ||
1584 (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1585 if (BBI == OtherBB->begin())
1586 return false;
1587 --BBI;
1588 }
1589 // If this isn't a store, isn't a store to the same location, or is not the
1590 // right kind of store, bail out.
1591 OtherStore = dyn_cast<StoreInst>(BBI);
1592 if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
1593 !SI.isSameOperationAs(OtherStore))
1594 return false;
1595 }
1596 else {
1597 // Otherwise, the other block ended with a conditional branch. If one of the
1598 // destinations is StoreBB, then we have the if/then case.
1599 if (OtherBr->getSuccessor(0) != StoreBB &&
1600 OtherBr->getSuccessor(1) != StoreBB)
1601 return false;
1602
1603 // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
1604 // if/then triangle. See if there is a store to the same ptr as SI that
1605 // lives in OtherBB.
1606 for (;; --BBI) {
1607 // Check to see if we find the matching store.
1608 if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
1609 if (OtherStore->getOperand(1) != SI.getOperand(1) ||
1610 !SI.isSameOperationAs(OtherStore))
1611 return false;
1612 break;
1613 }
1614 // If we find something that may be using or overwriting the stored
1615 // value, or if we run out of instructions, we can't do the xform.
1616 if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
1617 BBI->mayWriteToMemory() || BBI == OtherBB->begin())
1618 return false;
1619 }
1620
1621 // In order to eliminate the store in OtherBr, we have to
1622 // make sure nothing reads or overwrites the stored value in
1623 // StoreBB.
1624 for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
1625 // FIXME: This should really be AA driven.
1626 if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
1627 return false;
1628 }
1629 }
1630
1631 // Insert a PHI node now if we need it.
1632 Value* MergedVal = OtherStore->getOperand(0);
1633 if (MergedVal != SI.getOperand(0)) {
1634 PHINode* PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
1635 PN->addIncoming(SI.getOperand(0), SI.getParent());
1636 PN->addIncoming(OtherStore->getOperand(0), OtherBB);
1637 MergedVal = InsertNewInstBefore(PN, DestBB->front());
1638 }
1639
1640 // Advance to a place where it is safe to insert the new store and
1641 // insert it.
1642 BBI = DestBB->getFirstInsertionPt();
1643 StoreInst* NewSI = new StoreInst(MergedVal, SI.getOperand(1),
1644 SI.isVolatile(),
1645 SI.getAlignment(),
1646 SI.getOrdering(),
1647 SI.getSyncScopeID());
1648 InsertNewInstBefore(NewSI, *BBI);
1649 // The debug locations of the original instructions might differ; merge them.
1650 NewSI->applyMergedLocation(SI.getDebugLoc(), OtherStore->getDebugLoc());
1651
1652 // If the two stores had AA tags, merge them.
1653 AAMDNodes AATags;
1654 SI.getAAMetadata(AATags);
1655 if (AATags) {
1656 OtherStore->getAAMetadata(AATags, /* Merge = */ true);
1657 NewSI->setAAMetadata(AATags);
1658 }
1659
1660 // Nuke the old stores.
1661 eraseInstFromFunction(SI);
1662 eraseInstFromFunction(*OtherStore);
1663 return true;
1664 }
1665 #include "common/LLVMWarningsPop.hpp"
1666