1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains an optimization for div and rem on architectures that
10 // execute short instructions significantly faster than longer instructions.
11 // For example, on Intel Atom 32-bit divides are slow enough that during
12 // runtime it is profitable to check the value of the operands, and if they are
13 // positive and less than 256 use an unsigned 8-bit divide.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/None.h"
20 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Transforms/Utils/Local.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/Instruction.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/IR/Module.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/IR/Value.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/KnownBits.h"
37 #include <cassert>
38 #include <cstdint>
39 
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "bypass-slow-division"
43 
44 namespace {
45 
46   struct QuotRemPair {
47     Value *Quotient;
48     Value *Remainder;
49 
50     QuotRemPair(Value *InQuotient, Value *InRemainder)
51         : Quotient(InQuotient), Remainder(InRemainder) {}
52   };
53 
54   /// A quotient and remainder, plus a BB from which they logically "originate".
55   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
56   /// corresponding predecessor.
57   struct QuotRemWithBB {
58     BasicBlock *BB = nullptr;
59     Value *Quotient = nullptr;
60     Value *Remainder = nullptr;
61   };
62 
63 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
64 using BypassWidthsTy = DenseMap<unsigned, unsigned>;
65 using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
66 
67 enum ValueRange {
68   /// Operand definitely fits into BypassType. No runtime checks are needed.
69   VALRNG_KNOWN_SHORT,
70   /// A runtime check is required, as value range is unknown.
71   VALRNG_UNKNOWN,
72   /// Operand is unlikely to fit into BypassType. The bypassing should be
73   /// disabled.
74   VALRNG_LIKELY_LONG
75 };
76 
77 class FastDivInsertionTask {
78   bool IsValidTask = false;
79   Instruction *SlowDivOrRem = nullptr;
80   IntegerType *BypassType = nullptr;
81   BasicBlock *MainBB = nullptr;
82 
83   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
84   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
85   QuotRemWithBB createSlowBB(BasicBlock *Successor);
86   QuotRemWithBB createFastBB(BasicBlock *Successor);
87   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
88                                    BasicBlock *PhiBB);
89   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
90   Optional<QuotRemPair> insertFastDivAndRem();
91 
92   bool isSignedOp() {
93     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
94            SlowDivOrRem->getOpcode() == Instruction::SRem;
95   }
96 
97   bool isDivisionOp() {
98     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
99            SlowDivOrRem->getOpcode() == Instruction::UDiv;
100   }
101 
102   Type *getSlowType() { return SlowDivOrRem->getType(); }
103 
104 public:
105   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
106 
107   Value *getReplacement(DivCacheTy &Cache);
108 };
109 
110 } // end anonymous namespace
111 
112 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
113                                            const BypassWidthsTy &BypassWidths) {
114   switch (I->getOpcode()) {
115   case Instruction::UDiv:
116   case Instruction::SDiv:
117   case Instruction::URem:
118   case Instruction::SRem:
119     SlowDivOrRem = I;
120     break;
121   default:
122     // I is not a div/rem operation.
123     return;
124   }
125 
126   // Skip division on vector types. Only optimize integer instructions.
127   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
128   if (!SlowType)
129     return;
130 
131   // Skip if this bitwidth is not bypassed.
132   auto BI = BypassWidths.find(SlowType->getBitWidth());
133   if (BI == BypassWidths.end())
134     return;
135 
136   // Get type for div/rem instruction with bypass bitwidth.
137   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
138   BypassType = BT;
139 
140   // The original basic block.
141   MainBB = I->getParent();
142 
143   // The instruction is indeed a slow div or rem operation.
144   IsValidTask = true;
145 }
146 
147 /// Reuses previously-computed dividend or remainder from the current BB if
148 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
149 /// perform the optimization and caches the resulting dividend and remainder.
150 /// If no replacement can be generated, nullptr is returned.
151 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
152   // First, make sure that the task is valid.
153   if (!IsValidTask)
154     return nullptr;
155 
156   // Then, look for a value in Cache.
157   Value *Dividend = SlowDivOrRem->getOperand(0);
158   Value *Divisor = SlowDivOrRem->getOperand(1);
159   DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
160   auto CacheI = Cache.find(Key);
161 
162   if (CacheI == Cache.end()) {
163     // If previous instance does not exist, try to insert fast div.
164     Optional<QuotRemPair> OptResult = insertFastDivAndRem();
165     // Bail out if insertFastDivAndRem has failed.
166     if (!OptResult)
167       return nullptr;
168     CacheI = Cache.insert({Key, *OptResult}).first;
169   }
170 
171   QuotRemPair &Value = CacheI->second;
172   return isDivisionOp() ? Value.Quotient : Value.Remainder;
173 }
174 
175 /// Check if a value looks like a hash.
176 ///
177 /// The routine is expected to detect values computed using the most common hash
178 /// algorithms. Typically, hash computations end with one of the following
179 /// instructions:
180 ///
181 /// 1) MUL with a constant wider than BypassType
182 /// 2) XOR instruction
183 ///
184 /// And even if we are wrong and the value is not a hash, it is still quite
185 /// unlikely that such values will fit into BypassType.
186 ///
187 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
188 /// It is implemented as a depth-first search for values that look neither long
189 /// nor hash-like.
190 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
191   Instruction *I = dyn_cast<Instruction>(V);
192   if (!I)
193     return false;
194 
195   switch (I->getOpcode()) {
196   case Instruction::Xor:
197     return true;
198   case Instruction::Mul: {
199     // After Constant Hoisting pass, long constants may be represented as
200     // bitcast instructions. As a result, some constants may look like an
201     // instruction at first, and an additional check is necessary to find out if
202     // an operand is actually a constant.
203     Value *Op1 = I->getOperand(1);
204     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
205     if (!C && isa<BitCastInst>(Op1))
206       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
207     return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
208   }
209   case Instruction::PHI:
210     // Stop IR traversal in case of a crazy input code. This limits recursion
211     // depth.
212     if (Visited.size() >= 16)
213       return false;
214     // Do not visit nodes that have been visited already. We return true because
215     // it means that we couldn't find any value that doesn't look hash-like.
216     if (!Visited.insert(I).second)
217       return true;
218     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
219       // Ignore undef values as they probably don't affect the division
220       // operands.
221       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
222              isa<UndefValue>(V);
223     });
224   default:
225     return false;
226   }
227 }
228 
229 /// Check if an integer value fits into our bypass type.
230 ValueRange FastDivInsertionTask::getValueRange(Value *V,
231                                                VisitedSetTy &Visited) {
232   unsigned ShortLen = BypassType->getBitWidth();
233   unsigned LongLen = V->getType()->getIntegerBitWidth();
234 
235   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
236   unsigned HiBits = LongLen - ShortLen;
237 
238   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
239   KnownBits Known(LongLen);
240 
241   computeKnownBits(V, Known, DL);
242 
243   if (Known.countMinLeadingZeros() >= HiBits)
244     return VALRNG_KNOWN_SHORT;
245 
246   if (Known.countMaxLeadingZeros() < HiBits)
247     return VALRNG_LIKELY_LONG;
248 
249   // Long integer divisions are often used in hashtable implementations. It's
250   // not worth bypassing such divisions because hash values are extremely
251   // unlikely to have enough leading zeros. The call below tries to detect
252   // values that are unlikely to fit BypassType (including hashes).
253   if (isHashLikeValue(V, Visited))
254     return VALRNG_LIKELY_LONG;
255 
256   return VALRNG_UNKNOWN;
257 }
258 
259 /// Add new basic block for slow div and rem operations and put it before
260 /// SuccessorBB.
261 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
262   QuotRemWithBB DivRemPair;
263   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
264                                      MainBB->getParent(), SuccessorBB);
265   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
266   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
267 
268   Value *Dividend = SlowDivOrRem->getOperand(0);
269   Value *Divisor = SlowDivOrRem->getOperand(1);
270 
271   if (isSignedOp()) {
272     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
273     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
274   } else {
275     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
276     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
277   }
278 
279   Builder.CreateBr(SuccessorBB);
280   return DivRemPair;
281 }
282 
283 /// Add new basic block for fast div and rem operations and put it before
284 /// SuccessorBB.
285 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
286   QuotRemWithBB DivRemPair;
287   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
288                                      MainBB->getParent(), SuccessorBB);
289   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
290   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
291 
292   Value *Dividend = SlowDivOrRem->getOperand(0);
293   Value *Divisor = SlowDivOrRem->getOperand(1);
294   Value *ShortDivisorV =
295       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
296   Value *ShortDividendV =
297       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
298 
299   // udiv/urem because this optimization only handles positive numbers.
300   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
301   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
302   DivRemPair.Quotient =
303       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
304   DivRemPair.Remainder =
305       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
306   Builder.CreateBr(SuccessorBB);
307 
308   return DivRemPair;
309 }
310 
311 /// Creates Phi nodes for result of Div and Rem.
312 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
313                                                        QuotRemWithBB &RHS,
314                                                        BasicBlock *PhiBB) {
315   IRBuilder<> Builder(PhiBB, PhiBB->begin());
316   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
317   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
318   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
319   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
320   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
321   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
322   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
323   return QuotRemPair(QuoPhi, RemPhi);
324 }
325 
326 /// Creates a runtime check to test whether both the divisor and dividend fit
327 /// into BypassType. The check is inserted at the end of MainBB. True return
328 /// value means that the operands fit. Either of the operands may be NULL if it
329 /// doesn't need a runtime check.
330 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
331   assert((Op1 || Op2) && "Nothing to check");
332   IRBuilder<> Builder(MainBB, MainBB->end());
333   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
334 
335   Value *OrV;
336   if (Op1 && Op2)
337     OrV = Builder.CreateOr(Op1, Op2);
338   else
339     OrV = Op1 ? Op1 : Op2;
340 
341   // BitMask is inverted to check if the operands are
342   // larger than the bypass type
343   uint64_t BitMask = ~BypassType->getBitMask();
344   Value *AndV = Builder.CreateAnd(OrV, BitMask);
345 
346   // Compare operand values
347   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
348   return Builder.CreateICmpEQ(AndV, ZeroV);
349 }
350 
351 /// Substitutes the div/rem instruction with code that checks the value of the
352 /// operands and uses a shorter-faster div/rem instruction when possible.
353 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
354   Value *Dividend = SlowDivOrRem->getOperand(0);
355   Value *Divisor = SlowDivOrRem->getOperand(1);
356 
357   VisitedSetTy SetL;
358   ValueRange DividendRange = getValueRange(Dividend, SetL);
359   if (DividendRange == VALRNG_LIKELY_LONG)
360     return None;
361 
362   VisitedSetTy SetR;
363   ValueRange DivisorRange = getValueRange(Divisor, SetR);
364   if (DivisorRange == VALRNG_LIKELY_LONG)
365     return None;
366 
367   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
368   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
369 
370   if (DividendShort && DivisorShort) {
371     // If both operands are known to be short then just replace the long
372     // division with a short one in-place.  Since we're not introducing control
373     // flow in this case, narrowing the division is always a win, even if the
374     // divisor is a constant (and will later get replaced by a multiplication).
375 
376     IRBuilder<> Builder(SlowDivOrRem);
377     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
378     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
379     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
380     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
381     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
382     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
383     return QuotRemPair(ExtDiv, ExtRem);
384   }
385 
386   if (isa<ConstantInt>(Divisor)) {
387     // If the divisor is not a constant, DAGCombiner will convert it to a
388     // multiplication by a magic constant.  It isn't clear if it is worth
389     // introducing control flow to get a narrower multiply.
390     return None;
391   }
392 
393   // After Constant Hoisting pass, long constants may be represented as
394   // bitcast instructions. As a result, some constants may look like an
395   // instruction at first, and an additional check is necessary to find out if
396   // an operand is actually a constant.
397   if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
398     if (BCI->getParent() == SlowDivOrRem->getParent() &&
399         isa<ConstantInt>(BCI->getOperand(0)))
400       return None;
401 
402   IRBuilder<> Builder(MainBB, MainBB->end());
403   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
404 
405   if (DividendShort && !isSignedOp()) {
406     // If the division is unsigned and Dividend is known to be short, then
407     // either
408     // 1) Divisor is less or equal to Dividend, and the result can be computed
409     //    with a short division.
410     // 2) Divisor is greater than Dividend. In this case, no division is needed
411     //    at all: The quotient is 0 and the remainder is equal to Dividend.
412     //
413     // So instead of checking at runtime whether Divisor fits into BypassType,
414     // we emit a runtime check to differentiate between these two cases. This
415     // lets us entirely avoid a long div.
416 
417     // Split the basic block before the div/rem.
418     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
419     // Remove the unconditional branch from MainBB to SuccessorBB.
420     MainBB->getInstList().back().eraseFromParent();
421     QuotRemWithBB Long;
422     Long.BB = MainBB;
423     Long.Quotient = ConstantInt::get(getSlowType(), 0);
424     Long.Remainder = Dividend;
425     QuotRemWithBB Fast = createFastBB(SuccessorBB);
426     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
427     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
428     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
429     return Result;
430   } else {
431     // General case. Create both slow and fast div/rem pairs and choose one of
432     // them at runtime.
433 
434     // Split the basic block before the div/rem.
435     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
436     // Remove the unconditional branch from MainBB to SuccessorBB.
437     MainBB->getInstList().back().eraseFromParent();
438     QuotRemWithBB Fast = createFastBB(SuccessorBB);
439     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
440     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
441     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
442                                             DivisorShort ? nullptr : Divisor);
443     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
444     return Result;
445   }
446 }
447 
448 /// This optimization identifies DIV/REM instructions in a BB that can be
449 /// profitably bypassed and carried out with a shorter, faster divide.
450 bool llvm::bypassSlowDivision(BasicBlock *BB,
451                               const BypassWidthsTy &BypassWidths) {
452   DivCacheTy PerBBDivCache;
453 
454   bool MadeChange = false;
455   Instruction *Next = &*BB->begin();
456   while (Next != nullptr) {
457     // We may add instructions immediately after I, but we want to skip over
458     // them.
459     Instruction *I = Next;
460     Next = Next->getNextNode();
461 
462     // Ignore dead code to save time and avoid bugs.
463     if (I->hasNUses(0))
464       continue;
465 
466     FastDivInsertionTask Task(I, BypassWidths);
467     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
468       I->replaceAllUsesWith(Replacement);
469       I->eraseFromParent();
470       MadeChange = true;
471     }
472   }
473 
474   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
475   // create divrem machine instructions.  Now erase any unused divs / rems so we
476   // don't leave extra instructions sitting around.
477   for (auto &KV : PerBBDivCache)
478     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
479       RecursivelyDeleteTriviallyDeadInstructions(V);
480 
481   return MadeChange;
482 }
483