1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains an optimization for div and rem on architectures that
10 // execute short instructions significantly faster than longer instructions.
11 // For example, on Intel Atom 32-bit divides are slow enough that during
12 // runtime it is profitable to check the value of the operands, and if they are
13 // positive and less than 256 use an unsigned 8-bit divide.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 #include "llvm/Transforms/Utils/Local.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Module.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/IR/Value.h"
33 #include "llvm/Support/Casting.h"
34 #include "llvm/Support/KnownBits.h"
35 #include <cassert>
36 #include <cstdint>
37 
38 using namespace llvm;
39 
40 #define DEBUG_TYPE "bypass-slow-division"
41 
42 namespace {
43 
44   struct QuotRemPair {
45     Value *Quotient;
46     Value *Remainder;
47 
48     QuotRemPair(Value *InQuotient, Value *InRemainder)
49         : Quotient(InQuotient), Remainder(InRemainder) {}
50   };
51 
52   /// A quotient and remainder, plus a BB from which they logically "originate".
53   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
54   /// corresponding predecessor.
55   struct QuotRemWithBB {
56     BasicBlock *BB = nullptr;
57     Value *Quotient = nullptr;
58     Value *Remainder = nullptr;
59   };
60 
61 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
62 using BypassWidthsTy = DenseMap<unsigned, unsigned>;
63 using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
64 
65 enum ValueRange {
66   /// Operand definitely fits into BypassType. No runtime checks are needed.
67   VALRNG_KNOWN_SHORT,
68   /// A runtime check is required, as value range is unknown.
69   VALRNG_UNKNOWN,
70   /// Operand is unlikely to fit into BypassType. The bypassing should be
71   /// disabled.
72   VALRNG_LIKELY_LONG
73 };
74 
75 class FastDivInsertionTask {
76   bool IsValidTask = false;
77   Instruction *SlowDivOrRem = nullptr;
78   IntegerType *BypassType = nullptr;
79   BasicBlock *MainBB = nullptr;
80 
81   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
82   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
83   QuotRemWithBB createSlowBB(BasicBlock *Successor);
84   QuotRemWithBB createFastBB(BasicBlock *Successor);
85   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
86                                    BasicBlock *PhiBB);
87   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
88   std::optional<QuotRemPair> insertFastDivAndRem();
89 
90   bool isSignedOp() {
91     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
92            SlowDivOrRem->getOpcode() == Instruction::SRem;
93   }
94 
95   bool isDivisionOp() {
96     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
97            SlowDivOrRem->getOpcode() == Instruction::UDiv;
98   }
99 
100   Type *getSlowType() { return SlowDivOrRem->getType(); }
101 
102 public:
103   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
104 
105   Value *getReplacement(DivCacheTy &Cache);
106 };
107 
108 } // end anonymous namespace
109 
110 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
111                                            const BypassWidthsTy &BypassWidths) {
112   switch (I->getOpcode()) {
113   case Instruction::UDiv:
114   case Instruction::SDiv:
115   case Instruction::URem:
116   case Instruction::SRem:
117     SlowDivOrRem = I;
118     break;
119   default:
120     // I is not a div/rem operation.
121     return;
122   }
123 
124   // Skip division on vector types. Only optimize integer instructions.
125   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
126   if (!SlowType)
127     return;
128 
129   // Skip if this bitwidth is not bypassed.
130   auto BI = BypassWidths.find(SlowType->getBitWidth());
131   if (BI == BypassWidths.end())
132     return;
133 
134   // Get type for div/rem instruction with bypass bitwidth.
135   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
136   BypassType = BT;
137 
138   // The original basic block.
139   MainBB = I->getParent();
140 
141   // The instruction is indeed a slow div or rem operation.
142   IsValidTask = true;
143 }
144 
145 /// Reuses previously-computed dividend or remainder from the current BB if
146 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
147 /// perform the optimization and caches the resulting dividend and remainder.
148 /// If no replacement can be generated, nullptr is returned.
149 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
150   // First, make sure that the task is valid.
151   if (!IsValidTask)
152     return nullptr;
153 
154   // Then, look for a value in Cache.
155   Value *Dividend = SlowDivOrRem->getOperand(0);
156   Value *Divisor = SlowDivOrRem->getOperand(1);
157   DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
158   auto CacheI = Cache.find(Key);
159 
160   if (CacheI == Cache.end()) {
161     // If previous instance does not exist, try to insert fast div.
162     std::optional<QuotRemPair> OptResult = insertFastDivAndRem();
163     // Bail out if insertFastDivAndRem has failed.
164     if (!OptResult)
165       return nullptr;
166     CacheI = Cache.insert({Key, *OptResult}).first;
167   }
168 
169   QuotRemPair &Value = CacheI->second;
170   return isDivisionOp() ? Value.Quotient : Value.Remainder;
171 }
172 
173 /// Check if a value looks like a hash.
174 ///
175 /// The routine is expected to detect values computed using the most common hash
176 /// algorithms. Typically, hash computations end with one of the following
177 /// instructions:
178 ///
179 /// 1) MUL with a constant wider than BypassType
180 /// 2) XOR instruction
181 ///
182 /// And even if we are wrong and the value is not a hash, it is still quite
183 /// unlikely that such values will fit into BypassType.
184 ///
185 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
186 /// It is implemented as a depth-first search for values that look neither long
187 /// nor hash-like.
188 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
189   Instruction *I = dyn_cast<Instruction>(V);
190   if (!I)
191     return false;
192 
193   switch (I->getOpcode()) {
194   case Instruction::Xor:
195     return true;
196   case Instruction::Mul: {
197     // After Constant Hoisting pass, long constants may be represented as
198     // bitcast instructions. As a result, some constants may look like an
199     // instruction at first, and an additional check is necessary to find out if
200     // an operand is actually a constant.
201     Value *Op1 = I->getOperand(1);
202     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
203     if (!C && isa<BitCastInst>(Op1))
204       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
205     return C && C->getValue().getSignificantBits() > BypassType->getBitWidth();
206   }
207   case Instruction::PHI:
208     // Stop IR traversal in case of a crazy input code. This limits recursion
209     // depth.
210     if (Visited.size() >= 16)
211       return false;
212     // Do not visit nodes that have been visited already. We return true because
213     // it means that we couldn't find any value that doesn't look hash-like.
214     if (!Visited.insert(I).second)
215       return true;
216     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
217       // Ignore undef values as they probably don't affect the division
218       // operands.
219       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
220              isa<UndefValue>(V);
221     });
222   default:
223     return false;
224   }
225 }
226 
227 /// Check if an integer value fits into our bypass type.
228 ValueRange FastDivInsertionTask::getValueRange(Value *V,
229                                                VisitedSetTy &Visited) {
230   unsigned ShortLen = BypassType->getBitWidth();
231   unsigned LongLen = V->getType()->getIntegerBitWidth();
232 
233   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
234   unsigned HiBits = LongLen - ShortLen;
235 
236   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
237   KnownBits Known(LongLen);
238 
239   computeKnownBits(V, Known, DL);
240 
241   if (Known.countMinLeadingZeros() >= HiBits)
242     return VALRNG_KNOWN_SHORT;
243 
244   if (Known.countMaxLeadingZeros() < HiBits)
245     return VALRNG_LIKELY_LONG;
246 
247   // Long integer divisions are often used in hashtable implementations. It's
248   // not worth bypassing such divisions because hash values are extremely
249   // unlikely to have enough leading zeros. The call below tries to detect
250   // values that are unlikely to fit BypassType (including hashes).
251   if (isHashLikeValue(V, Visited))
252     return VALRNG_LIKELY_LONG;
253 
254   return VALRNG_UNKNOWN;
255 }
256 
257 /// Add new basic block for slow div and rem operations and put it before
258 /// SuccessorBB.
259 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
260   QuotRemWithBB DivRemPair;
261   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
262                                      MainBB->getParent(), SuccessorBB);
263   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
264   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
265 
266   Value *Dividend = SlowDivOrRem->getOperand(0);
267   Value *Divisor = SlowDivOrRem->getOperand(1);
268 
269   if (isSignedOp()) {
270     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
271     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
272   } else {
273     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
274     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
275   }
276 
277   Builder.CreateBr(SuccessorBB);
278   return DivRemPair;
279 }
280 
281 /// Add new basic block for fast div and rem operations and put it before
282 /// SuccessorBB.
283 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
284   QuotRemWithBB DivRemPair;
285   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
286                                      MainBB->getParent(), SuccessorBB);
287   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
288   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
289 
290   Value *Dividend = SlowDivOrRem->getOperand(0);
291   Value *Divisor = SlowDivOrRem->getOperand(1);
292   Value *ShortDivisorV =
293       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
294   Value *ShortDividendV =
295       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
296 
297   // udiv/urem because this optimization only handles positive numbers.
298   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
299   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
300   DivRemPair.Quotient =
301       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
302   DivRemPair.Remainder =
303       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
304   Builder.CreateBr(SuccessorBB);
305 
306   return DivRemPair;
307 }
308 
309 /// Creates Phi nodes for result of Div and Rem.
310 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
311                                                        QuotRemWithBB &RHS,
312                                                        BasicBlock *PhiBB) {
313   IRBuilder<> Builder(PhiBB, PhiBB->begin());
314   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
315   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
316   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
317   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
318   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
319   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
320   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
321   return QuotRemPair(QuoPhi, RemPhi);
322 }
323 
324 /// Creates a runtime check to test whether both the divisor and dividend fit
325 /// into BypassType. The check is inserted at the end of MainBB. True return
326 /// value means that the operands fit. Either of the operands may be NULL if it
327 /// doesn't need a runtime check.
328 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
329   assert((Op1 || Op2) && "Nothing to check");
330   IRBuilder<> Builder(MainBB, MainBB->end());
331   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
332 
333   Value *OrV;
334   if (Op1 && Op2)
335     OrV = Builder.CreateOr(Op1, Op2);
336   else
337     OrV = Op1 ? Op1 : Op2;
338 
339   // BitMask is inverted to check if the operands are
340   // larger than the bypass type
341   uint64_t BitMask = ~BypassType->getBitMask();
342   Value *AndV = Builder.CreateAnd(OrV, BitMask);
343 
344   // Compare operand values
345   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
346   return Builder.CreateICmpEQ(AndV, ZeroV);
347 }
348 
349 /// Substitutes the div/rem instruction with code that checks the value of the
350 /// operands and uses a shorter-faster div/rem instruction when possible.
351 std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
352   Value *Dividend = SlowDivOrRem->getOperand(0);
353   Value *Divisor = SlowDivOrRem->getOperand(1);
354 
355   VisitedSetTy SetL;
356   ValueRange DividendRange = getValueRange(Dividend, SetL);
357   if (DividendRange == VALRNG_LIKELY_LONG)
358     return std::nullopt;
359 
360   VisitedSetTy SetR;
361   ValueRange DivisorRange = getValueRange(Divisor, SetR);
362   if (DivisorRange == VALRNG_LIKELY_LONG)
363     return std::nullopt;
364 
365   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
366   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
367 
368   if (DividendShort && DivisorShort) {
369     // If both operands are known to be short then just replace the long
370     // division with a short one in-place.  Since we're not introducing control
371     // flow in this case, narrowing the division is always a win, even if the
372     // divisor is a constant (and will later get replaced by a multiplication).
373 
374     IRBuilder<> Builder(SlowDivOrRem);
375     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
376     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
377     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
378     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
379     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
380     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
381     return QuotRemPair(ExtDiv, ExtRem);
382   }
383 
384   if (isa<ConstantInt>(Divisor)) {
385     // If the divisor is not a constant, DAGCombiner will convert it to a
386     // multiplication by a magic constant.  It isn't clear if it is worth
387     // introducing control flow to get a narrower multiply.
388     return std::nullopt;
389   }
390 
391   // After Constant Hoisting pass, long constants may be represented as
392   // bitcast instructions. As a result, some constants may look like an
393   // instruction at first, and an additional check is necessary to find out if
394   // an operand is actually a constant.
395   if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
396     if (BCI->getParent() == SlowDivOrRem->getParent() &&
397         isa<ConstantInt>(BCI->getOperand(0)))
398       return std::nullopt;
399 
400   IRBuilder<> Builder(MainBB, MainBB->end());
401   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
402 
403   if (DividendShort && !isSignedOp()) {
404     // If the division is unsigned and Dividend is known to be short, then
405     // either
406     // 1) Divisor is less or equal to Dividend, and the result can be computed
407     //    with a short division.
408     // 2) Divisor is greater than Dividend. In this case, no division is needed
409     //    at all: The quotient is 0 and the remainder is equal to Dividend.
410     //
411     // So instead of checking at runtime whether Divisor fits into BypassType,
412     // we emit a runtime check to differentiate between these two cases. This
413     // lets us entirely avoid a long div.
414 
415     // Split the basic block before the div/rem.
416     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
417     // Remove the unconditional branch from MainBB to SuccessorBB.
418     MainBB->back().eraseFromParent();
419     QuotRemWithBB Long;
420     Long.BB = MainBB;
421     Long.Quotient = ConstantInt::get(getSlowType(), 0);
422     Long.Remainder = Dividend;
423     QuotRemWithBB Fast = createFastBB(SuccessorBB);
424     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
425     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
426     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
427     return Result;
428   } else {
429     // General case. Create both slow and fast div/rem pairs and choose one of
430     // them at runtime.
431 
432     // Split the basic block before the div/rem.
433     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
434     // Remove the unconditional branch from MainBB to SuccessorBB.
435     MainBB->back().eraseFromParent();
436     QuotRemWithBB Fast = createFastBB(SuccessorBB);
437     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
438     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
439     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
440                                             DivisorShort ? nullptr : Divisor);
441     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
442     return Result;
443   }
444 }
445 
446 /// This optimization identifies DIV/REM instructions in a BB that can be
447 /// profitably bypassed and carried out with a shorter, faster divide.
448 bool llvm::bypassSlowDivision(BasicBlock *BB,
449                               const BypassWidthsTy &BypassWidths) {
450   DivCacheTy PerBBDivCache;
451 
452   bool MadeChange = false;
453   Instruction *Next = &*BB->begin();
454   while (Next != nullptr) {
455     // We may add instructions immediately after I, but we want to skip over
456     // them.
457     Instruction *I = Next;
458     Next = Next->getNextNode();
459 
460     // Ignore dead code to save time and avoid bugs.
461     if (I->hasNUses(0))
462       continue;
463 
464     FastDivInsertionTask Task(I, BypassWidths);
465     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
466       I->replaceAllUsesWith(Replacement);
467       I->eraseFromParent();
468       MadeChange = true;
469     }
470   }
471 
472   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
473   // create divrem machine instructions.  Now erase any unused divs / rems so we
474   // don't leave extra instructions sitting around.
475   for (auto &KV : PerBBDivCache)
476     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
477       RecursivelyDeleteTriviallyDeadInstructions(V);
478 
479   return MadeChange;
480 }
481