1 //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Interface file for the IRSimilarityIdentifier for identifying similarities in
11 // IR including the IRInstructionMapper, which maps an Instruction to unsigned
12 // integers.
13 //
14 // Two sequences of instructions are called "similar" if they perform the same
15 // series of operations for all inputs.
16 //
17 // \code
18 // %1 = add i32 %a, 10
19 // %2 = add i32 %a, %1
20 // %3 = icmp slt icmp %1, %2
21 // \endcode
22 //
23 // and
24 //
25 // \code
26 // %1 = add i32 11, %a
27 // %2 = sub i32 %a, %1
28 // %3 = icmp sgt icmp %2, %1
29 // \endcode
30 //
31 // ultimately have the same result, even if the inputs, and structure are
32 // slightly different.
33 //
34 // For instructions, we do not worry about operands that do not have fixed
35 // semantic meaning to the program.  We consider the opcode that the instruction
36 // has, the types, parameters, and extra information such as the function name,
37 // or comparison predicate.  These are used to create a hash to map instructions
38 // to integers to be used in similarity matching in sequences of instructions
39 //
40 // Terminology:
41 // An IRSimilarityCandidate is a region of IRInstructionData (wrapped
42 // Instructions), usually used to denote a region of similarity has been found.
43 //
44 // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
45 // similar to one another.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
50 #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
51 
52 #include "llvm/IR/InstVisitor.h"
53 #include "llvm/IR/Instructions.h"
54 #include "llvm/IR/Module.h"
55 #include "llvm/IR/PassManager.h"
56 #include "llvm/Pass.h"
57 #include "llvm/Support/Allocator.h"
58 
59 namespace llvm {
60 namespace IRSimilarity {
61 
62 struct IRInstructionDataList;
63 
64 /// This represents what is and is not supported when finding similarity in
65 /// Instructions.
66 ///
67 /// Legal Instructions are considered when looking at similarity between
68 /// Instructions.
69 ///
70 /// Illegal Instructions cannot be considered when looking for similarity
71 /// between Instructions. They act as boundaries between similarity regions.
72 ///
73 /// Invisible Instructions are skipped over during analysis.
74 // TODO: Shared with MachineOutliner
75 enum InstrType { Legal, Illegal, Invisible };
76 
77 /// This provides the utilities for hashing an Instruction to an unsigned
78 /// integer. Two IRInstructionDatas produce the same hash value when their
79 /// underlying Instructions perform the same operation (even if they don't have
80 /// the same input operands.)
81 /// As a more concrete example, consider the following:
82 ///
83 /// \code
84 /// %add1 = add i32 %a, %b
85 /// %add2 = add i32 %c, %d
86 /// %add3 = add i64 %e, %f
87 /// \endcode
88 ///
89 // Then the IRInstructionData wrappers for these Instructions may be hashed like
90 /// so:
91 ///
92 /// \code
93 /// ; These two adds have the same types and operand types, so they hash to the
94 /// ; same number.
95 /// %add1 = add i32 %a, %b ; Hash: 1
96 /// %add2 = add i32 %c, %d ; Hash: 1
97 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
98 /// ; it hashes to a different number.
99 /// %add3 = add i64 %e, %f; Hash: 2
100 /// \endcode
101 ///
102 ///
103 /// This hashing scheme will be used to represent the program as a very long
104 /// string. This string can then be placed in a data structure which can be used
105 /// for similarity queries.
106 ///
107 /// TODO: Handle types of Instructions which can be equal even with different
108 /// operands. (E.g. comparisons with swapped predicates.)
109 /// TODO: Handle CallInsts, which are only checked for function type
110 /// by \ref isSameOperationAs.
111 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
112 /// exact same, and some do not.
113 struct IRInstructionData : ilist_node<IRInstructionData> {
114 
115   /// The source Instruction that is being wrapped.
116   Instruction *Inst = nullptr;
117   /// The values of the operands in the Instruction.
118   SmallVector<Value *, 4> OperVals;
119   /// The legality of the wrapped instruction. This is informed by InstrType,
120   /// and is used when checking when two instructions are considered similar.
121   /// If either instruction is not legal, the instructions are automatically not
122   /// considered similar.
123   bool Legal;
124 
125   /// This is only relevant if we are wrapping a CmpInst where we needed to
126   /// change the predicate of a compare instruction from a greater than form
127   /// to a less than form.  It is None otherwise.
128   Optional<CmpInst::Predicate> RevisedPredicate;
129 
130   /// Gather the information that is difficult to gather for an Instruction, or
131   /// is changed. i.e. the operands of an Instruction and the Types of those
132   /// operands. This extra information allows for similarity matching to make
133   /// assertions that allow for more flexibility when checking for whether an
134   /// Instruction performs the same operation.
135   IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
136 
137   /// Get the predicate that the compare instruction is using for hashing the
138   /// instruction. the IRInstructionData must be wrapping a CmpInst.
139   CmpInst::Predicate getPredicate() const;
140 
141   /// A function that swaps the predicates to their less than form if they are
142   /// in a greater than form. Otherwise, the predicate is unchanged.
143   ///
144   /// \param CI - The comparison operation to find a consistent preidcate for.
145   /// \return the consistent comparison predicate.
146   static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
147 
148   /// Hashes \p Value based on its opcode, types, and operand types.
149   /// Two IRInstructionData instances produce the same hash when they perform
150   /// the same operation.
151   ///
152   /// As a simple example, consider the following instructions.
153   ///
154   /// \code
155   /// %add1 = add i32 %x1, %y1
156   /// %add2 = add i32 %x2, %y2
157   ///
158   /// %sub = sub i32 %x1, %y1
159   ///
160   /// %add_i64 = add i64 %x2, %y2
161   /// \endcode
162   ///
163   /// Because the first two adds operate the same types, and are performing the
164   /// same action, they will be hashed to the same value.
165   ///
166   /// However, the subtraction instruction is not the same as an addition, and
167   /// will be hashed to a different value.
168   ///
169   /// Finally, the last add has a different type compared to the first two add
170   /// instructions, so it will also be hashed to a different value that any of
171   /// the previous instructions.
172   ///
173   /// \param [in] ID - The IRInstructionData instance to be hashed.
174   /// \returns A hash_value of the IRInstructionData.
hash_valueIRInstructionData175   friend hash_code hash_value(const IRInstructionData &ID) {
176     SmallVector<Type *, 4> OperTypes;
177     for (Value *V : ID.OperVals)
178       OperTypes.push_back(V->getType());
179 
180     if (isa<CmpInst>(ID.Inst))
181       return llvm::hash_combine(
182           llvm::hash_value(ID.Inst->getOpcode()),
183           llvm::hash_value(ID.Inst->getType()),
184           llvm::hash_value(ID.getPredicate()),
185           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
186     else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst))
187       return llvm::hash_combine(
188           llvm::hash_value(ID.Inst->getOpcode()),
189           llvm::hash_value(ID.Inst->getType()),
190           llvm::hash_value(CI->getCalledFunction()->getName().str()),
191           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
192     return llvm::hash_combine(
193         llvm::hash_value(ID.Inst->getOpcode()),
194         llvm::hash_value(ID.Inst->getType()),
195         llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
196   }
197 
198   IRInstructionDataList *IDL = nullptr;
199 };
200 
201 struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
202 
203 /// Compare one IRInstructionData class to another IRInstructionData class for
204 /// whether they are performing a the same operation, and can mapped to the
205 /// same value. For regular instructions if the hash value is the same, then
206 /// they will also be close.
207 ///
208 /// \param A - The first IRInstructionData class to compare
209 /// \param B - The second IRInstructionData class to compare
210 /// \returns true if \p A and \p B are similar enough to be mapped to the same
211 /// value.
212 bool isClose(const IRInstructionData &A, const IRInstructionData &B);
213 
214 struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
getEmptyKeyIRInstructionDataTraits215   static inline IRInstructionData *getEmptyKey() { return nullptr; }
getTombstoneKeyIRInstructionDataTraits216   static inline IRInstructionData *getTombstoneKey() {
217     return reinterpret_cast<IRInstructionData *>(-1);
218   }
219 
getHashValueIRInstructionDataTraits220   static unsigned getHashValue(const IRInstructionData *E) {
221     using llvm::hash_value;
222     assert(E && "IRInstructionData is a nullptr?");
223     return hash_value(*E);
224   }
225 
isEqualIRInstructionDataTraits226   static bool isEqual(const IRInstructionData *LHS,
227                       const IRInstructionData *RHS) {
228     if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
229         LHS == getEmptyKey() || LHS == getTombstoneKey())
230       return LHS == RHS;
231 
232     assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
233     return isClose(*LHS, *RHS);
234   }
235 };
236 
237 /// Helper struct for converting the Instructions in a Module into a vector of
238 /// unsigned integers. This vector of unsigned integers can be thought of as a
239 /// "numeric string". This numeric string can then be queried by, for example,
240 /// data structures that find repeated substrings.
241 ///
242 /// This hashing is done per BasicBlock in the module. To hash Instructions
243 /// based off of their operations, each Instruction is wrapped in an
244 /// IRInstructionData struct. The unsigned integer for an IRInstructionData
245 /// depends on:
246 /// - The hash provided by the IRInstructionData.
247 /// - Which member of InstrType the IRInstructionData is classified as.
248 // See InstrType for more details on the possible classifications, and how they
249 // manifest in the numeric string.
250 ///
251 /// The numeric string for an individual BasicBlock is terminated by an unique
252 /// unsigned integer. This prevents data structures which rely on repetition
253 /// from matching across BasicBlocks. (For example, the SuffixTree.)
254 /// As a concrete example, if we have the following two BasicBlocks:
255 /// \code
256 /// bb0:
257 /// %add1 = add i32 %a, %b
258 /// %add2 = add i32 %c, %d
259 /// %add3 = add i64 %e, %f
260 /// bb1:
261 /// %sub = sub i32 %c, %d
262 /// \endcode
263 /// We may hash the Instructions like this (via IRInstructionData):
264 /// \code
265 /// bb0:
266 /// %add1 = add i32 %a, %b ; Hash: 1
267 /// %add2 = add i32 %c, %d; Hash: 1
268 /// %add3 = add i64 %e, %f; Hash: 2
269 /// bb1:
270 /// %sub = sub i32 %c, %d; Hash: 3
271 /// %add4 = add i32 %c, %d ; Hash: 1
272 /// \endcode
273 /// And produce a "numeric string representation" like so:
274 /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
275 ///
276 /// TODO: This is very similar to the MachineOutliner, and should be
277 /// consolidated into the same interface.
278 struct IRInstructionMapper {
279   /// The starting illegal instruction number to map to.
280   ///
281   /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
282   unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
283 
284   /// The next available integer to assign to a legal Instruction to.
285   unsigned LegalInstrNumber = 0;
286 
287   /// Correspondence from IRInstructionData to unsigned integers.
288   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
289       InstructionIntegerMap;
290 
291   /// Set if we added an illegal number in the previous step.
292   /// Since each illegal number is unique, we only need one of them between
293   /// each range of legal numbers. This lets us make sure we don't add more
294   /// than one illegal number per range.
295   bool AddedIllegalLastTime = false;
296 
297   /// Marks whether we found a illegal instruction in the previous step.
298   bool CanCombineWithPrevInstr = false;
299 
300   /// Marks whether we have found a set of instructions that is long enough
301   /// to be considered for similarity.
302   bool HaveLegalRange = false;
303 
304   /// This allocator pointer is in charge of holding on to the IRInstructionData
305   /// so it is not deallocated until whatever external tool is using it is done
306   /// with the information.
307   SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
308 
309   /// This allocator pointer is in charge of creating the IRInstructionDataList
310   /// so it is not deallocated until whatever external tool is using it is done
311   /// with the information.
312   SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
313 
314   /// Get an allocated IRInstructionData struct using the InstDataAllocator.
315   ///
316   /// \param I - The Instruction to wrap with IRInstructionData.
317   /// \param Legality - A boolean value that is true if the instruction is to
318   /// be considered for similarity, and false if not.
319   /// \param IDL - The InstructionDataList that the IRInstructionData is
320   /// inserted into.
321   /// \returns An allocated IRInstructionData struct.
322   IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
323                                                IRInstructionDataList &IDL);
324 
325   /// Get an allocated IRInstructionDataList object using the IDLAllocator.
326   ///
327   /// \returns An allocated IRInstructionDataList object.
328   IRInstructionDataList *allocateIRInstructionDataList();
329 
330   IRInstructionDataList *IDL = nullptr;
331 
332   /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
333   /// determined by \p InstrType. Two Instructions are mapped to the same value
334   /// if they are close as defined by the InstructionData class above.
335   ///
336   /// \param [in] BB - The BasicBlock to be mapped to integers.
337   /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
338   /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
339   void convertToUnsignedVec(BasicBlock &BB,
340                             std::vector<IRInstructionData *> &InstrList,
341                             std::vector<unsigned> &IntegerMapping);
342 
343   /// Maps an Instruction to a legal integer.
344   ///
345   /// \param [in] It - The Instruction to be mapped to an integer.
346   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
347   /// append to.
348   /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
349   /// \returns The integer \p It was mapped to.
350   unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
351                               std::vector<unsigned> &IntegerMappingForBB,
352                               std::vector<IRInstructionData *> &InstrListForBB);
353 
354   /// Maps an Instruction to an illegal integer.
355   ///
356   /// \param [in] It - The \p Instruction to be mapped to an integer.
357   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
358   /// append to.
359   /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
360   /// \param End - true if creating a dummy IRInstructionData at the end of a
361   /// basic block.
362   /// \returns The integer \p It was mapped to.
363   unsigned mapToIllegalUnsigned(
364       BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
365       std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
366 
IRInstructionMapperIRInstructionMapper367   IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
368                       SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
369       : InstDataAllocator(IDA), IDLAllocator(IDLA) {
370     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
371     // changed.
372     assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
373            "DenseMapInfo<unsigned>'s empty key isn't -1!");
374     assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
375                static_cast<unsigned>(-2) &&
376            "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
377 
378     IDL = new (IDLAllocator->Allocate())
379         IRInstructionDataList();
380   }
381 
382   /// Custom InstVisitor to classify different instructions for whether it can
383   /// be analyzed for similarity.
384   struct InstructionClassification
385       : public InstVisitor<InstructionClassification, InstrType> {
InstructionClassificationIRInstructionMapper::InstructionClassification386     InstructionClassification() {}
387 
388     // TODO: Determine a scheme to resolve when the label is similar enough.
visitBranchInstIRInstructionMapper::InstructionClassification389     InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
390     // TODO: Determine a scheme to resolve when the labels are similar enough.
visitPHINodeIRInstructionMapper::InstructionClassification391     InstrType visitPHINode(PHINode &PN) { return Illegal; }
392     // TODO: Handle allocas.
visitAllocaInstIRInstructionMapper::InstructionClassification393     InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
394     // We exclude variable argument instructions since variable arguments
395     // requires extra checking of the argument list.
visitVAArgInstIRInstructionMapper::InstructionClassification396     InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
397     // We exclude all exception handling cases since they are so context
398     // dependent.
visitLandingPadInstIRInstructionMapper::InstructionClassification399     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
visitFuncletPadInstIRInstructionMapper::InstructionClassification400     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
401     // DebugInfo should be included in the regions, but should not be
402     // analyzed for similarity as it has no bearing on the outcome of the
403     // program.
visitDbgInfoIntrinsicIRInstructionMapper::InstructionClassification404     InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
405     // TODO: Handle specific intrinsics.
visitIntrinsicInstIRInstructionMapper::InstructionClassification406     InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
407     // We only allow call instructions where the function has a name and
408     // is not an indirect call.
visitCallInstIRInstructionMapper::InstructionClassification409     InstrType visitCallInst(CallInst &CI) {
410       Function *F = CI.getCalledFunction();
411       if (!F || CI.isIndirectCall() || !F->hasName())
412         return Illegal;
413       return Legal;
414     }
415     // TODO: We do not current handle similarity that changes the control flow.
visitInvokeInstIRInstructionMapper::InstructionClassification416     InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
417     // TODO: We do not current handle similarity that changes the control flow.
visitCallBrInstIRInstructionMapper::InstructionClassification418     InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
419     // TODO: Handle interblock similarity.
visitTerminatorIRInstructionMapper::InstructionClassification420     InstrType visitTerminator(Instruction &I) { return Illegal; }
visitInstructionIRInstructionMapper::InstructionClassification421     InstrType visitInstruction(Instruction &I) { return Legal; }
422   };
423 
424   /// Maps an Instruction to a member of InstrType.
425   InstructionClassification InstClassifier;
426 };
427 
428 /// This is a class that wraps a range of IRInstructionData from one point to
429 /// another in the vector of IRInstructionData, which is a region of the
430 /// program.  It is also responsible for defining the structure within this
431 /// region of instructions.
432 ///
433 /// The structure of a region is defined through a value numbering system
434 /// assigned to each unique value in a region at the creation of the
435 /// IRSimilarityCandidate.
436 ///
437 /// For example, for each Instruction we add a mapping for each new
438 /// value seen in that Instruction.
439 /// IR:                    Mapping Added:
440 /// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
441 /// %add2 = add i32 %a, %1    %add2 -> 4
442 /// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
443 ///
444 /// We can compare IRSimilarityCandidates against one another.
445 /// The \ref isSimilar function compares each IRInstructionData against one
446 /// another and if we have the same sequences of IRInstructionData that would
447 /// create the same hash, we have similar IRSimilarityCandidates.
448 ///
449 /// We can also compare the structure of IRSimilarityCandidates. If we can
450 /// create a mapping of registers in the region contained by one
451 /// IRSimilarityCandidate to the region contained by different
452 /// IRSimilarityCandidate, they can be considered structurally similar.
453 ///
454 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
455 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e
456 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
457 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
458 ///
459 /// Can have the following mapping from candidate to candidate of:
460 /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
461 /// and can be considered similar.
462 ///
463 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
464 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4
465 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
466 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
467 ///
468 /// We cannot create the same mapping since the use of c4 is not used in the
469 /// same way as %b or c2.
470 class IRSimilarityCandidate {
471 private:
472   /// The start index of this IRSimilarityCandidate in the instruction list.
473   unsigned StartIdx = 0;
474 
475   /// The number of instructions in this IRSimilarityCandidate.
476   unsigned Len = 0;
477 
478   /// The first instruction in this IRSimilarityCandidate.
479   IRInstructionData *FirstInst = nullptr;
480 
481   /// The last instruction in this IRSimilarityCandidate.
482   IRInstructionData *LastInst = nullptr;
483 
484   /// Global Value Numbering structures
485   /// @{
486   /// Stores the mapping of the value to the number assigned to it in the
487   /// IRSimilarityCandidate.
488   DenseMap<Value *, unsigned> ValueToNumber;
489   /// Stores the mapping of the number to the value assigned this number.
490   DenseMap<unsigned, Value *> NumberToValue;
491   /// @}
492 
493 public:
494   /// \param StartIdx - The starting location of the region.
495   /// \param Len - The length of the region.
496   /// \param FirstInstIt - The starting IRInstructionData of the region.
497   /// \param LastInstIt - The ending IRInstructionData of the region.
498   IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
499                         IRInstructionData *FirstInstIt,
500                         IRInstructionData *LastInstIt);
501 
502   /// \param A - The first IRInstructionCandidate to compare.
503   /// \param B - The second IRInstructionCandidate to compare.
504   /// \returns True when every IRInstructionData in \p A is similar to every
505   /// IRInstructionData in \p B.
506   static bool isSimilar(const IRSimilarityCandidate &A,
507                         const IRSimilarityCandidate &B);
508 
509   /// \param A - The first IRInstructionCandidate to compare.
510   /// \param B - The second IRInstructionCandidate to compare.
511   /// \returns True when every IRInstructionData in \p A is structurally similar
512   /// to \p B.
513   static bool compareStructure(const IRSimilarityCandidate &A,
514                                const IRSimilarityCandidate &B);
515 
516   struct OperandMapping {
517     /// The IRSimilarityCandidate that holds the instruction the OperVals were
518     /// pulled from.
519     const IRSimilarityCandidate &IRSC;
520 
521     /// The operand values to be analyzed.
522     ArrayRef<Value *> &OperVals;
523 
524     /// The current mapping of global value numbers from one IRSimilarityCandidate
525     /// to another IRSimilarityCandidate.
526     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
527   };
528 
529   /// Compare the operands in \p A and \p B and check that the current mapping
530   /// of global value numbers from \p A to \p B and \p B to \A is consistent.
531   ///
532   /// \param A - The first IRInstructionCandidate, operand values, and current
533   /// operand mappings to compare.
534   /// \param B - The second IRInstructionCandidate, operand values, and current
535   /// operand mappings to compare.
536   /// \returns true if the IRSimilarityCandidates operands are compatible.
537   static bool compareNonCommutativeOperandMapping(OperandMapping A,
538                                                   OperandMapping B);
539 
540   /// Compare the operands in \p A and \p B and check that the current mapping
541   /// of global value numbers from \p A to \p B and \p B to \A is consistent
542   /// given that the operands are commutative.
543   ///
544   /// \param A - The first IRInstructionCandidate, operand values, and current
545   /// operand mappings to compare.
546   /// \param B - The second IRInstructionCandidate, operand values, and current
547   /// operand mappings to compare.
548   /// \returns true if the IRSimilarityCandidates operands are compatible.
549   static bool compareCommutativeOperandMapping(OperandMapping A,
550                                                OperandMapping B);
551 
552   /// Compare the start and end indices of the two IRSimilarityCandidates for
553   /// whether they overlap. If the start instruction of one
554   /// IRSimilarityCandidate is less than the end instruction of the other, and
555   /// the start instruction of one is greater than the start instruction of the
556   /// other, they overlap.
557   ///
558   /// \returns true if the IRSimilarityCandidates do not have overlapping
559   /// instructions.
560   static bool overlap(const IRSimilarityCandidate &A,
561                       const IRSimilarityCandidate &B);
562 
563   /// \returns the number of instructions in this Candidate.
getLength()564   unsigned getLength() const { return Len; }
565 
566   /// \returns the start index of this IRSimilarityCandidate.
getStartIdx()567   unsigned getStartIdx() const { return StartIdx; }
568 
569   /// \returns the end index of this IRSimilarityCandidate.
getEndIdx()570   unsigned getEndIdx() const { return StartIdx + Len - 1; }
571 
572   /// \returns The first IRInstructionData.
front()573   IRInstructionData *front() const { return FirstInst; }
574   /// \returns The last IRInstructionData.
back()575   IRInstructionData *back() const { return LastInst; }
576 
577   /// \returns The first Instruction.
frontInstruction()578   Instruction *frontInstruction() { return FirstInst->Inst; }
579   /// \returns The last Instruction
backInstruction()580   Instruction *backInstruction() { return LastInst->Inst; }
581 
582   /// \returns The BasicBlock the IRSimilarityCandidate starts in.
getStartBB()583   BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
584   /// \returns The BasicBlock the IRSimilarityCandidate ends in.
getEndBB()585   BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
586 
587   /// \returns The Function that the IRSimilarityCandidate is located in.
getFunction()588   Function *getFunction() { return getStartBB()->getParent(); }
589 
590   /// Finds the positive number associated with \p V if it has been mapped.
591   /// \param [in] V - the Value to find.
592   /// \returns The positive number corresponding to the value.
593   /// \returns None if not present.
getGVN(Value * V)594   Optional<unsigned> getGVN(Value *V) {
595     assert(V != nullptr && "Value is a nullptr?");
596     DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
597     if (VNIt == ValueToNumber.end())
598       return None;
599     return VNIt->second;
600   }
601 
602   /// Finds the Value associate with \p Num if it exists.
603   /// \param [in] Num - the number to find.
604   /// \returns The Value associated with the number.
605   /// \returns None if not present.
fromGVN(unsigned Num)606   Optional<Value *> fromGVN(unsigned Num) {
607     DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
608     if (VNIt == NumberToValue.end())
609       return None;
610     assert(VNIt->second != nullptr && "Found value is a nullptr!");
611     return VNIt->second;
612   }
613 
614   /// \param RHS -The IRSimilarityCandidate to compare against
615   /// \returns true if the IRSimilarityCandidate is occurs after the
616   /// IRSimilarityCandidate in the program.
617   bool operator<(const IRSimilarityCandidate &RHS) const {
618     return getStartIdx() > RHS.getStartIdx();
619   }
620 
621   using iterator = IRInstructionDataList::iterator;
begin()622   iterator begin() const { return iterator(front()); }
end()623   iterator end() const { return std::next(iterator(back())); }
624 };
625 
626 typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
627 typedef std::vector<SimilarityGroup> SimilarityGroupList;
628 
629 /// This class puts all the pieces of the IRInstructionData,
630 /// IRInstructionMapper, IRSimilarityCandidate together.
631 ///
632 /// It first feeds the Module or vector of Modules into the IRInstructionMapper,
633 /// and puts all the mapped instructions into a single long list of
634 /// IRInstructionData.
635 ///
636 /// The list of unsigned integers is given to the Suffix Tree or similar data
637 /// structure to find repeated subsequences.  We construct an
638 /// IRSimilarityCandidate for each instance of the subsequence.  We compare them
639 /// against one another since  These repeated subsequences can have different
640 /// structure.  For each different kind of structure found, we create a
641 /// similarity group.
642 ///
643 /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
644 /// structurally similar to one another, while C is different we would have two
645 /// SimilarityGroups:
646 ///
647 /// SimilarityGroup 1:  SimilarityGroup 2
648 /// A, B, D             C
649 ///
650 /// A list of the different similarity groups is then returned after
651 /// analyzing the module.
652 class IRSimilarityIdentifier {
653 public:
IRSimilarityIdentifier()654   IRSimilarityIdentifier()
655       : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
656 
657 private:
658   /// Map the instructions in the module to unsigned integers, using mapping
659   /// already present in the Mapper if possible.
660   ///
661   /// \param [in] M Module - To map to integers.
662   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
663   /// \param [in,out] IntegerMapping - The vector to append integers to.
664   void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
665                       std::vector<unsigned> &IntegerMapping);
666 
667   /// Map the instructions in the modules vector to unsigned integers, using
668   /// mapping already present in the mapper if possible.
669   ///
670   /// \param [in] Modules - The list of modules to use to populate the mapper
671   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
672   /// \param [in,out] IntegerMapping - The vector to append integers to.
673   void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
674                       std::vector<IRInstructionData *> &InstrList,
675                       std::vector<unsigned> &IntegerMapping);
676 
677   /// Find the similarity candidates in \p InstrList and corresponding
678   /// \p UnsignedVec
679   ///
680   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
681   /// \param [in,out] IntegerMapping - The vector to append integers to.
682   /// candidates found in the program.
683   void findCandidates(std::vector<IRInstructionData *> &InstrList,
684                       std::vector<unsigned> &IntegerMapping);
685 
686 public:
687   // Find the IRSimilarityCandidates in the \p Modules and group by structural
688   // similarity in a SimilarityGroup, each group is returned in a
689   // SimilarityGroupList.
690   //
691   // \param [in] Modules - the modules to analyze.
692   // \returns The groups of similarity ranges found in the modules.
693   SimilarityGroupList &
694   findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
695 
696   // Find the IRSimilarityCandidates in the given Module grouped by structural
697   // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
698   //
699   // \param [in] M - the module to analyze.
700   // \returns The groups of similarity ranges found in the module.
701   SimilarityGroupList &findSimilarity(Module &M);
702 
703   // Clears \ref SimilarityCandidates if it is already filled by a previous run.
resetSimilarityCandidates()704   void resetSimilarityCandidates() {
705     // If we've already analyzed a Module or set of Modules, so we must clear
706     // the SimilarityCandidates to make sure we do not have only old values
707     // hanging around.
708     if (SimilarityCandidates.hasValue())
709       SimilarityCandidates->clear();
710     else
711       SimilarityCandidates = SimilarityGroupList();
712   }
713 
714   // \returns The groups of similarity ranges found in the most recently passed
715   // set of modules.
getSimilarity()716   Optional<SimilarityGroupList> &getSimilarity() {
717     return SimilarityCandidates;
718   }
719 
720 private:
721   /// The allocator for IRInstructionData.
722   SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
723 
724   /// The allocator for IRInstructionDataLists.
725   SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
726 
727   /// Map Instructions to unsigned integers and wraps the Instruction in an
728   /// instance of IRInstructionData.
729   IRInstructionMapper Mapper;
730 
731   /// The SimilarityGroups found with the most recent run of \ref
732   /// findSimilarity. None if there is no recent run.
733   Optional<SimilarityGroupList> SimilarityCandidates;
734 };
735 
736 } // end namespace IRSimilarity
737 
738 /// An analysis pass based on legacy pass manager that runs and returns
739 /// IRSimilarityIdentifier run on the Module.
740 class IRSimilarityIdentifierWrapperPass : public ModulePass {
741   std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
742 
743 public:
744   static char ID;
745   IRSimilarityIdentifierWrapperPass();
746 
getIRSI()747   IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
getIRSI()748   const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
749 
750   bool doInitialization(Module &M) override;
751   bool doFinalization(Module &M) override;
752   bool runOnModule(Module &M) override;
getAnalysisUsage(AnalysisUsage & AU)753   void getAnalysisUsage(AnalysisUsage &AU) const override {
754     AU.setPreservesAll();
755   }
756 };
757 
758 /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
759 /// Module.
760 class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
761 public:
762   typedef IRSimilarity::IRSimilarityIdentifier Result;
763 
764   Result run(Module &M, ModuleAnalysisManager &);
765 
766 private:
767   friend AnalysisInfoMixin<IRSimilarityAnalysis>;
768   static AnalysisKey Key;
769 };
770 
771 /// Printer pass that uses \c IRSimilarityAnalysis.
772 class IRSimilarityAnalysisPrinterPass
773     : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
774   raw_ostream &OS;
775 
776 public:
IRSimilarityAnalysisPrinterPass(raw_ostream & OS)777   explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
778   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
779 };
780 
781 } // end namespace llvm
782 
783 #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
784