1 //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Interface file for the IRSimilarityIdentifier for identifying similarities in
11 // IR including the IRInstructionMapper, which maps an Instruction to unsigned
12 // integers.
13 //
14 // Two sequences of instructions are called "similar" if they perform the same
15 // series of operations for all inputs.
16 //
17 // \code
18 // %1 = add i32 %a, 10
19 // %2 = add i32 %a, %1
20 // %3 = icmp slt icmp %1, %2
21 // \endcode
22 //
23 // and
24 //
25 // \code
26 // %1 = add i32 11, %a
27 // %2 = sub i32 %a, %1
28 // %3 = icmp sgt icmp %2, %1
29 // \endcode
30 //
31 // ultimately have the same result, even if the inputs, and structure are
32 // slightly different.
33 //
34 // For instructions, we do not worry about operands that do not have fixed
35 // semantic meaning to the program.  We consider the opcode that the instruction
36 // has, the types, parameters, and extra information such as the function name,
37 // or comparison predicate.  These are used to create a hash to map instructions
38 // to integers to be used in similarity matching in sequences of instructions
39 //
40 // Terminology:
41 // An IRSimilarityCandidate is a region of IRInstructionData (wrapped
42 // Instructions), usually used to denote a region of similarity has been found.
43 //
44 // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
45 // similar to one another.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
50 #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
51 
52 #include "llvm/IR/InstVisitor.h"
53 #include "llvm/IR/Instructions.h"
54 #include "llvm/IR/PassManager.h"
55 #include "llvm/Pass.h"
56 #include "llvm/Support/Allocator.h"
57 #include <optional>
58 
59 namespace llvm {
60 class Module;
61 
62 namespace IRSimilarity {
63 
64 struct IRInstructionDataList;
65 
66 /// This represents what is and is not supported when finding similarity in
67 /// Instructions.
68 ///
69 /// Legal Instructions are considered when looking at similarity between
70 /// Instructions.
71 ///
72 /// Illegal Instructions cannot be considered when looking for similarity
73 /// between Instructions. They act as boundaries between similarity regions.
74 ///
75 /// Invisible Instructions are skipped over during analysis.
76 // TODO: Shared with MachineOutliner
77 enum InstrType { Legal, Illegal, Invisible };
78 
79 /// This provides the utilities for hashing an Instruction to an unsigned
80 /// integer. Two IRInstructionDatas produce the same hash value when their
81 /// underlying Instructions perform the same operation (even if they don't have
82 /// the same input operands.)
83 /// As a more concrete example, consider the following:
84 ///
85 /// \code
86 /// %add1 = add i32 %a, %b
87 /// %add2 = add i32 %c, %d
88 /// %add3 = add i64 %e, %f
89 /// \endcode
90 ///
91 // Then the IRInstructionData wrappers for these Instructions may be hashed like
92 /// so:
93 ///
94 /// \code
95 /// ; These two adds have the same types and operand types, so they hash to the
96 /// ; same number.
97 /// %add1 = add i32 %a, %b ; Hash: 1
98 /// %add2 = add i32 %c, %d ; Hash: 1
99 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
100 /// ; it hashes to a different number.
101 /// %add3 = add i64 %e, %f; Hash: 2
102 /// \endcode
103 ///
104 ///
105 /// This hashing scheme will be used to represent the program as a very long
106 /// string. This string can then be placed in a data structure which can be used
107 /// for similarity queries.
108 ///
109 /// TODO: Handle types of Instructions which can be equal even with different
110 /// operands. (E.g. comparisons with swapped predicates.)
111 /// TODO: Handle CallInsts, which are only checked for function type
112 /// by \ref isSameOperationAs.
113 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
114 /// exact same, and some do not.
115 struct IRInstructionData
116     : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> {
117 
118   /// The source Instruction that is being wrapped.
119   Instruction *Inst = nullptr;
120   /// The values of the operands in the Instruction.
121   SmallVector<Value *, 4> OperVals;
122   /// The legality of the wrapped instruction. This is informed by InstrType,
123   /// and is used when checking when two instructions are considered similar.
124   /// If either instruction is not legal, the instructions are automatically not
125   /// considered similar.
126   bool Legal = false;
127 
128   /// This is only relevant if we are wrapping a CmpInst where we needed to
129   /// change the predicate of a compare instruction from a greater than form
130   /// to a less than form.  It is std::nullopt otherwise.
131   std::optional<CmpInst::Predicate> RevisedPredicate;
132 
133   /// This is only relevant if we are wrapping a CallInst. If we are requiring
134   /// that the function calls have matching names as well as types, and the
135   /// call is not an indirect call, this will hold the name of the function.  If
136   /// it is an indirect string, it will be the empty string.  However, if this
137   /// requirement is not in place it will be the empty string regardless of the
138   /// function call type.  The value held here is used to create the hash of the
139   /// instruction, and check to make sure two instructions are close to one
140   /// another.
141   std::optional<std::string> CalleeName;
142 
143   /// This structure holds the distances of how far "ahead of" or "behind" the
144   /// target blocks of a branch, or the incoming blocks of a phi nodes are.
145   /// If the value is negative, it means that the block was registered before
146   /// the block of this instruction in terms of blocks in the function.
147   /// Code Example:
148   /// \code
149   /// block_1:
150   ///   br i1 %0, label %block_2, label %block_3
151   /// block_2:
152   ///   br i1 %1, label %block_1, label %block_2
153   /// block_3:
154   ///   br i1 %2, label %block_2, label %block_1
155   /// ; Replacing the labels with relative values, this becomes:
156   /// block_1:
157   ///   br i1 %0, distance 1, distance 2
158   /// block_2:
159   ///   br i1 %1, distance -1, distance 0
160   /// block_3:
161   ///   br i1 %2, distance -1, distance -2
162   /// \endcode
163   /// Taking block_2 as our example, block_1 is "behind" block_2, and block_2 is
164   /// "ahead" of block_2.
165   SmallVector<int, 4> RelativeBlockLocations;
166 
167   /// Gather the information that is difficult to gather for an Instruction, or
168   /// is changed. i.e. the operands of an Instruction and the Types of those
169   /// operands. This extra information allows for similarity matching to make
170   /// assertions that allow for more flexibility when checking for whether an
171   /// Instruction performs the same operation.
172   IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
173   IRInstructionData(IRInstructionDataList &IDL);
174 
175   /// Fills data stuctures for IRInstructionData when it is constructed from a
176   // reference or a pointer.
177   void initializeInstruction();
178 
179   /// Get the predicate that the compare instruction is using for hashing the
180   /// instruction. the IRInstructionData must be wrapping a CmpInst.
181   CmpInst::Predicate getPredicate() const;
182 
183   /// Get the callee name that the call instruction is using for hashing the
184   /// instruction. The IRInstructionData must be wrapping a CallInst.
185   StringRef getCalleeName() const;
186 
187   /// A function that swaps the predicates to their less than form if they are
188   /// in a greater than form. Otherwise, the predicate is unchanged.
189   ///
190   /// \param CI - The comparison operation to find a consistent preidcate for.
191   /// \return the consistent comparison predicate.
192   static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
193 
194   /// For an IRInstructionData containing a branch, finds the
195   /// relative distances from the source basic block to the target by taking
196   /// the difference of the number assigned to the current basic block and the
197   /// target basic block of the branch.
198   ///
199   /// \param BasicBlockToInteger - The mapping of basic blocks to their location
200   /// in the module.
201   void
202   setBranchSuccessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
203 
204   /// For an IRInstructionData containing a CallInst, set the function name
205   /// appropriately.  This will be an empty string if it is an indirect call,
206   /// or we are not matching by name of the called function.  It will be the
207   /// name of the function if \p MatchByName is true and it is not an indirect
208   /// call.  We may decide not to match by name in order to expand the
209   /// size of the regions we can match.  If a function name has the same type
210   /// signature, but the different name, the region of code is still almost the
211   /// same.  Since function names can be treated as constants, the name itself
212   /// could be extrapolated away.  However, matching by name provides a
213   /// specificity and more "identical" code than not matching by name.
214   ///
215   /// \param MatchByName - A flag to mark whether we are using the called
216   /// function name as a differentiating parameter.
217   void setCalleeName(bool MatchByName = true);
218 
219   /// For an IRInstructionData containing a PHINode, finds the
220   /// relative distances from the incoming basic block to the current block by
221   /// taking the difference of the number assigned to the current basic block
222   /// and the incoming basic block of the branch.
223   ///
224   /// \param BasicBlockToInteger - The mapping of basic blocks to their location
225   /// in the module.
226   void
227   setPHIPredecessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
228 
229   /// Get the BasicBlock based operands for PHINodes and BranchInsts.
230   ///
231   /// \returns A list of relevant BasicBlocks.
232   ArrayRef<Value *> getBlockOperVals();
233 
234   /// Hashes \p Value based on its opcode, types, and operand types.
235   /// Two IRInstructionData instances produce the same hash when they perform
236   /// the same operation.
237   ///
238   /// As a simple example, consider the following instructions.
239   ///
240   /// \code
241   /// %add1 = add i32 %x1, %y1
242   /// %add2 = add i32 %x2, %y2
243   ///
244   /// %sub = sub i32 %x1, %y1
245   ///
246   /// %add_i64 = add i64 %x2, %y2
247   /// \endcode
248   ///
249   /// Because the first two adds operate the same types, and are performing the
250   /// same action, they will be hashed to the same value.
251   ///
252   /// However, the subtraction instruction is not the same as an addition, and
253   /// will be hashed to a different value.
254   ///
255   /// Finally, the last add has a different type compared to the first two add
256   /// instructions, so it will also be hashed to a different value that any of
257   /// the previous instructions.
258   ///
259   /// \param [in] ID - The IRInstructionData instance to be hashed.
260   /// \returns A hash_value of the IRInstructionData.
hash_valueIRInstructionData261   friend hash_code hash_value(const IRInstructionData &ID) {
262     SmallVector<Type *, 4> OperTypes;
263     for (Value *V : ID.OperVals)
264       OperTypes.push_back(V->getType());
265 
266     if (isa<CmpInst>(ID.Inst))
267       return llvm::hash_combine(
268           llvm::hash_value(ID.Inst->getOpcode()),
269           llvm::hash_value(ID.Inst->getType()),
270           llvm::hash_value(ID.getPredicate()),
271           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
272 
273     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(ID.Inst)) {
274       // To hash intrinsics, we use the opcode, and types like the other
275       // instructions, but also, the Intrinsic ID, and the Name of the
276       // intrinsic.
277       Intrinsic::ID IntrinsicID = II->getIntrinsicID();
278       return llvm::hash_combine(
279           llvm::hash_value(ID.Inst->getOpcode()),
280           llvm::hash_value(ID.Inst->getType()), llvm::hash_value(IntrinsicID),
281           llvm::hash_value(*ID.CalleeName),
282           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
283     }
284 
285     if (isa<CallInst>(ID.Inst)) {
286       std::string FunctionName = *ID.CalleeName;
287       return llvm::hash_combine(
288           llvm::hash_value(ID.Inst->getOpcode()),
289           llvm::hash_value(ID.Inst->getType()),
290           llvm::hash_value(ID.Inst->getType()), llvm::hash_value(FunctionName),
291           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
292     }
293 
294     return llvm::hash_combine(
295         llvm::hash_value(ID.Inst->getOpcode()),
296         llvm::hash_value(ID.Inst->getType()),
297         llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
298   }
299 
300   IRInstructionDataList *IDL = nullptr;
301 };
302 
303 struct IRInstructionDataList
304     : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> {};
305 
306 /// Compare one IRInstructionData class to another IRInstructionData class for
307 /// whether they are performing a the same operation, and can mapped to the
308 /// same value. For regular instructions if the hash value is the same, then
309 /// they will also be close.
310 ///
311 /// \param A - The first IRInstructionData class to compare
312 /// \param B - The second IRInstructionData class to compare
313 /// \returns true if \p A and \p B are similar enough to be mapped to the same
314 /// value.
315 bool isClose(const IRInstructionData &A, const IRInstructionData &B);
316 
317 struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
getEmptyKeyIRInstructionDataTraits318   static inline IRInstructionData *getEmptyKey() { return nullptr; }
getTombstoneKeyIRInstructionDataTraits319   static inline IRInstructionData *getTombstoneKey() {
320     return reinterpret_cast<IRInstructionData *>(-1);
321   }
322 
getHashValueIRInstructionDataTraits323   static unsigned getHashValue(const IRInstructionData *E) {
324     using llvm::hash_value;
325     assert(E && "IRInstructionData is a nullptr?");
326     return hash_value(*E);
327   }
328 
isEqualIRInstructionDataTraits329   static bool isEqual(const IRInstructionData *LHS,
330                       const IRInstructionData *RHS) {
331     if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
332         LHS == getEmptyKey() || LHS == getTombstoneKey())
333       return LHS == RHS;
334 
335     assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
336     return isClose(*LHS, *RHS);
337   }
338 };
339 
340 /// Helper struct for converting the Instructions in a Module into a vector of
341 /// unsigned integers. This vector of unsigned integers can be thought of as a
342 /// "numeric string". This numeric string can then be queried by, for example,
343 /// data structures that find repeated substrings.
344 ///
345 /// This hashing is done per BasicBlock in the module. To hash Instructions
346 /// based off of their operations, each Instruction is wrapped in an
347 /// IRInstructionData struct. The unsigned integer for an IRInstructionData
348 /// depends on:
349 /// - The hash provided by the IRInstructionData.
350 /// - Which member of InstrType the IRInstructionData is classified as.
351 // See InstrType for more details on the possible classifications, and how they
352 // manifest in the numeric string.
353 ///
354 /// The numeric string for an individual BasicBlock is terminated by an unique
355 /// unsigned integer. This prevents data structures which rely on repetition
356 /// from matching across BasicBlocks. (For example, the SuffixTree.)
357 /// As a concrete example, if we have the following two BasicBlocks:
358 /// \code
359 /// bb0:
360 /// %add1 = add i32 %a, %b
361 /// %add2 = add i32 %c, %d
362 /// %add3 = add i64 %e, %f
363 /// bb1:
364 /// %sub = sub i32 %c, %d
365 /// \endcode
366 /// We may hash the Instructions like this (via IRInstructionData):
367 /// \code
368 /// bb0:
369 /// %add1 = add i32 %a, %b ; Hash: 1
370 /// %add2 = add i32 %c, %d; Hash: 1
371 /// %add3 = add i64 %e, %f; Hash: 2
372 /// bb1:
373 /// %sub = sub i32 %c, %d; Hash: 3
374 /// %add4 = add i32 %c, %d ; Hash: 1
375 /// \endcode
376 /// And produce a "numeric string representation" like so:
377 /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
378 ///
379 /// TODO: This is very similar to the MachineOutliner, and should be
380 /// consolidated into the same interface.
381 struct IRInstructionMapper {
382   /// The starting illegal instruction number to map to.
383   ///
384   /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
385   unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
386 
387   /// The next available integer to assign to a legal Instruction to.
388   unsigned LegalInstrNumber = 0;
389 
390   /// Correspondence from IRInstructionData to unsigned integers.
391   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
392       InstructionIntegerMap;
393 
394   /// A mapping for a basic block in a module to its assigned number/location
395   /// in the module.
396   DenseMap<BasicBlock *, unsigned> BasicBlockToInteger;
397 
398   /// Set if we added an illegal number in the previous step.
399   /// Since each illegal number is unique, we only need one of them between
400   /// each range of legal numbers. This lets us make sure we don't add more
401   /// than one illegal number per range.
402   bool AddedIllegalLastTime = false;
403 
404   /// Marks whether we found a illegal instruction in the previous step.
405   bool CanCombineWithPrevInstr = false;
406 
407   /// Marks whether we have found a set of instructions that is long enough
408   /// to be considered for similarity.
409   bool HaveLegalRange = false;
410 
411   /// Marks whether we should use exact function names, as well as types to
412   /// find similarity between calls.
413   bool EnableMatchCallsByName = false;
414 
415   /// This allocator pointer is in charge of holding on to the IRInstructionData
416   /// so it is not deallocated until whatever external tool is using it is done
417   /// with the information.
418   SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
419 
420   /// This allocator pointer is in charge of creating the IRInstructionDataList
421   /// so it is not deallocated until whatever external tool is using it is done
422   /// with the information.
423   SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
424 
425   /// Get an allocated IRInstructionData struct using the InstDataAllocator.
426   ///
427   /// \param I - The Instruction to wrap with IRInstructionData.
428   /// \param Legality - A boolean value that is true if the instruction is to
429   /// be considered for similarity, and false if not.
430   /// \param IDL - The InstructionDataList that the IRInstructionData is
431   /// inserted into.
432   /// \returns An allocated IRInstructionData struct.
433   IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
434                                                IRInstructionDataList &IDL);
435 
436   /// Get an empty allocated IRInstructionData struct using the
437   /// InstDataAllocator.
438   ///
439   /// \param IDL - The InstructionDataList that the IRInstructionData is
440   /// inserted into.
441   /// \returns An allocated IRInstructionData struct.
442   IRInstructionData *allocateIRInstructionData(IRInstructionDataList &IDL);
443 
444   /// Get an allocated IRInstructionDataList object using the IDLAllocator.
445   ///
446   /// \returns An allocated IRInstructionDataList object.
447   IRInstructionDataList *allocateIRInstructionDataList();
448 
449   IRInstructionDataList *IDL = nullptr;
450 
451   /// Assigns values to all the basic blocks in function \p F starting from
452   /// integer \p BBNumber.
453   ///
454   /// \param F - The function containing the basic blocks to assign numbers to.
455   /// \param BBNumber - The number to start from.
initializeForBBsIRInstructionMapper456   void initializeForBBs(Function &F, unsigned &BBNumber) {
457     for (BasicBlock &BB : F)
458       BasicBlockToInteger.insert(std::make_pair(&BB, BBNumber++));
459   }
460 
461   /// Assigns values to all the basic blocks in Module \p M.
462   /// \param M - The module containing the basic blocks to assign numbers to.
initializeForBBsIRInstructionMapper463   void initializeForBBs(Module &M) {
464     unsigned BBNumber = 0;
465     for (Function &F : M)
466       initializeForBBs(F, BBNumber);
467   }
468 
469   /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
470   /// determined by \p InstrType. Two Instructions are mapped to the same value
471   /// if they are close as defined by the InstructionData class above.
472   ///
473   /// \param [in] BB - The BasicBlock to be mapped to integers.
474   /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
475   /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
476   void convertToUnsignedVec(BasicBlock &BB,
477                             std::vector<IRInstructionData *> &InstrList,
478                             std::vector<unsigned> &IntegerMapping);
479 
480   /// Maps an Instruction to a legal integer.
481   ///
482   /// \param [in] It - The Instruction to be mapped to an integer.
483   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
484   /// append to.
485   /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
486   /// \returns The integer \p It was mapped to.
487   unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
488                               std::vector<unsigned> &IntegerMappingForBB,
489                               std::vector<IRInstructionData *> &InstrListForBB);
490 
491   /// Maps an Instruction to an illegal integer.
492   ///
493   /// \param [in] It - The \p Instruction to be mapped to an integer.
494   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
495   /// append to.
496   /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
497   /// \param End - true if creating a dummy IRInstructionData at the end of a
498   /// basic block.
499   /// \returns The integer \p It was mapped to.
500   unsigned mapToIllegalUnsigned(
501       BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
502       std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
503 
IRInstructionMapperIRInstructionMapper504   IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
505                       SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
506       : InstDataAllocator(IDA), IDLAllocator(IDLA) {
507     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
508     // changed.
509     assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
510            "DenseMapInfo<unsigned>'s empty key isn't -1!");
511     assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
512                static_cast<unsigned>(-2) &&
513            "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
514 
515     IDL = new (IDLAllocator->Allocate())
516         IRInstructionDataList();
517   }
518 
519   /// Custom InstVisitor to classify different instructions for whether it can
520   /// be analyzed for similarity.
521   struct InstructionClassification
522       : public InstVisitor<InstructionClassification, InstrType> {
523     InstructionClassification() = default;
524 
525     // TODO: Determine a scheme to resolve when the label is similar enough.
visitBranchInstIRInstructionMapper::InstructionClassification526     InstrType visitBranchInst(BranchInst &BI) {
527       if (EnableBranches)
528         return Legal;
529       return Illegal;
530     }
visitPHINodeIRInstructionMapper::InstructionClassification531     InstrType visitPHINode(PHINode &PN) {
532       if (EnableBranches)
533         return Legal;
534       return Illegal;
535     }
536     // TODO: Handle allocas.
visitAllocaInstIRInstructionMapper::InstructionClassification537     InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
538     // We exclude variable argument instructions since variable arguments
539     // requires extra checking of the argument list.
visitVAArgInstIRInstructionMapper::InstructionClassification540     InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
541     // We exclude all exception handling cases since they are so context
542     // dependent.
visitLandingPadInstIRInstructionMapper::InstructionClassification543     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
visitFuncletPadInstIRInstructionMapper::InstructionClassification544     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
545     // DebugInfo should be included in the regions, but should not be
546     // analyzed for similarity as it has no bearing on the outcome of the
547     // program.
visitDbgInfoIntrinsicIRInstructionMapper::InstructionClassification548     InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
visitIntrinsicInstIRInstructionMapper::InstructionClassification549     InstrType visitIntrinsicInst(IntrinsicInst &II) {
550       // These are disabled due to complications in the CodeExtractor when
551       // outlining these instructions.  For instance, It is unclear what we
552       // should do when moving only the start or end lifetime instruction into
553       // an outlined function. Also, assume-like intrinsics could be removed
554       // from the region, removing arguments, causing discrepencies in the
555       // number of inputs between different regions.
556       if (II.isAssumeLikeIntrinsic())
557         return Illegal;
558       return EnableIntrinsics ? Legal : Illegal;
559     }
560     // We only allow call instructions where the function has a name and
561     // is not an indirect call.
visitCallInstIRInstructionMapper::InstructionClassification562     InstrType visitCallInst(CallInst &CI) {
563       Function *F = CI.getCalledFunction();
564       bool IsIndirectCall = CI.isIndirectCall();
565       if (IsIndirectCall && !EnableIndirectCalls)
566         return Illegal;
567       if (!F && !IsIndirectCall)
568         return Illegal;
569       // Functions marked with the swifttailcc and tailcc calling conventions
570       // require special handling when outlining musttail functions.  The
571       // calling convention must be passed down to the outlined function as
572       // well. Further, there is special handling for musttail calls as well,
573       // requiring a return call directly after.  For now, the outliner does not
574       // support this, so we do not handle matching this case either.
575       if ((CI.getCallingConv() == CallingConv::SwiftTail ||
576            CI.getCallingConv() == CallingConv::Tail) &&
577           !EnableMustTailCalls)
578         return Illegal;
579       if (CI.isMustTailCall() && !EnableMustTailCalls)
580         return Illegal;
581       return Legal;
582     }
583     // TODO: We do not current handle similarity that changes the control flow.
visitInvokeInstIRInstructionMapper::InstructionClassification584     InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
585     // TODO: We do not current handle similarity that changes the control flow.
visitCallBrInstIRInstructionMapper::InstructionClassification586     InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
587     // TODO: Handle interblock similarity.
visitTerminatorIRInstructionMapper::InstructionClassification588     InstrType visitTerminator(Instruction &I) { return Illegal; }
visitInstructionIRInstructionMapper::InstructionClassification589     InstrType visitInstruction(Instruction &I) { return Legal; }
590 
591     // The flag variable that lets the classifier know whether we should
592     // allow branches to be checked for similarity.
593     bool EnableBranches = false;
594 
595     // The flag variable that lets the classifier know whether we should
596     // allow indirect calls to be considered legal instructions.
597     bool EnableIndirectCalls = false;
598 
599     // Flag that lets the classifier know whether we should allow intrinsics to
600     // be checked for similarity.
601     bool EnableIntrinsics = false;
602 
603     // Flag that lets the classifier know whether we should allow tail calls to
604     // be checked for similarity.
605     bool EnableMustTailCalls = false;
606   };
607 
608   /// Maps an Instruction to a member of InstrType.
609   InstructionClassification InstClassifier;
610 };
611 
612 /// This is a class that wraps a range of IRInstructionData from one point to
613 /// another in the vector of IRInstructionData, which is a region of the
614 /// program.  It is also responsible for defining the structure within this
615 /// region of instructions.
616 ///
617 /// The structure of a region is defined through a value numbering system
618 /// assigned to each unique value in a region at the creation of the
619 /// IRSimilarityCandidate.
620 ///
621 /// For example, for each Instruction we add a mapping for each new
622 /// value seen in that Instruction.
623 /// IR:                    Mapping Added:
624 /// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
625 /// %add2 = add i32 %a, %1    %add2 -> 4
626 /// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
627 ///
628 /// We can compare IRSimilarityCandidates against one another.
629 /// The \ref isSimilar function compares each IRInstructionData against one
630 /// another and if we have the same sequences of IRInstructionData that would
631 /// create the same hash, we have similar IRSimilarityCandidates.
632 ///
633 /// We can also compare the structure of IRSimilarityCandidates. If we can
634 /// create a mapping of registers in the region contained by one
635 /// IRSimilarityCandidate to the region contained by different
636 /// IRSimilarityCandidate, they can be considered structurally similar.
637 ///
638 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
639 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e
640 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
641 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
642 ///
643 /// Can have the following mapping from candidate to candidate of:
644 /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
645 /// and can be considered similar.
646 ///
647 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
648 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4
649 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
650 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
651 ///
652 /// We cannot create the same mapping since the use of c4 is not used in the
653 /// same way as %b or c2.
654 class IRSimilarityCandidate {
655 private:
656   /// The start index of this IRSimilarityCandidate in the instruction list.
657   unsigned StartIdx = 0;
658 
659   /// The number of instructions in this IRSimilarityCandidate.
660   unsigned Len = 0;
661 
662   /// The first instruction in this IRSimilarityCandidate.
663   IRInstructionData *FirstInst = nullptr;
664 
665   /// The last instruction in this IRSimilarityCandidate.
666   IRInstructionData *LastInst = nullptr;
667 
668   /// Global Value Numbering structures
669   /// @{
670   /// Stores the mapping of the value to the number assigned to it in the
671   /// IRSimilarityCandidate.
672   DenseMap<Value *, unsigned> ValueToNumber;
673   /// Stores the mapping of the number to the value assigned this number.
674   DenseMap<unsigned, Value *> NumberToValue;
675   /// Stores the mapping of a value's number to canonical numbering in the
676   /// candidate's respective similarity group.
677   DenseMap<unsigned, unsigned> NumberToCanonNum;
678   /// Stores the mapping of canonical number in the candidate's respective
679   /// similarity group to a value number.
680   DenseMap<unsigned, unsigned> CanonNumToNumber;
681   /// @}
682 
683 public:
684   /// \param StartIdx - The starting location of the region.
685   /// \param Len - The length of the region.
686   /// \param FirstInstIt - The starting IRInstructionData of the region.
687   /// \param LastInstIt - The ending IRInstructionData of the region.
688   IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
689                         IRInstructionData *FirstInstIt,
690                         IRInstructionData *LastInstIt);
691 
692   /// \param A - The first IRInstructionCandidate to compare.
693   /// \param B - The second IRInstructionCandidate to compare.
694   /// \returns True when every IRInstructionData in \p A is similar to every
695   /// IRInstructionData in \p B.
696   static bool isSimilar(const IRSimilarityCandidate &A,
697                         const IRSimilarityCandidate &B);
698 
699   /// \param [in] A - The first IRInstructionCandidate to compare.
700   /// \param [in] B - The second IRInstructionCandidate to compare.
701   /// \returns True when every IRInstructionData in \p A is structurally similar
702   /// to \p B.
703   static bool compareStructure(const IRSimilarityCandidate &A,
704                                const IRSimilarityCandidate &B);
705 
706   /// \param [in] A - The first IRInstructionCandidate to compare.
707   /// \param [in] B - The second IRInstructionCandidate to compare.
708   /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from
709   /// candidate \p A to candidate \B.
710   /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from
711   /// candidate \p B to candidate \A.
712   /// \returns True when every IRInstructionData in \p A is structurally similar
713   /// to \p B.
714   static bool
715   compareStructure(const IRSimilarityCandidate &A,
716                    const IRSimilarityCandidate &B,
717                    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
718                    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB);
719 
720   struct OperandMapping {
721     /// The IRSimilarityCandidate that holds the instruction the OperVals were
722     /// pulled from.
723     const IRSimilarityCandidate &IRSC;
724 
725     /// The operand values to be analyzed.
726     ArrayRef<Value *> &OperVals;
727 
728     /// The current mapping of global value numbers from one IRSimilarityCandidate
729     /// to another IRSimilarityCandidate.
730     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
731   };
732 
733   /// A helper struct to hold the candidate, for a branch instruction, the
734   /// relative location of a label, and the label itself.  This is mostly to
735   /// group the values together before passing them as a bundle to a function.
736   struct RelativeLocMapping {
737     /// The IRSimilarityCandidate that holds the instruction the relative
738     /// location was pulled from.
739     const IRSimilarityCandidate &IRSC;
740 
741     /// The relative location to be analyzed.
742     int RelativeLocation;
743 
744     /// The corresponding value.
745     Value *OperVal;
746   };
747 
748   /// Compare the operands in \p A and \p B and check that the current mapping
749   /// of global value numbers from \p A to \p B and \p B to \A is consistent.
750   ///
751   /// \param A - The first IRInstructionCandidate, operand values, and current
752   /// operand mappings to compare.
753   /// \param B - The second IRInstructionCandidate, operand values, and current
754   /// operand mappings to compare.
755   /// \returns true if the IRSimilarityCandidates operands are compatible.
756   static bool compareNonCommutativeOperandMapping(OperandMapping A,
757                                                   OperandMapping B);
758 
759   /// Compare the operands in \p A and \p B and check that the current mapping
760   /// of global value numbers from \p A to \p B and \p B to \A is consistent
761   /// given that the operands are commutative.
762   ///
763   /// \param A - The first IRInstructionCandidate, operand values, and current
764   /// operand mappings to compare.
765   /// \param B - The second IRInstructionCandidate, operand values, and current
766   /// operand mappings to compare.
767   /// \returns true if the IRSimilarityCandidates operands are compatible.
768   static bool compareCommutativeOperandMapping(OperandMapping A,
769                                                OperandMapping B);
770 
771   /// Compare the GVN of the assignment value in corresponding instructions in
772   /// IRSimilarityCandidates \p A and \p B and check that there exists a mapping
773   /// between the values and replaces the mapping with a one-to-one value if
774   /// needed.
775   ///
776   /// \param InstValA - The assignment GVN from the first IRSimilarityCandidate.
777   /// \param InstValB - The assignment GVN from the second
778   /// IRSimilarityCandidate.
779   /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from
780   /// candidate \p A to candidate \B.
781   /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from
782   /// candidate \p B to candidate \A.
783   /// \returns true if the IRSimilarityCandidates assignments are compatible.
784   static bool compareAssignmentMapping(
785       const unsigned InstValA, const unsigned &InstValB,
786       DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
787       DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB);
788 
789   /// Compare the relative locations in \p A and \p B and check that the
790   /// distances match if both locations are contained in the region, and that
791   /// the branches both point outside the region if they do not.
792   /// Example Region:
793   /// \code
794   /// entry:
795   ///   br i1 %0, label %block_1, label %block_3
796   /// block_0:
797   ///   br i1 %0, label %block_1, label %block_2
798   /// block_1:
799   ///   br i1 %0, label %block_2, label %block_3
800   /// block_2:
801   ///   br i1 %1, label %block_1, label %block_4
802   /// block_3:
803   ///   br i1 %2, label %block_2, label %block_5
804   /// \endcode
805   /// If we compare the branches in block_0 and block_1 the relative values are
806   /// 1 and 2 for both, so we consider this a match.
807   ///
808   /// If we compare the branches in entry and block_0 the relative values are
809   /// 2 and 3, and 1 and 2 respectively.  Since these are not the same we do not
810   /// consider them a match.
811   ///
812   /// If we compare the branches in block_1 and block_2 the relative values are
813   /// 1 and 2, and -1 and None respectively.  As a result we do not consider
814   /// these to be the same
815   ///
816   /// If we compare the branches in block_2 and block_3 the relative values are
817   /// -1 and None for both.  We do consider these to be a match.
818   ///
819   /// \param A - The first IRInstructionCandidate, relative location value,
820   /// and incoming block.
821   /// \param B - The second IRInstructionCandidate, relative location value,
822   /// and incoming block.
823   /// \returns true if the relative locations match.
824   static bool checkRelativeLocations(RelativeLocMapping A,
825                                      RelativeLocMapping B);
826 
827   /// Create a mapping from the value numbering to a different separate set of
828   /// numbers. This will serve as a guide for relating one candidate to another.
829   /// The canonical number gives use the ability identify which global value
830   /// number in one candidate relates to the global value number in the other.
831   ///
832   /// \param [in, out] CurrCand - The IRSimilarityCandidate to create a
833   /// canonical numbering for.
834   static void createCanonicalMappingFor(IRSimilarityCandidate &CurrCand);
835 
836   /// Create a mapping for the value numbering of the calling
837   /// IRSimilarityCandidate, to a different separate set of numbers, based on
838   /// the canonical ordering in \p SourceCand. These are defined based on the
839   /// found mappings in \p ToSourceMapping and \p FromSourceMapping.  Both of
840   /// these relationships should have the same information, just in opposite
841   /// directions.
842   ///
843   /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
844   /// canonical numbering from.
845   /// \param ToSourceMapping - The mapping of value numbers from this candidate
846   /// to \p SourceCand.
847   /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand
848   /// to this candidate.
849   void createCanonicalRelationFrom(
850       IRSimilarityCandidate &SourceCand,
851       DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
852       DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping);
853 
854   /// Create a mapping for the value numbering of the calling
855   /// IRSimilarityCandidate, to a different separate set of numbers, based on
856   /// the canonical ordering in \p SourceCand. These are defined based on the
857   /// found mappings in \p ToSourceMapping and \p FromSourceMapping.  Both of
858   /// these relationships should have the same information, just in opposite
859   /// directions.  Uses the \p OneToOne mapping from target candidate to \p
860   /// SourceCand GVNs to determine the mapping first for values with multiple
861   /// mappings.  This mapping is created by the ordering of operands in the
862   /// instruction they are first seen in the candidates.
863   ///
864   /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
865   /// canonical numbering from.
866   /// \param [in,out] OneToOne - A mapping of value numbers from candidate
867   /// \p A to candidate \B using the structure of the original instructions.
868   /// \param ToSourceMapping - The mapping of value numbers from this candidate
869   /// to \p SourceCand.
870   /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand
871   /// to this candidate.
872   void createCanonicalRelationFrom(
873       IRSimilarityCandidate &SourceCand,
874       DenseMap<unsigned, unsigned> &OneToOne,
875       DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
876       DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping);
877 
878   /// Create a mapping for the value numbering of the calling
879   /// IRSimilarityCandidate, to a different separate set of numbers, based on
880   /// the canonical ordering in \p SourceCand. These are defined based on the
881   /// canonical mapping defined between \p SoureCandLarge and
882   /// \p TargetCandLarge.  These IRSimilarityCandidates are already structurally
883   /// similar, and fully encapsulate the IRSimilarityCandidates in question.
884   /// These are used as a "bridge" from the \p SourceCand to the target.
885   ///
886   /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
887   /// canonical numbering from.
888   /// \param SoureCandLarge - The IRSimilarityCandidate fully containing
889   /// \p SourceCand.
890   /// \param TargetCandLarge -  The IRSimilarityCandidate fully containing
891   /// this Candidate.
892   void createCanonicalRelationFrom(
893       IRSimilarityCandidate &SourceCand,
894       IRSimilarityCandidate &SourceCandLarge,
895       IRSimilarityCandidate &TargetCandLarge);
896 
897   /// \param [in,out] BBSet - The set to track the basic blocks.
getBasicBlocks(DenseSet<BasicBlock * > & BBSet)898   void getBasicBlocks(DenseSet<BasicBlock *> &BBSet) const {
899     for (IRInstructionData &ID : *this) {
900       BasicBlock *BB = ID.Inst->getParent();
901       BBSet.insert(BB);
902     }
903   }
904 
905   /// \param [in,out] BBSet - The set to track the basic blocks.
906   /// \param [in,out] BBList - A list in order of use to track the basic blocks.
getBasicBlocks(DenseSet<BasicBlock * > & BBSet,SmallVector<BasicBlock * > & BBList)907   void getBasicBlocks(DenseSet<BasicBlock *> &BBSet,
908                       SmallVector<BasicBlock *> &BBList) const {
909     for (IRInstructionData &ID : *this) {
910       BasicBlock *BB = ID.Inst->getParent();
911       if (BBSet.insert(BB).second)
912         BBList.push_back(BB);
913     }
914   }
915 
916   /// Compare the start and end indices of the two IRSimilarityCandidates for
917   /// whether they overlap. If the start instruction of one
918   /// IRSimilarityCandidate is less than the end instruction of the other, and
919   /// the start instruction of one is greater than the start instruction of the
920   /// other, they overlap.
921   ///
922   /// \returns true if the IRSimilarityCandidates do not have overlapping
923   /// instructions.
924   static bool overlap(const IRSimilarityCandidate &A,
925                       const IRSimilarityCandidate &B);
926 
927   /// \returns the number of instructions in this Candidate.
getLength()928   unsigned getLength() const { return Len; }
929 
930   /// \returns the start index of this IRSimilarityCandidate.
getStartIdx()931   unsigned getStartIdx() const { return StartIdx; }
932 
933   /// \returns the end index of this IRSimilarityCandidate.
getEndIdx()934   unsigned getEndIdx() const { return StartIdx + Len - 1; }
935 
936   /// \returns The first IRInstructionData.
front()937   IRInstructionData *front() const { return FirstInst; }
938   /// \returns The last IRInstructionData.
back()939   IRInstructionData *back() const { return LastInst; }
940 
941   /// \returns The first Instruction.
frontInstruction()942   Instruction *frontInstruction() { return FirstInst->Inst; }
943   /// \returns The last Instruction
backInstruction()944   Instruction *backInstruction() { return LastInst->Inst; }
945 
946   /// \returns The BasicBlock the IRSimilarityCandidate starts in.
getStartBB()947   BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
948   /// \returns The BasicBlock the IRSimilarityCandidate ends in.
getEndBB()949   BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
950 
951   /// \returns The Function that the IRSimilarityCandidate is located in.
getFunction()952   Function *getFunction() { return getStartBB()->getParent(); }
953 
954   /// Finds the positive number associated with \p V if it has been mapped.
955   /// \param [in] V - the Value to find.
956   /// \returns The positive number corresponding to the value.
957   /// \returns std::nullopt if not present.
getGVN(Value * V)958   std::optional<unsigned> getGVN(Value *V) {
959     assert(V != nullptr && "Value is a nullptr?");
960     DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
961     if (VNIt == ValueToNumber.end())
962       return std::nullopt;
963     return VNIt->second;
964   }
965 
966   /// Finds the Value associate with \p Num if it exists.
967   /// \param [in] Num - the number to find.
968   /// \returns The Value associated with the number.
969   /// \returns std::nullopt if not present.
fromGVN(unsigned Num)970   std::optional<Value *> fromGVN(unsigned Num) {
971     DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
972     if (VNIt == NumberToValue.end())
973       return std::nullopt;
974     assert(VNIt->second != nullptr && "Found value is a nullptr!");
975     return VNIt->second;
976   }
977 
978   /// Find the canonical number from the global value number \p N stored in the
979   /// candidate.
980   ///
981   /// \param N - The global value number to find the canonical number for.
982   /// \returns An optional containing the value, and std::nullopt if it could
983   /// not be found.
getCanonicalNum(unsigned N)984   std::optional<unsigned> getCanonicalNum(unsigned N) {
985     DenseMap<unsigned, unsigned>::iterator NCIt = NumberToCanonNum.find(N);
986     if (NCIt == NumberToCanonNum.end())
987       return std::nullopt;
988     return NCIt->second;
989   }
990 
991   /// Find the global value number from the canonical number \p N stored in the
992   /// candidate.
993   ///
994   /// \param N - The canonical number to find the global vlaue number for.
995   /// \returns An optional containing the value, and std::nullopt if it could
996   /// not be found.
fromCanonicalNum(unsigned N)997   std::optional<unsigned> fromCanonicalNum(unsigned N) {
998     DenseMap<unsigned, unsigned>::iterator CNIt = CanonNumToNumber.find(N);
999     if (CNIt == CanonNumToNumber.end())
1000       return std::nullopt;
1001     return CNIt->second;
1002   }
1003 
1004   /// \param RHS -The IRSimilarityCandidate to compare against
1005   /// \returns true if the IRSimilarityCandidate is occurs after the
1006   /// IRSimilarityCandidate in the program.
1007   bool operator<(const IRSimilarityCandidate &RHS) const {
1008     return getStartIdx() > RHS.getStartIdx();
1009   }
1010 
1011   using iterator = IRInstructionDataList::iterator;
begin()1012   iterator begin() const { return iterator(front()); }
end()1013   iterator end() const { return std::next(iterator(back())); }
1014 };
1015 
1016 typedef DenseMap<IRSimilarityCandidate *,
1017                  DenseMap<unsigned, DenseSet<unsigned>>>
1018     CandidateGVNMapping;
1019 typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
1020 typedef std::vector<SimilarityGroup> SimilarityGroupList;
1021 
1022 /// This class puts all the pieces of the IRInstructionData,
1023 /// IRInstructionMapper, IRSimilarityCandidate together.
1024 ///
1025 /// It first feeds the Module or vector of Modules into the IRInstructionMapper,
1026 /// and puts all the mapped instructions into a single long list of
1027 /// IRInstructionData.
1028 ///
1029 /// The list of unsigned integers is given to the Suffix Tree or similar data
1030 /// structure to find repeated subsequences.  We construct an
1031 /// IRSimilarityCandidate for each instance of the subsequence.  We compare them
1032 /// against one another since  These repeated subsequences can have different
1033 /// structure.  For each different kind of structure found, we create a
1034 /// similarity group.
1035 ///
1036 /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
1037 /// structurally similar to one another, while C is different we would have two
1038 /// SimilarityGroups:
1039 ///
1040 /// SimilarityGroup 1:  SimilarityGroup 2
1041 /// A, B, D             C
1042 ///
1043 /// A list of the different similarity groups is then returned after
1044 /// analyzing the module.
1045 class IRSimilarityIdentifier {
1046 public:
1047   IRSimilarityIdentifier(bool MatchBranches = true,
1048                          bool MatchIndirectCalls = true,
1049                          bool MatchCallsWithName = false,
1050                          bool MatchIntrinsics = true,
1051                          bool MatchMustTailCalls = true)
1052       : Mapper(&InstDataAllocator, &InstDataListAllocator),
1053         EnableBranches(MatchBranches), EnableIndirectCalls(MatchIndirectCalls),
1054         EnableMatchingCallsByName(MatchCallsWithName),
1055         EnableIntrinsics(MatchIntrinsics),
1056         EnableMustTailCalls(MatchMustTailCalls) {}
1057 
1058 private:
1059   /// Map the instructions in the module to unsigned integers, using mapping
1060   /// already present in the Mapper if possible.
1061   ///
1062   /// \param [in] M Module - To map to integers.
1063   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
1064   /// \param [in,out] IntegerMapping - The vector to append integers to.
1065   void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
1066                       std::vector<unsigned> &IntegerMapping);
1067 
1068   /// Map the instructions in the modules vector to unsigned integers, using
1069   /// mapping already present in the mapper if possible.
1070   ///
1071   /// \param [in] Modules - The list of modules to use to populate the mapper
1072   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
1073   /// \param [in,out] IntegerMapping - The vector to append integers to.
1074   void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
1075                       std::vector<IRInstructionData *> &InstrList,
1076                       std::vector<unsigned> &IntegerMapping);
1077 
1078   /// Find the similarity candidates in \p InstrList and corresponding
1079   /// \p UnsignedVec
1080   ///
1081   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
1082   /// \param [in,out] IntegerMapping - The vector to append integers to.
1083   /// candidates found in the program.
1084   void findCandidates(std::vector<IRInstructionData *> &InstrList,
1085                       std::vector<unsigned> &IntegerMapping);
1086 
1087 public:
1088   // Find the IRSimilarityCandidates in the \p Modules and group by structural
1089   // similarity in a SimilarityGroup, each group is returned in a
1090   // SimilarityGroupList.
1091   //
1092   // \param [in] Modules - the modules to analyze.
1093   // \returns The groups of similarity ranges found in the modules.
1094   SimilarityGroupList &
1095   findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
1096 
1097   // Find the IRSimilarityCandidates in the given Module grouped by structural
1098   // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
1099   //
1100   // \param [in] M - the module to analyze.
1101   // \returns The groups of similarity ranges found in the module.
1102   SimilarityGroupList &findSimilarity(Module &M);
1103 
1104   // Clears \ref SimilarityCandidates if it is already filled by a previous run.
resetSimilarityCandidates()1105   void resetSimilarityCandidates() {
1106     // If we've already analyzed a Module or set of Modules, so we must clear
1107     // the SimilarityCandidates to make sure we do not have only old values
1108     // hanging around.
1109     if (SimilarityCandidates)
1110       SimilarityCandidates->clear();
1111     else
1112       SimilarityCandidates = SimilarityGroupList();
1113   }
1114 
1115   // \returns The groups of similarity ranges found in the most recently passed
1116   // set of modules.
getSimilarity()1117   std::optional<SimilarityGroupList> &getSimilarity() {
1118     return SimilarityCandidates;
1119   }
1120 
1121 private:
1122   /// The allocator for IRInstructionData.
1123   SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
1124 
1125   /// The allocator for IRInstructionDataLists.
1126   SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
1127 
1128   /// Map Instructions to unsigned integers and wraps the Instruction in an
1129   /// instance of IRInstructionData.
1130   IRInstructionMapper Mapper;
1131 
1132   /// The flag variable that marks whether we should check branches for
1133   /// similarity, or only look within basic blocks.
1134   bool EnableBranches = true;
1135 
1136   /// The flag variable that marks whether we allow indirect calls to be checked
1137   /// for similarity, or exclude them as a legal instruction.
1138   bool EnableIndirectCalls = true;
1139 
1140   /// The flag variable that marks whether we allow calls to be marked as
1141   /// similar if they do not have the same name, only the same calling
1142   /// convention, attributes and type signature.
1143   bool EnableMatchingCallsByName = true;
1144 
1145   /// The flag variable that marks whether we should check intrinsics for
1146   /// similarity.
1147   bool EnableIntrinsics = true;
1148 
1149   // The flag variable that marks whether we should allow tailcalls
1150   // to be checked for similarity.
1151   bool EnableMustTailCalls = false;
1152 
1153   /// The SimilarityGroups found with the most recent run of \ref
1154   /// findSimilarity. std::nullopt if there is no recent run.
1155   std::optional<SimilarityGroupList> SimilarityCandidates;
1156 };
1157 
1158 } // end namespace IRSimilarity
1159 
1160 /// An analysis pass based on legacy pass manager that runs and returns
1161 /// IRSimilarityIdentifier run on the Module.
1162 class IRSimilarityIdentifierWrapperPass : public ModulePass {
1163   std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
1164 
1165 public:
1166   static char ID;
1167   IRSimilarityIdentifierWrapperPass();
1168 
getIRSI()1169   IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
getIRSI()1170   const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
1171 
1172   bool doInitialization(Module &M) override;
1173   bool doFinalization(Module &M) override;
1174   bool runOnModule(Module &M) override;
getAnalysisUsage(AnalysisUsage & AU)1175   void getAnalysisUsage(AnalysisUsage &AU) const override {
1176     AU.setPreservesAll();
1177   }
1178 };
1179 
1180 /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
1181 /// Module.
1182 class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
1183 public:
1184   typedef IRSimilarity::IRSimilarityIdentifier Result;
1185 
1186   Result run(Module &M, ModuleAnalysisManager &);
1187 
1188 private:
1189   friend AnalysisInfoMixin<IRSimilarityAnalysis>;
1190   static AnalysisKey Key;
1191 };
1192 
1193 /// Printer pass that uses \c IRSimilarityAnalysis.
1194 class IRSimilarityAnalysisPrinterPass
1195     : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
1196   raw_ostream &OS;
1197 
1198 public:
IRSimilarityAnalysisPrinterPass(raw_ostream & OS)1199   explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
1200   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
isRequired()1201   static bool isRequired() { return true; }
1202 };
1203 
1204 } // end namespace llvm
1205 
1206 #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
1207