1 //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // Interface file for the IRSimilarityIdentifier for identifying similarities in 11 // IR including the IRInstructionMapper, which maps an Instruction to unsigned 12 // integers. 13 // 14 // Two sequences of instructions are called "similar" if they perform the same 15 // series of operations for all inputs. 16 // 17 // \code 18 // %1 = add i32 %a, 10 19 // %2 = add i32 %a, %1 20 // %3 = icmp slt icmp %1, %2 21 // \endcode 22 // 23 // and 24 // 25 // \code 26 // %1 = add i32 11, %a 27 // %2 = sub i32 %a, %1 28 // %3 = icmp sgt icmp %2, %1 29 // \endcode 30 // 31 // ultimately have the same result, even if the inputs, and structure are 32 // slightly different. 33 // 34 // For instructions, we do not worry about operands that do not have fixed 35 // semantic meaning to the program. We consider the opcode that the instruction 36 // has, the types, parameters, and extra information such as the function name, 37 // or comparison predicate. These are used to create a hash to map instructions 38 // to integers to be used in similarity matching in sequences of instructions 39 // 40 // Terminology: 41 // An IRSimilarityCandidate is a region of IRInstructionData (wrapped 42 // Instructions), usually used to denote a region of similarity has been found. 43 // 44 // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally 45 // similar to one another. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 50 #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 51 52 #include "llvm/IR/InstVisitor.h" 53 #include "llvm/IR/Instructions.h" 54 #include "llvm/IR/Module.h" 55 #include "llvm/IR/PassManager.h" 56 #include "llvm/Pass.h" 57 #include "llvm/Support/Allocator.h" 58 59 namespace llvm { 60 namespace IRSimilarity { 61 62 struct IRInstructionDataList; 63 64 /// This represents what is and is not supported when finding similarity in 65 /// Instructions. 66 /// 67 /// Legal Instructions are considered when looking at similarity between 68 /// Instructions. 69 /// 70 /// Illegal Instructions cannot be considered when looking for similarity 71 /// between Instructions. They act as boundaries between similarity regions. 72 /// 73 /// Invisible Instructions are skipped over during analysis. 74 // TODO: Shared with MachineOutliner 75 enum InstrType { Legal, Illegal, Invisible }; 76 77 /// This provides the utilities for hashing an Instruction to an unsigned 78 /// integer. Two IRInstructionDatas produce the same hash value when their 79 /// underlying Instructions perform the same operation (even if they don't have 80 /// the same input operands.) 81 /// As a more concrete example, consider the following: 82 /// 83 /// \code 84 /// %add1 = add i32 %a, %b 85 /// %add2 = add i32 %c, %d 86 /// %add3 = add i64 %e, %f 87 /// \endcode 88 /// 89 // Then the IRInstructionData wrappers for these Instructions may be hashed like 90 /// so: 91 /// 92 /// \code 93 /// ; These two adds have the same types and operand types, so they hash to the 94 /// ; same number. 95 /// %add1 = add i32 %a, %b ; Hash: 1 96 /// %add2 = add i32 %c, %d ; Hash: 1 97 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So, 98 /// ; it hashes to a different number. 99 /// %add3 = add i64 %e, %f; Hash: 2 100 /// \endcode 101 /// 102 /// 103 /// This hashing scheme will be used to represent the program as a very long 104 /// string. This string can then be placed in a data structure which can be used 105 /// for similarity queries. 106 /// 107 /// TODO: Handle types of Instructions which can be equal even with different 108 /// operands. (E.g. comparisons with swapped predicates.) 109 /// TODO: Handle CallInsts, which are only checked for function type 110 /// by \ref isSameOperationAs. 111 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the 112 /// exact same, and some do not. 113 struct IRInstructionData : ilist_node<IRInstructionData> { 114 115 /// The source Instruction that is being wrapped. 116 Instruction *Inst = nullptr; 117 /// The values of the operands in the Instruction. 118 SmallVector<Value *, 4> OperVals; 119 /// The legality of the wrapped instruction. This is informed by InstrType, 120 /// and is used when checking when two instructions are considered similar. 121 /// If either instruction is not legal, the instructions are automatically not 122 /// considered similar. 123 bool Legal; 124 125 /// This is only relevant if we are wrapping a CmpInst where we needed to 126 /// change the predicate of a compare instruction from a greater than form 127 /// to a less than form. It is None otherwise. 128 Optional<CmpInst::Predicate> RevisedPredicate; 129 130 /// Gather the information that is difficult to gather for an Instruction, or 131 /// is changed. i.e. the operands of an Instruction and the Types of those 132 /// operands. This extra information allows for similarity matching to make 133 /// assertions that allow for more flexibility when checking for whether an 134 /// Instruction performs the same operation. 135 IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL); 136 137 /// Get the predicate that the compare instruction is using for hashing the 138 /// instruction. the IRInstructionData must be wrapping a CmpInst. 139 CmpInst::Predicate getPredicate() const; 140 141 /// A function that swaps the predicates to their less than form if they are 142 /// in a greater than form. Otherwise, the predicate is unchanged. 143 /// 144 /// \param CI - The comparison operation to find a consistent preidcate for. 145 /// \return the consistent comparison predicate. 146 static CmpInst::Predicate predicateForConsistency(CmpInst *CI); 147 148 /// Hashes \p Value based on its opcode, types, and operand types. 149 /// Two IRInstructionData instances produce the same hash when they perform 150 /// the same operation. 151 /// 152 /// As a simple example, consider the following instructions. 153 /// 154 /// \code 155 /// %add1 = add i32 %x1, %y1 156 /// %add2 = add i32 %x2, %y2 157 /// 158 /// %sub = sub i32 %x1, %y1 159 /// 160 /// %add_i64 = add i64 %x2, %y2 161 /// \endcode 162 /// 163 /// Because the first two adds operate the same types, and are performing the 164 /// same action, they will be hashed to the same value. 165 /// 166 /// However, the subtraction instruction is not the same as an addition, and 167 /// will be hashed to a different value. 168 /// 169 /// Finally, the last add has a different type compared to the first two add 170 /// instructions, so it will also be hashed to a different value that any of 171 /// the previous instructions. 172 /// 173 /// \param [in] ID - The IRInstructionData instance to be hashed. 174 /// \returns A hash_value of the IRInstructionData. hash_valueIRInstructionData175 friend hash_code hash_value(const IRInstructionData &ID) { 176 SmallVector<Type *, 4> OperTypes; 177 for (Value *V : ID.OperVals) 178 OperTypes.push_back(V->getType()); 179 180 if (isa<CmpInst>(ID.Inst)) 181 return llvm::hash_combine( 182 llvm::hash_value(ID.Inst->getOpcode()), 183 llvm::hash_value(ID.Inst->getType()), 184 llvm::hash_value(ID.getPredicate()), 185 llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 186 else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst)) 187 return llvm::hash_combine( 188 llvm::hash_value(ID.Inst->getOpcode()), 189 llvm::hash_value(ID.Inst->getType()), 190 llvm::hash_value(CI->getCalledFunction()->getName().str()), 191 llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 192 return llvm::hash_combine( 193 llvm::hash_value(ID.Inst->getOpcode()), 194 llvm::hash_value(ID.Inst->getType()), 195 llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 196 } 197 198 IRInstructionDataList *IDL = nullptr; 199 }; 200 201 struct IRInstructionDataList : simple_ilist<IRInstructionData> {}; 202 203 /// Compare one IRInstructionData class to another IRInstructionData class for 204 /// whether they are performing a the same operation, and can mapped to the 205 /// same value. For regular instructions if the hash value is the same, then 206 /// they will also be close. 207 /// 208 /// \param A - The first IRInstructionData class to compare 209 /// \param B - The second IRInstructionData class to compare 210 /// \returns true if \p A and \p B are similar enough to be mapped to the same 211 /// value. 212 bool isClose(const IRInstructionData &A, const IRInstructionData &B); 213 214 struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { getEmptyKeyIRInstructionDataTraits215 static inline IRInstructionData *getEmptyKey() { return nullptr; } getTombstoneKeyIRInstructionDataTraits216 static inline IRInstructionData *getTombstoneKey() { 217 return reinterpret_cast<IRInstructionData *>(-1); 218 } 219 getHashValueIRInstructionDataTraits220 static unsigned getHashValue(const IRInstructionData *E) { 221 using llvm::hash_value; 222 assert(E && "IRInstructionData is a nullptr?"); 223 return hash_value(*E); 224 } 225 isEqualIRInstructionDataTraits226 static bool isEqual(const IRInstructionData *LHS, 227 const IRInstructionData *RHS) { 228 if (RHS == getEmptyKey() || RHS == getTombstoneKey() || 229 LHS == getEmptyKey() || LHS == getTombstoneKey()) 230 return LHS == RHS; 231 232 assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?"); 233 return isClose(*LHS, *RHS); 234 } 235 }; 236 237 /// Helper struct for converting the Instructions in a Module into a vector of 238 /// unsigned integers. This vector of unsigned integers can be thought of as a 239 /// "numeric string". This numeric string can then be queried by, for example, 240 /// data structures that find repeated substrings. 241 /// 242 /// This hashing is done per BasicBlock in the module. To hash Instructions 243 /// based off of their operations, each Instruction is wrapped in an 244 /// IRInstructionData struct. The unsigned integer for an IRInstructionData 245 /// depends on: 246 /// - The hash provided by the IRInstructionData. 247 /// - Which member of InstrType the IRInstructionData is classified as. 248 // See InstrType for more details on the possible classifications, and how they 249 // manifest in the numeric string. 250 /// 251 /// The numeric string for an individual BasicBlock is terminated by an unique 252 /// unsigned integer. This prevents data structures which rely on repetition 253 /// from matching across BasicBlocks. (For example, the SuffixTree.) 254 /// As a concrete example, if we have the following two BasicBlocks: 255 /// \code 256 /// bb0: 257 /// %add1 = add i32 %a, %b 258 /// %add2 = add i32 %c, %d 259 /// %add3 = add i64 %e, %f 260 /// bb1: 261 /// %sub = sub i32 %c, %d 262 /// \endcode 263 /// We may hash the Instructions like this (via IRInstructionData): 264 /// \code 265 /// bb0: 266 /// %add1 = add i32 %a, %b ; Hash: 1 267 /// %add2 = add i32 %c, %d; Hash: 1 268 /// %add3 = add i64 %e, %f; Hash: 2 269 /// bb1: 270 /// %sub = sub i32 %c, %d; Hash: 3 271 /// %add4 = add i32 %c, %d ; Hash: 1 272 /// \endcode 273 /// And produce a "numeric string representation" like so: 274 /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 275 /// 276 /// TODO: This is very similar to the MachineOutliner, and should be 277 /// consolidated into the same interface. 278 struct IRInstructionMapper { 279 /// The starting illegal instruction number to map to. 280 /// 281 /// Set to -3 for compatibility with DenseMapInfo<unsigned>. 282 unsigned IllegalInstrNumber = static_cast<unsigned>(-3); 283 284 /// The next available integer to assign to a legal Instruction to. 285 unsigned LegalInstrNumber = 0; 286 287 /// Correspondence from IRInstructionData to unsigned integers. 288 DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits> 289 InstructionIntegerMap; 290 291 /// Set if we added an illegal number in the previous step. 292 /// Since each illegal number is unique, we only need one of them between 293 /// each range of legal numbers. This lets us make sure we don't add more 294 /// than one illegal number per range. 295 bool AddedIllegalLastTime = false; 296 297 /// Marks whether we found a illegal instruction in the previous step. 298 bool CanCombineWithPrevInstr = false; 299 300 /// Marks whether we have found a set of instructions that is long enough 301 /// to be considered for similarity. 302 bool HaveLegalRange = false; 303 304 /// This allocator pointer is in charge of holding on to the IRInstructionData 305 /// so it is not deallocated until whatever external tool is using it is done 306 /// with the information. 307 SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr; 308 309 /// This allocator pointer is in charge of creating the IRInstructionDataList 310 /// so it is not deallocated until whatever external tool is using it is done 311 /// with the information. 312 SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr; 313 314 /// Get an allocated IRInstructionData struct using the InstDataAllocator. 315 /// 316 /// \param I - The Instruction to wrap with IRInstructionData. 317 /// \param Legality - A boolean value that is true if the instruction is to 318 /// be considered for similarity, and false if not. 319 /// \param IDL - The InstructionDataList that the IRInstructionData is 320 /// inserted into. 321 /// \returns An allocated IRInstructionData struct. 322 IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality, 323 IRInstructionDataList &IDL); 324 325 /// Get an allocated IRInstructionDataList object using the IDLAllocator. 326 /// 327 /// \returns An allocated IRInstructionDataList object. 328 IRInstructionDataList *allocateIRInstructionDataList(); 329 330 IRInstructionDataList *IDL = nullptr; 331 332 /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers 333 /// determined by \p InstrType. Two Instructions are mapped to the same value 334 /// if they are close as defined by the InstructionData class above. 335 /// 336 /// \param [in] BB - The BasicBlock to be mapped to integers. 337 /// \param [in,out] InstrList - Vector of IRInstructionData to append to. 338 /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. 339 void convertToUnsignedVec(BasicBlock &BB, 340 std::vector<IRInstructionData *> &InstrList, 341 std::vector<unsigned> &IntegerMapping); 342 343 /// Maps an Instruction to a legal integer. 344 /// 345 /// \param [in] It - The Instruction to be mapped to an integer. 346 /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to 347 /// append to. 348 /// \param [in,out] InstrListForBB - Vector of InstructionData to append to. 349 /// \returns The integer \p It was mapped to. 350 unsigned mapToLegalUnsigned(BasicBlock::iterator &It, 351 std::vector<unsigned> &IntegerMappingForBB, 352 std::vector<IRInstructionData *> &InstrListForBB); 353 354 /// Maps an Instruction to an illegal integer. 355 /// 356 /// \param [in] It - The \p Instruction to be mapped to an integer. 357 /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to 358 /// append to. 359 /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to. 360 /// \param End - true if creating a dummy IRInstructionData at the end of a 361 /// basic block. 362 /// \returns The integer \p It was mapped to. 363 unsigned mapToIllegalUnsigned( 364 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB, 365 std::vector<IRInstructionData *> &InstrListForBB, bool End = false); 366 IRInstructionMapperIRInstructionMapper367 IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA, 368 SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA) 369 : InstDataAllocator(IDA), IDLAllocator(IDLA) { 370 // Make sure that the implementation of DenseMapInfo<unsigned> hasn't 371 // changed. 372 assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && 373 "DenseMapInfo<unsigned>'s empty key isn't -1!"); 374 assert(DenseMapInfo<unsigned>::getTombstoneKey() == 375 static_cast<unsigned>(-2) && 376 "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); 377 378 IDL = new (IDLAllocator->Allocate()) 379 IRInstructionDataList(); 380 } 381 382 /// Custom InstVisitor to classify different instructions for whether it can 383 /// be analyzed for similarity. 384 struct InstructionClassification 385 : public InstVisitor<InstructionClassification, InstrType> { InstructionClassificationIRInstructionMapper::InstructionClassification386 InstructionClassification() {} 387 388 // TODO: Determine a scheme to resolve when the label is similar enough. visitBranchInstIRInstructionMapper::InstructionClassification389 InstrType visitBranchInst(BranchInst &BI) { return Illegal; } 390 // TODO: Determine a scheme to resolve when the labels are similar enough. visitPHINodeIRInstructionMapper::InstructionClassification391 InstrType visitPHINode(PHINode &PN) { return Illegal; } 392 // TODO: Handle allocas. visitAllocaInstIRInstructionMapper::InstructionClassification393 InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } 394 // We exclude variable argument instructions since variable arguments 395 // requires extra checking of the argument list. visitVAArgInstIRInstructionMapper::InstructionClassification396 InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } 397 // We exclude all exception handling cases since they are so context 398 // dependent. visitLandingPadInstIRInstructionMapper::InstructionClassification399 InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } visitFuncletPadInstIRInstructionMapper::InstructionClassification400 InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } 401 // DebugInfo should be included in the regions, but should not be 402 // analyzed for similarity as it has no bearing on the outcome of the 403 // program. visitDbgInfoIntrinsicIRInstructionMapper::InstructionClassification404 InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } 405 // TODO: Handle specific intrinsics. visitIntrinsicInstIRInstructionMapper::InstructionClassification406 InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; } 407 // We only allow call instructions where the function has a name and 408 // is not an indirect call. visitCallInstIRInstructionMapper::InstructionClassification409 InstrType visitCallInst(CallInst &CI) { 410 Function *F = CI.getCalledFunction(); 411 if (!F || CI.isIndirectCall() || !F->hasName()) 412 return Illegal; 413 return Legal; 414 } 415 // TODO: We do not current handle similarity that changes the control flow. visitInvokeInstIRInstructionMapper::InstructionClassification416 InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } 417 // TODO: We do not current handle similarity that changes the control flow. visitCallBrInstIRInstructionMapper::InstructionClassification418 InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } 419 // TODO: Handle interblock similarity. visitTerminatorIRInstructionMapper::InstructionClassification420 InstrType visitTerminator(Instruction &I) { return Illegal; } visitInstructionIRInstructionMapper::InstructionClassification421 InstrType visitInstruction(Instruction &I) { return Legal; } 422 }; 423 424 /// Maps an Instruction to a member of InstrType. 425 InstructionClassification InstClassifier; 426 }; 427 428 /// This is a class that wraps a range of IRInstructionData from one point to 429 /// another in the vector of IRInstructionData, which is a region of the 430 /// program. It is also responsible for defining the structure within this 431 /// region of instructions. 432 /// 433 /// The structure of a region is defined through a value numbering system 434 /// assigned to each unique value in a region at the creation of the 435 /// IRSimilarityCandidate. 436 /// 437 /// For example, for each Instruction we add a mapping for each new 438 /// value seen in that Instruction. 439 /// IR: Mapping Added: 440 /// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2 441 /// %add2 = add i32 %a, %1 %add2 -> 4 442 /// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5 443 /// 444 /// We can compare IRSimilarityCandidates against one another. 445 /// The \ref isSimilar function compares each IRInstructionData against one 446 /// another and if we have the same sequences of IRInstructionData that would 447 /// create the same hash, we have similar IRSimilarityCandidates. 448 /// 449 /// We can also compare the structure of IRSimilarityCandidates. If we can 450 /// create a mapping of registers in the region contained by one 451 /// IRSimilarityCandidate to the region contained by different 452 /// IRSimilarityCandidate, they can be considered structurally similar. 453 /// 454 /// IRSimilarityCandidate1: IRSimilarityCandidate2: 455 /// %add1 = add i32 %a, %b %add1 = add i32 %d, %e 456 /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f 457 /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 458 /// 459 /// Can have the following mapping from candidate to candidate of: 460 /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4 461 /// and can be considered similar. 462 /// 463 /// IRSimilarityCandidate1: IRSimilarityCandidate2: 464 /// %add1 = add i32 %a, %b %add1 = add i32 %d, c4 465 /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f 466 /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 467 /// 468 /// We cannot create the same mapping since the use of c4 is not used in the 469 /// same way as %b or c2. 470 class IRSimilarityCandidate { 471 private: 472 /// The start index of this IRSimilarityCandidate in the instruction list. 473 unsigned StartIdx = 0; 474 475 /// The number of instructions in this IRSimilarityCandidate. 476 unsigned Len = 0; 477 478 /// The first instruction in this IRSimilarityCandidate. 479 IRInstructionData *FirstInst = nullptr; 480 481 /// The last instruction in this IRSimilarityCandidate. 482 IRInstructionData *LastInst = nullptr; 483 484 /// Global Value Numbering structures 485 /// @{ 486 /// Stores the mapping of the value to the number assigned to it in the 487 /// IRSimilarityCandidate. 488 DenseMap<Value *, unsigned> ValueToNumber; 489 /// Stores the mapping of the number to the value assigned this number. 490 DenseMap<unsigned, Value *> NumberToValue; 491 /// @} 492 493 public: 494 /// \param StartIdx - The starting location of the region. 495 /// \param Len - The length of the region. 496 /// \param FirstInstIt - The starting IRInstructionData of the region. 497 /// \param LastInstIt - The ending IRInstructionData of the region. 498 IRSimilarityCandidate(unsigned StartIdx, unsigned Len, 499 IRInstructionData *FirstInstIt, 500 IRInstructionData *LastInstIt); 501 502 /// \param A - The first IRInstructionCandidate to compare. 503 /// \param B - The second IRInstructionCandidate to compare. 504 /// \returns True when every IRInstructionData in \p A is similar to every 505 /// IRInstructionData in \p B. 506 static bool isSimilar(const IRSimilarityCandidate &A, 507 const IRSimilarityCandidate &B); 508 509 /// \param A - The first IRInstructionCandidate to compare. 510 /// \param B - The second IRInstructionCandidate to compare. 511 /// \returns True when every IRInstructionData in \p A is structurally similar 512 /// to \p B. 513 static bool compareStructure(const IRSimilarityCandidate &A, 514 const IRSimilarityCandidate &B); 515 516 struct OperandMapping { 517 /// The IRSimilarityCandidate that holds the instruction the OperVals were 518 /// pulled from. 519 const IRSimilarityCandidate &IRSC; 520 521 /// The operand values to be analyzed. 522 ArrayRef<Value *> &OperVals; 523 524 /// The current mapping of global value numbers from one IRSimilarityCandidate 525 /// to another IRSimilarityCandidate. 526 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping; 527 }; 528 529 /// Compare the operands in \p A and \p B and check that the current mapping 530 /// of global value numbers from \p A to \p B and \p B to \A is consistent. 531 /// 532 /// \param A - The first IRInstructionCandidate, operand values, and current 533 /// operand mappings to compare. 534 /// \param B - The second IRInstructionCandidate, operand values, and current 535 /// operand mappings to compare. 536 /// \returns true if the IRSimilarityCandidates operands are compatible. 537 static bool compareNonCommutativeOperandMapping(OperandMapping A, 538 OperandMapping B); 539 540 /// Compare the operands in \p A and \p B and check that the current mapping 541 /// of global value numbers from \p A to \p B and \p B to \A is consistent 542 /// given that the operands are commutative. 543 /// 544 /// \param A - The first IRInstructionCandidate, operand values, and current 545 /// operand mappings to compare. 546 /// \param B - The second IRInstructionCandidate, operand values, and current 547 /// operand mappings to compare. 548 /// \returns true if the IRSimilarityCandidates operands are compatible. 549 static bool compareCommutativeOperandMapping(OperandMapping A, 550 OperandMapping B); 551 552 /// Compare the start and end indices of the two IRSimilarityCandidates for 553 /// whether they overlap. If the start instruction of one 554 /// IRSimilarityCandidate is less than the end instruction of the other, and 555 /// the start instruction of one is greater than the start instruction of the 556 /// other, they overlap. 557 /// 558 /// \returns true if the IRSimilarityCandidates do not have overlapping 559 /// instructions. 560 static bool overlap(const IRSimilarityCandidate &A, 561 const IRSimilarityCandidate &B); 562 563 /// \returns the number of instructions in this Candidate. getLength()564 unsigned getLength() const { return Len; } 565 566 /// \returns the start index of this IRSimilarityCandidate. getStartIdx()567 unsigned getStartIdx() const { return StartIdx; } 568 569 /// \returns the end index of this IRSimilarityCandidate. getEndIdx()570 unsigned getEndIdx() const { return StartIdx + Len - 1; } 571 572 /// \returns The first IRInstructionData. front()573 IRInstructionData *front() const { return FirstInst; } 574 /// \returns The last IRInstructionData. back()575 IRInstructionData *back() const { return LastInst; } 576 577 /// \returns The first Instruction. frontInstruction()578 Instruction *frontInstruction() { return FirstInst->Inst; } 579 /// \returns The last Instruction backInstruction()580 Instruction *backInstruction() { return LastInst->Inst; } 581 582 /// \returns The BasicBlock the IRSimilarityCandidate starts in. getStartBB()583 BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); } 584 /// \returns The BasicBlock the IRSimilarityCandidate ends in. getEndBB()585 BasicBlock *getEndBB() { return LastInst->Inst->getParent(); } 586 587 /// \returns The Function that the IRSimilarityCandidate is located in. getFunction()588 Function *getFunction() { return getStartBB()->getParent(); } 589 590 /// Finds the positive number associated with \p V if it has been mapped. 591 /// \param [in] V - the Value to find. 592 /// \returns The positive number corresponding to the value. 593 /// \returns None if not present. getGVN(Value * V)594 Optional<unsigned> getGVN(Value *V) { 595 assert(V != nullptr && "Value is a nullptr?"); 596 DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V); 597 if (VNIt == ValueToNumber.end()) 598 return None; 599 return VNIt->second; 600 } 601 602 /// Finds the Value associate with \p Num if it exists. 603 /// \param [in] Num - the number to find. 604 /// \returns The Value associated with the number. 605 /// \returns None if not present. fromGVN(unsigned Num)606 Optional<Value *> fromGVN(unsigned Num) { 607 DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num); 608 if (VNIt == NumberToValue.end()) 609 return None; 610 assert(VNIt->second != nullptr && "Found value is a nullptr!"); 611 return VNIt->second; 612 } 613 614 /// \param RHS -The IRSimilarityCandidate to compare against 615 /// \returns true if the IRSimilarityCandidate is occurs after the 616 /// IRSimilarityCandidate in the program. 617 bool operator<(const IRSimilarityCandidate &RHS) const { 618 return getStartIdx() > RHS.getStartIdx(); 619 } 620 621 using iterator = IRInstructionDataList::iterator; begin()622 iterator begin() const { return iterator(front()); } end()623 iterator end() const { return std::next(iterator(back())); } 624 }; 625 626 typedef std::vector<IRSimilarityCandidate> SimilarityGroup; 627 typedef std::vector<SimilarityGroup> SimilarityGroupList; 628 629 /// This class puts all the pieces of the IRInstructionData, 630 /// IRInstructionMapper, IRSimilarityCandidate together. 631 /// 632 /// It first feeds the Module or vector of Modules into the IRInstructionMapper, 633 /// and puts all the mapped instructions into a single long list of 634 /// IRInstructionData. 635 /// 636 /// The list of unsigned integers is given to the Suffix Tree or similar data 637 /// structure to find repeated subsequences. We construct an 638 /// IRSimilarityCandidate for each instance of the subsequence. We compare them 639 /// against one another since These repeated subsequences can have different 640 /// structure. For each different kind of structure found, we create a 641 /// similarity group. 642 /// 643 /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are 644 /// structurally similar to one another, while C is different we would have two 645 /// SimilarityGroups: 646 /// 647 /// SimilarityGroup 1: SimilarityGroup 2 648 /// A, B, D C 649 /// 650 /// A list of the different similarity groups is then returned after 651 /// analyzing the module. 652 class IRSimilarityIdentifier { 653 public: IRSimilarityIdentifier()654 IRSimilarityIdentifier() 655 : Mapper(&InstDataAllocator, &InstDataListAllocator) {} 656 657 private: 658 /// Map the instructions in the module to unsigned integers, using mapping 659 /// already present in the Mapper if possible. 660 /// 661 /// \param [in] M Module - To map to integers. 662 /// \param [in,out] InstrList - The vector to append IRInstructionData to. 663 /// \param [in,out] IntegerMapping - The vector to append integers to. 664 void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList, 665 std::vector<unsigned> &IntegerMapping); 666 667 /// Map the instructions in the modules vector to unsigned integers, using 668 /// mapping already present in the mapper if possible. 669 /// 670 /// \param [in] Modules - The list of modules to use to populate the mapper 671 /// \param [in,out] InstrList - The vector to append IRInstructionData to. 672 /// \param [in,out] IntegerMapping - The vector to append integers to. 673 void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules, 674 std::vector<IRInstructionData *> &InstrList, 675 std::vector<unsigned> &IntegerMapping); 676 677 /// Find the similarity candidates in \p InstrList and corresponding 678 /// \p UnsignedVec 679 /// 680 /// \param [in,out] InstrList - The vector to append IRInstructionData to. 681 /// \param [in,out] IntegerMapping - The vector to append integers to. 682 /// candidates found in the program. 683 void findCandidates(std::vector<IRInstructionData *> &InstrList, 684 std::vector<unsigned> &IntegerMapping); 685 686 public: 687 // Find the IRSimilarityCandidates in the \p Modules and group by structural 688 // similarity in a SimilarityGroup, each group is returned in a 689 // SimilarityGroupList. 690 // 691 // \param [in] Modules - the modules to analyze. 692 // \returns The groups of similarity ranges found in the modules. 693 SimilarityGroupList & 694 findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules); 695 696 // Find the IRSimilarityCandidates in the given Module grouped by structural 697 // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. 698 // 699 // \param [in] M - the module to analyze. 700 // \returns The groups of similarity ranges found in the module. 701 SimilarityGroupList &findSimilarity(Module &M); 702 703 // Clears \ref SimilarityCandidates if it is already filled by a previous run. resetSimilarityCandidates()704 void resetSimilarityCandidates() { 705 // If we've already analyzed a Module or set of Modules, so we must clear 706 // the SimilarityCandidates to make sure we do not have only old values 707 // hanging around. 708 if (SimilarityCandidates.hasValue()) 709 SimilarityCandidates->clear(); 710 else 711 SimilarityCandidates = SimilarityGroupList(); 712 } 713 714 // \returns The groups of similarity ranges found in the most recently passed 715 // set of modules. getSimilarity()716 Optional<SimilarityGroupList> &getSimilarity() { 717 return SimilarityCandidates; 718 } 719 720 private: 721 /// The allocator for IRInstructionData. 722 SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator; 723 724 /// The allocator for IRInstructionDataLists. 725 SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator; 726 727 /// Map Instructions to unsigned integers and wraps the Instruction in an 728 /// instance of IRInstructionData. 729 IRInstructionMapper Mapper; 730 731 /// The SimilarityGroups found with the most recent run of \ref 732 /// findSimilarity. None if there is no recent run. 733 Optional<SimilarityGroupList> SimilarityCandidates; 734 }; 735 736 } // end namespace IRSimilarity 737 738 /// An analysis pass based on legacy pass manager that runs and returns 739 /// IRSimilarityIdentifier run on the Module. 740 class IRSimilarityIdentifierWrapperPass : public ModulePass { 741 std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI; 742 743 public: 744 static char ID; 745 IRSimilarityIdentifierWrapperPass(); 746 getIRSI()747 IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; } getIRSI()748 const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; } 749 750 bool doInitialization(Module &M) override; 751 bool doFinalization(Module &M) override; 752 bool runOnModule(Module &M) override; getAnalysisUsage(AnalysisUsage & AU)753 void getAnalysisUsage(AnalysisUsage &AU) const override { 754 AU.setPreservesAll(); 755 } 756 }; 757 758 /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the 759 /// Module. 760 class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> { 761 public: 762 typedef IRSimilarity::IRSimilarityIdentifier Result; 763 764 Result run(Module &M, ModuleAnalysisManager &); 765 766 private: 767 friend AnalysisInfoMixin<IRSimilarityAnalysis>; 768 static AnalysisKey Key; 769 }; 770 771 /// Printer pass that uses \c IRSimilarityAnalysis. 772 class IRSimilarityAnalysisPrinterPass 773 : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> { 774 raw_ostream &OS; 775 776 public: IRSimilarityAnalysisPrinterPass(raw_ostream & OS)777 explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} 778 PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); 779 }; 780 781 } // end namespace llvm 782 783 #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 784