1 //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // TrigramIndex implements a heuristic for SpecialCaseList that allows to 10 // filter out ~99% incoming queries when all regular expressions in the 11 // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more 12 // complicated, the check is defeated and it will always pass the queries to a 13 // full regex. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Support/TrigramIndex.h" 18 #include "llvm/ADT/SmallVector.h" 19 20 #include <set> 21 #include <string> 22 #include <unordered_map> 23 24 using namespace llvm; 25 26 static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; 27 28 static bool isAdvancedMetachar(unsigned Char) { 29 return strchr(RegexAdvancedMetachars, Char) != nullptr; 30 } 31 32 void TrigramIndex::insert(std::string Regex) { 33 if (Defeated) return; 34 std::set<unsigned> Was; 35 unsigned Cnt = 0; 36 unsigned Tri = 0; 37 unsigned Len = 0; 38 bool Escaped = false; 39 for (unsigned Char : Regex) { 40 if (!Escaped) { 41 // Regular expressions allow escaping symbols by preceding it with '\'. 42 if (Char == '\\') { 43 Escaped = true; 44 continue; 45 } 46 if (isAdvancedMetachar(Char)) { 47 // This is a more complicated regex than we can handle here. 48 Defeated = true; 49 return; 50 } 51 if (Char == '.' || Char == '*') { 52 Tri = 0; 53 Len = 0; 54 continue; 55 } 56 } 57 if (Escaped && Char >= '1' && Char <= '9') { 58 Defeated = true; 59 return; 60 } 61 // We have already handled escaping and can reset the flag. 62 Escaped = false; 63 Tri = ((Tri << 8) + Char) & 0xFFFFFF; 64 Len++; 65 if (Len < 3) 66 continue; 67 // We don't want the index to grow too much for the popular trigrams, 68 // as they are weak signals. It's ok to still require them for the 69 // rules we have already processed. It's just a small additional 70 // computational cost. 71 if (Index[Tri].size() >= 4) 72 continue; 73 Cnt++; 74 if (!Was.count(Tri)) { 75 // Adding the current rule to the index. 76 Index[Tri].push_back(Counts.size()); 77 Was.insert(Tri); 78 } 79 } 80 if (!Cnt) { 81 // This rule does not have remarkable trigrams to rely on. 82 // We have to always call the full regex chain. 83 Defeated = true; 84 return; 85 } 86 Counts.push_back(Cnt); 87 } 88 89 bool TrigramIndex::isDefinitelyOut(StringRef Query) const { 90 if (Defeated) 91 return false; 92 std::vector<unsigned> CurCounts(Counts.size()); 93 unsigned Tri = 0; 94 for (size_t I = 0; I < Query.size(); I++) { 95 Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; 96 if (I < 2) 97 continue; 98 const auto &II = Index.find(Tri); 99 if (II == Index.end()) 100 continue; 101 for (size_t J : II->second) { 102 CurCounts[J]++; 103 // If we have reached a desired limit, we have to look at the query 104 // more closely by running a full regex. 105 if (CurCounts[J] >= Counts[J]) 106 return false; 107 } 108 } 109 return true; 110 } 111