1 //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // TrigramIndex implements a heuristic for SpecialCaseList that allows to 10 // filter out ~99% incoming queries when all regular expressions in the 11 // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more 12 // complicated, the check is defeated and it will always pass the queries to a 13 // full regex. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Support/TrigramIndex.h" 18 #include <set> 19 20 using namespace llvm; 21 22 static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; 23 isAdvancedMetachar(unsigned Char)24static bool isAdvancedMetachar(unsigned Char) { 25 return strchr(RegexAdvancedMetachars, Char) != nullptr; 26 } 27 insert(const std::string & Regex)28void TrigramIndex::insert(const std::string &Regex) { 29 if (Defeated) return; 30 std::set<unsigned> Was; 31 unsigned Cnt = 0; 32 unsigned Tri = 0; 33 unsigned Len = 0; 34 bool Escaped = false; 35 for (unsigned Char : Regex) { 36 if (!Escaped) { 37 // Regular expressions allow escaping symbols by preceding it with '\'. 38 if (Char == '\\') { 39 Escaped = true; 40 continue; 41 } 42 if (isAdvancedMetachar(Char)) { 43 // This is a more complicated regex than we can handle here. 44 Defeated = true; 45 return; 46 } 47 if (Char == '.' || Char == '*') { 48 Tri = 0; 49 Len = 0; 50 continue; 51 } 52 } 53 if (Escaped && Char >= '1' && Char <= '9') { 54 Defeated = true; 55 return; 56 } 57 // We have already handled escaping and can reset the flag. 58 Escaped = false; 59 Tri = ((Tri << 8) + Char) & 0xFFFFFF; 60 Len++; 61 if (Len < 3) 62 continue; 63 // We don't want the index to grow too much for the popular trigrams, 64 // as they are weak signals. It's ok to still require them for the 65 // rules we have already processed. It's just a small additional 66 // computational cost. 67 if (Index[Tri].size() >= 4) 68 continue; 69 Cnt++; 70 if (!Was.count(Tri)) { 71 // Adding the current rule to the index. 72 Index[Tri].push_back(Counts.size()); 73 Was.insert(Tri); 74 } 75 } 76 if (!Cnt) { 77 // This rule does not have remarkable trigrams to rely on. 78 // We have to always call the full regex chain. 79 Defeated = true; 80 return; 81 } 82 Counts.push_back(Cnt); 83 } 84 isDefinitelyOut(StringRef Query) const85bool TrigramIndex::isDefinitelyOut(StringRef Query) const { 86 if (Defeated) 87 return false; 88 std::vector<unsigned> CurCounts(Counts.size()); 89 unsigned Tri = 0; 90 for (size_t I = 0; I < Query.size(); I++) { 91 Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; 92 if (I < 2) 93 continue; 94 const auto &II = Index.find(Tri); 95 if (II == Index.end()) 96 continue; 97 for (size_t J : II->second) { 98 CurCounts[J]++; 99 // If we have reached a desired limit, we have to look at the query 100 // more closely by running a full regex. 101 if (CurCounts[J] >= Counts[J]) 102 return false; 103 } 104 } 105 return true; 106 } 107