1 //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a glob pattern matcher.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/GlobPattern.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/StringRef.h"
16 #include "llvm/Support/Errc.h"
17 
18 using namespace llvm;
19 
20 static bool hasWildcard(StringRef S) {
21   return S.find_first_of("?*[\\") != StringRef::npos;
22 }
23 
24 // Expands character ranges and returns a bitmap.
25 // For example, "a-cf-hz" is expanded to "abcfghz".
26 static Expected<BitVector> expand(StringRef S, StringRef Original) {
27   BitVector BV(256, false);
28 
29   // Expand X-Y.
30   for (;;) {
31     if (S.size() < 3)
32       break;
33 
34     uint8_t Start = S[0];
35     uint8_t End = S[2];
36 
37     // If it doesn't start with something like X-Y,
38     // consume the first character and proceed.
39     if (S[1] != '-') {
40       BV[Start] = true;
41       S = S.substr(1);
42       continue;
43     }
44 
45     // It must be in the form of X-Y.
46     // Validate it and then interpret the range.
47     if (Start > End)
48       return make_error<StringError>("invalid glob pattern: " + Original,
49                                      errc::invalid_argument);
50 
51     for (int C = Start; C <= End; ++C)
52       BV[(uint8_t)C] = true;
53     S = S.substr(3);
54   }
55 
56   for (char C : S)
57     BV[(uint8_t)C] = true;
58   return BV;
59 }
60 
61 // This is a scanner for the glob pattern.
62 // A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]"
63 // (which is a negative form of "[<chars>]"), "[!<chars>]" (which is
64 // equivalent to "[^<chars>]"), or a non-meta character.
65 // This function returns the first token in S.
66 static Expected<BitVector> scan(StringRef &S, StringRef Original) {
67   switch (S[0]) {
68   case '*':
69     S = S.substr(1);
70     // '*' is represented by an empty bitvector.
71     // All other bitvectors are 256-bit long.
72     return BitVector();
73   case '?':
74     S = S.substr(1);
75     return BitVector(256, true);
76   case '[': {
77     // ']' is allowed as the first character of a character class. '[]' is
78     // invalid. So, just skip the first character.
79     size_t End = S.find(']', 2);
80     if (End == StringRef::npos)
81       return make_error<StringError>("invalid glob pattern: " + Original,
82                                      errc::invalid_argument);
83 
84     StringRef Chars = S.substr(1, End - 1);
85     S = S.substr(End + 1);
86     if (Chars.startswith("^") || Chars.startswith("!")) {
87       Expected<BitVector> BV = expand(Chars.substr(1), Original);
88       if (!BV)
89         return BV.takeError();
90       return BV->flip();
91     }
92     return expand(Chars, Original);
93   }
94   case '\\':
95     // Eat this character and fall through below to treat it like a non-meta
96     // character.
97     S = S.substr(1);
98     [[fallthrough]];
99   default:
100     BitVector BV(256, false);
101     BV[(uint8_t)S[0]] = true;
102     S = S.substr(1);
103     return BV;
104   }
105 }
106 
107 Expected<GlobPattern> GlobPattern::create(StringRef S) {
108   GlobPattern Pat;
109 
110   // S doesn't contain any metacharacter,
111   // so the regular string comparison should work.
112   if (!hasWildcard(S)) {
113     Pat.Exact = S;
114     return Pat;
115   }
116 
117   // S is something like "foo*", and the "* is not escaped. We can use
118   // startswith().
119   if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) {
120     Pat.Prefix = S.drop_back();
121     return Pat;
122   }
123 
124   // S is something like "*foo". We can use endswith().
125   if (S.startswith("*") && !hasWildcard(S.drop_front())) {
126     Pat.Suffix = S.drop_front();
127     return Pat;
128   }
129 
130   // Otherwise, we need to do real glob pattern matching.
131   // Parse the pattern now.
132   StringRef Original = S;
133   while (!S.empty()) {
134     Expected<BitVector> BV = scan(S, Original);
135     if (!BV)
136       return BV.takeError();
137     Pat.Tokens.push_back(*BV);
138   }
139   return Pat;
140 }
141 
142 bool GlobPattern::match(StringRef S) const {
143   if (Exact)
144     return S == *Exact;
145   if (Prefix)
146     return S.startswith(*Prefix);
147   if (Suffix)
148     return S.endswith(*Suffix);
149   return matchOne(Tokens, S);
150 }
151 
152 // Runs glob pattern Pats against string S.
153 bool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const {
154   for (;;) {
155     if (Pats.empty())
156       return S.empty();
157 
158     // If Pats[0] is '*', try to match Pats[1..] against all possible
159     // tail strings of S to see at least one pattern succeeds.
160     if (Pats[0].size() == 0) {
161       Pats = Pats.slice(1);
162       if (Pats.empty())
163         // Fast path. If a pattern is '*', it matches anything.
164         return true;
165       for (size_t I = 0, E = S.size(); I < E; ++I)
166         if (matchOne(Pats, S.substr(I)))
167           return true;
168       return false;
169     }
170 
171     // If Pats[0] is not '*', it must consume one character.
172     if (S.empty() || !Pats[0][(uint8_t)S[0]])
173       return false;
174     Pats = Pats.slice(1);
175     S = S.substr(1);
176   }
177 }
178