1 //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a glob pattern matcher.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/GlobPattern.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/Optional.h"
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/Support/Errc.h"
18 
19 using namespace llvm;
20 
hasWildcard(StringRef S)21 static bool hasWildcard(StringRef S) {
22   return S.find_first_of("?*[\\") != StringRef::npos;
23 }
24 
25 // Expands character ranges and returns a bitmap.
26 // For example, "a-cf-hz" is expanded to "abcfghz".
expand(StringRef S,StringRef Original)27 static Expected<BitVector> expand(StringRef S, StringRef Original) {
28   BitVector BV(256, false);
29 
30   // Expand X-Y.
31   for (;;) {
32     if (S.size() < 3)
33       break;
34 
35     uint8_t Start = S[0];
36     uint8_t End = S[2];
37 
38     // If it doesn't start with something like X-Y,
39     // consume the first character and proceed.
40     if (S[1] != '-') {
41       BV[Start] = true;
42       S = S.substr(1);
43       continue;
44     }
45 
46     // It must be in the form of X-Y.
47     // Validate it and then interpret the range.
48     if (Start > End)
49       return make_error<StringError>("invalid glob pattern: " + Original,
50                                      errc::invalid_argument);
51 
52     for (int C = Start; C <= End; ++C)
53       BV[(uint8_t)C] = true;
54     S = S.substr(3);
55   }
56 
57   for (char C : S)
58     BV[(uint8_t)C] = true;
59   return BV;
60 }
61 
62 // This is a scanner for the glob pattern.
63 // A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]"
64 // (which is a negative form of "[<chars>]"), "[!<chars>]" (which is
65 // equivalent to "[^<chars>]"), or a non-meta character.
66 // This function returns the first token in S.
scan(StringRef & S,StringRef Original)67 static Expected<BitVector> scan(StringRef &S, StringRef Original) {
68   switch (S[0]) {
69   case '*':
70     S = S.substr(1);
71     // '*' is represented by an empty bitvector.
72     // All other bitvectors are 256-bit long.
73     return BitVector();
74   case '?':
75     S = S.substr(1);
76     return BitVector(256, true);
77   case '[': {
78     // ']' is allowed as the first character of a character class. '[]' is
79     // invalid. So, just skip the first character.
80     size_t End = S.find(']', 2);
81     if (End == StringRef::npos)
82       return make_error<StringError>("invalid glob pattern: " + Original,
83                                      errc::invalid_argument);
84 
85     StringRef Chars = S.substr(1, End - 1);
86     S = S.substr(End + 1);
87     if (Chars.startswith("^") || Chars.startswith("!")) {
88       Expected<BitVector> BV = expand(Chars.substr(1), Original);
89       if (!BV)
90         return BV.takeError();
91       return BV->flip();
92     }
93     return expand(Chars, Original);
94   }
95   case '\\':
96     // Eat this character and fall through below to treat it like a non-meta
97     // character.
98     S = S.substr(1);
99     LLVM_FALLTHROUGH;
100   default:
101     BitVector BV(256, false);
102     BV[(uint8_t)S[0]] = true;
103     S = S.substr(1);
104     return BV;
105   }
106 }
107 
create(StringRef S)108 Expected<GlobPattern> GlobPattern::create(StringRef S) {
109   GlobPattern Pat;
110 
111   // S doesn't contain any metacharacter,
112   // so the regular string comparison should work.
113   if (!hasWildcard(S)) {
114     Pat.Exact = S;
115     return Pat;
116   }
117 
118   // S is something like "foo*", and the "* is not escaped. We can use
119   // startswith().
120   if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) {
121     Pat.Prefix = S.drop_back();
122     return Pat;
123   }
124 
125   // S is something like "*foo". We can use endswith().
126   if (S.startswith("*") && !hasWildcard(S.drop_front())) {
127     Pat.Suffix = S.drop_front();
128     return Pat;
129   }
130 
131   // Otherwise, we need to do real glob pattern matching.
132   // Parse the pattern now.
133   StringRef Original = S;
134   while (!S.empty()) {
135     Expected<BitVector> BV = scan(S, Original);
136     if (!BV)
137       return BV.takeError();
138     Pat.Tokens.push_back(*BV);
139   }
140   return Pat;
141 }
142 
match(StringRef S) const143 bool GlobPattern::match(StringRef S) const {
144   if (Exact)
145     return S == *Exact;
146   if (Prefix)
147     return S.startswith(*Prefix);
148   if (Suffix)
149     return S.endswith(*Suffix);
150   return matchOne(Tokens, S);
151 }
152 
153 // Runs glob pattern Pats against string S.
matchOne(ArrayRef<BitVector> Pats,StringRef S) const154 bool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const {
155   for (;;) {
156     if (Pats.empty())
157       return S.empty();
158 
159     // If Pats[0] is '*', try to match Pats[1..] against all possible
160     // tail strings of S to see at least one pattern succeeds.
161     if (Pats[0].size() == 0) {
162       Pats = Pats.slice(1);
163       if (Pats.empty())
164         // Fast path. If a pattern is '*', it matches anything.
165         return true;
166       for (size_t I = 0, E = S.size(); I < E; ++I)
167         if (matchOne(Pats, S.substr(I)))
168           return true;
169       return false;
170     }
171 
172     // If Pats[0] is not '*', it must consume one character.
173     if (S.empty() || !Pats[0][(uint8_t)S[0]])
174       return false;
175     Pats = Pats.slice(1);
176     S = S.substr(1);
177   }
178 }
179