1 //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a glob pattern matcher.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/GlobPattern.h"
14 #include "llvm/ADT/StringRef.h"
15 #include "llvm/Support/Errc.h"
16 
17 using namespace llvm;
18 
19 // Expands character ranges and returns a bitmap.
20 // For example, "a-cf-hz" is expanded to "abcfghz".
expand(StringRef S,StringRef Original)21 static Expected<BitVector> expand(StringRef S, StringRef Original) {
22   BitVector BV(256, false);
23 
24   // Expand X-Y.
25   for (;;) {
26     if (S.size() < 3)
27       break;
28 
29     uint8_t Start = S[0];
30     uint8_t End = S[2];
31 
32     // If it doesn't start with something like X-Y,
33     // consume the first character and proceed.
34     if (S[1] != '-') {
35       BV[Start] = true;
36       S = S.substr(1);
37       continue;
38     }
39 
40     // It must be in the form of X-Y.
41     // Validate it and then interpret the range.
42     if (Start > End)
43       return make_error<StringError>("invalid glob pattern: " + Original,
44                                      errc::invalid_argument);
45 
46     for (int C = Start; C <= End; ++C)
47       BV[(uint8_t)C] = true;
48     S = S.substr(3);
49   }
50 
51   for (char C : S)
52     BV[(uint8_t)C] = true;
53   return BV;
54 }
55 
56 // Identify brace expansions in S and return the list of patterns they expand
57 // into.
58 static Expected<SmallVector<std::string, 1>>
parseBraceExpansions(StringRef S,std::optional<size_t> MaxSubPatterns)59 parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
60   SmallVector<std::string> SubPatterns = {S.str()};
61   if (!MaxSubPatterns || !S.contains('{'))
62     return std::move(SubPatterns);
63 
64   struct BraceExpansion {
65     size_t Start;
66     size_t Length;
67     SmallVector<StringRef, 2> Terms;
68   };
69   SmallVector<BraceExpansion, 0> BraceExpansions;
70 
71   BraceExpansion *CurrentBE = nullptr;
72   size_t TermBegin;
73   for (size_t I = 0, E = S.size(); I != E; ++I) {
74     if (S[I] == '[') {
75       I = S.find(']', I + 2);
76       if (I == std::string::npos)
77         return make_error<StringError>("invalid glob pattern, unmatched '['",
78                                        errc::invalid_argument);
79     } else if (S[I] == '{') {
80       if (CurrentBE)
81         return make_error<StringError>(
82             "nested brace expansions are not supported",
83             errc::invalid_argument);
84       CurrentBE = &BraceExpansions.emplace_back();
85       CurrentBE->Start = I;
86       TermBegin = I + 1;
87     } else if (S[I] == ',') {
88       if (!CurrentBE)
89         continue;
90       CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
91       TermBegin = I + 1;
92     } else if (S[I] == '}') {
93       if (!CurrentBE)
94         continue;
95       if (CurrentBE->Terms.empty())
96         return make_error<StringError>(
97             "empty or singleton brace expansions are not supported",
98             errc::invalid_argument);
99       CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
100       CurrentBE->Length = I - CurrentBE->Start + 1;
101       CurrentBE = nullptr;
102     } else if (S[I] == '\\') {
103       if (++I == E)
104         return make_error<StringError>("invalid glob pattern, stray '\\'",
105                                        errc::invalid_argument);
106     }
107   }
108   if (CurrentBE)
109     return make_error<StringError>("incomplete brace expansion",
110                                    errc::invalid_argument);
111 
112   size_t NumSubPatterns = 1;
113   for (auto &BE : BraceExpansions) {
114     if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
115       NumSubPatterns = std::numeric_limits<size_t>::max();
116       break;
117     }
118     NumSubPatterns *= BE.Terms.size();
119   }
120   if (NumSubPatterns > *MaxSubPatterns)
121     return make_error<StringError>("too many brace expansions",
122                                    errc::invalid_argument);
123   // Replace brace expansions in reverse order so that we don't invalidate
124   // earlier start indices
125   for (auto &BE : reverse(BraceExpansions)) {
126     SmallVector<std::string> OrigSubPatterns;
127     std::swap(SubPatterns, OrigSubPatterns);
128     for (StringRef Term : BE.Terms)
129       for (StringRef Orig : OrigSubPatterns)
130         SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
131   }
132   return std::move(SubPatterns);
133 }
134 
135 Expected<GlobPattern>
create(StringRef S,std::optional<size_t> MaxSubPatterns)136 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
137   GlobPattern Pat;
138 
139   // Store the prefix that does not contain any metacharacter.
140   size_t PrefixSize = S.find_first_of("?*[{\\");
141   Pat.Prefix = S.substr(0, PrefixSize);
142   if (PrefixSize == std::string::npos)
143     return Pat;
144   S = S.substr(PrefixSize);
145 
146   SmallVector<std::string, 1> SubPats;
147   if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
148     return std::move(Err);
149   for (StringRef SubPat : SubPats) {
150     auto SubGlobOrErr = SubGlobPattern::create(SubPat);
151     if (!SubGlobOrErr)
152       return SubGlobOrErr.takeError();
153     Pat.SubGlobs.push_back(*SubGlobOrErr);
154   }
155 
156   return Pat;
157 }
158 
159 Expected<GlobPattern::SubGlobPattern>
create(StringRef S)160 GlobPattern::SubGlobPattern::create(StringRef S) {
161   SubGlobPattern Pat;
162 
163   // Parse brackets.
164   Pat.Pat.assign(S.begin(), S.end());
165   for (size_t I = 0, E = S.size(); I != E; ++I) {
166     if (S[I] == '[') {
167       // ']' is allowed as the first character of a character class. '[]' is
168       // invalid. So, just skip the first character.
169       ++I;
170       size_t J = S.find(']', I + 1);
171       if (J == StringRef::npos)
172         return make_error<StringError>("invalid glob pattern, unmatched '['",
173                                        errc::invalid_argument);
174       StringRef Chars = S.substr(I, J - I);
175       bool Invert = S[I] == '^' || S[I] == '!';
176       Expected<BitVector> BV =
177           Invert ? expand(Chars.substr(1), S) : expand(Chars, S);
178       if (!BV)
179         return BV.takeError();
180       if (Invert)
181         BV->flip();
182       Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
183       I = J;
184     } else if (S[I] == '\\') {
185       if (++I == E)
186         return make_error<StringError>("invalid glob pattern, stray '\\'",
187                                        errc::invalid_argument);
188     }
189   }
190   return Pat;
191 }
192 
match(StringRef S) const193 bool GlobPattern::match(StringRef S) const {
194   if (!S.consume_front(Prefix))
195     return false;
196   if (SubGlobs.empty() && S.empty())
197     return true;
198   for (auto &Glob : SubGlobs)
199     if (Glob.match(S))
200       return true;
201   return false;
202 }
203 
204 // Factor the pattern into segments split by '*'. The segment is matched
205 // sequentianlly by finding the first occurrence past the end of the previous
206 // match.
match(StringRef Str) const207 bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
208   const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
209              *SavedS = S;
210   const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
211   size_t B = 0, SavedB = 0;
212   while (S != End) {
213     if (P == PEnd)
214       ;
215     else if (*P == '*') {
216       // The non-* substring on the left of '*' matches the tail of S. Save the
217       // positions to be used by backtracking if we see a mismatch later.
218       SegmentBegin = ++P;
219       SavedS = S;
220       SavedB = B;
221       continue;
222     } else if (*P == '[') {
223       if (Brackets[B].Bytes[uint8_t(*S)]) {
224         P = Pat.data() + Brackets[B++].NextOffset;
225         ++S;
226         continue;
227       }
228     } else if (*P == '\\') {
229       if (*++P == *S) {
230         ++P;
231         ++S;
232         continue;
233       }
234     } else if (*P == *S || *P == '?') {
235       ++P;
236       ++S;
237       continue;
238     }
239     if (!SegmentBegin)
240       return false;
241     // We have seen a '*'. Backtrack to the saved positions. Shift the S
242     // position to probe the next starting position in the segment.
243     P = SegmentBegin;
244     S = ++SavedS;
245     B = SavedB;
246   }
247   // All bytes in Str have been matched. Return true if the rest part of Pat is
248   // empty or contains only '*'.
249   return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
250 }
251