1 //===-- GlobPattern.h - glob pattern matcher implementation -*- C++ -*-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a glob pattern matcher.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_SUPPORT_GLOBPATTERN_H
14 #define LLVM_SUPPORT_GLOBPATTERN_H
15 
16 #include "llvm/ADT/BitVector.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Support/Error.h"
20 #include <optional>
21 
22 namespace llvm {
23 
24 /// This class implements a glob pattern matcher similar to the one found in
25 /// bash, but with some key differences. Namely, that \p "*" matches all
26 /// characters and does not exclude path separators.
27 ///
28 /// * \p "?" matches a single character.
29 /// * \p "*" matches zero or more characters.
30 /// * \p "[<chars>]" matches one character in the bracket. Character ranges,
31 ///   e.g., \p "[a-z]", and negative sets via \p "[^ab]" or \p "[!ab]" are also
32 ///   supported.
33 /// * \p "{<glob>,...}" matches one of the globs in the list. Nested brace
34 ///   expansions are not supported. If \p MaxSubPatterns is empty then
35 ///   brace expansions are not supported and characters \p "{,}" are treated as
36 ///   literals.
37 /// * \p "\" escapes the next character so it is treated as a literal.
38 ///
39 ///
40 /// Some known edge cases are:
41 /// * \p "]" is allowed as the first character in a character class, i.e.,
42 ///   \p "[]]" is valid and matches the literal \p "]".
43 /// * The empty character class, i.e., \p "[]", is invalid.
44 /// * Empty or singleton brace expansions, e.g., \p "{}", \p "{a}", are invalid.
45 /// * \p "}" and \p "," that are not inside a brace expansion are taken as
46 ///   literals, e.g., \p ",}" is valid but \p "{" is not.
47 ///
48 ///
49 /// For example, \p "*[/\\]foo.{c,cpp}" will match (unix or windows) paths to
50 /// all files named \p "foo.c" or \p "foo.cpp".
51 class GlobPattern {
52 public:
53   /// \param Pat the pattern to match against
54   /// \param MaxSubPatterns if provided limit the number of allowed subpatterns
55   ///                       created from expanding braces otherwise disable
56   ///                       brace expansion
57   static Expected<GlobPattern>
58   create(StringRef Pat, std::optional<size_t> MaxSubPatterns = {});
59   /// \returns \p true if \p S matches this glob pattern
60   bool match(StringRef S) const;
61 
62   // Returns true for glob pattern "*". Can be used to avoid expensive
63   // preparation/acquisition of the input for match().
isTrivialMatchAll()64   bool isTrivialMatchAll() const {
65     if (!Prefix.empty())
66       return false;
67     if (SubGlobs.size() != 1)
68       return false;
69     return SubGlobs[0].getPat() == "*";
70   }
71 
72 private:
73   StringRef Prefix;
74 
75   struct SubGlobPattern {
76     /// \param Pat the pattern to match against
77     static Expected<SubGlobPattern> create(StringRef Pat);
78     /// \returns \p true if \p S matches this glob pattern
79     bool match(StringRef S) const;
getPatSubGlobPattern80     StringRef getPat() const { return StringRef(Pat.data(), Pat.size()); }
81 
82     // Brackets with their end position and matched bytes.
83     struct Bracket {
84       size_t NextOffset;
85       BitVector Bytes;
86     };
87     SmallVector<Bracket, 0> Brackets;
88     SmallVector<char, 0> Pat;
89   };
90   SmallVector<SubGlobPattern, 1> SubGlobs;
91 };
92 }
93 
94 #endif // LLVM_SUPPORT_GLOBPATTERN_H
95