10b57cec5SDimitry Andric //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements a glob pattern matcher.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric 
130b57cec5SDimitry Andric #include "llvm/Support/GlobPattern.h"
140b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
150b57cec5SDimitry Andric #include "llvm/Support/Errc.h"
160b57cec5SDimitry Andric 
170b57cec5SDimitry Andric using namespace llvm;
180b57cec5SDimitry Andric 
190b57cec5SDimitry Andric // Expands character ranges and returns a bitmap.
200b57cec5SDimitry Andric // For example, "a-cf-hz" is expanded to "abcfghz".
expand(StringRef S,StringRef Original)210b57cec5SDimitry Andric static Expected<BitVector> expand(StringRef S, StringRef Original) {
220b57cec5SDimitry Andric   BitVector BV(256, false);
230b57cec5SDimitry Andric 
240b57cec5SDimitry Andric   // Expand X-Y.
250b57cec5SDimitry Andric   for (;;) {
260b57cec5SDimitry Andric     if (S.size() < 3)
270b57cec5SDimitry Andric       break;
280b57cec5SDimitry Andric 
290b57cec5SDimitry Andric     uint8_t Start = S[0];
300b57cec5SDimitry Andric     uint8_t End = S[2];
310b57cec5SDimitry Andric 
320b57cec5SDimitry Andric     // If it doesn't start with something like X-Y,
330b57cec5SDimitry Andric     // consume the first character and proceed.
340b57cec5SDimitry Andric     if (S[1] != '-') {
350b57cec5SDimitry Andric       BV[Start] = true;
360b57cec5SDimitry Andric       S = S.substr(1);
370b57cec5SDimitry Andric       continue;
380b57cec5SDimitry Andric     }
390b57cec5SDimitry Andric 
400b57cec5SDimitry Andric     // It must be in the form of X-Y.
410b57cec5SDimitry Andric     // Validate it and then interpret the range.
420b57cec5SDimitry Andric     if (Start > End)
430b57cec5SDimitry Andric       return make_error<StringError>("invalid glob pattern: " + Original,
440b57cec5SDimitry Andric                                      errc::invalid_argument);
450b57cec5SDimitry Andric 
460b57cec5SDimitry Andric     for (int C = Start; C <= End; ++C)
470b57cec5SDimitry Andric       BV[(uint8_t)C] = true;
480b57cec5SDimitry Andric     S = S.substr(3);
490b57cec5SDimitry Andric   }
500b57cec5SDimitry Andric 
510b57cec5SDimitry Andric   for (char C : S)
520b57cec5SDimitry Andric     BV[(uint8_t)C] = true;
530b57cec5SDimitry Andric   return BV;
540b57cec5SDimitry Andric }
550b57cec5SDimitry Andric 
565f757f3fSDimitry Andric // Identify brace expansions in S and return the list of patterns they expand
575f757f3fSDimitry Andric // into.
585f757f3fSDimitry Andric static Expected<SmallVector<std::string, 1>>
parseBraceExpansions(StringRef S,std::optional<size_t> MaxSubPatterns)595f757f3fSDimitry Andric parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
605f757f3fSDimitry Andric   SmallVector<std::string> SubPatterns = {S.str()};
615f757f3fSDimitry Andric   if (!MaxSubPatterns || !S.contains('{'))
625f757f3fSDimitry Andric     return std::move(SubPatterns);
635f757f3fSDimitry Andric 
645f757f3fSDimitry Andric   struct BraceExpansion {
655f757f3fSDimitry Andric     size_t Start;
665f757f3fSDimitry Andric     size_t Length;
675f757f3fSDimitry Andric     SmallVector<StringRef, 2> Terms;
685f757f3fSDimitry Andric   };
695f757f3fSDimitry Andric   SmallVector<BraceExpansion, 0> BraceExpansions;
705f757f3fSDimitry Andric 
715f757f3fSDimitry Andric   BraceExpansion *CurrentBE = nullptr;
725f757f3fSDimitry Andric   size_t TermBegin;
735f757f3fSDimitry Andric   for (size_t I = 0, E = S.size(); I != E; ++I) {
745f757f3fSDimitry Andric     if (S[I] == '[') {
755f757f3fSDimitry Andric       I = S.find(']', I + 2);
765f757f3fSDimitry Andric       if (I == std::string::npos)
775f757f3fSDimitry Andric         return make_error<StringError>("invalid glob pattern, unmatched '['",
785f757f3fSDimitry Andric                                        errc::invalid_argument);
795f757f3fSDimitry Andric     } else if (S[I] == '{') {
805f757f3fSDimitry Andric       if (CurrentBE)
815f757f3fSDimitry Andric         return make_error<StringError>(
825f757f3fSDimitry Andric             "nested brace expansions are not supported",
835f757f3fSDimitry Andric             errc::invalid_argument);
845f757f3fSDimitry Andric       CurrentBE = &BraceExpansions.emplace_back();
855f757f3fSDimitry Andric       CurrentBE->Start = I;
865f757f3fSDimitry Andric       TermBegin = I + 1;
875f757f3fSDimitry Andric     } else if (S[I] == ',') {
885f757f3fSDimitry Andric       if (!CurrentBE)
895f757f3fSDimitry Andric         continue;
905f757f3fSDimitry Andric       CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
915f757f3fSDimitry Andric       TermBegin = I + 1;
925f757f3fSDimitry Andric     } else if (S[I] == '}') {
935f757f3fSDimitry Andric       if (!CurrentBE)
945f757f3fSDimitry Andric         continue;
955f757f3fSDimitry Andric       if (CurrentBE->Terms.empty())
965f757f3fSDimitry Andric         return make_error<StringError>(
975f757f3fSDimitry Andric             "empty or singleton brace expansions are not supported",
985f757f3fSDimitry Andric             errc::invalid_argument);
995f757f3fSDimitry Andric       CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
1005f757f3fSDimitry Andric       CurrentBE->Length = I - CurrentBE->Start + 1;
1015f757f3fSDimitry Andric       CurrentBE = nullptr;
1025f757f3fSDimitry Andric     } else if (S[I] == '\\') {
1035f757f3fSDimitry Andric       if (++I == E)
1045f757f3fSDimitry Andric         return make_error<StringError>("invalid glob pattern, stray '\\'",
1055f757f3fSDimitry Andric                                        errc::invalid_argument);
1065f757f3fSDimitry Andric     }
1075f757f3fSDimitry Andric   }
1085f757f3fSDimitry Andric   if (CurrentBE)
1095f757f3fSDimitry Andric     return make_error<StringError>("incomplete brace expansion",
1100b57cec5SDimitry Andric                                    errc::invalid_argument);
1110b57cec5SDimitry Andric 
1125f757f3fSDimitry Andric   size_t NumSubPatterns = 1;
1135f757f3fSDimitry Andric   for (auto &BE : BraceExpansions) {
1145f757f3fSDimitry Andric     if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
1155f757f3fSDimitry Andric       NumSubPatterns = std::numeric_limits<size_t>::max();
1165f757f3fSDimitry Andric       break;
1170b57cec5SDimitry Andric     }
1185f757f3fSDimitry Andric     NumSubPatterns *= BE.Terms.size();
1190b57cec5SDimitry Andric   }
1205f757f3fSDimitry Andric   if (NumSubPatterns > *MaxSubPatterns)
1215f757f3fSDimitry Andric     return make_error<StringError>("too many brace expansions",
1225f757f3fSDimitry Andric                                    errc::invalid_argument);
1235f757f3fSDimitry Andric   // Replace brace expansions in reverse order so that we don't invalidate
1245f757f3fSDimitry Andric   // earlier start indices
1255f757f3fSDimitry Andric   for (auto &BE : reverse(BraceExpansions)) {
1265f757f3fSDimitry Andric     SmallVector<std::string> OrigSubPatterns;
1275f757f3fSDimitry Andric     std::swap(SubPatterns, OrigSubPatterns);
1285f757f3fSDimitry Andric     for (StringRef Term : BE.Terms)
1295f757f3fSDimitry Andric       for (StringRef Orig : OrigSubPatterns)
1305f757f3fSDimitry Andric         SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
1310b57cec5SDimitry Andric   }
1325f757f3fSDimitry Andric   return std::move(SubPatterns);
1330b57cec5SDimitry Andric }
1340b57cec5SDimitry Andric 
1355f757f3fSDimitry Andric Expected<GlobPattern>
create(StringRef S,std::optional<size_t> MaxSubPatterns)1365f757f3fSDimitry Andric GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
1370b57cec5SDimitry Andric   GlobPattern Pat;
1380b57cec5SDimitry Andric 
1395f757f3fSDimitry Andric   // Store the prefix that does not contain any metacharacter.
1405f757f3fSDimitry Andric   size_t PrefixSize = S.find_first_of("?*[{\\");
1415f757f3fSDimitry Andric   Pat.Prefix = S.substr(0, PrefixSize);
1425f757f3fSDimitry Andric   if (PrefixSize == std::string::npos)
1435f757f3fSDimitry Andric     return Pat;
1445f757f3fSDimitry Andric   S = S.substr(PrefixSize);
1455f757f3fSDimitry Andric 
1465f757f3fSDimitry Andric   SmallVector<std::string, 1> SubPats;
1475f757f3fSDimitry Andric   if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
1485f757f3fSDimitry Andric     return std::move(Err);
1495f757f3fSDimitry Andric   for (StringRef SubPat : SubPats) {
1505f757f3fSDimitry Andric     auto SubGlobOrErr = SubGlobPattern::create(SubPat);
1515f757f3fSDimitry Andric     if (!SubGlobOrErr)
1525f757f3fSDimitry Andric       return SubGlobOrErr.takeError();
1535f757f3fSDimitry Andric     Pat.SubGlobs.push_back(*SubGlobOrErr);
1545f757f3fSDimitry Andric   }
1555f757f3fSDimitry Andric 
1560b57cec5SDimitry Andric   return Pat;
1570b57cec5SDimitry Andric }
1580b57cec5SDimitry Andric 
1595f757f3fSDimitry Andric Expected<GlobPattern::SubGlobPattern>
create(StringRef S)1605f757f3fSDimitry Andric GlobPattern::SubGlobPattern::create(StringRef S) {
1615f757f3fSDimitry Andric   SubGlobPattern Pat;
1620b57cec5SDimitry Andric 
1635f757f3fSDimitry Andric   // Parse brackets.
1645f757f3fSDimitry Andric   Pat.Pat.assign(S.begin(), S.end());
1655f757f3fSDimitry Andric   for (size_t I = 0, E = S.size(); I != E; ++I) {
1665f757f3fSDimitry Andric     if (S[I] == '[') {
1675f757f3fSDimitry Andric       // ']' is allowed as the first character of a character class. '[]' is
1685f757f3fSDimitry Andric       // invalid. So, just skip the first character.
1695f757f3fSDimitry Andric       ++I;
1705f757f3fSDimitry Andric       size_t J = S.find(']', I + 1);
1715f757f3fSDimitry Andric       if (J == StringRef::npos)
1725f757f3fSDimitry Andric         return make_error<StringError>("invalid glob pattern, unmatched '['",
1735f757f3fSDimitry Andric                                        errc::invalid_argument);
1745f757f3fSDimitry Andric       StringRef Chars = S.substr(I, J - I);
1755f757f3fSDimitry Andric       bool Invert = S[I] == '^' || S[I] == '!';
1765f757f3fSDimitry Andric       Expected<BitVector> BV =
1775f757f3fSDimitry Andric           Invert ? expand(Chars.substr(1), S) : expand(Chars, S);
1780b57cec5SDimitry Andric       if (!BV)
1790b57cec5SDimitry Andric         return BV.takeError();
1805f757f3fSDimitry Andric       if (Invert)
1815f757f3fSDimitry Andric         BV->flip();
1825f757f3fSDimitry Andric       Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
1835f757f3fSDimitry Andric       I = J;
1845f757f3fSDimitry Andric     } else if (S[I] == '\\') {
1855f757f3fSDimitry Andric       if (++I == E)
1865f757f3fSDimitry Andric         return make_error<StringError>("invalid glob pattern, stray '\\'",
1875f757f3fSDimitry Andric                                        errc::invalid_argument);
1885f757f3fSDimitry Andric     }
1890b57cec5SDimitry Andric   }
1900b57cec5SDimitry Andric   return Pat;
1910b57cec5SDimitry Andric }
1920b57cec5SDimitry Andric 
match(StringRef S) const1930b57cec5SDimitry Andric bool GlobPattern::match(StringRef S) const {
1945f757f3fSDimitry Andric   if (!S.consume_front(Prefix))
1955f757f3fSDimitry Andric     return false;
1965f757f3fSDimitry Andric   if (SubGlobs.empty() && S.empty())
1970b57cec5SDimitry Andric     return true;
1985f757f3fSDimitry Andric   for (auto &Glob : SubGlobs)
1995f757f3fSDimitry Andric     if (Glob.match(S))
2000b57cec5SDimitry Andric       return true;
2010b57cec5SDimitry Andric   return false;
2020b57cec5SDimitry Andric }
2030b57cec5SDimitry Andric 
2045f757f3fSDimitry Andric // Factor the pattern into segments split by '*'. The segment is matched
2055f757f3fSDimitry Andric // sequentianlly by finding the first occurrence past the end of the previous
2065f757f3fSDimitry Andric // match.
match(StringRef Str) const2075f757f3fSDimitry Andric bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
2085f757f3fSDimitry Andric   const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
2095f757f3fSDimitry Andric              *SavedS = S;
2105f757f3fSDimitry Andric   const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
2115f757f3fSDimitry Andric   size_t B = 0, SavedB = 0;
2125f757f3fSDimitry Andric   while (S != End) {
2135f757f3fSDimitry Andric     if (P == PEnd)
2145f757f3fSDimitry Andric       ;
2155f757f3fSDimitry Andric     else if (*P == '*') {
2165f757f3fSDimitry Andric       // The non-* substring on the left of '*' matches the tail of S. Save the
2175f757f3fSDimitry Andric       // positions to be used by backtracking if we see a mismatch later.
2185f757f3fSDimitry Andric       SegmentBegin = ++P;
2195f757f3fSDimitry Andric       SavedS = S;
2205f757f3fSDimitry Andric       SavedB = B;
2215f757f3fSDimitry Andric       continue;
2225f757f3fSDimitry Andric     } else if (*P == '[') {
2235f757f3fSDimitry Andric       if (Brackets[B].Bytes[uint8_t(*S)]) {
2245f757f3fSDimitry Andric         P = Pat.data() + Brackets[B++].NextOffset;
2255f757f3fSDimitry Andric         ++S;
2265f757f3fSDimitry Andric         continue;
2270b57cec5SDimitry Andric       }
2285f757f3fSDimitry Andric     } else if (*P == '\\') {
2295f757f3fSDimitry Andric       if (*++P == *S) {
2305f757f3fSDimitry Andric         ++P;
2315f757f3fSDimitry Andric         ++S;
2325f757f3fSDimitry Andric         continue;
2335f757f3fSDimitry Andric       }
2345f757f3fSDimitry Andric     } else if (*P == *S || *P == '?') {
2355f757f3fSDimitry Andric       ++P;
2365f757f3fSDimitry Andric       ++S;
2375f757f3fSDimitry Andric       continue;
2385f757f3fSDimitry Andric     }
2395f757f3fSDimitry Andric     if (!SegmentBegin)
2405f757f3fSDimitry Andric       return false;
2415f757f3fSDimitry Andric     // We have seen a '*'. Backtrack to the saved positions. Shift the S
2425f757f3fSDimitry Andric     // position to probe the next starting position in the segment.
2435f757f3fSDimitry Andric     P = SegmentBegin;
2445f757f3fSDimitry Andric     S = ++SavedS;
2455f757f3fSDimitry Andric     B = SavedB;
2465f757f3fSDimitry Andric   }
2475f757f3fSDimitry Andric   // All bytes in Str have been matched. Return true if the rest part of Pat is
2485f757f3fSDimitry Andric   // empty or contains only '*'.
2495f757f3fSDimitry Andric   return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
2500b57cec5SDimitry Andric }
251