xref: /openbsd/gnu/llvm/llvm/utils/lit/lit/ShUtil.py (revision 09467b48)
1*09467b48Spatrickfrom __future__ import absolute_import
2*09467b48Spatrickimport itertools
3*09467b48Spatrick
4*09467b48Spatrickimport lit.util
5*09467b48Spatrickfrom lit.ShCommands import Command, GlobItem, Pipeline, Seq
6*09467b48Spatrick
7*09467b48Spatrickclass ShLexer:
8*09467b48Spatrick    def __init__(self, data, win32Escapes = False):
9*09467b48Spatrick        self.data = data
10*09467b48Spatrick        self.pos = 0
11*09467b48Spatrick        self.end = len(data)
12*09467b48Spatrick        self.win32Escapes = win32Escapes
13*09467b48Spatrick
14*09467b48Spatrick    def eat(self):
15*09467b48Spatrick        c = self.data[self.pos]
16*09467b48Spatrick        self.pos += 1
17*09467b48Spatrick        return c
18*09467b48Spatrick
19*09467b48Spatrick    def look(self):
20*09467b48Spatrick        return self.data[self.pos]
21*09467b48Spatrick
22*09467b48Spatrick    def maybe_eat(self, c):
23*09467b48Spatrick        """
24*09467b48Spatrick        maybe_eat(c) - Consume the character c if it is the next character,
25*09467b48Spatrick        returning True if a character was consumed. """
26*09467b48Spatrick        if self.data[self.pos] == c:
27*09467b48Spatrick            self.pos += 1
28*09467b48Spatrick            return True
29*09467b48Spatrick        return False
30*09467b48Spatrick
31*09467b48Spatrick    def lex_arg_fast(self, c):
32*09467b48Spatrick        # Get the leading whitespace free section.
33*09467b48Spatrick        chunk = self.data[self.pos - 1:].split(None, 1)[0]
34*09467b48Spatrick
35*09467b48Spatrick        # If it has special characters, the fast path failed.
36*09467b48Spatrick        if ('|' in chunk or '&' in chunk or
37*09467b48Spatrick            '<' in chunk or '>' in chunk or
38*09467b48Spatrick            "'" in chunk or '"' in chunk or
39*09467b48Spatrick            ';' in chunk or '\\' in chunk):
40*09467b48Spatrick            return None
41*09467b48Spatrick
42*09467b48Spatrick        self.pos = self.pos - 1 + len(chunk)
43*09467b48Spatrick        return GlobItem(chunk) if '*' in chunk or '?' in chunk else chunk
44*09467b48Spatrick
45*09467b48Spatrick    def lex_arg_slow(self, c):
46*09467b48Spatrick        if c in "'\"":
47*09467b48Spatrick            str = self.lex_arg_quoted(c)
48*09467b48Spatrick        else:
49*09467b48Spatrick            str = c
50*09467b48Spatrick        unquoted_glob_char = False
51*09467b48Spatrick        quoted_glob_char = False
52*09467b48Spatrick        while self.pos != self.end:
53*09467b48Spatrick            c = self.look()
54*09467b48Spatrick            if c.isspace() or c in "|&;":
55*09467b48Spatrick                break
56*09467b48Spatrick            elif c in '><':
57*09467b48Spatrick                # This is an annoying case; we treat '2>' as a single token so
58*09467b48Spatrick                # we don't have to track whitespace tokens.
59*09467b48Spatrick
60*09467b48Spatrick                # If the parse string isn't an integer, do the usual thing.
61*09467b48Spatrick                if not str.isdigit():
62*09467b48Spatrick                    break
63*09467b48Spatrick
64*09467b48Spatrick                # Otherwise, lex the operator and convert to a redirection
65*09467b48Spatrick                # token.
66*09467b48Spatrick                num = int(str)
67*09467b48Spatrick                tok = self.lex_one_token()
68*09467b48Spatrick                assert isinstance(tok, tuple) and len(tok) == 1
69*09467b48Spatrick                return (tok[0], num)
70*09467b48Spatrick            elif c == '"' or c == "'":
71*09467b48Spatrick                self.eat()
72*09467b48Spatrick                quoted_arg = self.lex_arg_quoted(c)
73*09467b48Spatrick                if '*' in quoted_arg or '?' in quoted_arg:
74*09467b48Spatrick                    quoted_glob_char = True
75*09467b48Spatrick                str += quoted_arg
76*09467b48Spatrick            elif not self.win32Escapes and c == '\\':
77*09467b48Spatrick                # Outside of a string, '\\' escapes everything.
78*09467b48Spatrick                self.eat()
79*09467b48Spatrick                if self.pos == self.end:
80*09467b48Spatrick                    lit.util.warning(
81*09467b48Spatrick                        "escape at end of quoted argument in: %r" % self.data)
82*09467b48Spatrick                    return str
83*09467b48Spatrick                str += self.eat()
84*09467b48Spatrick            elif c in '*?':
85*09467b48Spatrick                unquoted_glob_char = True
86*09467b48Spatrick                str += self.eat()
87*09467b48Spatrick            else:
88*09467b48Spatrick                str += self.eat()
89*09467b48Spatrick        # If a quote character is present, lex_arg_quoted will remove the quotes
90*09467b48Spatrick        # and append the argument directly.  This causes a problem when the
91*09467b48Spatrick        # quoted portion contains a glob character, as the character will no
92*09467b48Spatrick        # longer be treated literally.  If glob characters occur *only* inside
93*09467b48Spatrick        # of quotes, then we can handle this by not globbing at all, and if
94*09467b48Spatrick        # glob characters occur *only* outside of quotes, we can still glob just
95*09467b48Spatrick        # fine.  But if a glob character occurs both inside and outside of
96*09467b48Spatrick        # quotes this presents a problem.  In practice this is such an obscure
97*09467b48Spatrick        # edge case that it doesn't seem worth the added complexity to support.
98*09467b48Spatrick        # By adding an assertion, it means some bot somewhere will catch this
99*09467b48Spatrick        # and flag the user of a non-portable test (which could almost certainly
100*09467b48Spatrick        # be re-written to work correctly without triggering this).
101*09467b48Spatrick        assert not (quoted_glob_char and unquoted_glob_char)
102*09467b48Spatrick        return GlobItem(str) if unquoted_glob_char else str
103*09467b48Spatrick
104*09467b48Spatrick    def lex_arg_quoted(self, delim):
105*09467b48Spatrick        str = ''
106*09467b48Spatrick        while self.pos != self.end:
107*09467b48Spatrick            c = self.eat()
108*09467b48Spatrick            if c == delim:
109*09467b48Spatrick                return str
110*09467b48Spatrick            elif c == '\\' and delim == '"':
111*09467b48Spatrick                # Inside a '"' quoted string, '\\' only escapes the quote
112*09467b48Spatrick                # character and backslash, otherwise it is preserved.
113*09467b48Spatrick                if self.pos == self.end:
114*09467b48Spatrick                    lit.util.warning(
115*09467b48Spatrick                        "escape at end of quoted argument in: %r" % self.data)
116*09467b48Spatrick                    return str
117*09467b48Spatrick                c = self.eat()
118*09467b48Spatrick                if c == '"': #
119*09467b48Spatrick                    str += '"'
120*09467b48Spatrick                elif c == '\\':
121*09467b48Spatrick                    str += '\\'
122*09467b48Spatrick                else:
123*09467b48Spatrick                    str += '\\' + c
124*09467b48Spatrick            else:
125*09467b48Spatrick                str += c
126*09467b48Spatrick        lit.util.warning("missing quote character in %r" % self.data)
127*09467b48Spatrick        return str
128*09467b48Spatrick
129*09467b48Spatrick    def lex_arg_checked(self, c):
130*09467b48Spatrick        pos = self.pos
131*09467b48Spatrick        res = self.lex_arg_fast(c)
132*09467b48Spatrick        end = self.pos
133*09467b48Spatrick
134*09467b48Spatrick        self.pos = pos
135*09467b48Spatrick        reference = self.lex_arg_slow(c)
136*09467b48Spatrick        if res is not None:
137*09467b48Spatrick            if res != reference:
138*09467b48Spatrick                raise ValueError("Fast path failure: %r != %r" % (
139*09467b48Spatrick                        res, reference))
140*09467b48Spatrick            if self.pos != end:
141*09467b48Spatrick                raise ValueError("Fast path failure: %r != %r" % (
142*09467b48Spatrick                        self.pos, end))
143*09467b48Spatrick        return reference
144*09467b48Spatrick
145*09467b48Spatrick    def lex_arg(self, c):
146*09467b48Spatrick        return self.lex_arg_fast(c) or self.lex_arg_slow(c)
147*09467b48Spatrick
148*09467b48Spatrick    def lex_one_token(self):
149*09467b48Spatrick        """
150*09467b48Spatrick        lex_one_token - Lex a single 'sh' token. """
151*09467b48Spatrick
152*09467b48Spatrick        c = self.eat()
153*09467b48Spatrick        if c == ';':
154*09467b48Spatrick            return (c,)
155*09467b48Spatrick        if c == '|':
156*09467b48Spatrick            if self.maybe_eat('|'):
157*09467b48Spatrick                return ('||',)
158*09467b48Spatrick            return (c,)
159*09467b48Spatrick        if c == '&':
160*09467b48Spatrick            if self.maybe_eat('&'):
161*09467b48Spatrick                return ('&&',)
162*09467b48Spatrick            if self.maybe_eat('>'):
163*09467b48Spatrick                return ('&>',)
164*09467b48Spatrick            return (c,)
165*09467b48Spatrick        if c == '>':
166*09467b48Spatrick            if self.maybe_eat('&'):
167*09467b48Spatrick                return ('>&',)
168*09467b48Spatrick            if self.maybe_eat('>'):
169*09467b48Spatrick                return ('>>',)
170*09467b48Spatrick            return (c,)
171*09467b48Spatrick        if c == '<':
172*09467b48Spatrick            if self.maybe_eat('&'):
173*09467b48Spatrick                return ('<&',)
174*09467b48Spatrick            if self.maybe_eat('>'):
175*09467b48Spatrick                return ('<<',)
176*09467b48Spatrick            return (c,)
177*09467b48Spatrick
178*09467b48Spatrick        return self.lex_arg(c)
179*09467b48Spatrick
180*09467b48Spatrick    def lex(self):
181*09467b48Spatrick        while self.pos != self.end:
182*09467b48Spatrick            if self.look().isspace():
183*09467b48Spatrick                self.eat()
184*09467b48Spatrick            else:
185*09467b48Spatrick                yield self.lex_one_token()
186*09467b48Spatrick
187*09467b48Spatrick###
188*09467b48Spatrick
189*09467b48Spatrickclass ShParser:
190*09467b48Spatrick    def __init__(self, data, win32Escapes = False, pipefail = False):
191*09467b48Spatrick        self.data = data
192*09467b48Spatrick        self.pipefail = pipefail
193*09467b48Spatrick        self.tokens = ShLexer(data, win32Escapes = win32Escapes).lex()
194*09467b48Spatrick
195*09467b48Spatrick    def lex(self):
196*09467b48Spatrick        for item in self.tokens:
197*09467b48Spatrick            return item
198*09467b48Spatrick        return None
199*09467b48Spatrick
200*09467b48Spatrick    def look(self):
201*09467b48Spatrick        token = self.lex()
202*09467b48Spatrick        if token is not None:
203*09467b48Spatrick            self.tokens = itertools.chain([token], self.tokens)
204*09467b48Spatrick        return token
205*09467b48Spatrick
206*09467b48Spatrick    def parse_command(self):
207*09467b48Spatrick        tok = self.lex()
208*09467b48Spatrick        if not tok:
209*09467b48Spatrick            raise ValueError("empty command!")
210*09467b48Spatrick        if isinstance(tok, tuple):
211*09467b48Spatrick            raise ValueError("syntax error near unexpected token %r" % tok[0])
212*09467b48Spatrick
213*09467b48Spatrick        args = [tok]
214*09467b48Spatrick        redirects = []
215*09467b48Spatrick        while 1:
216*09467b48Spatrick            tok = self.look()
217*09467b48Spatrick
218*09467b48Spatrick            # EOF?
219*09467b48Spatrick            if tok is None:
220*09467b48Spatrick                break
221*09467b48Spatrick
222*09467b48Spatrick            # If this is an argument, just add it to the current command.
223*09467b48Spatrick            if isinstance(tok, (str, GlobItem)):
224*09467b48Spatrick                args.append(self.lex())
225*09467b48Spatrick                continue
226*09467b48Spatrick
227*09467b48Spatrick            # Otherwise see if it is a terminator.
228*09467b48Spatrick            assert isinstance(tok, tuple)
229*09467b48Spatrick            if tok[0] in ('|',';','&','||','&&'):
230*09467b48Spatrick                break
231*09467b48Spatrick
232*09467b48Spatrick            # Otherwise it must be a redirection.
233*09467b48Spatrick            op = self.lex()
234*09467b48Spatrick            arg = self.lex()
235*09467b48Spatrick            if not arg:
236*09467b48Spatrick                raise ValueError("syntax error near token %r" % op[0])
237*09467b48Spatrick            redirects.append((op, arg))
238*09467b48Spatrick
239*09467b48Spatrick        return Command(args, redirects)
240*09467b48Spatrick
241*09467b48Spatrick    def parse_pipeline(self):
242*09467b48Spatrick        negate = False
243*09467b48Spatrick
244*09467b48Spatrick        commands = [self.parse_command()]
245*09467b48Spatrick        while self.look() == ('|',):
246*09467b48Spatrick            self.lex()
247*09467b48Spatrick            commands.append(self.parse_command())
248*09467b48Spatrick        return Pipeline(commands, negate, self.pipefail)
249*09467b48Spatrick
250*09467b48Spatrick    def parse(self):
251*09467b48Spatrick        lhs = self.parse_pipeline()
252*09467b48Spatrick
253*09467b48Spatrick        while self.look():
254*09467b48Spatrick            operator = self.lex()
255*09467b48Spatrick            assert isinstance(operator, tuple) and len(operator) == 1
256*09467b48Spatrick
257*09467b48Spatrick            if not self.look():
258*09467b48Spatrick                raise ValueError(
259*09467b48Spatrick                    "missing argument to operator %r" % operator[0])
260*09467b48Spatrick
261*09467b48Spatrick            # FIXME: Operator precedence!!
262*09467b48Spatrick            lhs = Seq(lhs, operator[0], self.parse_pipeline())
263*09467b48Spatrick
264*09467b48Spatrick        return lhs
265*09467b48Spatrick
266