xref: /openbsd/gnu/llvm/llvm/utils/lit/lit/ShUtil.py (revision 09467b48)
1from __future__ import absolute_import
2import itertools
3
4import lit.util
5from lit.ShCommands import Command, GlobItem, Pipeline, Seq
6
7class ShLexer:
8    def __init__(self, data, win32Escapes = False):
9        self.data = data
10        self.pos = 0
11        self.end = len(data)
12        self.win32Escapes = win32Escapes
13
14    def eat(self):
15        c = self.data[self.pos]
16        self.pos += 1
17        return c
18
19    def look(self):
20        return self.data[self.pos]
21
22    def maybe_eat(self, c):
23        """
24        maybe_eat(c) - Consume the character c if it is the next character,
25        returning True if a character was consumed. """
26        if self.data[self.pos] == c:
27            self.pos += 1
28            return True
29        return False
30
31    def lex_arg_fast(self, c):
32        # Get the leading whitespace free section.
33        chunk = self.data[self.pos - 1:].split(None, 1)[0]
34
35        # If it has special characters, the fast path failed.
36        if ('|' in chunk or '&' in chunk or
37            '<' in chunk or '>' in chunk or
38            "'" in chunk or '"' in chunk or
39            ';' in chunk or '\\' in chunk):
40            return None
41
42        self.pos = self.pos - 1 + len(chunk)
43        return GlobItem(chunk) if '*' in chunk or '?' in chunk else chunk
44
45    def lex_arg_slow(self, c):
46        if c in "'\"":
47            str = self.lex_arg_quoted(c)
48        else:
49            str = c
50        unquoted_glob_char = False
51        quoted_glob_char = False
52        while self.pos != self.end:
53            c = self.look()
54            if c.isspace() or c in "|&;":
55                break
56            elif c in '><':
57                # This is an annoying case; we treat '2>' as a single token so
58                # we don't have to track whitespace tokens.
59
60                # If the parse string isn't an integer, do the usual thing.
61                if not str.isdigit():
62                    break
63
64                # Otherwise, lex the operator and convert to a redirection
65                # token.
66                num = int(str)
67                tok = self.lex_one_token()
68                assert isinstance(tok, tuple) and len(tok) == 1
69                return (tok[0], num)
70            elif c == '"' or c == "'":
71                self.eat()
72                quoted_arg = self.lex_arg_quoted(c)
73                if '*' in quoted_arg or '?' in quoted_arg:
74                    quoted_glob_char = True
75                str += quoted_arg
76            elif not self.win32Escapes and c == '\\':
77                # Outside of a string, '\\' escapes everything.
78                self.eat()
79                if self.pos == self.end:
80                    lit.util.warning(
81                        "escape at end of quoted argument in: %r" % self.data)
82                    return str
83                str += self.eat()
84            elif c in '*?':
85                unquoted_glob_char = True
86                str += self.eat()
87            else:
88                str += self.eat()
89        # If a quote character is present, lex_arg_quoted will remove the quotes
90        # and append the argument directly.  This causes a problem when the
91        # quoted portion contains a glob character, as the character will no
92        # longer be treated literally.  If glob characters occur *only* inside
93        # of quotes, then we can handle this by not globbing at all, and if
94        # glob characters occur *only* outside of quotes, we can still glob just
95        # fine.  But if a glob character occurs both inside and outside of
96        # quotes this presents a problem.  In practice this is such an obscure
97        # edge case that it doesn't seem worth the added complexity to support.
98        # By adding an assertion, it means some bot somewhere will catch this
99        # and flag the user of a non-portable test (which could almost certainly
100        # be re-written to work correctly without triggering this).
101        assert not (quoted_glob_char and unquoted_glob_char)
102        return GlobItem(str) if unquoted_glob_char else str
103
104    def lex_arg_quoted(self, delim):
105        str = ''
106        while self.pos != self.end:
107            c = self.eat()
108            if c == delim:
109                return str
110            elif c == '\\' and delim == '"':
111                # Inside a '"' quoted string, '\\' only escapes the quote
112                # character and backslash, otherwise it is preserved.
113                if self.pos == self.end:
114                    lit.util.warning(
115                        "escape at end of quoted argument in: %r" % self.data)
116                    return str
117                c = self.eat()
118                if c == '"': #
119                    str += '"'
120                elif c == '\\':
121                    str += '\\'
122                else:
123                    str += '\\' + c
124            else:
125                str += c
126        lit.util.warning("missing quote character in %r" % self.data)
127        return str
128
129    def lex_arg_checked(self, c):
130        pos = self.pos
131        res = self.lex_arg_fast(c)
132        end = self.pos
133
134        self.pos = pos
135        reference = self.lex_arg_slow(c)
136        if res is not None:
137            if res != reference:
138                raise ValueError("Fast path failure: %r != %r" % (
139                        res, reference))
140            if self.pos != end:
141                raise ValueError("Fast path failure: %r != %r" % (
142                        self.pos, end))
143        return reference
144
145    def lex_arg(self, c):
146        return self.lex_arg_fast(c) or self.lex_arg_slow(c)
147
148    def lex_one_token(self):
149        """
150        lex_one_token - Lex a single 'sh' token. """
151
152        c = self.eat()
153        if c == ';':
154            return (c,)
155        if c == '|':
156            if self.maybe_eat('|'):
157                return ('||',)
158            return (c,)
159        if c == '&':
160            if self.maybe_eat('&'):
161                return ('&&',)
162            if self.maybe_eat('>'):
163                return ('&>',)
164            return (c,)
165        if c == '>':
166            if self.maybe_eat('&'):
167                return ('>&',)
168            if self.maybe_eat('>'):
169                return ('>>',)
170            return (c,)
171        if c == '<':
172            if self.maybe_eat('&'):
173                return ('<&',)
174            if self.maybe_eat('>'):
175                return ('<<',)
176            return (c,)
177
178        return self.lex_arg(c)
179
180    def lex(self):
181        while self.pos != self.end:
182            if self.look().isspace():
183                self.eat()
184            else:
185                yield self.lex_one_token()
186
187###
188
189class ShParser:
190    def __init__(self, data, win32Escapes = False, pipefail = False):
191        self.data = data
192        self.pipefail = pipefail
193        self.tokens = ShLexer(data, win32Escapes = win32Escapes).lex()
194
195    def lex(self):
196        for item in self.tokens:
197            return item
198        return None
199
200    def look(self):
201        token = self.lex()
202        if token is not None:
203            self.tokens = itertools.chain([token], self.tokens)
204        return token
205
206    def parse_command(self):
207        tok = self.lex()
208        if not tok:
209            raise ValueError("empty command!")
210        if isinstance(tok, tuple):
211            raise ValueError("syntax error near unexpected token %r" % tok[0])
212
213        args = [tok]
214        redirects = []
215        while 1:
216            tok = self.look()
217
218            # EOF?
219            if tok is None:
220                break
221
222            # If this is an argument, just add it to the current command.
223            if isinstance(tok, (str, GlobItem)):
224                args.append(self.lex())
225                continue
226
227            # Otherwise see if it is a terminator.
228            assert isinstance(tok, tuple)
229            if tok[0] in ('|',';','&','||','&&'):
230                break
231
232            # Otherwise it must be a redirection.
233            op = self.lex()
234            arg = self.lex()
235            if not arg:
236                raise ValueError("syntax error near token %r" % op[0])
237            redirects.append((op, arg))
238
239        return Command(args, redirects)
240
241    def parse_pipeline(self):
242        negate = False
243
244        commands = [self.parse_command()]
245        while self.look() == ('|',):
246            self.lex()
247            commands.append(self.parse_command())
248        return Pipeline(commands, negate, self.pipefail)
249
250    def parse(self):
251        lhs = self.parse_pipeline()
252
253        while self.look():
254            operator = self.lex()
255            assert isinstance(operator, tuple) and len(operator) == 1
256
257            if not self.look():
258                raise ValueError(
259                    "missing argument to operator %r" % operator[0])
260
261            # FIXME: Operator precedence!!
262            lhs = Seq(lhs, operator[0], self.parse_pipeline())
263
264        return lhs
265
266