1from __future__ import absolute_import 2import itertools 3 4import lit.util 5from lit.ShCommands import Command, GlobItem, Pipeline, Seq 6 7class ShLexer: 8 def __init__(self, data, win32Escapes = False): 9 self.data = data 10 self.pos = 0 11 self.end = len(data) 12 self.win32Escapes = win32Escapes 13 14 def eat(self): 15 c = self.data[self.pos] 16 self.pos += 1 17 return c 18 19 def look(self): 20 return self.data[self.pos] 21 22 def maybe_eat(self, c): 23 """ 24 maybe_eat(c) - Consume the character c if it is the next character, 25 returning True if a character was consumed. """ 26 if self.data[self.pos] == c: 27 self.pos += 1 28 return True 29 return False 30 31 def lex_arg_fast(self, c): 32 # Get the leading whitespace free section. 33 chunk = self.data[self.pos - 1:].split(None, 1)[0] 34 35 # If it has special characters, the fast path failed. 36 if ('|' in chunk or '&' in chunk or 37 '<' in chunk or '>' in chunk or 38 "'" in chunk or '"' in chunk or 39 ';' in chunk or '\\' in chunk): 40 return None 41 42 self.pos = self.pos - 1 + len(chunk) 43 return GlobItem(chunk) if '*' in chunk or '?' in chunk else chunk 44 45 def lex_arg_slow(self, c): 46 if c in "'\"": 47 str = self.lex_arg_quoted(c) 48 else: 49 str = c 50 unquoted_glob_char = False 51 quoted_glob_char = False 52 while self.pos != self.end: 53 c = self.look() 54 if c.isspace() or c in "|&;": 55 break 56 elif c in '><': 57 # This is an annoying case; we treat '2>' as a single token so 58 # we don't have to track whitespace tokens. 59 60 # If the parse string isn't an integer, do the usual thing. 61 if not str.isdigit(): 62 break 63 64 # Otherwise, lex the operator and convert to a redirection 65 # token. 66 num = int(str) 67 tok = self.lex_one_token() 68 assert isinstance(tok, tuple) and len(tok) == 1 69 return (tok[0], num) 70 elif c == '"' or c == "'": 71 self.eat() 72 quoted_arg = self.lex_arg_quoted(c) 73 if '*' in quoted_arg or '?' in quoted_arg: 74 quoted_glob_char = True 75 str += quoted_arg 76 elif not self.win32Escapes and c == '\\': 77 # Outside of a string, '\\' escapes everything. 78 self.eat() 79 if self.pos == self.end: 80 lit.util.warning( 81 "escape at end of quoted argument in: %r" % self.data) 82 return str 83 str += self.eat() 84 elif c in '*?': 85 unquoted_glob_char = True 86 str += self.eat() 87 else: 88 str += self.eat() 89 # If a quote character is present, lex_arg_quoted will remove the quotes 90 # and append the argument directly. This causes a problem when the 91 # quoted portion contains a glob character, as the character will no 92 # longer be treated literally. If glob characters occur *only* inside 93 # of quotes, then we can handle this by not globbing at all, and if 94 # glob characters occur *only* outside of quotes, we can still glob just 95 # fine. But if a glob character occurs both inside and outside of 96 # quotes this presents a problem. In practice this is such an obscure 97 # edge case that it doesn't seem worth the added complexity to support. 98 # By adding an assertion, it means some bot somewhere will catch this 99 # and flag the user of a non-portable test (which could almost certainly 100 # be re-written to work correctly without triggering this). 101 assert not (quoted_glob_char and unquoted_glob_char) 102 return GlobItem(str) if unquoted_glob_char else str 103 104 def lex_arg_quoted(self, delim): 105 str = '' 106 while self.pos != self.end: 107 c = self.eat() 108 if c == delim: 109 return str 110 elif c == '\\' and delim == '"': 111 # Inside a '"' quoted string, '\\' only escapes the quote 112 # character and backslash, otherwise it is preserved. 113 if self.pos == self.end: 114 lit.util.warning( 115 "escape at end of quoted argument in: %r" % self.data) 116 return str 117 c = self.eat() 118 if c == '"': # 119 str += '"' 120 elif c == '\\': 121 str += '\\' 122 else: 123 str += '\\' + c 124 else: 125 str += c 126 lit.util.warning("missing quote character in %r" % self.data) 127 return str 128 129 def lex_arg_checked(self, c): 130 pos = self.pos 131 res = self.lex_arg_fast(c) 132 end = self.pos 133 134 self.pos = pos 135 reference = self.lex_arg_slow(c) 136 if res is not None: 137 if res != reference: 138 raise ValueError("Fast path failure: %r != %r" % ( 139 res, reference)) 140 if self.pos != end: 141 raise ValueError("Fast path failure: %r != %r" % ( 142 self.pos, end)) 143 return reference 144 145 def lex_arg(self, c): 146 return self.lex_arg_fast(c) or self.lex_arg_slow(c) 147 148 def lex_one_token(self): 149 """ 150 lex_one_token - Lex a single 'sh' token. """ 151 152 c = self.eat() 153 if c == ';': 154 return (c,) 155 if c == '|': 156 if self.maybe_eat('|'): 157 return ('||',) 158 return (c,) 159 if c == '&': 160 if self.maybe_eat('&'): 161 return ('&&',) 162 if self.maybe_eat('>'): 163 return ('&>',) 164 return (c,) 165 if c == '>': 166 if self.maybe_eat('&'): 167 return ('>&',) 168 if self.maybe_eat('>'): 169 return ('>>',) 170 return (c,) 171 if c == '<': 172 if self.maybe_eat('&'): 173 return ('<&',) 174 if self.maybe_eat('>'): 175 return ('<<',) 176 return (c,) 177 178 return self.lex_arg(c) 179 180 def lex(self): 181 while self.pos != self.end: 182 if self.look().isspace(): 183 self.eat() 184 else: 185 yield self.lex_one_token() 186 187### 188 189class ShParser: 190 def __init__(self, data, win32Escapes = False, pipefail = False): 191 self.data = data 192 self.pipefail = pipefail 193 self.tokens = ShLexer(data, win32Escapes = win32Escapes).lex() 194 195 def lex(self): 196 for item in self.tokens: 197 return item 198 return None 199 200 def look(self): 201 token = self.lex() 202 if token is not None: 203 self.tokens = itertools.chain([token], self.tokens) 204 return token 205 206 def parse_command(self): 207 tok = self.lex() 208 if not tok: 209 raise ValueError("empty command!") 210 if isinstance(tok, tuple): 211 raise ValueError("syntax error near unexpected token %r" % tok[0]) 212 213 args = [tok] 214 redirects = [] 215 while 1: 216 tok = self.look() 217 218 # EOF? 219 if tok is None: 220 break 221 222 # If this is an argument, just add it to the current command. 223 if isinstance(tok, (str, GlobItem)): 224 args.append(self.lex()) 225 continue 226 227 # Otherwise see if it is a terminator. 228 assert isinstance(tok, tuple) 229 if tok[0] in ('|',';','&','||','&&'): 230 break 231 232 # Otherwise it must be a redirection. 233 op = self.lex() 234 arg = self.lex() 235 if not arg: 236 raise ValueError("syntax error near token %r" % op[0]) 237 redirects.append((op, arg)) 238 239 return Command(args, redirects) 240 241 def parse_pipeline(self): 242 negate = False 243 244 commands = [self.parse_command()] 245 while self.look() == ('|',): 246 self.lex() 247 commands.append(self.parse_command()) 248 return Pipeline(commands, negate, self.pipefail) 249 250 def parse(self): 251 lhs = self.parse_pipeline() 252 253 while self.look(): 254 operator = self.lex() 255 assert isinstance(operator, tuple) and len(operator) == 1 256 257 if not self.look(): 258 raise ValueError( 259 "missing argument to operator %r" % operator[0]) 260 261 # FIXME: Operator precedence!! 262 lhs = Seq(lhs, operator[0], self.parse_pipeline()) 263 264 return lhs 265 266