1*09467b48Spatrickfrom __future__ import absolute_import 2*09467b48Spatrickimport itertools 3*09467b48Spatrick 4*09467b48Spatrickimport lit.util 5*09467b48Spatrickfrom lit.ShCommands import Command, GlobItem, Pipeline, Seq 6*09467b48Spatrick 7*09467b48Spatrickclass ShLexer: 8*09467b48Spatrick def __init__(self, data, win32Escapes = False): 9*09467b48Spatrick self.data = data 10*09467b48Spatrick self.pos = 0 11*09467b48Spatrick self.end = len(data) 12*09467b48Spatrick self.win32Escapes = win32Escapes 13*09467b48Spatrick 14*09467b48Spatrick def eat(self): 15*09467b48Spatrick c = self.data[self.pos] 16*09467b48Spatrick self.pos += 1 17*09467b48Spatrick return c 18*09467b48Spatrick 19*09467b48Spatrick def look(self): 20*09467b48Spatrick return self.data[self.pos] 21*09467b48Spatrick 22*09467b48Spatrick def maybe_eat(self, c): 23*09467b48Spatrick """ 24*09467b48Spatrick maybe_eat(c) - Consume the character c if it is the next character, 25*09467b48Spatrick returning True if a character was consumed. """ 26*09467b48Spatrick if self.data[self.pos] == c: 27*09467b48Spatrick self.pos += 1 28*09467b48Spatrick return True 29*09467b48Spatrick return False 30*09467b48Spatrick 31*09467b48Spatrick def lex_arg_fast(self, c): 32*09467b48Spatrick # Get the leading whitespace free section. 33*09467b48Spatrick chunk = self.data[self.pos - 1:].split(None, 1)[0] 34*09467b48Spatrick 35*09467b48Spatrick # If it has special characters, the fast path failed. 36*09467b48Spatrick if ('|' in chunk or '&' in chunk or 37*09467b48Spatrick '<' in chunk or '>' in chunk or 38*09467b48Spatrick "'" in chunk or '"' in chunk or 39*09467b48Spatrick ';' in chunk or '\\' in chunk): 40*09467b48Spatrick return None 41*09467b48Spatrick 42*09467b48Spatrick self.pos = self.pos - 1 + len(chunk) 43*09467b48Spatrick return GlobItem(chunk) if '*' in chunk or '?' in chunk else chunk 44*09467b48Spatrick 45*09467b48Spatrick def lex_arg_slow(self, c): 46*09467b48Spatrick if c in "'\"": 47*09467b48Spatrick str = self.lex_arg_quoted(c) 48*09467b48Spatrick else: 49*09467b48Spatrick str = c 50*09467b48Spatrick unquoted_glob_char = False 51*09467b48Spatrick quoted_glob_char = False 52*09467b48Spatrick while self.pos != self.end: 53*09467b48Spatrick c = self.look() 54*09467b48Spatrick if c.isspace() or c in "|&;": 55*09467b48Spatrick break 56*09467b48Spatrick elif c in '><': 57*09467b48Spatrick # This is an annoying case; we treat '2>' as a single token so 58*09467b48Spatrick # we don't have to track whitespace tokens. 59*09467b48Spatrick 60*09467b48Spatrick # If the parse string isn't an integer, do the usual thing. 61*09467b48Spatrick if not str.isdigit(): 62*09467b48Spatrick break 63*09467b48Spatrick 64*09467b48Spatrick # Otherwise, lex the operator and convert to a redirection 65*09467b48Spatrick # token. 66*09467b48Spatrick num = int(str) 67*09467b48Spatrick tok = self.lex_one_token() 68*09467b48Spatrick assert isinstance(tok, tuple) and len(tok) == 1 69*09467b48Spatrick return (tok[0], num) 70*09467b48Spatrick elif c == '"' or c == "'": 71*09467b48Spatrick self.eat() 72*09467b48Spatrick quoted_arg = self.lex_arg_quoted(c) 73*09467b48Spatrick if '*' in quoted_arg or '?' in quoted_arg: 74*09467b48Spatrick quoted_glob_char = True 75*09467b48Spatrick str += quoted_arg 76*09467b48Spatrick elif not self.win32Escapes and c == '\\': 77*09467b48Spatrick # Outside of a string, '\\' escapes everything. 78*09467b48Spatrick self.eat() 79*09467b48Spatrick if self.pos == self.end: 80*09467b48Spatrick lit.util.warning( 81*09467b48Spatrick "escape at end of quoted argument in: %r" % self.data) 82*09467b48Spatrick return str 83*09467b48Spatrick str += self.eat() 84*09467b48Spatrick elif c in '*?': 85*09467b48Spatrick unquoted_glob_char = True 86*09467b48Spatrick str += self.eat() 87*09467b48Spatrick else: 88*09467b48Spatrick str += self.eat() 89*09467b48Spatrick # If a quote character is present, lex_arg_quoted will remove the quotes 90*09467b48Spatrick # and append the argument directly. This causes a problem when the 91*09467b48Spatrick # quoted portion contains a glob character, as the character will no 92*09467b48Spatrick # longer be treated literally. If glob characters occur *only* inside 93*09467b48Spatrick # of quotes, then we can handle this by not globbing at all, and if 94*09467b48Spatrick # glob characters occur *only* outside of quotes, we can still glob just 95*09467b48Spatrick # fine. But if a glob character occurs both inside and outside of 96*09467b48Spatrick # quotes this presents a problem. In practice this is such an obscure 97*09467b48Spatrick # edge case that it doesn't seem worth the added complexity to support. 98*09467b48Spatrick # By adding an assertion, it means some bot somewhere will catch this 99*09467b48Spatrick # and flag the user of a non-portable test (which could almost certainly 100*09467b48Spatrick # be re-written to work correctly without triggering this). 101*09467b48Spatrick assert not (quoted_glob_char and unquoted_glob_char) 102*09467b48Spatrick return GlobItem(str) if unquoted_glob_char else str 103*09467b48Spatrick 104*09467b48Spatrick def lex_arg_quoted(self, delim): 105*09467b48Spatrick str = '' 106*09467b48Spatrick while self.pos != self.end: 107*09467b48Spatrick c = self.eat() 108*09467b48Spatrick if c == delim: 109*09467b48Spatrick return str 110*09467b48Spatrick elif c == '\\' and delim == '"': 111*09467b48Spatrick # Inside a '"' quoted string, '\\' only escapes the quote 112*09467b48Spatrick # character and backslash, otherwise it is preserved. 113*09467b48Spatrick if self.pos == self.end: 114*09467b48Spatrick lit.util.warning( 115*09467b48Spatrick "escape at end of quoted argument in: %r" % self.data) 116*09467b48Spatrick return str 117*09467b48Spatrick c = self.eat() 118*09467b48Spatrick if c == '"': # 119*09467b48Spatrick str += '"' 120*09467b48Spatrick elif c == '\\': 121*09467b48Spatrick str += '\\' 122*09467b48Spatrick else: 123*09467b48Spatrick str += '\\' + c 124*09467b48Spatrick else: 125*09467b48Spatrick str += c 126*09467b48Spatrick lit.util.warning("missing quote character in %r" % self.data) 127*09467b48Spatrick return str 128*09467b48Spatrick 129*09467b48Spatrick def lex_arg_checked(self, c): 130*09467b48Spatrick pos = self.pos 131*09467b48Spatrick res = self.lex_arg_fast(c) 132*09467b48Spatrick end = self.pos 133*09467b48Spatrick 134*09467b48Spatrick self.pos = pos 135*09467b48Spatrick reference = self.lex_arg_slow(c) 136*09467b48Spatrick if res is not None: 137*09467b48Spatrick if res != reference: 138*09467b48Spatrick raise ValueError("Fast path failure: %r != %r" % ( 139*09467b48Spatrick res, reference)) 140*09467b48Spatrick if self.pos != end: 141*09467b48Spatrick raise ValueError("Fast path failure: %r != %r" % ( 142*09467b48Spatrick self.pos, end)) 143*09467b48Spatrick return reference 144*09467b48Spatrick 145*09467b48Spatrick def lex_arg(self, c): 146*09467b48Spatrick return self.lex_arg_fast(c) or self.lex_arg_slow(c) 147*09467b48Spatrick 148*09467b48Spatrick def lex_one_token(self): 149*09467b48Spatrick """ 150*09467b48Spatrick lex_one_token - Lex a single 'sh' token. """ 151*09467b48Spatrick 152*09467b48Spatrick c = self.eat() 153*09467b48Spatrick if c == ';': 154*09467b48Spatrick return (c,) 155*09467b48Spatrick if c == '|': 156*09467b48Spatrick if self.maybe_eat('|'): 157*09467b48Spatrick return ('||',) 158*09467b48Spatrick return (c,) 159*09467b48Spatrick if c == '&': 160*09467b48Spatrick if self.maybe_eat('&'): 161*09467b48Spatrick return ('&&',) 162*09467b48Spatrick if self.maybe_eat('>'): 163*09467b48Spatrick return ('&>',) 164*09467b48Spatrick return (c,) 165*09467b48Spatrick if c == '>': 166*09467b48Spatrick if self.maybe_eat('&'): 167*09467b48Spatrick return ('>&',) 168*09467b48Spatrick if self.maybe_eat('>'): 169*09467b48Spatrick return ('>>',) 170*09467b48Spatrick return (c,) 171*09467b48Spatrick if c == '<': 172*09467b48Spatrick if self.maybe_eat('&'): 173*09467b48Spatrick return ('<&',) 174*09467b48Spatrick if self.maybe_eat('>'): 175*09467b48Spatrick return ('<<',) 176*09467b48Spatrick return (c,) 177*09467b48Spatrick 178*09467b48Spatrick return self.lex_arg(c) 179*09467b48Spatrick 180*09467b48Spatrick def lex(self): 181*09467b48Spatrick while self.pos != self.end: 182*09467b48Spatrick if self.look().isspace(): 183*09467b48Spatrick self.eat() 184*09467b48Spatrick else: 185*09467b48Spatrick yield self.lex_one_token() 186*09467b48Spatrick 187*09467b48Spatrick### 188*09467b48Spatrick 189*09467b48Spatrickclass ShParser: 190*09467b48Spatrick def __init__(self, data, win32Escapes = False, pipefail = False): 191*09467b48Spatrick self.data = data 192*09467b48Spatrick self.pipefail = pipefail 193*09467b48Spatrick self.tokens = ShLexer(data, win32Escapes = win32Escapes).lex() 194*09467b48Spatrick 195*09467b48Spatrick def lex(self): 196*09467b48Spatrick for item in self.tokens: 197*09467b48Spatrick return item 198*09467b48Spatrick return None 199*09467b48Spatrick 200*09467b48Spatrick def look(self): 201*09467b48Spatrick token = self.lex() 202*09467b48Spatrick if token is not None: 203*09467b48Spatrick self.tokens = itertools.chain([token], self.tokens) 204*09467b48Spatrick return token 205*09467b48Spatrick 206*09467b48Spatrick def parse_command(self): 207*09467b48Spatrick tok = self.lex() 208*09467b48Spatrick if not tok: 209*09467b48Spatrick raise ValueError("empty command!") 210*09467b48Spatrick if isinstance(tok, tuple): 211*09467b48Spatrick raise ValueError("syntax error near unexpected token %r" % tok[0]) 212*09467b48Spatrick 213*09467b48Spatrick args = [tok] 214*09467b48Spatrick redirects = [] 215*09467b48Spatrick while 1: 216*09467b48Spatrick tok = self.look() 217*09467b48Spatrick 218*09467b48Spatrick # EOF? 219*09467b48Spatrick if tok is None: 220*09467b48Spatrick break 221*09467b48Spatrick 222*09467b48Spatrick # If this is an argument, just add it to the current command. 223*09467b48Spatrick if isinstance(tok, (str, GlobItem)): 224*09467b48Spatrick args.append(self.lex()) 225*09467b48Spatrick continue 226*09467b48Spatrick 227*09467b48Spatrick # Otherwise see if it is a terminator. 228*09467b48Spatrick assert isinstance(tok, tuple) 229*09467b48Spatrick if tok[0] in ('|',';','&','||','&&'): 230*09467b48Spatrick break 231*09467b48Spatrick 232*09467b48Spatrick # Otherwise it must be a redirection. 233*09467b48Spatrick op = self.lex() 234*09467b48Spatrick arg = self.lex() 235*09467b48Spatrick if not arg: 236*09467b48Spatrick raise ValueError("syntax error near token %r" % op[0]) 237*09467b48Spatrick redirects.append((op, arg)) 238*09467b48Spatrick 239*09467b48Spatrick return Command(args, redirects) 240*09467b48Spatrick 241*09467b48Spatrick def parse_pipeline(self): 242*09467b48Spatrick negate = False 243*09467b48Spatrick 244*09467b48Spatrick commands = [self.parse_command()] 245*09467b48Spatrick while self.look() == ('|',): 246*09467b48Spatrick self.lex() 247*09467b48Spatrick commands.append(self.parse_command()) 248*09467b48Spatrick return Pipeline(commands, negate, self.pipefail) 249*09467b48Spatrick 250*09467b48Spatrick def parse(self): 251*09467b48Spatrick lhs = self.parse_pipeline() 252*09467b48Spatrick 253*09467b48Spatrick while self.look(): 254*09467b48Spatrick operator = self.lex() 255*09467b48Spatrick assert isinstance(operator, tuple) and len(operator) == 1 256*09467b48Spatrick 257*09467b48Spatrick if not self.look(): 258*09467b48Spatrick raise ValueError( 259*09467b48Spatrick "missing argument to operator %r" % operator[0]) 260*09467b48Spatrick 261*09467b48Spatrick # FIXME: Operator precedence!! 262*09467b48Spatrick lhs = Seq(lhs, operator[0], self.parse_pipeline()) 263*09467b48Spatrick 264*09467b48Spatrick return lhs 265*09467b48Spatrick 266