1#------------------------------------------------------------------------------- 2# Parser for ASDL [1] definition files. Reads in an ASDL description and parses 3# it into an AST that describes it. 4# 5# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support 6# modules and attributes after a product. Words starting with Capital letters 7# are terminals. Literal tokens are in "double quotes". Others are 8# non-terminals. Id is either TokenId or ConstructorId. 9# 10# module ::= "module" Id "{" [definitions] "}" 11# definitions ::= { TypeId "=" type } 12# type ::= product | sum 13# product ::= fields ["attributes" fields] 14# fields ::= "(" { field, "," } field ")" 15# field ::= TypeId ["?" | "*"] [Id] 16# sum ::= constructor { "|" constructor } ["attributes" fields] 17# constructor ::= ConstructorId [fields] 18# 19# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See 20# http://asdl.sourceforge.net/ 21#------------------------------------------------------------------------------- 22from collections import namedtuple 23import re 24 25__all__ = [ 26 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor', 27 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check'] 28 29# The following classes define nodes into which the ASDL description is parsed. 30# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST 31# structure used by a programming language. But ASDL files themselves need to be 32# parsed. This module parses ASDL files and uses a simple AST to represent them. 33# See the EBNF at the top of the file to understand the logical connection 34# between the various node types. 35 36builtin_types = {'identifier', 'string', 'bytes', 'int', 'object', 'singleton', 37 'constant'} 38 39class AST: 40 def __repr__(self): 41 raise NotImplementedError 42 43class Module(AST): 44 def __init__(self, name, dfns): 45 self.name = name 46 self.dfns = dfns 47 self.types = {type.name: type.value for type in dfns} 48 49 def __repr__(self): 50 return 'Module({0.name}, {0.dfns})'.format(self) 51 52class Type(AST): 53 def __init__(self, name, value): 54 self.name = name 55 self.value = value 56 57 def __repr__(self): 58 return 'Type({0.name}, {0.value})'.format(self) 59 60class Constructor(AST): 61 def __init__(self, name, fields=None): 62 self.name = name 63 self.fields = fields or [] 64 65 def __repr__(self): 66 return 'Constructor({0.name}, {0.fields})'.format(self) 67 68class Field(AST): 69 def __init__(self, type, name=None, seq=False, opt=False): 70 self.type = type 71 self.name = name 72 self.seq = seq 73 self.opt = opt 74 75 def __repr__(self): 76 if self.seq: 77 extra = ", seq=True" 78 elif self.opt: 79 extra = ", opt=True" 80 else: 81 extra = "" 82 if self.name is None: 83 return 'Field({0.type}{1})'.format(self, extra) 84 else: 85 return 'Field({0.type}, {0.name}{1})'.format(self, extra) 86 87class Sum(AST): 88 def __init__(self, types, attributes=None): 89 self.types = types 90 self.attributes = attributes or [] 91 92 def __repr__(self): 93 if self.attributes: 94 return 'Sum({0.types}, {0.attributes})'.format(self) 95 else: 96 return 'Sum({0.types})'.format(self) 97 98class Product(AST): 99 def __init__(self, fields, attributes=None): 100 self.fields = fields 101 self.attributes = attributes or [] 102 103 def __repr__(self): 104 if self.attributes: 105 return 'Product({0.fields}, {0.attributes})'.format(self) 106 else: 107 return 'Product({0.fields})'.format(self) 108 109# A generic visitor for the meta-AST that describes ASDL. This can be used by 110# emitters. Note that this visitor does not provide a generic visit method, so a 111# subclass needs to define visit methods from visitModule to as deep as the 112# interesting node. 113# We also define a Check visitor that makes sure the parsed ASDL is well-formed. 114 115class VisitorBase(object): 116 """Generic tree visitor for ASTs.""" 117 def __init__(self): 118 self.cache = {} 119 120 def visit(self, obj, *args): 121 klass = obj.__class__ 122 meth = self.cache.get(klass) 123 if meth is None: 124 methname = "visit" + klass.__name__ 125 meth = getattr(self, methname, None) 126 self.cache[klass] = meth 127 if meth: 128 try: 129 meth(obj, *args) 130 except Exception as e: 131 print("Error visiting %r: %s" % (obj, e)) 132 raise 133 134class Check(VisitorBase): 135 """A visitor that checks a parsed ASDL tree for correctness. 136 137 Errors are printed and accumulated. 138 """ 139 def __init__(self): 140 super(Check, self).__init__() 141 self.cons = {} 142 self.errors = 0 143 self.types = {} 144 145 def visitModule(self, mod): 146 for dfn in mod.dfns: 147 self.visit(dfn) 148 149 def visitType(self, type): 150 self.visit(type.value, str(type.name)) 151 152 def visitSum(self, sum, name): 153 for t in sum.types: 154 self.visit(t, name) 155 156 def visitConstructor(self, cons, name): 157 key = str(cons.name) 158 conflict = self.cons.get(key) 159 if conflict is None: 160 self.cons[key] = name 161 else: 162 print('Redefinition of constructor {}'.format(key)) 163 print('Defined in {} and {}'.format(conflict, name)) 164 self.errors += 1 165 for f in cons.fields: 166 self.visit(f, key) 167 168 def visitField(self, field, name): 169 key = str(field.type) 170 l = self.types.setdefault(key, []) 171 l.append(name) 172 173 def visitProduct(self, prod, name): 174 for f in prod.fields: 175 self.visit(f, name) 176 177def check(mod): 178 """Check the parsed ASDL tree for correctness. 179 180 Return True if success. For failure, the errors are printed out and False 181 is returned. 182 """ 183 v = Check() 184 v.visit(mod) 185 186 for t in v.types: 187 if t not in mod.types and not t in builtin_types: 188 v.errors += 1 189 uses = ", ".join(v.types[t]) 190 print('Undefined type {}, used in {}'.format(t, uses)) 191 return not v.errors 192 193# The ASDL parser itself comes next. The only interesting external interface 194# here is the top-level parse function. 195 196def parse(filename): 197 """Parse ASDL from the given file and return a Module node describing it.""" 198 with open(filename) as f: 199 parser = ASDLParser() 200 return parser.parse(f.read()) 201 202# Types for describing tokens in an ASDL specification. 203class TokenKind: 204 """TokenKind is provides a scope for enumerated token kinds.""" 205 (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk, 206 LParen, RParen, LBrace, RBrace) = range(11) 207 208 operator_table = { 209 '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen, 210 ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace} 211 212Token = namedtuple('Token', 'kind value lineno') 213 214class ASDLSyntaxError(Exception): 215 def __init__(self, msg, lineno=None): 216 self.msg = msg 217 self.lineno = lineno or '<unknown>' 218 219 def __str__(self): 220 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self) 221 222def tokenize_asdl(buf): 223 """Tokenize the given buffer. Yield Token objects.""" 224 for lineno, line in enumerate(buf.splitlines(), 1): 225 for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()): 226 c = m.group(1) 227 if c[0].isalpha(): 228 # Some kind of identifier 229 if c[0].isupper(): 230 yield Token(TokenKind.ConstructorId, c, lineno) 231 else: 232 yield Token(TokenKind.TypeId, c, lineno) 233 elif c[:2] == '--': 234 # Comment 235 break 236 else: 237 # Operators 238 try: 239 op_kind = TokenKind.operator_table[c] 240 except KeyError: 241 raise ASDLSyntaxError('Invalid operator %s' % c, lineno) 242 yield Token(op_kind, c, lineno) 243 244class ASDLParser: 245 """Parser for ASDL files. 246 247 Create, then call the parse method on a buffer containing ASDL. 248 This is a simple recursive descent parser that uses tokenize_asdl for the 249 lexing. 250 """ 251 def __init__(self): 252 self._tokenizer = None 253 self.cur_token = None 254 255 def parse(self, buf): 256 """Parse the ASDL in the buffer and return an AST with a Module root. 257 """ 258 self._tokenizer = tokenize_asdl(buf) 259 self._advance() 260 return self._parse_module() 261 262 def _parse_module(self): 263 if self._at_keyword('module'): 264 self._advance() 265 else: 266 raise ASDLSyntaxError( 267 'Expected "module" (found {})'.format(self.cur_token.value), 268 self.cur_token.lineno) 269 name = self._match(self._id_kinds) 270 self._match(TokenKind.LBrace) 271 defs = self._parse_definitions() 272 self._match(TokenKind.RBrace) 273 return Module(name, defs) 274 275 def _parse_definitions(self): 276 defs = [] 277 while self.cur_token.kind == TokenKind.TypeId: 278 typename = self._advance() 279 self._match(TokenKind.Equals) 280 type = self._parse_type() 281 defs.append(Type(typename, type)) 282 return defs 283 284 def _parse_type(self): 285 if self.cur_token.kind == TokenKind.LParen: 286 # If we see a (, it's a product 287 return self._parse_product() 288 else: 289 # Otherwise it's a sum. Look for ConstructorId 290 sumlist = [Constructor(self._match(TokenKind.ConstructorId), 291 self._parse_optional_fields())] 292 while self.cur_token.kind == TokenKind.Pipe: 293 # More constructors 294 self._advance() 295 sumlist.append(Constructor( 296 self._match(TokenKind.ConstructorId), 297 self._parse_optional_fields())) 298 return Sum(sumlist, self._parse_optional_attributes()) 299 300 def _parse_product(self): 301 return Product(self._parse_fields(), self._parse_optional_attributes()) 302 303 def _parse_fields(self): 304 fields = [] 305 self._match(TokenKind.LParen) 306 while self.cur_token.kind == TokenKind.TypeId: 307 typename = self._advance() 308 is_seq, is_opt = self._parse_optional_field_quantifier() 309 id = (self._advance() if self.cur_token.kind in self._id_kinds 310 else None) 311 fields.append(Field(typename, id, seq=is_seq, opt=is_opt)) 312 if self.cur_token.kind == TokenKind.RParen: 313 break 314 elif self.cur_token.kind == TokenKind.Comma: 315 self._advance() 316 self._match(TokenKind.RParen) 317 return fields 318 319 def _parse_optional_fields(self): 320 if self.cur_token.kind == TokenKind.LParen: 321 return self._parse_fields() 322 else: 323 return None 324 325 def _parse_optional_attributes(self): 326 if self._at_keyword('attributes'): 327 self._advance() 328 return self._parse_fields() 329 else: 330 return None 331 332 def _parse_optional_field_quantifier(self): 333 is_seq, is_opt = False, False 334 if self.cur_token.kind == TokenKind.Asterisk: 335 is_seq = True 336 self._advance() 337 elif self.cur_token.kind == TokenKind.Question: 338 is_opt = True 339 self._advance() 340 return is_seq, is_opt 341 342 def _advance(self): 343 """ Return the value of the current token and read the next one into 344 self.cur_token. 345 """ 346 cur_val = None if self.cur_token is None else self.cur_token.value 347 try: 348 self.cur_token = next(self._tokenizer) 349 except StopIteration: 350 self.cur_token = None 351 return cur_val 352 353 _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId) 354 355 def _match(self, kind): 356 """The 'match' primitive of RD parsers. 357 358 * Verifies that the current token is of the given kind (kind can 359 be a tuple, in which the kind must match one of its members). 360 * Returns the value of the current token 361 * Reads in the next token 362 """ 363 if (isinstance(kind, tuple) and self.cur_token.kind in kind or 364 self.cur_token.kind == kind 365 ): 366 value = self.cur_token.value 367 self._advance() 368 return value 369 else: 370 raise ASDLSyntaxError( 371 'Unmatched {} (found {})'.format(kind, self.cur_token.kind), 372 self.cur_token.lineno) 373 374 def _at_keyword(self, keyword): 375 return (self.cur_token.kind == TokenKind.TypeId and 376 self.cur_token.value == keyword) 377