1#!/usr/bin/env python3 2# A tool to parse ASTMatchers.h and update the documentation in 3# ../LibASTMatchersReference.html automatically. Run from the 4# directory in which this file is located to update the docs. 5 6import collections 7import re 8try: 9 from urllib.request import urlopen 10except ImportError: 11 from urllib2 import urlopen 12 13CLASS_INDEX_PAGE_URL = 'https://clang.llvm.org/doxygen/classes.html' 14try: 15 CLASS_INDEX_PAGE = urlopen(CLASS_INDEX_PAGE_URL).read().decode('utf-8') 16except Exception as e: 17 raise Exception('Unable to get %s: %s' % (CLASS_INDEX_PAGE_URL, e)) 18 19MATCHERS_FILE = '../../include/clang/ASTMatchers/ASTMatchers.h' 20 21# Each matcher is documented in one row of the form: 22# result | name | argA 23# The subsequent row contains the documentation and is hidden by default, 24# becoming visible via javascript when the user clicks the matcher name. 25TD_TEMPLATE=""" 26<tr><td>%(result)s</td><td class="name" onclick="toggle('%(id)s')"><a name="%(id)sAnchor">%(name)s</a></td><td>%(args)s</td></tr> 27<tr><td colspan="4" class="doc" id="%(id)s"><pre>%(comment)s</pre></td></tr> 28""" 29 30# We categorize the matchers into these three categories in the reference: 31node_matchers = {} 32narrowing_matchers = {} 33traversal_matchers = {} 34 35# We output multiple rows per matcher if the matcher can be used on multiple 36# node types. Thus, we need a new id per row to control the documentation 37# pop-up. ids[name] keeps track of those ids. 38ids = collections.defaultdict(int) 39 40# Cache for doxygen urls we have already verified. 41doxygen_probes = {} 42 43def esc(text): 44 """Escape any html in the given text.""" 45 text = re.sub(r'&', '&', text) 46 text = re.sub(r'<', '<', text) 47 text = re.sub(r'>', '>', text) 48 def link_if_exists(m): 49 """Wrap a likely AST node name in a link to its clang docs. 50 51 We want to do this only if the page exists, in which case it will be 52 referenced from the class index page. 53 """ 54 name = m.group(1) 55 url = 'https://clang.llvm.org/doxygen/classclang_1_1%s.html' % name 56 if url not in doxygen_probes: 57 search_str = 'href="classclang_1_1%s.html"' % name 58 doxygen_probes[url] = search_str in CLASS_INDEX_PAGE 59 if not doxygen_probes[url]: 60 print('Did not find %s in class index page' % name) 61 if doxygen_probes[url]: 62 return r'Matcher<<a href="%s">%s</a>>' % (url, name) 63 else: 64 return m.group(0) 65 text = re.sub( 66 r'Matcher<([^\*&]+)>', link_if_exists, text) 67 return text 68 69def extract_result_types(comment): 70 """Extracts a list of result types from the given comment. 71 72 We allow annotations in the comment of the matcher to specify what 73 nodes a matcher can match on. Those comments have the form: 74 Usable as: Any Matcher | (Matcher<T1>[, Matcher<t2>[, ...]]) 75 76 Returns ['*'] in case of 'Any Matcher', or ['T1', 'T2', ...]. 77 Returns the empty list if no 'Usable as' specification could be 78 parsed. 79 """ 80 result_types = [] 81 m = re.search(r'Usable as: Any Matcher[\s\n]*$', comment, re.S) 82 if m: 83 return ['*'] 84 while True: 85 m = re.match(r'^(.*)Matcher<([^>]+)>\s*,?[\s\n]*$', comment, re.S) 86 if not m: 87 if re.search(r'Usable as:\s*$', comment): 88 return result_types 89 else: 90 return None 91 result_types += [m.group(2)] 92 comment = m.group(1) 93 94def strip_doxygen(comment): 95 """Returns the given comment without \-escaped words.""" 96 # If there is only a doxygen keyword in the line, delete the whole line. 97 comment = re.sub(r'^\\[^\s]+\n', r'', comment, flags=re.M) 98 99 # If there is a doxygen \see command, change the \see prefix into "See also:". 100 # FIXME: it would be better to turn this into a link to the target instead. 101 comment = re.sub(r'\\see', r'See also:', comment) 102 103 # Delete the doxygen command and the following whitespace. 104 comment = re.sub(r'\\[^\s]+\s+', r'', comment) 105 return comment 106 107def unify_arguments(args): 108 """Gets rid of anything the user doesn't care about in the argument list.""" 109 args = re.sub(r'internal::', r'', args) 110 args = re.sub(r'extern const\s+(.*)&', r'\1 ', args) 111 args = re.sub(r'&', r' ', args) 112 args = re.sub(r'(^|\s)M\d?(\s)', r'\1Matcher<*>\2', args) 113 args = re.sub(r'BindableMatcher', r'Matcher', args) 114 args = re.sub(r'const Matcher', r'Matcher', args) 115 return args 116 117def unify_type(result_type): 118 """Gets rid of anything the user doesn't care about in the type name.""" 119 result_type = re.sub(r'^internal::(Bindable)?Matcher<([a-zA-Z_][a-zA-Z0-9_]*)>$', r'\2', result_type) 120 return result_type 121 122def add_matcher(result_type, name, args, comment, is_dyncast=False): 123 """Adds a matcher to one of our categories.""" 124 if name == 'id': 125 # FIXME: Figure out whether we want to support the 'id' matcher. 126 return 127 matcher_id = '%s%d' % (name, ids[name]) 128 ids[name] += 1 129 args = unify_arguments(args) 130 result_type = unify_type(result_type) 131 132 docs_result_type = esc('Matcher<%s>' % result_type); 133 134 if name == 'mapAnyOf': 135 args = "nodeMatcherFunction..." 136 docs_result_type = "<em>unspecified</em>" 137 138 matcher_html = TD_TEMPLATE % { 139 'result': docs_result_type, 140 'name': name, 141 'args': esc(args), 142 'comment': esc(strip_doxygen(comment)), 143 'id': matcher_id, 144 } 145 if is_dyncast: 146 dict = node_matchers 147 lookup = result_type + name 148 # Use a heuristic to figure out whether a matcher is a narrowing or 149 # traversal matcher. By default, matchers that take other matchers as 150 # arguments (and are not node matchers) do traversal. We specifically 151 # exclude known narrowing matchers that also take other matchers as 152 # arguments. 153 elif ('Matcher<' not in args or 154 name in ['allOf', 'anyOf', 'anything', 'unless', 'mapAnyOf']): 155 dict = narrowing_matchers 156 lookup = result_type + name + esc(args) 157 else: 158 dict = traversal_matchers 159 lookup = result_type + name + esc(args) 160 161 if dict.get(lookup) is None or len(dict.get(lookup)) < len(matcher_html): 162 dict[lookup] = matcher_html 163 164def act_on_decl(declaration, comment, allowed_types): 165 """Parse the matcher out of the given declaration and comment. 166 167 If 'allowed_types' is set, it contains a list of node types the matcher 168 can match on, as extracted from the static type asserts in the matcher 169 definition. 170 """ 171 if declaration.strip(): 172 173 if re.match(r'^\s?(#|namespace|using)', declaration): return 174 175 # Node matchers are defined by writing: 176 # VariadicDynCastAllOfMatcher<ResultType, ArgumentType> name; 177 m = re.match(r""".*Variadic(?:DynCast)?AllOfMatcher\s*< 178 \s*([^\s,]+)\s*(?:, 179 \s*([^\s>]+)\s*)?> 180 \s*([^\s;]+)\s*;\s*$""", declaration, flags=re.X) 181 if m: 182 result, inner, name = m.groups() 183 if not inner: 184 inner = result 185 add_matcher(result, name, 'Matcher<%s>...' % inner, 186 comment, is_dyncast=True) 187 return 188 189 # Special case of type matchers: 190 # AstTypeMatcher<ArgumentType> name 191 m = re.match(r""".*AstTypeMatcher\s*< 192 \s*([^\s>]+)\s*> 193 \s*([^\s;]+)\s*;\s*$""", declaration, flags=re.X) 194 if m: 195 inner, name = m.groups() 196 add_matcher('Type', name, 'Matcher<%s>...' % inner, 197 comment, is_dyncast=True) 198 # FIXME: re-enable once we have implemented casting on the TypeLoc 199 # hierarchy. 200 # add_matcher('TypeLoc', '%sLoc' % name, 'Matcher<%sLoc>...' % inner, 201 # comment, is_dyncast=True) 202 return 203 204 # Parse the various matcher definition macros. 205 m = re.match(""".*AST_TYPE(LOC)?_TRAVERSE_MATCHER(?:_DECL)?\( 206 \s*([^\s,]+\s*), 207 \s*(?:[^\s,]+\s*), 208 \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\) 209 \)\s*;\s*$""", declaration, flags=re.X) 210 if m: 211 loc, name, results = m.groups()[0:3] 212 result_types = [r.strip() for r in results.split(',')] 213 214 comment_result_types = extract_result_types(comment) 215 if (comment_result_types and 216 sorted(result_types) != sorted(comment_result_types)): 217 raise Exception('Inconsistent documentation for: %s' % name) 218 for result_type in result_types: 219 add_matcher(result_type, name, 'Matcher<Type>', comment) 220 # if loc: 221 # add_matcher('%sLoc' % result_type, '%sLoc' % name, 'Matcher<TypeLoc>', 222 # comment) 223 return 224 225 m = re.match(r"""^\s*AST_POLYMORPHIC_MATCHER(_P)?(.?)(?:_OVERLOAD)?\( 226 \s*([^\s,]+)\s*, 227 \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\) 228 (?:,\s*([^\s,]+)\s* 229 ,\s*([^\s,]+)\s*)? 230 (?:,\s*([^\s,]+)\s* 231 ,\s*([^\s,]+)\s*)? 232 (?:,\s*\d+\s*)? 233 \)\s*{\s*$""", declaration, flags=re.X) 234 235 if m: 236 p, n, name, results = m.groups()[0:4] 237 args = m.groups()[4:] 238 result_types = [r.strip() for r in results.split(',')] 239 if allowed_types and allowed_types != result_types: 240 raise Exception('Inconsistent documentation for: %s' % name) 241 if n not in ['', '2']: 242 raise Exception('Cannot parse "%s"' % declaration) 243 args = ', '.join('%s %s' % (args[i], args[i+1]) 244 for i in range(0, len(args), 2) if args[i]) 245 for result_type in result_types: 246 add_matcher(result_type, name, args, comment) 247 return 248 249 m = re.match(r"""^\s*AST_POLYMORPHIC_MATCHER_REGEX(?:_OVERLOAD)?\( 250 \s*([^\s,]+)\s*, 251 \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\), 252 \s*([^\s,]+)\s* 253 (?:,\s*\d+\s*)? 254 \)\s*{\s*$""", declaration, flags=re.X) 255 256 if m: 257 name, results, arg_name = m.groups()[0:3] 258 result_types = [r.strip() for r in results.split(',')] 259 if allowed_types and allowed_types != result_types: 260 raise Exception('Inconsistent documentation for: %s' % name) 261 arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name 262 comment += """ 263If the matcher is used in clang-query, RegexFlags parameter 264should be passed as a quoted string. e.g: "NoFlags". 265Flags can be combined with '|' example \"IgnoreCase | BasicRegex\" 266""" 267 for result_type in result_types: 268 add_matcher(result_type, name, arg, comment) 269 return 270 271 m = re.match(r"""^\s*AST_MATCHER_FUNCTION(_P)?(.?)(?:_OVERLOAD)?\( 272 (?:\s*([^\s,]+)\s*,)? 273 \s*([^\s,]+)\s* 274 (?:,\s*([^\s,]+)\s* 275 ,\s*([^\s,]+)\s*)? 276 (?:,\s*([^\s,]+)\s* 277 ,\s*([^\s,]+)\s*)? 278 (?:,\s*\d+\s*)? 279 \)\s*{\s*$""", declaration, flags=re.X) 280 if m: 281 p, n, result, name = m.groups()[0:4] 282 args = m.groups()[4:] 283 if n not in ['', '2']: 284 raise Exception('Cannot parse "%s"' % declaration) 285 args = ', '.join('%s %s' % (args[i], args[i+1]) 286 for i in range(0, len(args), 2) if args[i]) 287 add_matcher(result, name, args, comment) 288 return 289 290 m = re.match(r"""^\s*AST_MATCHER(_P)?(.?)(?:_OVERLOAD)?\( 291 (?:\s*([^\s,]+)\s*,)? 292 \s*([^\s,]+)\s* 293 (?:,\s*([^,]+)\s* 294 ,\s*([^\s,]+)\s*)? 295 (?:,\s*([^\s,]+)\s* 296 ,\s*([^\s,]+)\s*)? 297 (?:,\s*\d+\s*)? 298 \)\s*{""", declaration, flags=re.X) 299 if m: 300 p, n, result, name = m.groups()[0:4] 301 args = m.groups()[4:] 302 if not result: 303 if not allowed_types: 304 raise Exception('Did not find allowed result types for: %s' % name) 305 result_types = allowed_types 306 else: 307 result_types = [result] 308 if n not in ['', '2']: 309 raise Exception('Cannot parse "%s"' % declaration) 310 args = ', '.join('%s %s' % (args[i], args[i+1]) 311 for i in range(0, len(args), 2) if args[i]) 312 for result_type in result_types: 313 add_matcher(result_type, name, args, comment) 314 return 315 316 m = re.match(r"""^\s*AST_MATCHER_REGEX(?:_OVERLOAD)?\( 317 \s*([^\s,]+)\s*, 318 \s*([^\s,]+)\s*, 319 \s*([^\s,]+)\s* 320 (?:,\s*\d+\s*)? 321 \)\s*{""", declaration, flags=re.X) 322 if m: 323 result, name, arg_name = m.groups()[0:3] 324 if not result: 325 if not allowed_types: 326 raise Exception('Did not find allowed result types for: %s' % name) 327 result_types = allowed_types 328 else: 329 result_types = [result] 330 arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name 331 comment += """ 332If the matcher is used in clang-query, RegexFlags parameter 333should be passed as a quoted string. e.g: "NoFlags". 334Flags can be combined with '|' example \"IgnoreCase | BasicRegex\" 335""" 336 337 for result_type in result_types: 338 add_matcher(result_type, name, arg, comment) 339 return 340 341 # Parse ArgumentAdapting matchers. 342 m = re.match( 343 r"""^.*ArgumentAdaptingMatcherFunc<.*>\s* 344 ([a-zA-Z]*);$""", 345 declaration, flags=re.X) 346 if m: 347 name = m.groups()[0] 348 add_matcher('*', name, 'Matcher<*>', comment) 349 return 350 351 # Parse Variadic functions. 352 m = re.match( 353 r"""^.*internal::VariadicFunction\s*<\s*([^,]+),\s*([^,]+),\s*[^>]+>\s* 354 ([a-zA-Z]*);$""", 355 declaration, flags=re.X) 356 if m: 357 result, arg, name = m.groups()[:3] 358 add_matcher(result, name, '%s, ..., %s' % (arg, arg), comment) 359 return 360 361 m = re.match( 362 r"""^.*internal::VariadicFunction\s*<\s* 363 internal::PolymorphicMatcher<[\S\s]+ 364 AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),\s*(.*);$""", 365 declaration, flags=re.X) 366 367 if m: 368 results, trailing = m.groups() 369 trailing, name = trailing.rsplit(">", 1) 370 name = name.strip() 371 trailing, _ = trailing.rsplit(",", 1) 372 _, arg = trailing.rsplit(",", 1) 373 arg = arg.strip() 374 375 result_types = [r.strip() for r in results.split(',')] 376 for result_type in result_types: 377 add_matcher(result_type, name, '%s, ..., %s' % (arg, arg), comment) 378 return 379 380 381 # Parse Variadic operator matchers. 382 m = re.match( 383 r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s]+)\s*>\s* 384 ([a-zA-Z]*);$""", 385 declaration, flags=re.X) 386 if m: 387 min_args, max_args, name = m.groups()[:3] 388 if max_args == '1': 389 add_matcher('*', name, 'Matcher<*>', comment) 390 return 391 elif max_args == 'std::numeric_limits<unsigned>::max()': 392 add_matcher('*', name, 'Matcher<*>, ..., Matcher<*>', comment) 393 return 394 395 m = re.match( 396 r"""^.*MapAnyOfMatcher<.*>\s* 397 ([a-zA-Z]*);$""", 398 declaration, flags=re.X) 399 if m: 400 name = m.groups()[0] 401 add_matcher('*', name, 'Matcher<*>...Matcher<*>', comment) 402 return 403 404 # Parse free standing matcher functions, like: 405 # Matcher<ResultType> Name(Matcher<ArgumentType> InnerMatcher) { 406 m = re.match(r"""^\s*(?:template\s+<\s*(?:class|typename)\s+(.+)\s*>\s+)? 407 (.*)\s+ 408 ([^\s\(]+)\s*\( 409 (.*) 410 \)\s*{""", declaration, re.X) 411 if m: 412 template_name, result, name, args = m.groups() 413 if template_name: 414 matcherTemplateArgs = re.findall(r'Matcher<\s*(%s)\s*>' % template_name, args) 415 templateArgs = re.findall(r'(?:^|[\s,<])(%s)(?:$|[\s,>])' % template_name, args) 416 if len(matcherTemplateArgs) < len(templateArgs): 417 # The template name is used naked, so don't replace with `*`` later on 418 template_name = None 419 else : 420 args = re.sub(r'(^|[\s,<])%s($|[\s,>])' % template_name, r'\1*\2', args) 421 args = ', '.join(p.strip() for p in args.split(',')) 422 m = re.match(r'(?:^|.*\s+)internal::(?:Bindable)?Matcher<([^>]+)>$', result) 423 if m: 424 result_types = [m.group(1)] 425 if template_name and len(result_types) == 1 and result_types[0] == template_name: 426 result_types = ['*'] 427 else: 428 result_types = extract_result_types(comment) 429 if not result_types: 430 if not comment: 431 # Only overloads don't have their own doxygen comments; ignore those. 432 print('Ignoring "%s"' % name) 433 else: 434 print('Cannot determine result type for "%s"' % name) 435 else: 436 for result_type in result_types: 437 add_matcher(result_type, name, args, comment) 438 else: 439 print('*** Unparsable: "' + declaration + '" ***') 440 441def sort_table(matcher_type, matcher_map): 442 """Returns the sorted html table for the given row map.""" 443 table = '' 444 for key in sorted(matcher_map.keys()): 445 table += matcher_map[key] + '\n' 446 return ('<!-- START_%(type)s_MATCHERS -->\n' + 447 '%(table)s' + 448 '<!--END_%(type)s_MATCHERS -->') % { 449 'type': matcher_type, 450 'table': table, 451 } 452 453# Parse the ast matchers. 454# We alternate between two modes: 455# body = True: We parse the definition of a matcher. We need 456# to parse the full definition before adding a matcher, as the 457# definition might contain static asserts that specify the result 458# type. 459# body = False: We parse the comments and declaration of the matcher. 460comment = '' 461declaration = '' 462allowed_types = [] 463body = False 464for line in open(MATCHERS_FILE).read().splitlines(): 465 if body: 466 if line.strip() and line[0] == '}': 467 if declaration: 468 act_on_decl(declaration, comment, allowed_types) 469 comment = '' 470 declaration = '' 471 allowed_types = [] 472 body = False 473 else: 474 m = re.search(r'is_base_of<([^,]+), NodeType>', line) 475 if m and m.group(1): 476 allowed_types += [m.group(1)] 477 continue 478 if line.strip() and line.lstrip()[0] == '/': 479 comment += re.sub(r'^/+\s?', '', line) + '\n' 480 else: 481 declaration += ' ' + line 482 if ((not line.strip()) or 483 line.rstrip()[-1] == ';' or 484 (line.rstrip()[-1] == '{' and line.rstrip()[-3:] != '= {')): 485 if line.strip() and line.rstrip()[-1] == '{': 486 body = True 487 else: 488 act_on_decl(declaration, comment, allowed_types) 489 comment = '' 490 declaration = '' 491 allowed_types = [] 492 493node_matcher_table = sort_table('DECL', node_matchers) 494narrowing_matcher_table = sort_table('NARROWING', narrowing_matchers) 495traversal_matcher_table = sort_table('TRAVERSAL', traversal_matchers) 496 497reference = open('../LibASTMatchersReference.html').read() 498reference = re.sub(r'<!-- START_DECL_MATCHERS.*END_DECL_MATCHERS -->', 499 node_matcher_table, reference, flags=re.S) 500reference = re.sub(r'<!-- START_NARROWING_MATCHERS.*END_NARROWING_MATCHERS -->', 501 narrowing_matcher_table, reference, flags=re.S) 502reference = re.sub(r'<!-- START_TRAVERSAL_MATCHERS.*END_TRAVERSAL_MATCHERS -->', 503 traversal_matcher_table, reference, flags=re.S) 504 505with open('../LibASTMatchersReference.html', 'w', newline='\n') as output: 506 output.write(reference) 507 508