1#!/usr/bin/env python3
2# A tool to parse ASTMatchers.h and update the documentation in
3# ../LibASTMatchersReference.html automatically. Run from the
4# directory in which this file is located to update the docs.
5
6import collections
7import re
8try:
9    from urllib.request import urlopen
10except ImportError:
11    from urllib2 import urlopen
12
13CLASS_INDEX_PAGE_URL = 'https://clang.llvm.org/doxygen/classes.html'
14try:
15  CLASS_INDEX_PAGE = urlopen(CLASS_INDEX_PAGE_URL).read().decode('utf-8')
16except Exception as e:
17  raise Exception('Unable to get %s: %s' % (CLASS_INDEX_PAGE_URL, e))
18
19MATCHERS_FILE = '../../include/clang/ASTMatchers/ASTMatchers.h'
20
21# Each matcher is documented in one row of the form:
22#   result | name | argA
23# The subsequent row contains the documentation and is hidden by default,
24# becoming visible via javascript when the user clicks the matcher name.
25TD_TEMPLATE="""
26<tr><td>%(result)s</td><td class="name" onclick="toggle('%(id)s')"><a name="%(id)sAnchor">%(name)s</a></td><td>%(args)s</td></tr>
27<tr><td colspan="4" class="doc" id="%(id)s"><pre>%(comment)s</pre></td></tr>
28"""
29
30# We categorize the matchers into these three categories in the reference:
31node_matchers = {}
32narrowing_matchers = {}
33traversal_matchers = {}
34
35# We output multiple rows per matcher if the matcher can be used on multiple
36# node types. Thus, we need a new id per row to control the documentation
37# pop-up. ids[name] keeps track of those ids.
38ids = collections.defaultdict(int)
39
40# Cache for doxygen urls we have already verified.
41doxygen_probes = {}
42
43def esc(text):
44  """Escape any html in the given text."""
45  text = re.sub(r'&', '&amp;', text)
46  text = re.sub(r'<', '&lt;', text)
47  text = re.sub(r'>', '&gt;', text)
48  def link_if_exists(m):
49    """Wrap a likely AST node name in a link to its clang docs.
50
51       We want to do this only if the page exists, in which case it will be
52       referenced from the class index page.
53    """
54    name = m.group(1)
55    url = 'https://clang.llvm.org/doxygen/classclang_1_1%s.html' % name
56    if url not in doxygen_probes:
57      search_str = 'href="classclang_1_1%s.html"' % name
58      doxygen_probes[url] = search_str in CLASS_INDEX_PAGE
59      if not doxygen_probes[url]:
60        print('Did not find %s in class index page' % name)
61    if doxygen_probes[url]:
62      return r'Matcher&lt;<a href="%s">%s</a>&gt;' % (url, name)
63    else:
64      return m.group(0)
65  text = re.sub(
66    r'Matcher&lt;([^\*&]+)&gt;', link_if_exists, text)
67  return text
68
69def extract_result_types(comment):
70  """Extracts a list of result types from the given comment.
71
72     We allow annotations in the comment of the matcher to specify what
73     nodes a matcher can match on. Those comments have the form:
74       Usable as: Any Matcher | (Matcher<T1>[, Matcher<t2>[, ...]])
75
76     Returns ['*'] in case of 'Any Matcher', or ['T1', 'T2', ...].
77     Returns the empty list if no 'Usable as' specification could be
78     parsed.
79  """
80  result_types = []
81  m = re.search(r'Usable as: Any Matcher[\s\n]*$', comment, re.S)
82  if m:
83    return ['*']
84  while True:
85    m = re.match(r'^(.*)Matcher<([^>]+)>\s*,?[\s\n]*$', comment, re.S)
86    if not m:
87      if re.search(r'Usable as:\s*$', comment):
88        return result_types
89      else:
90        return None
91    result_types += [m.group(2)]
92    comment = m.group(1)
93
94def strip_doxygen(comment):
95  """Returns the given comment without \-escaped words."""
96  # If there is only a doxygen keyword in the line, delete the whole line.
97  comment = re.sub(r'^\\[^\s]+\n', r'', comment, flags=re.M)
98
99  # If there is a doxygen \see command, change the \see prefix into "See also:".
100  # FIXME: it would be better to turn this into a link to the target instead.
101  comment = re.sub(r'\\see', r'See also:', comment)
102
103  # Delete the doxygen command and the following whitespace.
104  comment = re.sub(r'\\[^\s]+\s+', r'', comment)
105  return comment
106
107def unify_arguments(args):
108  """Gets rid of anything the user doesn't care about in the argument list."""
109  args = re.sub(r'internal::', r'', args)
110  args = re.sub(r'extern const\s+(.*)&', r'\1 ', args)
111  args = re.sub(r'&', r' ', args)
112  args = re.sub(r'(^|\s)M\d?(\s)', r'\1Matcher<*>\2', args)
113  args = re.sub(r'BindableMatcher', r'Matcher', args)
114  args = re.sub(r'const Matcher', r'Matcher', args)
115  return args
116
117def unify_type(result_type):
118  """Gets rid of anything the user doesn't care about in the type name."""
119  result_type = re.sub(r'^internal::(Bindable)?Matcher<([a-zA-Z_][a-zA-Z0-9_]*)>$', r'\2', result_type)
120  return result_type
121
122def add_matcher(result_type, name, args, comment, is_dyncast=False):
123  """Adds a matcher to one of our categories."""
124  if name == 'id':
125     # FIXME: Figure out whether we want to support the 'id' matcher.
126     return
127  matcher_id = '%s%d' % (name, ids[name])
128  ids[name] += 1
129  args = unify_arguments(args)
130  result_type = unify_type(result_type)
131
132  docs_result_type = esc('Matcher<%s>' % result_type);
133
134  if name == 'mapAnyOf':
135    args = "nodeMatcherFunction..."
136    docs_result_type = "<em>unspecified</em>"
137
138  matcher_html = TD_TEMPLATE % {
139    'result': docs_result_type,
140    'name': name,
141    'args': esc(args),
142    'comment': esc(strip_doxygen(comment)),
143    'id': matcher_id,
144  }
145  if is_dyncast:
146    dict = node_matchers
147    lookup = result_type + name
148  # Use a heuristic to figure out whether a matcher is a narrowing or
149  # traversal matcher. By default, matchers that take other matchers as
150  # arguments (and are not node matchers) do traversal. We specifically
151  # exclude known narrowing matchers that also take other matchers as
152  # arguments.
153  elif ('Matcher<' not in args or
154        name in ['allOf', 'anyOf', 'anything', 'unless', 'mapAnyOf']):
155    dict = narrowing_matchers
156    lookup = result_type + name + esc(args)
157  else:
158    dict = traversal_matchers
159    lookup = result_type + name + esc(args)
160
161  if dict.get(lookup) is None or len(dict.get(lookup)) < len(matcher_html):
162    dict[lookup] = matcher_html
163
164def act_on_decl(declaration, comment, allowed_types):
165  """Parse the matcher out of the given declaration and comment.
166
167     If 'allowed_types' is set, it contains a list of node types the matcher
168     can match on, as extracted from the static type asserts in the matcher
169     definition.
170  """
171  if declaration.strip():
172
173    if re.match(r'^\s?(#|namespace|using)', declaration): return
174
175    # Node matchers are defined by writing:
176    #   VariadicDynCastAllOfMatcher<ResultType, ArgumentType> name;
177    m = re.match(r""".*Variadic(?:DynCast)?AllOfMatcher\s*<
178                       \s*([^\s,]+)\s*(?:,
179                       \s*([^\s>]+)\s*)?>
180                       \s*([^\s;]+)\s*;\s*$""", declaration, flags=re.X)
181    if m:
182      result, inner, name = m.groups()
183      if not inner:
184        inner = result
185      add_matcher(result, name, 'Matcher<%s>...' % inner,
186                  comment, is_dyncast=True)
187      return
188
189    # Special case of type matchers:
190    #   AstTypeMatcher<ArgumentType> name
191    m = re.match(r""".*AstTypeMatcher\s*<
192                       \s*([^\s>]+)\s*>
193                       \s*([^\s;]+)\s*;\s*$""", declaration, flags=re.X)
194    if m:
195      inner, name = m.groups()
196      add_matcher('Type', name, 'Matcher<%s>...' % inner,
197                  comment, is_dyncast=True)
198      # FIXME: re-enable once we have implemented casting on the TypeLoc
199      # hierarchy.
200      # add_matcher('TypeLoc', '%sLoc' % name, 'Matcher<%sLoc>...' % inner,
201      #             comment, is_dyncast=True)
202      return
203
204    # Parse the various matcher definition macros.
205    m = re.match(""".*AST_TYPE(LOC)?_TRAVERSE_MATCHER(?:_DECL)?\(
206                       \s*([^\s,]+\s*),
207                       \s*(?:[^\s,]+\s*),
208                       \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\)
209                     \)\s*;\s*$""", declaration, flags=re.X)
210    if m:
211      loc, name, results = m.groups()[0:3]
212      result_types = [r.strip() for r in results.split(',')]
213
214      comment_result_types = extract_result_types(comment)
215      if (comment_result_types and
216          sorted(result_types) != sorted(comment_result_types)):
217        raise Exception('Inconsistent documentation for: %s' % name)
218      for result_type in result_types:
219        add_matcher(result_type, name, 'Matcher<Type>', comment)
220        # if loc:
221        #   add_matcher('%sLoc' % result_type, '%sLoc' % name, 'Matcher<TypeLoc>',
222        #               comment)
223      return
224
225    m = re.match(r"""^\s*AST_POLYMORPHIC_MATCHER(_P)?(.?)(?:_OVERLOAD)?\(
226                          \s*([^\s,]+)\s*,
227                          \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\)
228                       (?:,\s*([^\s,]+)\s*
229                          ,\s*([^\s,]+)\s*)?
230                       (?:,\s*([^\s,]+)\s*
231                          ,\s*([^\s,]+)\s*)?
232                       (?:,\s*\d+\s*)?
233                      \)\s*{\s*$""", declaration, flags=re.X)
234
235    if m:
236      p, n, name, results = m.groups()[0:4]
237      args = m.groups()[4:]
238      result_types = [r.strip() for r in results.split(',')]
239      if allowed_types and allowed_types != result_types:
240        raise Exception('Inconsistent documentation for: %s' % name)
241      if n not in ['', '2']:
242        raise Exception('Cannot parse "%s"' % declaration)
243      args = ', '.join('%s %s' % (args[i], args[i+1])
244                       for i in range(0, len(args), 2) if args[i])
245      for result_type in result_types:
246        add_matcher(result_type, name, args, comment)
247      return
248
249    m = re.match(r"""^\s*AST_POLYMORPHIC_MATCHER_REGEX(?:_OVERLOAD)?\(
250                          \s*([^\s,]+)\s*,
251                          \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),
252                          \s*([^\s,]+)\s*
253                       (?:,\s*\d+\s*)?
254                      \)\s*{\s*$""", declaration, flags=re.X)
255
256    if m:
257      name, results, arg_name = m.groups()[0:3]
258      result_types = [r.strip() for r in results.split(',')]
259      if allowed_types and allowed_types != result_types:
260        raise Exception('Inconsistent documentation for: %s' % name)
261      arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name
262      comment += """
263If the matcher is used in clang-query, RegexFlags parameter
264should be passed as a quoted string. e.g: "NoFlags".
265Flags can be combined with '|' example \"IgnoreCase | BasicRegex\"
266"""
267      for result_type in result_types:
268        add_matcher(result_type, name, arg, comment)
269      return
270
271    m = re.match(r"""^\s*AST_MATCHER_FUNCTION(_P)?(.?)(?:_OVERLOAD)?\(
272                       (?:\s*([^\s,]+)\s*,)?
273                          \s*([^\s,]+)\s*
274                       (?:,\s*([^\s,]+)\s*
275                          ,\s*([^\s,]+)\s*)?
276                       (?:,\s*([^\s,]+)\s*
277                          ,\s*([^\s,]+)\s*)?
278                       (?:,\s*\d+\s*)?
279                      \)\s*{\s*$""", declaration, flags=re.X)
280    if m:
281      p, n, result, name = m.groups()[0:4]
282      args = m.groups()[4:]
283      if n not in ['', '2']:
284        raise Exception('Cannot parse "%s"' % declaration)
285      args = ', '.join('%s %s' % (args[i], args[i+1])
286                       for i in range(0, len(args), 2) if args[i])
287      add_matcher(result, name, args, comment)
288      return
289
290    m = re.match(r"""^\s*AST_MATCHER(_P)?(.?)(?:_OVERLOAD)?\(
291                       (?:\s*([^\s,]+)\s*,)?
292                          \s*([^\s,]+)\s*
293                       (?:,\s*([^,]+)\s*
294                          ,\s*([^\s,]+)\s*)?
295                       (?:,\s*([^\s,]+)\s*
296                          ,\s*([^\s,]+)\s*)?
297                       (?:,\s*\d+\s*)?
298                      \)\s*{""", declaration, flags=re.X)
299    if m:
300      p, n, result, name = m.groups()[0:4]
301      args = m.groups()[4:]
302      if not result:
303        if not allowed_types:
304          raise Exception('Did not find allowed result types for: %s' % name)
305        result_types = allowed_types
306      else:
307        result_types = [result]
308      if n not in ['', '2']:
309        raise Exception('Cannot parse "%s"' % declaration)
310      args = ', '.join('%s %s' % (args[i], args[i+1])
311                       for i in range(0, len(args), 2) if args[i])
312      for result_type in result_types:
313        add_matcher(result_type, name, args, comment)
314      return
315
316    m = re.match(r"""^\s*AST_MATCHER_REGEX(?:_OVERLOAD)?\(
317                       \s*([^\s,]+)\s*,
318                       \s*([^\s,]+)\s*,
319                       \s*([^\s,]+)\s*
320                       (?:,\s*\d+\s*)?
321                      \)\s*{""", declaration, flags=re.X)
322    if m:
323      result, name, arg_name = m.groups()[0:3]
324      if not result:
325        if not allowed_types:
326          raise Exception('Did not find allowed result types for: %s' % name)
327        result_types = allowed_types
328      else:
329        result_types = [result]
330      arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name
331      comment += """
332If the matcher is used in clang-query, RegexFlags parameter
333should be passed as a quoted string. e.g: "NoFlags".
334Flags can be combined with '|' example \"IgnoreCase | BasicRegex\"
335"""
336
337      for result_type in result_types:
338        add_matcher(result_type, name, arg, comment)
339      return
340
341    # Parse ArgumentAdapting matchers.
342    m = re.match(
343        r"""^.*ArgumentAdaptingMatcherFunc<.*>\s*
344              ([a-zA-Z]*);$""",
345        declaration, flags=re.X)
346    if m:
347      name = m.groups()[0]
348      add_matcher('*', name, 'Matcher<*>', comment)
349      return
350
351    # Parse Variadic functions.
352    m = re.match(
353        r"""^.*internal::VariadicFunction\s*<\s*([^,]+),\s*([^,]+),\s*[^>]+>\s*
354              ([a-zA-Z]*);$""",
355        declaration, flags=re.X)
356    if m:
357      result, arg, name = m.groups()[:3]
358      add_matcher(result, name, '%s, ..., %s' % (arg, arg), comment)
359      return
360
361    m = re.match(
362        r"""^.*internal::VariadicFunction\s*<\s*
363              internal::PolymorphicMatcher<[\S\s]+
364              AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),\s*(.*);$""",
365        declaration, flags=re.X)
366
367    if m:
368      results, trailing = m.groups()
369      trailing, name = trailing.rsplit(">", 1)
370      name = name.strip()
371      trailing, _ = trailing.rsplit(",", 1)
372      _, arg = trailing.rsplit(",", 1)
373      arg = arg.strip()
374
375      result_types = [r.strip() for r in results.split(',')]
376      for result_type in result_types:
377        add_matcher(result_type, name, '%s, ..., %s' % (arg, arg), comment)
378      return
379
380
381    # Parse Variadic operator matchers.
382    m = re.match(
383        r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s]+)\s*>\s*
384              ([a-zA-Z]*);$""",
385        declaration, flags=re.X)
386    if m:
387      min_args, max_args, name = m.groups()[:3]
388      if max_args == '1':
389        add_matcher('*', name, 'Matcher<*>', comment)
390        return
391      elif max_args == 'std::numeric_limits<unsigned>::max()':
392        add_matcher('*', name, 'Matcher<*>, ..., Matcher<*>', comment)
393        return
394
395    m = re.match(
396        r"""^.*MapAnyOfMatcher<.*>\s*
397              ([a-zA-Z]*);$""",
398        declaration, flags=re.X)
399    if m:
400      name = m.groups()[0]
401      add_matcher('*', name, 'Matcher<*>...Matcher<*>', comment)
402      return
403
404    # Parse free standing matcher functions, like:
405    #   Matcher<ResultType> Name(Matcher<ArgumentType> InnerMatcher) {
406    m = re.match(r"""^\s*(?:template\s+<\s*(?:class|typename)\s+(.+)\s*>\s+)?
407                     (.*)\s+
408                     ([^\s\(]+)\s*\(
409                     (.*)
410                     \)\s*{""", declaration, re.X)
411    if m:
412      template_name, result, name, args = m.groups()
413      if template_name:
414        matcherTemplateArgs = re.findall(r'Matcher<\s*(%s)\s*>' % template_name, args)
415        templateArgs = re.findall(r'(?:^|[\s,<])(%s)(?:$|[\s,>])' % template_name, args)
416        if len(matcherTemplateArgs) < len(templateArgs):
417          # The template name is used naked, so don't replace with `*`` later on
418          template_name = None
419        else :
420          args = re.sub(r'(^|[\s,<])%s($|[\s,>])' % template_name, r'\1*\2', args)
421      args = ', '.join(p.strip() for p in args.split(','))
422      m = re.match(r'(?:^|.*\s+)internal::(?:Bindable)?Matcher<([^>]+)>$', result)
423      if m:
424        result_types = [m.group(1)]
425        if template_name and len(result_types) == 1 and result_types[0] == template_name:
426          result_types = ['*']
427      else:
428        result_types = extract_result_types(comment)
429      if not result_types:
430        if not comment:
431          # Only overloads don't have their own doxygen comments; ignore those.
432          print('Ignoring "%s"' % name)
433        else:
434          print('Cannot determine result type for "%s"' % name)
435      else:
436        for result_type in result_types:
437          add_matcher(result_type, name, args, comment)
438    else:
439      print('*** Unparsable: "' + declaration + '" ***')
440
441def sort_table(matcher_type, matcher_map):
442  """Returns the sorted html table for the given row map."""
443  table = ''
444  for key in sorted(matcher_map.keys()):
445    table += matcher_map[key] + '\n'
446  return ('<!-- START_%(type)s_MATCHERS -->\n' +
447          '%(table)s' +
448          '<!--END_%(type)s_MATCHERS -->') % {
449    'type': matcher_type,
450    'table': table,
451  }
452
453# Parse the ast matchers.
454# We alternate between two modes:
455# body = True: We parse the definition of a matcher. We need
456#   to parse the full definition before adding a matcher, as the
457#   definition might contain static asserts that specify the result
458#   type.
459# body = False: We parse the comments and declaration of the matcher.
460comment = ''
461declaration = ''
462allowed_types = []
463body = False
464for line in open(MATCHERS_FILE).read().splitlines():
465  if body:
466    if line.strip() and line[0] == '}':
467      if declaration:
468        act_on_decl(declaration, comment, allowed_types)
469        comment = ''
470        declaration = ''
471        allowed_types = []
472      body = False
473    else:
474      m = re.search(r'is_base_of<([^,]+), NodeType>', line)
475      if m and m.group(1):
476        allowed_types += [m.group(1)]
477    continue
478  if line.strip() and line.lstrip()[0] == '/':
479    comment += re.sub(r'^/+\s?', '', line) + '\n'
480  else:
481    declaration += ' ' + line
482    if ((not line.strip()) or
483        line.rstrip()[-1] == ';' or
484        (line.rstrip()[-1] == '{' and line.rstrip()[-3:] != '= {')):
485      if line.strip() and line.rstrip()[-1] == '{':
486        body = True
487      else:
488        act_on_decl(declaration, comment, allowed_types)
489        comment = ''
490        declaration = ''
491        allowed_types = []
492
493node_matcher_table = sort_table('DECL', node_matchers)
494narrowing_matcher_table = sort_table('NARROWING', narrowing_matchers)
495traversal_matcher_table = sort_table('TRAVERSAL', traversal_matchers)
496
497reference = open('../LibASTMatchersReference.html').read()
498reference = re.sub(r'<!-- START_DECL_MATCHERS.*END_DECL_MATCHERS -->',
499                   node_matcher_table, reference, flags=re.S)
500reference = re.sub(r'<!-- START_NARROWING_MATCHERS.*END_NARROWING_MATCHERS -->',
501                   narrowing_matcher_table, reference, flags=re.S)
502reference = re.sub(r'<!-- START_TRAVERSAL_MATCHERS.*END_TRAVERSAL_MATCHERS -->',
503                   traversal_matcher_table, reference, flags=re.S)
504
505with open('../LibASTMatchersReference.html', 'w', newline='\n') as output:
506  output.write(reference)
507
508