1*12c85518Srobert#!/usr/bin/env python
2*12c85518Srobert#===- cppreference_parser.py -  ------------------------------*- python -*--===#
3*12c85518Srobert#
4*12c85518Srobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*12c85518Srobert# See https://llvm.org/LICENSE.txt for license information.
6*12c85518Srobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*12c85518Srobert#
8*12c85518Srobert#===------------------------------------------------------------------------===#
9*12c85518Srobert
10*12c85518Srobertfrom bs4 import BeautifulSoup, NavigableString
11*12c85518Srobert
12*12c85518Srobertimport collections
13*12c85518Srobertimport multiprocessing
14*12c85518Srobertimport os
15*12c85518Srobertimport re
16*12c85518Srobertimport signal
17*12c85518Srobertimport sys
18*12c85518Srobert
19*12c85518Srobert
20*12c85518Srobertclass Symbol:
21*12c85518Srobert
22*12c85518Srobert  def __init__(self, name, namespace, headers):
23*12c85518Srobert    # unqualifed symbol name, e.g. "move"
24*12c85518Srobert    self.name = name
25*12c85518Srobert    # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
26*12c85518Srobert    # None for C symbols.
27*12c85518Srobert    self.namespace = namespace
28*12c85518Srobert    # a list of corresponding headers
29*12c85518Srobert    self.headers = headers
30*12c85518Srobert
31*12c85518Srobert
32*12c85518Srobertdef _HasClass(tag, *classes):
33*12c85518Srobert  for c in tag.get('class', []):
34*12c85518Srobert    if c in classes:
35*12c85518Srobert      return True
36*12c85518Srobert  return False
37*12c85518Srobert
38*12c85518Srobert
39*12c85518Srobertdef _ParseSymbolPage(symbol_page_html, symbol_name):
40*12c85518Srobert  """Parse symbol page and retrieve the include header defined in this page.
41*12c85518Srobert  The symbol page provides header for the symbol, specifically in
42*12c85518Srobert  "Defined in header <header>" section. An example:
43*12c85518Srobert
44*12c85518Srobert  <tr class="t-dsc-header">
45*12c85518Srobert    <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
46*12c85518Srobert  </td></tr>
47*12c85518Srobert
48*12c85518Srobert  Returns a list of headers.
49*12c85518Srobert  """
50*12c85518Srobert  headers = set()
51*12c85518Srobert  all_headers = set()
52*12c85518Srobert
53*12c85518Srobert  soup = BeautifulSoup(symbol_page_html, "html.parser")
54*12c85518Srobert  # Rows in table are like:
55*12c85518Srobert  #   Defined in header <foo>      .t-dsc-header
56*12c85518Srobert  #   Defined in header <bar>      .t-dsc-header
57*12c85518Srobert  #   decl1                        .t-dcl
58*12c85518Srobert  #   Defined in header <baz>      .t-dsc-header
59*12c85518Srobert  #   decl2                        .t-dcl
60*12c85518Srobert  for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
61*12c85518Srobert    current_headers = []
62*12c85518Srobert    was_decl = False
63*12c85518Srobert    for row in table.select('tr'):
64*12c85518Srobert      if _HasClass(row, 't-dcl', 't-dsc'):
65*12c85518Srobert        was_decl = True
66*12c85518Srobert        # Symbols are in the first cell.
67*12c85518Srobert        found_symbols = row.find('td').stripped_strings
68*12c85518Srobert        if not symbol_name in found_symbols:
69*12c85518Srobert          continue
70*12c85518Srobert        headers.update(current_headers)
71*12c85518Srobert      elif _HasClass(row, 't-dsc-header'):
72*12c85518Srobert        # If we saw a decl since the last header, this is a new block of headers
73*12c85518Srobert        # for a new block of decls.
74*12c85518Srobert        if was_decl:
75*12c85518Srobert          current_headers = []
76*12c85518Srobert        was_decl = False
77*12c85518Srobert        # There are also .t-dsc-header for "defined in namespace".
78*12c85518Srobert        if not "Defined in header " in row.text:
79*12c85518Srobert          continue
80*12c85518Srobert        # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
81*12c85518Srobert        for header_code in row.find_all("code"):
82*12c85518Srobert          current_headers.append(header_code.text)
83*12c85518Srobert          all_headers.add(header_code.text)
84*12c85518Srobert  # If the symbol was never named, consider all named headers.
85*12c85518Srobert  return headers or all_headers
86*12c85518Srobert
87*12c85518Srobert
88*12c85518Srobertdef _ParseIndexPage(index_page_html):
89*12c85518Srobert  """Parse index page.
90*12c85518Srobert  The index page lists all std symbols and hrefs to their detailed pages
91*12c85518Srobert  (which contain the defined header). An example:
92*12c85518Srobert
93*12c85518Srobert  <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
94*12c85518Srobert  <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
95*12c85518Srobert
96*12c85518Srobert  Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
97*12c85518Srobert  """
98*12c85518Srobert  symbols = []
99*12c85518Srobert  soup = BeautifulSoup(index_page_html, "html.parser")
100*12c85518Srobert  for symbol_href in soup.select("a[title]"):
101*12c85518Srobert    # Ignore annotated symbols like "acos<>() (std::complex)".
102*12c85518Srobert    # These tend to be overloads, and we the primary is more useful.
103*12c85518Srobert    # This accidentally accepts begin/end despite the (iterator) caption: the
104*12c85518Srobert    # (since C++11) note is first. They are good symbols, so the bug is unfixed.
105*12c85518Srobert    caption = symbol_href.next_sibling
106*12c85518Srobert    variant = None
107*12c85518Srobert    if isinstance(caption, NavigableString) and "(" in caption:
108*12c85518Srobert      variant = caption.text.strip(" ()")
109*12c85518Srobert    symbol_tt = symbol_href.find("tt")
110*12c85518Srobert    if symbol_tt:
111*12c85518Srobert      symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
112*12c85518Srobert                      symbol_href["href"], variant))
113*12c85518Srobert  return symbols
114*12c85518Srobert
115*12c85518Srobert
116*12c85518Srobertdef _ReadSymbolPage(path, name):
117*12c85518Srobert  with open(path) as f:
118*12c85518Srobert    return _ParseSymbolPage(f.read(), name)
119*12c85518Srobert
120*12c85518Srobert
121*12c85518Srobertdef _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
122*12c85518Srobert  """Get all symbols listed in the index page. All symbols should be in the
123*12c85518Srobert  given namespace.
124*12c85518Srobert
125*12c85518Srobert  Returns a list of Symbols.
126*12c85518Srobert  """
127*12c85518Srobert
128*12c85518Srobert  # Workflow steps:
129*12c85518Srobert  #   1. Parse index page which lists all symbols to get symbol
130*12c85518Srobert  #      name (unqualified name) and its href link to the symbol page which
131*12c85518Srobert  #      contains the defined header.
132*12c85518Srobert  #   2. Parse the symbol page to get the defined header.
133*12c85518Srobert  index_page_path = os.path.join(root_dir, index_page_name)
134*12c85518Srobert  with open(index_page_path, "r") as f:
135*12c85518Srobert    # Read each symbol page in parallel.
136*12c85518Srobert    results = [] # (symbol_name, promise of [header...])
137*12c85518Srobert    for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
138*12c85518Srobert      # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
139*12c85518Srobert      # FIXME: use these as a fallback rather than ignoring entirely.
140*12c85518Srobert      variants_for_symbol = variants_to_accept.get(
141*12c85518Srobert          (namespace or "") + symbol_name, ())
142*12c85518Srobert      if variant and variant not in variants_for_symbol:
143*12c85518Srobert        continue
144*12c85518Srobert      path = os.path.join(root_dir, symbol_page_path)
145*12c85518Srobert      if os.path.isfile(path):
146*12c85518Srobert        results.append((symbol_name,
147*12c85518Srobert                      pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
148*12c85518Srobert      else:
149*12c85518Srobert        sys.stderr.write("Discarding information for symbol: %s. Page %s does not exist.\n"
150*12c85518Srobert          % (symbol_name, path))
151*12c85518Srobert
152*12c85518Srobert    # Build map from symbol name to a set of headers.
153*12c85518Srobert    symbol_headers = collections.defaultdict(set)
154*12c85518Srobert    for symbol_name, lazy_headers in results:
155*12c85518Srobert      symbol_headers[symbol_name].update(lazy_headers.get())
156*12c85518Srobert
157*12c85518Srobert  symbols = []
158*12c85518Srobert  for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
159*12c85518Srobert    symbols.append(Symbol(name, namespace, list(headers)))
160*12c85518Srobert  return symbols
161*12c85518Srobert
162*12c85518Srobert
163*12c85518Srobertdef GetSymbols(parse_pages):
164*12c85518Srobert  """Get all symbols by parsing the given pages.
165*12c85518Srobert
166*12c85518Srobert  Args:
167*12c85518Srobert    parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
168*12c85518Srobert  """
169*12c85518Srobert  # By default we prefer the non-variant versions, as they're more common. But
170*12c85518Srobert  # there are some symbols, whose variant is more common. This list describes
171*12c85518Srobert  # those symbols.
172*12c85518Srobert  variants_to_accept = {
173*12c85518Srobert      # std::remove<> has variant algorithm.
174*12c85518Srobert      "std::remove": ("algorithm"),
175*12c85518Srobert  }
176*12c85518Srobert  symbols = []
177*12c85518Srobert  # Run many workers to process individual symbol pages under the symbol index.
178*12c85518Srobert  # Don't allow workers to capture Ctrl-C.
179*12c85518Srobert  pool = multiprocessing.Pool(
180*12c85518Srobert      initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
181*12c85518Srobert  try:
182*12c85518Srobert    for root_dir, page_name, namespace in parse_pages:
183*12c85518Srobert      symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace,
184*12c85518Srobert                                 variants_to_accept))
185*12c85518Srobert  finally:
186*12c85518Srobert    pool.terminate()
187*12c85518Srobert    pool.join()
188*12c85518Srobert  return symbols
189