1*12c85518Srobert#!/usr/bin/env python 2*12c85518Srobert#===- cppreference_parser.py - ------------------------------*- python -*--===# 3*12c85518Srobert# 4*12c85518Srobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*12c85518Srobert# See https://llvm.org/LICENSE.txt for license information. 6*12c85518Srobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*12c85518Srobert# 8*12c85518Srobert#===------------------------------------------------------------------------===# 9*12c85518Srobert 10*12c85518Srobertfrom bs4 import BeautifulSoup, NavigableString 11*12c85518Srobert 12*12c85518Srobertimport collections 13*12c85518Srobertimport multiprocessing 14*12c85518Srobertimport os 15*12c85518Srobertimport re 16*12c85518Srobertimport signal 17*12c85518Srobertimport sys 18*12c85518Srobert 19*12c85518Srobert 20*12c85518Srobertclass Symbol: 21*12c85518Srobert 22*12c85518Srobert def __init__(self, name, namespace, headers): 23*12c85518Srobert # unqualifed symbol name, e.g. "move" 24*12c85518Srobert self.name = name 25*12c85518Srobert # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) 26*12c85518Srobert # None for C symbols. 27*12c85518Srobert self.namespace = namespace 28*12c85518Srobert # a list of corresponding headers 29*12c85518Srobert self.headers = headers 30*12c85518Srobert 31*12c85518Srobert 32*12c85518Srobertdef _HasClass(tag, *classes): 33*12c85518Srobert for c in tag.get('class', []): 34*12c85518Srobert if c in classes: 35*12c85518Srobert return True 36*12c85518Srobert return False 37*12c85518Srobert 38*12c85518Srobert 39*12c85518Srobertdef _ParseSymbolPage(symbol_page_html, symbol_name): 40*12c85518Srobert """Parse symbol page and retrieve the include header defined in this page. 41*12c85518Srobert The symbol page provides header for the symbol, specifically in 42*12c85518Srobert "Defined in header <header>" section. An example: 43*12c85518Srobert 44*12c85518Srobert <tr class="t-dsc-header"> 45*12c85518Srobert <td colspan="2"> <div>Defined in header <code><ratio></code> </div> 46*12c85518Srobert </td></tr> 47*12c85518Srobert 48*12c85518Srobert Returns a list of headers. 49*12c85518Srobert """ 50*12c85518Srobert headers = set() 51*12c85518Srobert all_headers = set() 52*12c85518Srobert 53*12c85518Srobert soup = BeautifulSoup(symbol_page_html, "html.parser") 54*12c85518Srobert # Rows in table are like: 55*12c85518Srobert # Defined in header <foo> .t-dsc-header 56*12c85518Srobert # Defined in header <bar> .t-dsc-header 57*12c85518Srobert # decl1 .t-dcl 58*12c85518Srobert # Defined in header <baz> .t-dsc-header 59*12c85518Srobert # decl2 .t-dcl 60*12c85518Srobert for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): 61*12c85518Srobert current_headers = [] 62*12c85518Srobert was_decl = False 63*12c85518Srobert for row in table.select('tr'): 64*12c85518Srobert if _HasClass(row, 't-dcl', 't-dsc'): 65*12c85518Srobert was_decl = True 66*12c85518Srobert # Symbols are in the first cell. 67*12c85518Srobert found_symbols = row.find('td').stripped_strings 68*12c85518Srobert if not symbol_name in found_symbols: 69*12c85518Srobert continue 70*12c85518Srobert headers.update(current_headers) 71*12c85518Srobert elif _HasClass(row, 't-dsc-header'): 72*12c85518Srobert # If we saw a decl since the last header, this is a new block of headers 73*12c85518Srobert # for a new block of decls. 74*12c85518Srobert if was_decl: 75*12c85518Srobert current_headers = [] 76*12c85518Srobert was_decl = False 77*12c85518Srobert # There are also .t-dsc-header for "defined in namespace". 78*12c85518Srobert if not "Defined in header " in row.text: 79*12c85518Srobert continue 80*12c85518Srobert # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. 81*12c85518Srobert for header_code in row.find_all("code"): 82*12c85518Srobert current_headers.append(header_code.text) 83*12c85518Srobert all_headers.add(header_code.text) 84*12c85518Srobert # If the symbol was never named, consider all named headers. 85*12c85518Srobert return headers or all_headers 86*12c85518Srobert 87*12c85518Srobert 88*12c85518Srobertdef _ParseIndexPage(index_page_html): 89*12c85518Srobert """Parse index page. 90*12c85518Srobert The index page lists all std symbols and hrefs to their detailed pages 91*12c85518Srobert (which contain the defined header). An example: 92*12c85518Srobert 93*12c85518Srobert <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> 94*12c85518Srobert <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> 95*12c85518Srobert 96*12c85518Srobert Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). 97*12c85518Srobert """ 98*12c85518Srobert symbols = [] 99*12c85518Srobert soup = BeautifulSoup(index_page_html, "html.parser") 100*12c85518Srobert for symbol_href in soup.select("a[title]"): 101*12c85518Srobert # Ignore annotated symbols like "acos<>() (std::complex)". 102*12c85518Srobert # These tend to be overloads, and we the primary is more useful. 103*12c85518Srobert # This accidentally accepts begin/end despite the (iterator) caption: the 104*12c85518Srobert # (since C++11) note is first. They are good symbols, so the bug is unfixed. 105*12c85518Srobert caption = symbol_href.next_sibling 106*12c85518Srobert variant = None 107*12c85518Srobert if isinstance(caption, NavigableString) and "(" in caption: 108*12c85518Srobert variant = caption.text.strip(" ()") 109*12c85518Srobert symbol_tt = symbol_href.find("tt") 110*12c85518Srobert if symbol_tt: 111*12c85518Srobert symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() 112*12c85518Srobert symbol_href["href"], variant)) 113*12c85518Srobert return symbols 114*12c85518Srobert 115*12c85518Srobert 116*12c85518Srobertdef _ReadSymbolPage(path, name): 117*12c85518Srobert with open(path) as f: 118*12c85518Srobert return _ParseSymbolPage(f.read(), name) 119*12c85518Srobert 120*12c85518Srobert 121*12c85518Srobertdef _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept): 122*12c85518Srobert """Get all symbols listed in the index page. All symbols should be in the 123*12c85518Srobert given namespace. 124*12c85518Srobert 125*12c85518Srobert Returns a list of Symbols. 126*12c85518Srobert """ 127*12c85518Srobert 128*12c85518Srobert # Workflow steps: 129*12c85518Srobert # 1. Parse index page which lists all symbols to get symbol 130*12c85518Srobert # name (unqualified name) and its href link to the symbol page which 131*12c85518Srobert # contains the defined header. 132*12c85518Srobert # 2. Parse the symbol page to get the defined header. 133*12c85518Srobert index_page_path = os.path.join(root_dir, index_page_name) 134*12c85518Srobert with open(index_page_path, "r") as f: 135*12c85518Srobert # Read each symbol page in parallel. 136*12c85518Srobert results = [] # (symbol_name, promise of [header...]) 137*12c85518Srobert for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): 138*12c85518Srobert # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. 139*12c85518Srobert # FIXME: use these as a fallback rather than ignoring entirely. 140*12c85518Srobert variants_for_symbol = variants_to_accept.get( 141*12c85518Srobert (namespace or "") + symbol_name, ()) 142*12c85518Srobert if variant and variant not in variants_for_symbol: 143*12c85518Srobert continue 144*12c85518Srobert path = os.path.join(root_dir, symbol_page_path) 145*12c85518Srobert if os.path.isfile(path): 146*12c85518Srobert results.append((symbol_name, 147*12c85518Srobert pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) 148*12c85518Srobert else: 149*12c85518Srobert sys.stderr.write("Discarding information for symbol: %s. Page %s does not exist.\n" 150*12c85518Srobert % (symbol_name, path)) 151*12c85518Srobert 152*12c85518Srobert # Build map from symbol name to a set of headers. 153*12c85518Srobert symbol_headers = collections.defaultdict(set) 154*12c85518Srobert for symbol_name, lazy_headers in results: 155*12c85518Srobert symbol_headers[symbol_name].update(lazy_headers.get()) 156*12c85518Srobert 157*12c85518Srobert symbols = [] 158*12c85518Srobert for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): 159*12c85518Srobert symbols.append(Symbol(name, namespace, list(headers))) 160*12c85518Srobert return symbols 161*12c85518Srobert 162*12c85518Srobert 163*12c85518Srobertdef GetSymbols(parse_pages): 164*12c85518Srobert """Get all symbols by parsing the given pages. 165*12c85518Srobert 166*12c85518Srobert Args: 167*12c85518Srobert parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) 168*12c85518Srobert """ 169*12c85518Srobert # By default we prefer the non-variant versions, as they're more common. But 170*12c85518Srobert # there are some symbols, whose variant is more common. This list describes 171*12c85518Srobert # those symbols. 172*12c85518Srobert variants_to_accept = { 173*12c85518Srobert # std::remove<> has variant algorithm. 174*12c85518Srobert "std::remove": ("algorithm"), 175*12c85518Srobert } 176*12c85518Srobert symbols = [] 177*12c85518Srobert # Run many workers to process individual symbol pages under the symbol index. 178*12c85518Srobert # Don't allow workers to capture Ctrl-C. 179*12c85518Srobert pool = multiprocessing.Pool( 180*12c85518Srobert initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) 181*12c85518Srobert try: 182*12c85518Srobert for root_dir, page_name, namespace in parse_pages: 183*12c85518Srobert symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace, 184*12c85518Srobert variants_to_accept)) 185*12c85518Srobert finally: 186*12c85518Srobert pool.terminate() 187*12c85518Srobert pool.join() 188*12c85518Srobert return symbols 189