1#!/usr/bin/env python 2#===- cppreference_parser.py - ------------------------------*- python -*--===# 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8#===------------------------------------------------------------------------===# 9 10from bs4 import BeautifulSoup, NavigableString 11 12import collections 13import multiprocessing 14import os 15import re 16import signal 17import sys 18 19 20class Symbol: 21 22 def __init__(self, name, namespace, headers): 23 # unqualifed symbol name, e.g. "move" 24 self.name = name 25 # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) 26 # None for C symbols. 27 self.namespace = namespace 28 # a list of corresponding headers 29 self.headers = headers 30 31 32def _HasClass(tag, *classes): 33 for c in tag.get('class', []): 34 if c in classes: 35 return True 36 return False 37 38 39def _ParseSymbolPage(symbol_page_html, symbol_name): 40 """Parse symbol page and retrieve the include header defined in this page. 41 The symbol page provides header for the symbol, specifically in 42 "Defined in header <header>" section. An example: 43 44 <tr class="t-dsc-header"> 45 <td colspan="2"> <div>Defined in header <code><ratio></code> </div> 46 </td></tr> 47 48 Returns a list of headers. 49 """ 50 headers = set() 51 all_headers = set() 52 53 soup = BeautifulSoup(symbol_page_html, "html.parser") 54 # Rows in table are like: 55 # Defined in header <foo> .t-dsc-header 56 # Defined in header <bar> .t-dsc-header 57 # decl1 .t-dcl 58 # Defined in header <baz> .t-dsc-header 59 # decl2 .t-dcl 60 for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): 61 current_headers = [] 62 was_decl = False 63 for row in table.select('tr'): 64 if _HasClass(row, 't-dcl', 't-dsc'): 65 was_decl = True 66 # Symbols are in the first cell. 67 found_symbols = row.find('td').stripped_strings 68 if not symbol_name in found_symbols: 69 continue 70 headers.update(current_headers) 71 elif _HasClass(row, 't-dsc-header'): 72 # If we saw a decl since the last header, this is a new block of headers 73 # for a new block of decls. 74 if was_decl: 75 current_headers = [] 76 was_decl = False 77 # There are also .t-dsc-header for "defined in namespace". 78 if not "Defined in header " in row.text: 79 continue 80 # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. 81 for header_code in row.find_all("code"): 82 current_headers.append(header_code.text) 83 all_headers.add(header_code.text) 84 # If the symbol was never named, consider all named headers. 85 return headers or all_headers 86 87 88def _ParseIndexPage(index_page_html): 89 """Parse index page. 90 The index page lists all std symbols and hrefs to their detailed pages 91 (which contain the defined header). An example: 92 93 <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> 94 <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> 95 96 Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). 97 """ 98 symbols = [] 99 soup = BeautifulSoup(index_page_html, "html.parser") 100 for symbol_href in soup.select("a[title]"): 101 # Ignore annotated symbols like "acos<>() (std::complex)". 102 # These tend to be overloads, and we the primary is more useful. 103 # This accidentally accepts begin/end despite the (iterator) caption: the 104 # (since C++11) note is first. They are good symbols, so the bug is unfixed. 105 caption = symbol_href.next_sibling 106 variant = None 107 if isinstance(caption, NavigableString) and "(" in caption: 108 variant = caption.text.strip(" ()") 109 symbol_tt = symbol_href.find("tt") 110 if symbol_tt: 111 symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() 112 symbol_href["href"], variant)) 113 return symbols 114 115 116def _ReadSymbolPage(path, name): 117 with open(path) as f: 118 return _ParseSymbolPage(f.read(), name) 119 120 121def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept): 122 """Get all symbols listed in the index page. All symbols should be in the 123 given namespace. 124 125 Returns a list of Symbols. 126 """ 127 128 # Workflow steps: 129 # 1. Parse index page which lists all symbols to get symbol 130 # name (unqualified name) and its href link to the symbol page which 131 # contains the defined header. 132 # 2. Parse the symbol page to get the defined header. 133 index_page_path = os.path.join(root_dir, index_page_name) 134 with open(index_page_path, "r") as f: 135 # Read each symbol page in parallel. 136 results = [] # (symbol_name, promise of [header...]) 137 for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): 138 # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. 139 # FIXME: use these as a fallback rather than ignoring entirely. 140 variants_for_symbol = variants_to_accept.get( 141 (namespace or "") + symbol_name, ()) 142 if variant and variant not in variants_for_symbol: 143 continue 144 path = os.path.join(root_dir, symbol_page_path) 145 if os.path.isfile(path): 146 results.append((symbol_name, 147 pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) 148 else: 149 sys.stderr.write("Discarding information for symbol: %s. Page %s does not exist.\n" 150 % (symbol_name, path)) 151 152 # Build map from symbol name to a set of headers. 153 symbol_headers = collections.defaultdict(set) 154 for symbol_name, lazy_headers in results: 155 symbol_headers[symbol_name].update(lazy_headers.get()) 156 157 symbols = [] 158 for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): 159 symbols.append(Symbol(name, namespace, list(headers))) 160 return symbols 161 162 163def GetSymbols(parse_pages): 164 """Get all symbols by parsing the given pages. 165 166 Args: 167 parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) 168 """ 169 # By default we prefer the non-variant versions, as they're more common. But 170 # there are some symbols, whose variant is more common. This list describes 171 # those symbols. 172 variants_to_accept = { 173 # std::remove<> has variant algorithm. 174 "std::remove": ("algorithm"), 175 } 176 symbols = [] 177 # Run many workers to process individual symbol pages under the symbol index. 178 # Don't allow workers to capture Ctrl-C. 179 pool = multiprocessing.Pool( 180 initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) 181 try: 182 for root_dir, page_name, namespace in parse_pages: 183 symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace, 184 variants_to_accept)) 185 finally: 186 pool.terminate() 187 pool.join() 188 return symbols 189