1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3 4r""" 5htmldocck.py is a custom checker script for Rustdoc HTML outputs. 6 7# How and why? 8 9The principle is simple: This script receives a path to generated HTML 10documentation and a "template" script, which has a series of check 11commands like `@has` or `@matches`. Each command is used to check if 12some pattern is present or not present in the particular file or in 13a particular node of the HTML tree. In many cases, the template script 14happens to be the source code given to rustdoc. 15 16While it indeed is possible to test in smaller portions, it has been 17hard to construct tests in this fashion and major rendering errors were 18discovered much later. This script is designed to make black-box and 19regression testing of Rustdoc easy. This does not preclude the needs for 20unit testing, but can be used to complement related tests by quickly 21showing the expected renderings. 22 23In order to avoid one-off dependencies for this task, this script uses 24a reasonably working HTML parser and the existing XPath implementation 25from Python's standard library. Hopefully, we won't render 26non-well-formed HTML. 27 28# Commands 29 30Commands start with an `@` followed by a command name (letters and 31hyphens), and zero or more arguments separated by one or more whitespace 32characters and optionally delimited with single or double quotes. The `@` 33mark cannot be preceded by a non-whitespace character. Other lines 34(including every text up to the first `@`) are ignored, but it is 35recommended to avoid the use of `@` in the template file. 36 37There are a number of supported commands: 38 39* `@has PATH` checks for the existence of the given file. 40 41 `PATH` is relative to the output directory. It can be given as `-` 42 which repeats the most recently used `PATH`. 43 44* `@has PATH PATTERN` and `@matches PATH PATTERN` checks for 45 the occurrence of the given pattern `PATTERN` in the specified file. 46 Only one occurrence of the pattern is enough. 47 48 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive 49 whitespace being replaced by one single space character) string. 50 The entire file is also whitespace-normalized including newlines. 51 52 For `@matches`, `PATTERN` is a Python-supported regular expression. 53 The file remains intact but the regexp is matched without the `MULTILINE` 54 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)` 55 to override them, and `\A` and `\Z` for definitely matching 56 the beginning and end of the file. 57 58 (The same distinction goes to other variants of these commands.) 59 60* `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for 61 the presence of the given XPath `XPATH` in the specified HTML file, 62 and also the occurrence of the given pattern `PATTERN` in the matching 63 node or attribute. Only one occurrence of the pattern in the match 64 is enough. 65 66 `PATH` should be a valid and well-formed HTML file. It does *not* 67 accept arbitrary HTML5; it should have matching open and close tags 68 and correct entity references at least. 69 70 `XPATH` is an XPath expression to match. The XPath is fairly limited: 71 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`, 72 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()` 73 and `@attr` (both as the last segment) are supported. Some examples: 74 75 - `//pre` or `.//pre` matches any element with a name `pre`. 76 - `//a[@href]` matches any element with an `href` attribute. 77 - `//*[@class="impl"]//code` matches any element with a name `code`, 78 which is an ancestor of some element which `class` attr is `impl`. 79 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of 80 `class` attribute in the last `a` element (can be followed by more 81 elements that are not `a`) inside the first `span` in the `h1` with 82 a class of `fqn`. Note that there cannot be any additional elements 83 between them due to the use of `/` instead of `//`. 84 85 Do not try to use non-absolute paths, it won't work due to the flawed 86 ElementTree implementation. The script rejects them. 87 88 For the text matches (i.e. paths not ending with `@attr`), any 89 subelements are flattened into one string; this is handy for ignoring 90 highlights for example. If you want to simply check for the presence of 91 a given node or attribute, use an empty string (`""`) as a `PATTERN`. 92 93* `@count PATH XPATH COUNT' checks for the occurrence of the given XPath 94 in the specified file. The number of occurrences must match the given 95 count. 96 97* `@has-dir PATH` checks for the existence of the given directory. 98 99All conditions can be negated with `!`. `@!has foo/type.NoSuch.html` 100checks if the given file does not exist, for example. 101 102""" 103 104from __future__ import absolute_import, print_function, unicode_literals 105 106import codecs 107import io 108import sys 109import os.path 110import re 111import shlex 112from collections import namedtuple 113try: 114 from html.parser import HTMLParser 115except ImportError: 116 from HTMLParser import HTMLParser 117try: 118 from xml.etree import cElementTree as ET 119except ImportError: 120 from xml.etree import ElementTree as ET 121 122try: 123 from html.entities import name2codepoint 124except ImportError: 125 from htmlentitydefs import name2codepoint 126 127# "void elements" (no closing tag) from the HTML Standard section 12.1.2 128VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 129 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'} 130 131# Python 2 -> 3 compatibility 132try: 133 unichr 134except NameError: 135 unichr = chr 136 137 138channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"] 139 140class CustomHTMLParser(HTMLParser): 141 """simplified HTML parser. 142 143 this is possible because we are dealing with very regular HTML from 144 rustdoc; we only have to deal with i) void elements and ii) empty 145 attributes.""" 146 def __init__(self, target=None): 147 HTMLParser.__init__(self) 148 self.__builder = target or ET.TreeBuilder() 149 150 def handle_starttag(self, tag, attrs): 151 attrs = {k: v or '' for k, v in attrs} 152 self.__builder.start(tag, attrs) 153 if tag in VOID_ELEMENTS: 154 self.__builder.end(tag) 155 156 def handle_endtag(self, tag): 157 self.__builder.end(tag) 158 159 def handle_startendtag(self, tag, attrs): 160 attrs = {k: v or '' for k, v in attrs} 161 self.__builder.start(tag, attrs) 162 self.__builder.end(tag) 163 164 def handle_data(self, data): 165 self.__builder.data(data) 166 167 def handle_entityref(self, name): 168 self.__builder.data(unichr(name2codepoint[name])) 169 170 def handle_charref(self, name): 171 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10) 172 self.__builder.data(unichr(code)) 173 174 def close(self): 175 HTMLParser.close(self) 176 return self.__builder.close() 177 178 179Command = namedtuple('Command', 'negated cmd args lineno context') 180 181 182class FailedCheck(Exception): 183 pass 184 185 186class InvalidCheck(Exception): 187 pass 188 189 190def concat_multi_lines(f): 191 """returns a generator out of the file object, which 192 - removes `\\` then `\n` then a shared prefix with the previous line then 193 optional whitespace; 194 - keeps a line number (starting from 0) of the first line being 195 concatenated.""" 196 lastline = None # set to the last line when the last line has a backslash 197 firstlineno = None 198 catenated = '' 199 for lineno, line in enumerate(f): 200 line = line.rstrip('\r\n') 201 202 # strip the common prefix from the current line if needed 203 if lastline is not None: 204 common_prefix = os.path.commonprefix([line, lastline]) 205 line = line[len(common_prefix):].lstrip() 206 207 firstlineno = firstlineno or lineno 208 if line.endswith('\\'): 209 if lastline is None: 210 lastline = line[:-1] 211 catenated += line[:-1] 212 else: 213 yield firstlineno, catenated + line 214 lastline = None 215 firstlineno = None 216 catenated = '' 217 218 if lastline is not None: 219 print_err(lineno, line, 'Trailing backslash at the end of the file') 220 221 222LINE_PATTERN = re.compile(r''' 223 (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?) 224 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*) 225 (?P<args>.*)$ 226''', re.X | re.UNICODE) 227 228 229def get_commands(template): 230 with io.open(template, encoding='utf-8') as f: 231 for lineno, line in concat_multi_lines(f): 232 m = LINE_PATTERN.search(line) 233 if not m: 234 continue 235 236 negated = (m.group('negated') == '!') 237 cmd = m.group('cmd') 238 if m.group('invalid') == '!': 239 print_err( 240 lineno, 241 line, 242 'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format( 243 '!' if negated else '', 244 cmd, 245 ), 246 ) 247 continue 248 args = m.group('args') 249 if args and not args[:1].isspace(): 250 print_err(lineno, line, 'Invalid template syntax') 251 continue 252 try: 253 args = shlex.split(args) 254 except UnicodeEncodeError: 255 args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))] 256 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line) 257 258 259def _flatten(node, acc): 260 if node.text: 261 acc.append(node.text) 262 for e in node: 263 _flatten(e, acc) 264 if e.tail: 265 acc.append(e.tail) 266 267 268def flatten(node): 269 acc = [] 270 _flatten(node, acc) 271 return ''.join(acc) 272 273 274def normalize_xpath(path): 275 path = path.replace("{{channel}}", channel) 276 if path.startswith('//'): 277 return '.' + path # avoid warnings 278 elif path.startswith('.//'): 279 return path 280 else: 281 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues') 282 283 284class CachedFiles(object): 285 def __init__(self, root): 286 self.root = root 287 self.files = {} 288 self.trees = {} 289 self.last_path = None 290 291 def resolve_path(self, path): 292 if path != '-': 293 path = os.path.normpath(path) 294 self.last_path = path 295 return path 296 elif self.last_path is None: 297 raise InvalidCheck('Tried to use the previous path in the first command') 298 else: 299 return self.last_path 300 301 def get_file(self, path): 302 path = self.resolve_path(path) 303 if path in self.files: 304 return self.files[path] 305 306 abspath = os.path.join(self.root, path) 307 if not(os.path.exists(abspath) and os.path.isfile(abspath)): 308 raise FailedCheck('File does not exist {!r}'.format(path)) 309 310 with io.open(abspath, encoding='utf-8') as f: 311 data = f.read() 312 self.files[path] = data 313 return data 314 315 def get_tree(self, path): 316 path = self.resolve_path(path) 317 if path in self.trees: 318 return self.trees[path] 319 320 abspath = os.path.join(self.root, path) 321 if not(os.path.exists(abspath) and os.path.isfile(abspath)): 322 raise FailedCheck('File does not exist {!r}'.format(path)) 323 324 with io.open(abspath, encoding='utf-8') as f: 325 try: 326 tree = ET.fromstringlist(f.readlines(), CustomHTMLParser()) 327 except Exception as e: 328 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e)) 329 self.trees[path] = tree 330 return self.trees[path] 331 332 def get_dir(self, path): 333 path = self.resolve_path(path) 334 abspath = os.path.join(self.root, path) 335 if not(os.path.exists(abspath) and os.path.isdir(abspath)): 336 raise FailedCheck('Directory does not exist {!r}'.format(path)) 337 338 339def check_string(data, pat, regexp): 340 pat = pat.replace("{{channel}}", channel) 341 if not pat: 342 return True # special case a presence testing 343 elif regexp: 344 return re.search(pat, data, flags=re.UNICODE) is not None 345 else: 346 data = ' '.join(data.split()) 347 pat = ' '.join(pat.split()) 348 return pat in data 349 350 351def check_tree_attr(tree, path, attr, pat, regexp): 352 path = normalize_xpath(path) 353 ret = False 354 for e in tree.findall(path): 355 if attr in e.attrib: 356 value = e.attrib[attr] 357 else: 358 continue 359 360 ret = check_string(value, pat, regexp) 361 if ret: 362 break 363 return ret 364 365 366def check_tree_text(tree, path, pat, regexp): 367 path = normalize_xpath(path) 368 ret = False 369 try: 370 for e in tree.findall(path): 371 try: 372 value = flatten(e) 373 except KeyError: 374 continue 375 else: 376 ret = check_string(value, pat, regexp) 377 if ret: 378 break 379 except Exception: 380 print('Failed to get path "{}"'.format(path)) 381 raise 382 return ret 383 384 385def get_tree_count(tree, path): 386 path = normalize_xpath(path) 387 return len(tree.findall(path)) 388 389 390def stderr(*args): 391 if sys.version_info.major < 3: 392 file = codecs.getwriter('utf-8')(sys.stderr) 393 else: 394 file = sys.stderr 395 396 print(*args, file=file) 397 398 399def print_err(lineno, context, err, message=None): 400 global ERR_COUNT 401 ERR_COUNT += 1 402 stderr("{}: {}".format(lineno, message or err)) 403 if message and err: 404 stderr("\t{}".format(err)) 405 406 if context: 407 stderr("\t{}".format(context)) 408 409 410ERR_COUNT = 0 411 412 413def check_command(c, cache): 414 try: 415 cerr = "" 416 if c.cmd == 'has' or c.cmd == 'matches': # string test 417 regexp = (c.cmd == 'matches') 418 if len(c.args) == 1 and not regexp: # @has <path> = file existence 419 try: 420 cache.get_file(c.args[0]) 421 ret = True 422 except FailedCheck as err: 423 cerr = str(err) 424 ret = False 425 elif len(c.args) == 2: # @has/matches <path> <pat> = string test 426 cerr = "`PATTERN` did not match" 427 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp) 428 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test 429 cerr = "`XPATH PATTERN` did not match" 430 tree = cache.get_tree(c.args[0]) 431 pat, sep, attr = c.args[1].partition('/@') 432 if sep: # attribute 433 tree = cache.get_tree(c.args[0]) 434 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp) 435 else: # normalized text 436 pat = c.args[1] 437 if pat.endswith('/text()'): 438 pat = pat[:-7] 439 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp) 440 else: 441 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd)) 442 443 elif c.cmd == 'count': # count test 444 if len(c.args) == 3: # @count <path> <pat> <count> = count test 445 expected = int(c.args[2]) 446 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1]) 447 cerr = "Expected {} occurrences but found {}".format(expected, found) 448 ret = expected == found 449 else: 450 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd)) 451 elif c.cmd == 'has-dir': # has-dir test 452 if len(c.args) == 1: # @has-dir <path> = has-dir test 453 try: 454 cache.get_dir(c.args[0]) 455 ret = True 456 except FailedCheck as err: 457 cerr = str(err) 458 ret = False 459 else: 460 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd)) 461 elif c.cmd == 'valid-html': 462 raise InvalidCheck('Unimplemented @valid-html') 463 464 elif c.cmd == 'valid-links': 465 raise InvalidCheck('Unimplemented @valid-links') 466 else: 467 raise InvalidCheck('Unrecognized @{}'.format(c.cmd)) 468 469 if ret == c.negated: 470 raise FailedCheck(cerr) 471 472 except FailedCheck as err: 473 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd) 474 print_err(c.lineno, c.context, str(err), message) 475 except InvalidCheck as err: 476 print_err(c.lineno, c.context, str(err)) 477 478 479def check(target, commands): 480 cache = CachedFiles(target) 481 for c in commands: 482 check_command(c, cache) 483 484 485if __name__ == '__main__': 486 if len(sys.argv) != 3: 487 stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0])) 488 raise SystemExit(1) 489 490 check(sys.argv[1], get_commands(sys.argv[2])) 491 if ERR_COUNT: 492 stderr("\nEncountered {} errors".format(ERR_COUNT)) 493 raise SystemExit(1) 494