1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4r"""
5htmldocck.py is a custom checker script for Rustdoc HTML outputs.
6
7# How and why?
8
9The principle is simple: This script receives a path to generated HTML
10documentation and a "template" script, which has a series of check
11commands like `@has` or `@matches`. Each command is used to check if
12some pattern is present or not present in the particular file or in
13a particular node of the HTML tree. In many cases, the template script
14happens to be the source code given to rustdoc.
15
16While it indeed is possible to test in smaller portions, it has been
17hard to construct tests in this fashion and major rendering errors were
18discovered much later. This script is designed to make black-box and
19regression testing of Rustdoc easy. This does not preclude the needs for
20unit testing, but can be used to complement related tests by quickly
21showing the expected renderings.
22
23In order to avoid one-off dependencies for this task, this script uses
24a reasonably working HTML parser and the existing XPath implementation
25from Python's standard library. Hopefully, we won't render
26non-well-formed HTML.
27
28# Commands
29
30Commands start with an `@` followed by a command name (letters and
31hyphens), and zero or more arguments separated by one or more whitespace
32characters and optionally delimited with single or double quotes. The `@`
33mark cannot be preceded by a non-whitespace character. Other lines
34(including every text up to the first `@`) are ignored, but it is
35recommended to avoid the use of `@` in the template file.
36
37There are a number of supported commands:
38
39* `@has PATH` checks for the existence of the given file.
40
41  `PATH` is relative to the output directory. It can be given as `-`
42  which repeats the most recently used `PATH`.
43
44* `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
45  the occurrence of the given pattern `PATTERN` in the specified file.
46  Only one occurrence of the pattern is enough.
47
48  For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
49  whitespace being replaced by one single space character) string.
50  The entire file is also whitespace-normalized including newlines.
51
52  For `@matches`, `PATTERN` is a Python-supported regular expression.
53  The file remains intact but the regexp is matched without the `MULTILINE`
54  and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
55  to override them, and `\A` and `\Z` for definitely matching
56  the beginning and end of the file.
57
58  (The same distinction goes to other variants of these commands.)
59
60* `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
61  the presence of the given XPath `XPATH` in the specified HTML file,
62  and also the occurrence of the given pattern `PATTERN` in the matching
63  node or attribute. Only one occurrence of the pattern in the match
64  is enough.
65
66  `PATH` should be a valid and well-formed HTML file. It does *not*
67  accept arbitrary HTML5; it should have matching open and close tags
68  and correct entity references at least.
69
70  `XPATH` is an XPath expression to match. The XPath is fairly limited:
71  `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
72  `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
73  and `@attr` (both as the last segment) are supported. Some examples:
74
75  - `//pre` or `.//pre` matches any element with a name `pre`.
76  - `//a[@href]` matches any element with an `href` attribute.
77  - `//*[@class="impl"]//code` matches any element with a name `code`,
78    which is an ancestor of some element which `class` attr is `impl`.
79  - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
80    `class` attribute in the last `a` element (can be followed by more
81    elements that are not `a`) inside the first `span` in the `h1` with
82    a class of `fqn`. Note that there cannot be any additional elements
83    between them due to the use of `/` instead of `//`.
84
85  Do not try to use non-absolute paths, it won't work due to the flawed
86  ElementTree implementation. The script rejects them.
87
88  For the text matches (i.e. paths not ending with `@attr`), any
89  subelements are flattened into one string; this is handy for ignoring
90  highlights for example. If you want to simply check for the presence of
91  a given node or attribute, use an empty string (`""`) as a `PATTERN`.
92
93* `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
94  in the specified file. The number of occurrences must match the given
95  count.
96
97* `@has-dir PATH` checks for the existence of the given directory.
98
99All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
100checks if the given file does not exist, for example.
101
102"""
103
104from __future__ import absolute_import, print_function, unicode_literals
105
106import codecs
107import io
108import sys
109import os.path
110import re
111import shlex
112from collections import namedtuple
113try:
114    from html.parser import HTMLParser
115except ImportError:
116    from HTMLParser import HTMLParser
117try:
118    from xml.etree import cElementTree as ET
119except ImportError:
120    from xml.etree import ElementTree as ET
121
122try:
123    from html.entities import name2codepoint
124except ImportError:
125    from htmlentitydefs import name2codepoint
126
127# "void elements" (no closing tag) from the HTML Standard section 12.1.2
128VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
129                     'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
130
131# Python 2 -> 3 compatibility
132try:
133    unichr
134except NameError:
135    unichr = chr
136
137
138channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
139
140class CustomHTMLParser(HTMLParser):
141    """simplified HTML parser.
142
143    this is possible because we are dealing with very regular HTML from
144    rustdoc; we only have to deal with i) void elements and ii) empty
145    attributes."""
146    def __init__(self, target=None):
147        HTMLParser.__init__(self)
148        self.__builder = target or ET.TreeBuilder()
149
150    def handle_starttag(self, tag, attrs):
151        attrs = {k: v or '' for k, v in attrs}
152        self.__builder.start(tag, attrs)
153        if tag in VOID_ELEMENTS:
154            self.__builder.end(tag)
155
156    def handle_endtag(self, tag):
157        self.__builder.end(tag)
158
159    def handle_startendtag(self, tag, attrs):
160        attrs = {k: v or '' for k, v in attrs}
161        self.__builder.start(tag, attrs)
162        self.__builder.end(tag)
163
164    def handle_data(self, data):
165        self.__builder.data(data)
166
167    def handle_entityref(self, name):
168        self.__builder.data(unichr(name2codepoint[name]))
169
170    def handle_charref(self, name):
171        code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
172        self.__builder.data(unichr(code))
173
174    def close(self):
175        HTMLParser.close(self)
176        return self.__builder.close()
177
178
179Command = namedtuple('Command', 'negated cmd args lineno context')
180
181
182class FailedCheck(Exception):
183    pass
184
185
186class InvalidCheck(Exception):
187    pass
188
189
190def concat_multi_lines(f):
191    """returns a generator out of the file object, which
192    - removes `\\` then `\n` then a shared prefix with the previous line then
193      optional whitespace;
194    - keeps a line number (starting from 0) of the first line being
195      concatenated."""
196    lastline = None  # set to the last line when the last line has a backslash
197    firstlineno = None
198    catenated = ''
199    for lineno, line in enumerate(f):
200        line = line.rstrip('\r\n')
201
202        # strip the common prefix from the current line if needed
203        if lastline is not None:
204            common_prefix = os.path.commonprefix([line, lastline])
205            line = line[len(common_prefix):].lstrip()
206
207        firstlineno = firstlineno or lineno
208        if line.endswith('\\'):
209            if lastline is None:
210                lastline = line[:-1]
211            catenated += line[:-1]
212        else:
213            yield firstlineno, catenated + line
214            lastline = None
215            firstlineno = None
216            catenated = ''
217
218    if lastline is not None:
219        print_err(lineno, line, 'Trailing backslash at the end of the file')
220
221
222LINE_PATTERN = re.compile(r'''
223    (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
224    (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
225    (?P<args>.*)$
226''', re.X | re.UNICODE)
227
228
229def get_commands(template):
230    with io.open(template, encoding='utf-8') as f:
231        for lineno, line in concat_multi_lines(f):
232            m = LINE_PATTERN.search(line)
233            if not m:
234                continue
235
236            negated = (m.group('negated') == '!')
237            cmd = m.group('cmd')
238            if m.group('invalid') == '!':
239                print_err(
240                    lineno,
241                    line,
242                    'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
243                        '!' if negated else '',
244                        cmd,
245                    ),
246                )
247                continue
248            args = m.group('args')
249            if args and not args[:1].isspace():
250                print_err(lineno, line, 'Invalid template syntax')
251                continue
252            try:
253                args = shlex.split(args)
254            except UnicodeEncodeError:
255                args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
256            yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
257
258
259def _flatten(node, acc):
260    if node.text:
261        acc.append(node.text)
262    for e in node:
263        _flatten(e, acc)
264        if e.tail:
265            acc.append(e.tail)
266
267
268def flatten(node):
269    acc = []
270    _flatten(node, acc)
271    return ''.join(acc)
272
273
274def normalize_xpath(path):
275    path = path.replace("{{channel}}", channel)
276    if path.startswith('//'):
277        return '.' + path  # avoid warnings
278    elif path.startswith('.//'):
279        return path
280    else:
281        raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
282
283
284class CachedFiles(object):
285    def __init__(self, root):
286        self.root = root
287        self.files = {}
288        self.trees = {}
289        self.last_path = None
290
291    def resolve_path(self, path):
292        if path != '-':
293            path = os.path.normpath(path)
294            self.last_path = path
295            return path
296        elif self.last_path is None:
297            raise InvalidCheck('Tried to use the previous path in the first command')
298        else:
299            return self.last_path
300
301    def get_file(self, path):
302        path = self.resolve_path(path)
303        if path in self.files:
304            return self.files[path]
305
306        abspath = os.path.join(self.root, path)
307        if not(os.path.exists(abspath) and os.path.isfile(abspath)):
308            raise FailedCheck('File does not exist {!r}'.format(path))
309
310        with io.open(abspath, encoding='utf-8') as f:
311            data = f.read()
312            self.files[path] = data
313            return data
314
315    def get_tree(self, path):
316        path = self.resolve_path(path)
317        if path in self.trees:
318            return self.trees[path]
319
320        abspath = os.path.join(self.root, path)
321        if not(os.path.exists(abspath) and os.path.isfile(abspath)):
322            raise FailedCheck('File does not exist {!r}'.format(path))
323
324        with io.open(abspath, encoding='utf-8') as f:
325            try:
326                tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
327            except Exception as e:
328                raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
329            self.trees[path] = tree
330            return self.trees[path]
331
332    def get_dir(self, path):
333        path = self.resolve_path(path)
334        abspath = os.path.join(self.root, path)
335        if not(os.path.exists(abspath) and os.path.isdir(abspath)):
336            raise FailedCheck('Directory does not exist {!r}'.format(path))
337
338
339def check_string(data, pat, regexp):
340    pat = pat.replace("{{channel}}", channel)
341    if not pat:
342        return True  # special case a presence testing
343    elif regexp:
344        return re.search(pat, data, flags=re.UNICODE) is not None
345    else:
346        data = ' '.join(data.split())
347        pat = ' '.join(pat.split())
348        return pat in data
349
350
351def check_tree_attr(tree, path, attr, pat, regexp):
352    path = normalize_xpath(path)
353    ret = False
354    for e in tree.findall(path):
355        if attr in e.attrib:
356            value = e.attrib[attr]
357        else:
358            continue
359
360        ret = check_string(value, pat, regexp)
361        if ret:
362            break
363    return ret
364
365
366def check_tree_text(tree, path, pat, regexp):
367    path = normalize_xpath(path)
368    ret = False
369    try:
370        for e in tree.findall(path):
371            try:
372                value = flatten(e)
373            except KeyError:
374                continue
375            else:
376                ret = check_string(value, pat, regexp)
377                if ret:
378                    break
379    except Exception:
380        print('Failed to get path "{}"'.format(path))
381        raise
382    return ret
383
384
385def get_tree_count(tree, path):
386    path = normalize_xpath(path)
387    return len(tree.findall(path))
388
389
390def stderr(*args):
391    if sys.version_info.major < 3:
392        file = codecs.getwriter('utf-8')(sys.stderr)
393    else:
394        file = sys.stderr
395
396    print(*args, file=file)
397
398
399def print_err(lineno, context, err, message=None):
400    global ERR_COUNT
401    ERR_COUNT += 1
402    stderr("{}: {}".format(lineno, message or err))
403    if message and err:
404        stderr("\t{}".format(err))
405
406    if context:
407        stderr("\t{}".format(context))
408
409
410ERR_COUNT = 0
411
412
413def check_command(c, cache):
414    try:
415        cerr = ""
416        if c.cmd == 'has' or c.cmd == 'matches':  # string test
417            regexp = (c.cmd == 'matches')
418            if len(c.args) == 1 and not regexp:  # @has <path> = file existence
419                try:
420                    cache.get_file(c.args[0])
421                    ret = True
422                except FailedCheck as err:
423                    cerr = str(err)
424                    ret = False
425            elif len(c.args) == 2:  # @has/matches <path> <pat> = string test
426                cerr = "`PATTERN` did not match"
427                ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
428            elif len(c.args) == 3:  # @has/matches <path> <pat> <match> = XML tree test
429                cerr = "`XPATH PATTERN` did not match"
430                tree = cache.get_tree(c.args[0])
431                pat, sep, attr = c.args[1].partition('/@')
432                if sep:  # attribute
433                    tree = cache.get_tree(c.args[0])
434                    ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
435                else:  # normalized text
436                    pat = c.args[1]
437                    if pat.endswith('/text()'):
438                        pat = pat[:-7]
439                    ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
440            else:
441                raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
442
443        elif c.cmd == 'count':  # count test
444            if len(c.args) == 3:  # @count <path> <pat> <count> = count test
445                expected = int(c.args[2])
446                found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
447                cerr = "Expected {} occurrences but found {}".format(expected, found)
448                ret = expected == found
449            else:
450                raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
451        elif c.cmd == 'has-dir':  # has-dir test
452            if len(c.args) == 1:  # @has-dir <path> = has-dir test
453                try:
454                    cache.get_dir(c.args[0])
455                    ret = True
456                except FailedCheck as err:
457                    cerr = str(err)
458                    ret = False
459            else:
460                raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
461        elif c.cmd == 'valid-html':
462            raise InvalidCheck('Unimplemented @valid-html')
463
464        elif c.cmd == 'valid-links':
465            raise InvalidCheck('Unimplemented @valid-links')
466        else:
467            raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
468
469        if ret == c.negated:
470            raise FailedCheck(cerr)
471
472    except FailedCheck as err:
473        message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
474        print_err(c.lineno, c.context, str(err), message)
475    except InvalidCheck as err:
476        print_err(c.lineno, c.context, str(err))
477
478
479def check(target, commands):
480    cache = CachedFiles(target)
481    for c in commands:
482        check_command(c, cache)
483
484
485if __name__ == '__main__':
486    if len(sys.argv) != 3:
487        stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
488        raise SystemExit(1)
489
490    check(sys.argv[1], get_commands(sys.argv[2]))
491    if ERR_COUNT:
492        stderr("\nEncountered {} errors".format(ERR_COUNT))
493        raise SystemExit(1)
494