1# -*- test-case-name: twisted.python.test.test_htmlizer -*-
2# Copyright (c) Twisted Matrix Laboratories.
3# See LICENSE for details.
4
5"""
6HTML rendering of Python source.
7"""
8
9import keyword
10import tokenize
11from html import escape
12from typing import List
13
14from . import reflect
15
16
17class TokenPrinter:
18    """
19    Format a stream of tokens and intermediate whitespace, for pretty-printing.
20    """
21
22    currentCol, currentLine = 0, 1
23    lastIdentifier = parameters = 0
24    encoding = "utf-8"
25
26    def __init__(self, writer):
27        """
28        @param writer: A file-like object, opened in bytes mode.
29        """
30        self.writer = writer
31
32    def printtoken(self, type, token, sCoordinates, eCoordinates, line):
33        if hasattr(tokenize, "ENCODING") and type == tokenize.ENCODING:
34            self.encoding = token
35            return
36
37        if not isinstance(token, bytes):
38            token = token.encode(self.encoding)
39
40        (srow, scol) = sCoordinates
41        (erow, ecol) = eCoordinates
42        if self.currentLine < srow:
43            self.writer(b"\n" * (srow - self.currentLine))
44            self.currentLine, self.currentCol = srow, 0
45        self.writer(b" " * (scol - self.currentCol))
46        if self.lastIdentifier:
47            type = "identifier"
48            self.parameters = 1
49        elif type == tokenize.NAME:
50            if keyword.iskeyword(token):
51                type = "keyword"
52            else:
53                if self.parameters:
54                    type = "parameter"
55                else:
56                    type = "variable"
57        else:
58            type = tokenize.tok_name.get(type)
59            assert type is not None
60            type = type.lower()
61        self.writer(token, type)
62        self.currentCol = ecol
63        self.currentLine += token.count(b"\n")
64        if self.currentLine != erow:
65            self.currentCol = 0
66        self.lastIdentifier = token in (b"def", b"class")
67        if token == b":":
68            self.parameters = 0
69
70
71class HTMLWriter:
72    """
73    Write the stream of tokens and whitespace from L{TokenPrinter}, formating
74    tokens as HTML spans.
75    """
76
77    noSpan: List[str] = []
78
79    def __init__(self, writer):
80        self.writer = writer
81        noSpan: List[str] = []
82        reflect.accumulateClassList(self.__class__, "noSpan", noSpan)
83        self.noSpan = noSpan
84
85    def write(self, token, type=None):
86        if isinstance(token, bytes):
87            token = token.decode("utf-8")
88        token = escape(token)
89        token = token.encode("utf-8")
90        if (type is None) or (type in self.noSpan):
91            self.writer(token)
92        else:
93            self.writer(
94                b'<span class="py-src-'
95                + type.encode("utf-8")
96                + b'">'
97                + token
98                + b"</span>"
99            )
100
101
102class SmallerHTMLWriter(HTMLWriter):
103    """
104    HTMLWriter that doesn't generate spans for some junk.
105
106    Results in much smaller HTML output.
107    """
108
109    noSpan = ["endmarker", "indent", "dedent", "op", "newline", "nl"]
110
111
112def filter(inp, out, writer=HTMLWriter):
113    out.write(b"<pre>")
114    printer = TokenPrinter(writer(out.write).write).printtoken
115    try:
116        for token in tokenize.tokenize(inp.readline):
117            (tokenType, string, start, end, line) = token
118            printer(tokenType, string, start, end, line)
119    except tokenize.TokenError:
120        pass
121    out.write(b"</pre>\n")
122
123
124def main():
125    import sys
126
127    stdout = getattr(sys.stdout, "buffer", sys.stdout)
128    with open(sys.argv[1], "rb") as f:
129        filter(f, stdout)
130
131
132if __name__ == "__main__":
133    main()
134