1import io
2import re
3import textwrap
4from typing import Iterable, Optional
5
6from mitmproxy.contentviews import base
7from mitmproxy.utils import sliding_window, strutils
8
9"""
10A custom XML/HTML prettifier. Compared to other prettifiers, its main features are:
11
12- Implemented in pure Python.
13- Modifies whitespace only.
14- Works with any input.
15- Lazy evaluation.
16
17The implementation is split into two main parts: tokenization and formatting of tokens.
18"""
19
20# http://www.xml.com/pub/a/2001/07/25/namingparts.html - this is close enough for what we do.
21REGEX_TAG = re.compile(r"[a-zA-Z0-9._:\-]+(?!=)")
22# https://www.w3.org/TR/html5/syntax.html#void-elements
23HTML_VOID_ELEMENTS = {
24    "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param",
25    "source", "track", "wbr"
26}
27NO_INDENT_TAGS = {"xml", "doctype", "html"}
28INDENT = 2
29
30
31class Token:
32    def __init__(self, data):
33        self.data = data
34
35    def __repr__(self):
36        return "{}({})".format(
37            type(self).__name__,
38            self.data
39        )
40
41
42class Text(Token):
43    @property
44    def text(self):
45        return self.data.strip()
46
47
48class Tag(Token):
49    @property
50    def tag(self):
51        t = REGEX_TAG.search(self.data)
52        if t is not None:
53            return t.group(0).lower()
54        return "<empty>"
55
56    @property
57    def is_comment(self) -> bool:
58        return self.data.startswith("<!--")
59
60    @property
61    def is_cdata(self) -> bool:
62        return self.data.startswith("<![CDATA[")
63
64    @property
65    def is_closing(self):
66        return self.data.startswith("</")
67
68    @property
69    def is_self_closing(self):
70        return self.is_comment or self.is_cdata or self.data.endswith(
71            "/>") or self.tag in HTML_VOID_ELEMENTS
72
73    @property
74    def is_opening(self):
75        return not self.is_closing and not self.is_self_closing
76
77    @property
78    def done(self):
79        if self.is_comment:
80            return self.data.endswith("-->")
81        elif self.is_cdata:
82            return self.data.endswith("]]>")
83        else:
84            # This fails for attributes that contain an unescaped ">"
85            return self.data.endswith(">")
86
87
88def tokenize(data: str) -> Iterable[Token]:
89    token: Token = Text("")
90
91    i = 0
92
93    def readuntil(char, start, include=1):
94        nonlocal i
95        end = data.find(char, start)
96        if end == -1:
97            end = len(data)
98        ret = data[i:end + include]
99        i = end + include
100        return ret
101
102    while i < len(data):
103        if isinstance(token, Text):
104            token.data = readuntil("<", i, 0)
105            if token.text:
106                yield token
107            token = Tag("")
108        elif isinstance(token, Tag):
109            token.data += readuntil(">", i, 1)
110            if token.done:
111                yield token
112                token = Text("")
113    if token.data.strip():
114        yield token
115
116
117def indent_text(data: str, prefix: str) -> str:
118    # Add spacing to first line so that we dedent in cases like this:
119    # <li>This is
120    #     example text
121    #     over multiple lines
122    # </li>
123    dedented = textwrap.dedent(" " * 32 + data).strip()
124    return textwrap.indent(dedented, prefix[:32])
125
126
127def is_inline_text(a: Optional[Token], b: Optional[Token], c: Optional[Token]) -> bool:
128    if isinstance(a, Tag) and isinstance(b, Text) and isinstance(c, Tag):
129        if a.is_opening and "\n" not in b.data and c.is_closing and a.tag == c.tag:
130            return True
131    return False
132
133
134def is_inline(prev2: Optional[Token], prev1: Optional[Token], t: Optional[Token], next1: Optional[Token], next2: Optional[Token]) -> bool:
135    if isinstance(t, Text):
136        return is_inline_text(prev1, t, next1)
137    elif isinstance(t, Tag):
138        if is_inline_text(prev2, prev1, t) or is_inline_text(t, next1, next2):
139            return True
140        if isinstance(next1, Tag) and t.is_opening and next1.is_closing and t.tag == next1.tag:
141            return True  # <div></div> (start tag)
142        if isinstance(prev1, Tag) and prev1.is_opening and t.is_closing and prev1.tag == t.tag:
143            return True  # <div></div> (end tag)
144    return False
145
146
147class ElementStack:
148    """
149    Keep track of how deeply nested our document is.
150    """
151
152    def __init__(self):
153        self.open_tags = []
154        self.indent = ""
155
156    def push_tag(self, tag: str):
157        if len(self.open_tags) > 16:
158            return
159        self.open_tags.append(tag)
160        if tag not in NO_INDENT_TAGS:
161            self.indent += " " * INDENT
162
163    def pop_tag(self, tag: str):
164        if tag in self.open_tags:
165            remove_indent = 0
166            while True:
167                t = self.open_tags.pop()
168                if t not in NO_INDENT_TAGS:
169                    remove_indent += INDENT
170                if t == tag:
171                    break
172            self.indent = self.indent[:-remove_indent]
173        else:
174            pass  # this closing tag has no start tag. let's keep indentation as-is.
175
176
177def format_xml(tokens: Iterable[Token]) -> str:
178    out = io.StringIO()
179
180    context = ElementStack()
181
182    for prev2, prev1, token, next1, next2 in sliding_window.window(tokens, 2, 2):
183        if isinstance(token, Tag):
184            if token.is_opening:
185                out.write(indent_text(token.data, context.indent))
186
187                if not is_inline(prev2, prev1, token, next1, next2):
188                    out.write("\n")
189
190                context.push_tag(token.tag)
191            elif token.is_closing:
192                context.pop_tag(token.tag)
193
194                if is_inline(prev2, prev1, token, next1, next2):
195                    out.write(token.data)
196                else:
197                    out.write(indent_text(token.data, context.indent))
198                out.write("\n")
199
200            else:  # self-closing
201                out.write(indent_text(token.data, context.indent))
202                out.write("\n")
203        elif isinstance(token, Text):
204            if is_inline(prev2, prev1, token, next1, next2):
205                out.write(token.text)
206            else:
207                out.write(indent_text(token.data, context.indent))
208                out.write("\n")
209        else:  # pragma: no cover
210            raise RuntimeError()
211
212    return out.getvalue()
213
214
215class ViewXmlHtml(base.View):
216    name = "XML/HTML"
217    __content_types = ("text/xml", "text/html")
218
219    def __call__(self, data, **metadata):
220        # TODO:
221        # We should really have the message text as str here,
222        # not the message content as bytes.
223        # https://github.com/mitmproxy/mitmproxy/issues/1662#issuecomment-266192578
224        data = data.decode("utf8", "xmlcharrefreplace")
225        tokens = tokenize(data)
226        # TODO:
227        # Performance: Don't render the whole document right away.
228        # Let's wait with this until we have a sequence-like interface,
229        # this thing is reasonably fast right now anyway.
230        pretty = base.format_text(format_xml(tokens))
231        if "html" in data.lower():
232            t = "HTML"
233        else:
234            t = "XML"
235        return t, pretty
236
237    def render_priority(self, data: bytes, *, content_type: Optional[str] = None, **metadata) -> float:
238        if content_type in self.__content_types:
239            return 1
240        elif strutils.is_xml(data):
241            return 0.4
242        return float(content_type in self.__content_types)
243