1import io 2import re 3import textwrap 4from typing import Iterable, Optional 5 6from mitmproxy.contentviews import base 7from mitmproxy.utils import sliding_window, strutils 8 9""" 10A custom XML/HTML prettifier. Compared to other prettifiers, its main features are: 11 12- Implemented in pure Python. 13- Modifies whitespace only. 14- Works with any input. 15- Lazy evaluation. 16 17The implementation is split into two main parts: tokenization and formatting of tokens. 18""" 19 20# http://www.xml.com/pub/a/2001/07/25/namingparts.html - this is close enough for what we do. 21REGEX_TAG = re.compile(r"[a-zA-Z0-9._:\-]+(?!=)") 22# https://www.w3.org/TR/html5/syntax.html#void-elements 23HTML_VOID_ELEMENTS = { 24 "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", 25 "source", "track", "wbr" 26} 27NO_INDENT_TAGS = {"xml", "doctype", "html"} 28INDENT = 2 29 30 31class Token: 32 def __init__(self, data): 33 self.data = data 34 35 def __repr__(self): 36 return "{}({})".format( 37 type(self).__name__, 38 self.data 39 ) 40 41 42class Text(Token): 43 @property 44 def text(self): 45 return self.data.strip() 46 47 48class Tag(Token): 49 @property 50 def tag(self): 51 t = REGEX_TAG.search(self.data) 52 if t is not None: 53 return t.group(0).lower() 54 return "<empty>" 55 56 @property 57 def is_comment(self) -> bool: 58 return self.data.startswith("<!--") 59 60 @property 61 def is_cdata(self) -> bool: 62 return self.data.startswith("<![CDATA[") 63 64 @property 65 def is_closing(self): 66 return self.data.startswith("</") 67 68 @property 69 def is_self_closing(self): 70 return self.is_comment or self.is_cdata or self.data.endswith( 71 "/>") or self.tag in HTML_VOID_ELEMENTS 72 73 @property 74 def is_opening(self): 75 return not self.is_closing and not self.is_self_closing 76 77 @property 78 def done(self): 79 if self.is_comment: 80 return self.data.endswith("-->") 81 elif self.is_cdata: 82 return self.data.endswith("]]>") 83 else: 84 # This fails for attributes that contain an unescaped ">" 85 return self.data.endswith(">") 86 87 88def tokenize(data: str) -> Iterable[Token]: 89 token: Token = Text("") 90 91 i = 0 92 93 def readuntil(char, start, include=1): 94 nonlocal i 95 end = data.find(char, start) 96 if end == -1: 97 end = len(data) 98 ret = data[i:end + include] 99 i = end + include 100 return ret 101 102 while i < len(data): 103 if isinstance(token, Text): 104 token.data = readuntil("<", i, 0) 105 if token.text: 106 yield token 107 token = Tag("") 108 elif isinstance(token, Tag): 109 token.data += readuntil(">", i, 1) 110 if token.done: 111 yield token 112 token = Text("") 113 if token.data.strip(): 114 yield token 115 116 117def indent_text(data: str, prefix: str) -> str: 118 # Add spacing to first line so that we dedent in cases like this: 119 # <li>This is 120 # example text 121 # over multiple lines 122 # </li> 123 dedented = textwrap.dedent(" " * 32 + data).strip() 124 return textwrap.indent(dedented, prefix[:32]) 125 126 127def is_inline_text(a: Optional[Token], b: Optional[Token], c: Optional[Token]) -> bool: 128 if isinstance(a, Tag) and isinstance(b, Text) and isinstance(c, Tag): 129 if a.is_opening and "\n" not in b.data and c.is_closing and a.tag == c.tag: 130 return True 131 return False 132 133 134def is_inline(prev2: Optional[Token], prev1: Optional[Token], t: Optional[Token], next1: Optional[Token], next2: Optional[Token]) -> bool: 135 if isinstance(t, Text): 136 return is_inline_text(prev1, t, next1) 137 elif isinstance(t, Tag): 138 if is_inline_text(prev2, prev1, t) or is_inline_text(t, next1, next2): 139 return True 140 if isinstance(next1, Tag) and t.is_opening and next1.is_closing and t.tag == next1.tag: 141 return True # <div></div> (start tag) 142 if isinstance(prev1, Tag) and prev1.is_opening and t.is_closing and prev1.tag == t.tag: 143 return True # <div></div> (end tag) 144 return False 145 146 147class ElementStack: 148 """ 149 Keep track of how deeply nested our document is. 150 """ 151 152 def __init__(self): 153 self.open_tags = [] 154 self.indent = "" 155 156 def push_tag(self, tag: str): 157 if len(self.open_tags) > 16: 158 return 159 self.open_tags.append(tag) 160 if tag not in NO_INDENT_TAGS: 161 self.indent += " " * INDENT 162 163 def pop_tag(self, tag: str): 164 if tag in self.open_tags: 165 remove_indent = 0 166 while True: 167 t = self.open_tags.pop() 168 if t not in NO_INDENT_TAGS: 169 remove_indent += INDENT 170 if t == tag: 171 break 172 self.indent = self.indent[:-remove_indent] 173 else: 174 pass # this closing tag has no start tag. let's keep indentation as-is. 175 176 177def format_xml(tokens: Iterable[Token]) -> str: 178 out = io.StringIO() 179 180 context = ElementStack() 181 182 for prev2, prev1, token, next1, next2 in sliding_window.window(tokens, 2, 2): 183 if isinstance(token, Tag): 184 if token.is_opening: 185 out.write(indent_text(token.data, context.indent)) 186 187 if not is_inline(prev2, prev1, token, next1, next2): 188 out.write("\n") 189 190 context.push_tag(token.tag) 191 elif token.is_closing: 192 context.pop_tag(token.tag) 193 194 if is_inline(prev2, prev1, token, next1, next2): 195 out.write(token.data) 196 else: 197 out.write(indent_text(token.data, context.indent)) 198 out.write("\n") 199 200 else: # self-closing 201 out.write(indent_text(token.data, context.indent)) 202 out.write("\n") 203 elif isinstance(token, Text): 204 if is_inline(prev2, prev1, token, next1, next2): 205 out.write(token.text) 206 else: 207 out.write(indent_text(token.data, context.indent)) 208 out.write("\n") 209 else: # pragma: no cover 210 raise RuntimeError() 211 212 return out.getvalue() 213 214 215class ViewXmlHtml(base.View): 216 name = "XML/HTML" 217 __content_types = ("text/xml", "text/html") 218 219 def __call__(self, data, **metadata): 220 # TODO: 221 # We should really have the message text as str here, 222 # not the message content as bytes. 223 # https://github.com/mitmproxy/mitmproxy/issues/1662#issuecomment-266192578 224 data = data.decode("utf8", "xmlcharrefreplace") 225 tokens = tokenize(data) 226 # TODO: 227 # Performance: Don't render the whole document right away. 228 # Let's wait with this until we have a sequence-like interface, 229 # this thing is reasonably fast right now anyway. 230 pretty = base.format_text(format_xml(tokens)) 231 if "html" in data.lower(): 232 t = "HTML" 233 else: 234 t = "XML" 235 return t, pretty 236 237 def render_priority(self, data: bytes, *, content_type: Optional[str] = None, **metadata) -> float: 238 if content_type in self.__content_types: 239 return 1 240 elif strutils.is_xml(data): 241 return 0.4 242 return float(content_type in self.__content_types) 243