1import html.entities 2from typing import Dict, List, Optional 3 4from . import config 5 6unifiable_n = { 7 html.entities.name2codepoint[k]: v 8 for k, v in config.UNIFIABLE.items() 9 if k != "nbsp" 10} 11 12 13def hn(tag: str) -> int: 14 if tag[0] == "h" and len(tag) == 2: 15 n = tag[1] 16 if "0" < n <= "9": 17 return int(n) 18 return 0 19 20 21def dumb_property_dict(style: str) -> Dict[str, str]: 22 """ 23 :returns: A hash of css attributes 24 """ 25 return { 26 x.strip().lower(): y.strip().lower() 27 for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z] 28 } 29 30 31def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]: 32 """ 33 :type data: str 34 35 :returns: A hash of css selectors, each of which contains a hash of 36 css attributes. 37 :rtype: dict 38 """ 39 # remove @import sentences 40 data += ";" 41 importIndex = data.find("@import") 42 while importIndex != -1: 43 data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :] 44 importIndex = data.find("@import") 45 46 # parse the css. reverted from dictionary comprehension in order to 47 # support older pythons 48 pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()] 49 try: 50 elements = {a.strip(): dumb_property_dict(b) for a, b in pairs} 51 except ValueError: 52 elements = {} # not that important 53 54 return elements 55 56 57def element_style( 58 attrs: Dict[str, Optional[str]], 59 style_def: Dict[str, Dict[str, str]], 60 parent_style: Dict[str, str], 61) -> Dict[str, str]: 62 """ 63 :type attrs: dict 64 :type style_def: dict 65 :type style_def: dict 66 67 :returns: A hash of the 'final' style attributes of the element 68 :rtype: dict 69 """ 70 style = parent_style.copy() 71 if "class" in attrs: 72 assert attrs["class"] is not None 73 for css_class in attrs["class"].split(): 74 css_style = style_def.get("." + css_class, {}) 75 style.update(css_style) 76 if "style" in attrs: 77 assert attrs["style"] is not None 78 immediate_style = dumb_property_dict(attrs["style"]) 79 style.update(immediate_style) 80 81 return style 82 83 84def google_list_style(style: Dict[str, str]) -> str: 85 """ 86 Finds out whether this is an ordered or unordered list 87 88 :type style: dict 89 90 :rtype: str 91 """ 92 if "list-style-type" in style: 93 list_style = style["list-style-type"] 94 if list_style in ["disc", "circle", "square", "none"]: 95 return "ul" 96 97 return "ol" 98 99 100def google_has_height(style: Dict[str, str]) -> bool: 101 """ 102 Check if the style of the element has the 'height' attribute 103 explicitly defined 104 105 :type style: dict 106 107 :rtype: bool 108 """ 109 return "height" in style 110 111 112def google_text_emphasis(style: Dict[str, str]) -> List[str]: 113 """ 114 :type style: dict 115 116 :returns: A list of all emphasis modifiers of the element 117 :rtype: list 118 """ 119 emphasis = [] 120 if "text-decoration" in style: 121 emphasis.append(style["text-decoration"]) 122 if "font-style" in style: 123 emphasis.append(style["font-style"]) 124 if "font-weight" in style: 125 emphasis.append(style["font-weight"]) 126 127 return emphasis 128 129 130def google_fixed_width_font(style: Dict[str, str]) -> bool: 131 """ 132 Check if the css of the current element defines a fixed width font 133 134 :type style: dict 135 136 :rtype: bool 137 """ 138 font_family = "" 139 if "font-family" in style: 140 font_family = style["font-family"] 141 return "courier new" == font_family or "consolas" == font_family 142 143 144def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int: 145 """ 146 Extract numbering from list element attributes 147 148 :type attrs: dict 149 150 :rtype: int or None 151 """ 152 if "start" in attrs: 153 assert attrs["start"] is not None 154 try: 155 return int(attrs["start"]) - 1 156 except ValueError: 157 pass 158 159 return 0 160 161 162def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool: 163 # If it appears to contain a link 164 # don't wrap 165 if not wrap_links and config.RE_LINK.search(para): 166 return True 167 # If the text begins with four spaces or one tab, it's a code block; 168 # don't wrap 169 if para[0:4] == " " or para[0] == "\t": 170 return True 171 172 # If the text begins with only two "--", possibly preceded by 173 # whitespace, that's an emdash; so wrap. 174 stripped = para.lstrip() 175 if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": 176 return False 177 178 # I'm not sure what this is for; I thought it was to detect lists, 179 # but there's a <br>-inside-<span> case in one of the tests that 180 # also depends upon it. 181 if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": 182 return not wrap_list_items 183 184 # If the text begins with a single -, *, or +, followed by a space, 185 # or an integer, followed by a ., followed by a space (in either 186 # case optionally proceeded by whitespace), it's a list; don't wrap. 187 return bool( 188 config.RE_ORDERED_LIST_MATCHER.match(stripped) 189 or config.RE_UNORDERED_LIST_MATCHER.match(stripped) 190 ) 191 192 193def escape_md(text: str) -> str: 194 """ 195 Escapes markdown-sensitive characters within other markdown 196 constructs. 197 """ 198 return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) 199 200 201def escape_md_section(text: str, snob: bool = False) -> str: 202 """ 203 Escapes markdown-sensitive characters across whole document sections. 204 """ 205 text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) 206 207 if snob: 208 text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) 209 210 text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) 211 text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) 212 text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) 213 214 return text 215 216 217def reformat_table(lines: List[str], right_margin: int) -> List[str]: 218 """ 219 Given the lines of a table 220 padds the cells and returns the new lines 221 """ 222 # find the maximum width of the columns 223 max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] 224 max_cols = len(max_width) 225 for line in lines: 226 cols = [x.rstrip() for x in line.split("|")] 227 num_cols = len(cols) 228 229 # don't drop any data if colspan attributes result in unequal lengths 230 if num_cols < max_cols: 231 cols += [""] * (max_cols - num_cols) 232 elif max_cols < num_cols: 233 max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] 234 max_cols = num_cols 235 236 max_width = [ 237 max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) 238 ] 239 240 # reformat 241 new_lines = [] 242 for line in lines: 243 cols = [x.rstrip() for x in line.split("|")] 244 if set(line.strip()) == set("-|"): 245 filler = "-" 246 new_cols = [ 247 x.rstrip() + (filler * (M - len(x.rstrip()))) 248 for x, M in zip(cols, max_width) 249 ] 250 else: 251 filler = " " 252 new_cols = [ 253 x.rstrip() + (filler * (M - len(x.rstrip()))) 254 for x, M in zip(cols, max_width) 255 ] 256 new_lines.append("|".join(new_cols)) 257 return new_lines 258 259 260def pad_tables_in_text(text: str, right_margin: int = 1) -> str: 261 """ 262 Provide padding for tables in the text 263 """ 264 lines = text.split("\n") 265 table_buffer = [] # type: List[str] 266 table_started = False 267 new_lines = [] 268 for line in lines: 269 # Toggle table started 270 if config.TABLE_MARKER_FOR_PAD in line: 271 table_started = not table_started 272 if not table_started: 273 table = reformat_table(table_buffer, right_margin) 274 new_lines.extend(table) 275 table_buffer = [] 276 new_lines.append("") 277 continue 278 # Process lines 279 if table_started: 280 table_buffer.append(line) 281 else: 282 new_lines.append(line) 283 return "\n".join(new_lines) 284