1"""html2text: Turn HTML into equivalent Markdown-structured text.""" 2 3import html.entities 4import html.parser 5import re 6import urllib.parse as urlparse 7from textwrap import wrap 8from typing import Dict, List, Optional, Tuple, Union 9 10from . import config 11from .elements import AnchorElement, ListElement 12from .typing import OutCallback 13from .utils import ( 14 dumb_css_parser, 15 element_style, 16 escape_md, 17 escape_md_section, 18 google_fixed_width_font, 19 google_has_height, 20 google_list_style, 21 google_text_emphasis, 22 hn, 23 list_numbering_start, 24 pad_tables_in_text, 25 skipwrap, 26 unifiable_n, 27) 28 29__version__ = (2020, 1, 16) 30 31 32# TODO: 33# Support decoded entities with UNIFIABLE. 34 35 36class HTML2Text(html.parser.HTMLParser): 37 def __init__( 38 self, 39 out: Optional[OutCallback] = None, 40 baseurl: str = "", 41 bodywidth: int = config.BODY_WIDTH, 42 ) -> None: 43 """ 44 Input parameters: 45 out: possible custom replacement for self.outtextf (which 46 appends lines of text). 47 baseurl: base URL of the document we process 48 """ 49 super().__init__(convert_charrefs=False) 50 51 # Config options 52 self.split_next_td = False 53 self.td_count = 0 54 self.table_start = False 55 self.unicode_snob = config.UNICODE_SNOB # covered in cli 56 self.escape_snob = config.ESCAPE_SNOB # covered in cli 57 self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH 58 self.body_width = bodywidth # covered in cli 59 self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli 60 self.inline_links = config.INLINE_LINKS # covered in cli 61 self.protect_links = config.PROTECT_LINKS # covered in cli 62 self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli 63 self.ignore_links = config.IGNORE_ANCHORS # covered in cli 64 self.ignore_images = config.IGNORE_IMAGES # covered in cli 65 self.images_as_html = config.IMAGES_AS_HTML # covered in cli 66 self.images_to_alt = config.IMAGES_TO_ALT # covered in cli 67 self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli 68 self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli 69 self.bypass_tables = config.BYPASS_TABLES # covered in cli 70 self.ignore_tables = config.IGNORE_TABLES # covered in cli 71 self.google_doc = False # covered in cli 72 self.ul_item_mark = "*" # covered in cli 73 self.emphasis_mark = "_" # covered in cli 74 self.strong_mark = "**" 75 self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli 76 self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli 77 self.hide_strikethrough = False # covered in cli 78 self.mark_code = config.MARK_CODE 79 self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli 80 self.wrap_links = config.WRAP_LINKS # covered in cli 81 self.pad_tables = config.PAD_TABLES # covered in cli 82 self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli 83 self.tag_callback = None 84 self.open_quote = config.OPEN_QUOTE # covered in cli 85 self.close_quote = config.CLOSE_QUOTE # covered in cli 86 87 if out is None: 88 self.out = self.outtextf 89 else: 90 self.out = out 91 92 # empty list to store output characters before they are "joined" 93 self.outtextlist = [] # type: List[str] 94 95 self.quiet = 0 96 self.p_p = 0 # number of newline character to print before next output 97 self.outcount = 0 98 self.start = True 99 self.space = False 100 self.a = [] # type: List[AnchorElement] 101 self.astack = [] # type: List[Optional[Dict[str, Optional[str]]]] 102 self.maybe_automatic_link = None # type: Optional[str] 103 self.empty_link = False 104 self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") 105 self.acount = 0 106 self.list = [] # type: List[ListElement] 107 self.blockquote = 0 108 self.pre = False 109 self.startpre = False 110 self.code = False 111 self.quote = False 112 self.br_toggle = "" 113 self.lastWasNL = False 114 self.lastWasList = False 115 self.style = 0 116 self.style_def = {} # type: Dict[str, Dict[str, str]] 117 self.tag_stack = ( 118 [] 119 ) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] 120 self.emphasis = 0 121 self.drop_white_space = 0 122 self.inheader = False 123 # Current abbreviation definition 124 self.abbr_title = None # type: Optional[str] 125 # Last inner HTML (for abbr being defined) 126 self.abbr_data = None # type: Optional[str] 127 # Stack of abbreviations to write later 128 self.abbr_list = {} # type: Dict[str, str] 129 self.baseurl = baseurl 130 self.stressed = False 131 self.preceding_stressed = False 132 self.preceding_data = "" 133 self.current_tag = "" 134 135 config.UNIFIABLE["nbsp"] = " _place_holder;" 136 137 def feed(self, data: str) -> None: 138 data = data.replace("</' + 'script>", "</ignore>") 139 super().feed(data) 140 141 def handle(self, data: str) -> str: 142 self.feed(data) 143 self.feed("") 144 markdown = self.optwrap(self.finish()) 145 if self.pad_tables: 146 return pad_tables_in_text(markdown) 147 else: 148 return markdown 149 150 def outtextf(self, s: str) -> None: 151 self.outtextlist.append(s) 152 if s: 153 self.lastWasNL = s[-1] == "\n" 154 155 def finish(self) -> str: 156 self.close() 157 158 self.pbr() 159 self.o("", force="end") 160 161 outtext = "".join(self.outtextlist) 162 163 if self.unicode_snob: 164 nbsp = html.entities.html5["nbsp;"] 165 else: 166 nbsp = " " 167 outtext = outtext.replace(" _place_holder;", nbsp) 168 169 # Clear self.outtextlist to avoid memory leak of its content to 170 # the next handling. 171 self.outtextlist = [] 172 173 return outtext 174 175 def handle_charref(self, c: str) -> None: 176 self.handle_data(self.charref(c), True) 177 178 def handle_entityref(self, c: str) -> None: 179 ref = self.entityref(c) 180 181 # ref may be an empty string (e.g. for ‎/‏ markers that should 182 # not contribute to the final output). 183 # self.handle_data cannot handle a zero-length string right after a 184 # stressed tag or mid-text within a stressed tag (text get split and 185 # self.stressed/self.preceding_stressed gets switched after the first 186 # part of that text). 187 if ref: 188 self.handle_data(ref, True) 189 190 def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: 191 self.handle_tag(tag, dict(attrs), start=True) 192 193 def handle_endtag(self, tag: str) -> None: 194 self.handle_tag(tag, {}, start=False) 195 196 def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: 197 """ 198 :type attrs: dict 199 200 :returns: The index of certain set of attributes (of a link) in the 201 self.a list. If the set of attributes is not found, returns None 202 :rtype: int 203 """ 204 if "href" not in attrs: 205 return None 206 207 match = False 208 for i, a in enumerate(self.a): 209 if "href" in a.attrs and a.attrs["href"] == attrs["href"]: 210 if "title" in a.attrs or "title" in attrs: 211 if ( 212 "title" in a.attrs 213 and "title" in attrs 214 and a.attrs["title"] == attrs["title"] 215 ): 216 match = True 217 else: 218 match = True 219 220 if match: 221 return i 222 return None 223 224 def handle_emphasis( 225 self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str] 226 ) -> None: 227 """ 228 Handles various text emphases 229 """ 230 tag_emphasis = google_text_emphasis(tag_style) 231 parent_emphasis = google_text_emphasis(parent_style) 232 233 # handle Google's text emphasis 234 strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough 235 236 # google and others may mark a font's weight as `bold` or `700` 237 bold = False 238 for bold_marker in config.BOLD_TEXT_STYLE_VALUES: 239 bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis 240 if bold: 241 break 242 243 italic = "italic" in tag_emphasis and "italic" not in parent_emphasis 244 fixed = ( 245 google_fixed_width_font(tag_style) 246 and not google_fixed_width_font(parent_style) 247 and not self.pre 248 ) 249 250 if start: 251 # crossed-out text must be handled before other attributes 252 # in order not to output qualifiers unnecessarily 253 if bold or italic or fixed: 254 self.emphasis += 1 255 if strikethrough: 256 self.quiet += 1 257 if italic: 258 self.o(self.emphasis_mark) 259 self.drop_white_space += 1 260 if bold: 261 self.o(self.strong_mark) 262 self.drop_white_space += 1 263 if fixed: 264 self.o("`") 265 self.drop_white_space += 1 266 self.code = True 267 else: 268 if bold or italic or fixed: 269 # there must not be whitespace before closing emphasis mark 270 self.emphasis -= 1 271 self.space = False 272 if fixed: 273 if self.drop_white_space: 274 # empty emphasis, drop it 275 self.drop_white_space -= 1 276 else: 277 self.o("`") 278 self.code = False 279 if bold: 280 if self.drop_white_space: 281 # empty emphasis, drop it 282 self.drop_white_space -= 1 283 else: 284 self.o(self.strong_mark) 285 if italic: 286 if self.drop_white_space: 287 # empty emphasis, drop it 288 self.drop_white_space -= 1 289 else: 290 self.o(self.emphasis_mark) 291 # space is only allowed after *all* emphasis marks 292 if (bold or italic) and not self.emphasis: 293 self.o(" ") 294 if strikethrough: 295 self.quiet -= 1 296 297 def handle_tag( 298 self, tag: str, attrs: Dict[str, Optional[str]], start: bool 299 ) -> None: 300 self.current_tag = tag 301 302 if self.tag_callback is not None: 303 if self.tag_callback(self, tag, attrs, start) is True: 304 return 305 306 # first thing inside the anchor tag is another tag 307 # that produces some output 308 if ( 309 start 310 and self.maybe_automatic_link is not None 311 and tag not in ["p", "div", "style", "dl", "dt"] 312 and (tag != "img" or self.ignore_images) 313 ): 314 self.o("[") 315 self.maybe_automatic_link = None 316 self.empty_link = False 317 318 if self.google_doc: 319 # the attrs parameter is empty for a closing tag. in addition, we 320 # need the attributes of the parent nodes in order to get a 321 # complete style description for the current element. we assume 322 # that google docs export well formed html. 323 parent_style = {} # type: Dict[str, str] 324 if start: 325 if self.tag_stack: 326 parent_style = self.tag_stack[-1][2] 327 tag_style = element_style(attrs, self.style_def, parent_style) 328 self.tag_stack.append((tag, attrs, tag_style)) 329 else: 330 dummy, attrs, tag_style = ( 331 self.tag_stack.pop() if self.tag_stack else (None, {}, {}) 332 ) 333 if self.tag_stack: 334 parent_style = self.tag_stack[-1][2] 335 336 if hn(tag): 337 self.p() 338 if start: 339 self.inheader = True 340 self.o(hn(tag) * "#" + " ") 341 else: 342 self.inheader = False 343 return # prevent redundant emphasis marks on headers 344 345 if tag in ["p", "div"]: 346 if self.google_doc: 347 if start and google_has_height(tag_style): 348 self.p() 349 else: 350 self.soft_br() 351 elif self.astack and tag == "div": 352 pass 353 else: 354 self.p() 355 356 if tag == "br" and start: 357 if self.blockquote > 0: 358 self.o(" \n> ") 359 else: 360 self.o(" \n") 361 362 if tag == "hr" and start: 363 self.p() 364 self.o("* * *") 365 self.p() 366 367 if tag in ["head", "style", "script"]: 368 if start: 369 self.quiet += 1 370 else: 371 self.quiet -= 1 372 373 if tag == "style": 374 if start: 375 self.style += 1 376 else: 377 self.style -= 1 378 379 if tag in ["body"]: 380 self.quiet = 0 # sites like 9rules.com never close <head> 381 382 if tag == "blockquote": 383 if start: 384 self.p() 385 self.o("> ", force=True) 386 self.start = True 387 self.blockquote += 1 388 else: 389 self.blockquote -= 1 390 self.p() 391 392 def no_preceding_space(self: HTML2Text) -> bool: 393 return bool( 394 self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1]) 395 ) 396 397 if tag in ["em", "i", "u"] and not self.ignore_emphasis: 398 if start and no_preceding_space(self): 399 emphasis = " " + self.emphasis_mark 400 else: 401 emphasis = self.emphasis_mark 402 403 self.o(emphasis) 404 if start: 405 self.stressed = True 406 407 if tag in ["strong", "b"] and not self.ignore_emphasis: 408 if start and no_preceding_space(self): 409 strong = " " + self.strong_mark 410 else: 411 strong = self.strong_mark 412 413 self.o(strong) 414 if start: 415 self.stressed = True 416 417 if tag in ["del", "strike", "s"]: 418 if start and no_preceding_space(self): 419 strike = " ~~" 420 else: 421 strike = "~~" 422 423 self.o(strike) 424 if start: 425 self.stressed = True 426 427 if self.google_doc: 428 if not self.inheader: 429 # handle some font attributes, but leave headers clean 430 self.handle_emphasis(start, tag_style, parent_style) 431 432 if tag in ["kbd", "code", "tt"] and not self.pre: 433 self.o("`") # TODO: `` `this` `` 434 self.code = not self.code 435 436 if tag == "abbr": 437 if start: 438 self.abbr_title = None 439 self.abbr_data = "" 440 if "title" in attrs: 441 self.abbr_title = attrs["title"] 442 else: 443 if self.abbr_title is not None: 444 assert self.abbr_data is not None 445 self.abbr_list[self.abbr_data] = self.abbr_title 446 self.abbr_title = None 447 self.abbr_data = None 448 449 if tag == "q": 450 if not self.quote: 451 self.o(self.open_quote) 452 else: 453 self.o(self.close_quote) 454 self.quote = not self.quote 455 456 def link_url(self: HTML2Text, link: str, title: str = "") -> None: 457 url = urlparse.urljoin(self.baseurl, link) 458 title = ' "{}"'.format(title) if title.strip() else "" 459 self.o("]({url}{title})".format(url=escape_md(url), title=title)) 460 461 if tag == "a" and not self.ignore_links: 462 if start: 463 if ( 464 "href" in attrs 465 and attrs["href"] is not None 466 and not (self.skip_internal_links and attrs["href"].startswith("#")) 467 ): 468 self.astack.append(attrs) 469 self.maybe_automatic_link = attrs["href"] 470 self.empty_link = True 471 if self.protect_links: 472 attrs["href"] = "<" + attrs["href"] + ">" 473 else: 474 self.astack.append(None) 475 else: 476 if self.astack: 477 a = self.astack.pop() 478 if self.maybe_automatic_link and not self.empty_link: 479 self.maybe_automatic_link = None 480 elif a: 481 assert a["href"] is not None 482 if self.empty_link: 483 self.o("[") 484 self.empty_link = False 485 self.maybe_automatic_link = None 486 if self.inline_links: 487 title = a.get("title") or "" 488 title = escape_md(title) 489 link_url(self, a["href"], title) 490 else: 491 i = self.previousIndex(a) 492 if i is not None: 493 a_props = self.a[i] 494 else: 495 self.acount += 1 496 a_props = AnchorElement(a, self.acount, self.outcount) 497 self.a.append(a_props) 498 self.o("][" + str(a_props.count) + "]") 499 500 if tag == "img" and start and not self.ignore_images: 501 if "src" in attrs: 502 assert attrs["src"] is not None 503 if not self.images_to_alt: 504 attrs["href"] = attrs["src"] 505 alt = attrs.get("alt") or self.default_image_alt 506 507 # If we have images_with_size, write raw html including width, 508 # height, and alt attributes 509 if self.images_as_html or ( 510 self.images_with_size and ("width" in attrs or "height" in attrs) 511 ): 512 self.o("<img src='" + attrs["src"] + "' ") 513 if "width" in attrs: 514 assert attrs["width"] is not None 515 self.o("width='" + attrs["width"] + "' ") 516 if "height" in attrs: 517 assert attrs["height"] is not None 518 self.o("height='" + attrs["height"] + "' ") 519 if alt: 520 self.o("alt='" + alt + "' ") 521 self.o("/>") 522 return 523 524 # If we have a link to create, output the start 525 if self.maybe_automatic_link is not None: 526 href = self.maybe_automatic_link 527 if ( 528 self.images_to_alt 529 and escape_md(alt) == href 530 and self.absolute_url_matcher.match(href) 531 ): 532 self.o("<" + escape_md(alt) + ">") 533 self.empty_link = False 534 return 535 else: 536 self.o("[") 537 self.maybe_automatic_link = None 538 self.empty_link = False 539 540 # If we have images_to_alt, we discard the image itself, 541 # considering only the alt text. 542 if self.images_to_alt: 543 self.o(escape_md(alt)) 544 else: 545 self.o("![" + escape_md(alt) + "]") 546 if self.inline_links: 547 href = attrs.get("href") or "" 548 self.o( 549 "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" 550 ) 551 else: 552 i = self.previousIndex(attrs) 553 if i is not None: 554 a_props = self.a[i] 555 else: 556 self.acount += 1 557 a_props = AnchorElement(attrs, self.acount, self.outcount) 558 self.a.append(a_props) 559 self.o("[" + str(a_props.count) + "]") 560 561 if tag == "dl" and start: 562 self.p() 563 if tag == "dt" and not start: 564 self.pbr() 565 if tag == "dd" and start: 566 self.o(" ") 567 if tag == "dd" and not start: 568 self.pbr() 569 570 if tag in ["ol", "ul"]: 571 # Google Docs create sub lists as top level lists 572 if not self.list and not self.lastWasList: 573 self.p() 574 if start: 575 if self.google_doc: 576 list_style = google_list_style(tag_style) 577 else: 578 list_style = tag 579 numbering_start = list_numbering_start(attrs) 580 self.list.append(ListElement(list_style, numbering_start)) 581 else: 582 if self.list: 583 self.list.pop() 584 if not self.google_doc and not self.list: 585 self.o("\n") 586 self.lastWasList = True 587 else: 588 self.lastWasList = False 589 590 if tag == "li": 591 self.pbr() 592 if start: 593 if self.list: 594 li = self.list[-1] 595 else: 596 li = ListElement("ul", 0) 597 if self.google_doc: 598 nest_count = self.google_nest_count(tag_style) 599 else: 600 nest_count = len(self.list) 601 # TODO: line up <ol><li>s > 9 correctly. 602 self.o(" " * nest_count) 603 if li.name == "ul": 604 self.o(self.ul_item_mark + " ") 605 elif li.name == "ol": 606 li.num += 1 607 self.o(str(li.num) + ". ") 608 self.start = True 609 610 if tag in ["table", "tr", "td", "th"]: 611 if self.ignore_tables: 612 if tag == "tr": 613 if start: 614 pass 615 else: 616 self.soft_br() 617 else: 618 pass 619 620 elif self.bypass_tables: 621 if start: 622 self.soft_br() 623 if tag in ["td", "th"]: 624 if start: 625 self.o("<{}>\n\n".format(tag)) 626 else: 627 self.o("\n</{}>".format(tag)) 628 else: 629 if start: 630 self.o("<{}>".format(tag)) 631 else: 632 self.o("</{}>".format(tag)) 633 634 else: 635 if tag == "table": 636 if start: 637 self.table_start = True 638 if self.pad_tables: 639 self.o("<" + config.TABLE_MARKER_FOR_PAD + ">") 640 self.o(" \n") 641 else: 642 if self.pad_tables: 643 self.o("</" + config.TABLE_MARKER_FOR_PAD + ">") 644 self.o(" \n") 645 if tag in ["td", "th"] and start: 646 if self.split_next_td: 647 self.o("| ") 648 self.split_next_td = True 649 650 if tag == "tr" and start: 651 self.td_count = 0 652 if tag == "tr" and not start: 653 self.split_next_td = False 654 self.soft_br() 655 if tag == "tr" and not start and self.table_start: 656 # Underline table header 657 self.o("|".join(["---"] * self.td_count)) 658 self.soft_br() 659 self.table_start = False 660 if tag in ["td", "th"] and start: 661 self.td_count += 1 662 663 if tag == "pre": 664 if start: 665 self.startpre = True 666 self.pre = True 667 else: 668 self.pre = False 669 if self.mark_code: 670 self.out("\n[/code]") 671 self.p() 672 673 # TODO: Add docstring for these one letter functions 674 def pbr(self) -> None: 675 "Pretty print has a line break" 676 if self.p_p == 0: 677 self.p_p = 1 678 679 def p(self) -> None: 680 "Set pretty print to 1 or 2 lines" 681 self.p_p = 1 if self.single_line_break else 2 682 683 def soft_br(self) -> None: 684 "Soft breaks" 685 self.pbr() 686 self.br_toggle = " " 687 688 def o( 689 self, data: str, puredata: bool = False, force: Union[bool, str] = False 690 ) -> None: 691 """ 692 Deal with indentation and whitespace 693 """ 694 if self.abbr_data is not None: 695 self.abbr_data += data 696 697 if not self.quiet: 698 if self.google_doc: 699 # prevent white space immediately after 'begin emphasis' 700 # marks ('**' and '_') 701 lstripped_data = data.lstrip() 702 if self.drop_white_space and not (self.pre or self.code): 703 data = lstripped_data 704 if lstripped_data != "": 705 self.drop_white_space = 0 706 707 if puredata and not self.pre: 708 # This is a very dangerous call ... it could mess up 709 # all handling of when not handled properly 710 # (see entityref) 711 data = re.sub(r"\s+", r" ", data) 712 if data and data[0] == " ": 713 self.space = True 714 data = data[1:] 715 if not data and not force: 716 return 717 718 if self.startpre: 719 # self.out(" :") #TODO: not output when already one there 720 if not data.startswith("\n") and not data.startswith("\r\n"): 721 # <pre>stuff... 722 data = "\n" + data 723 if self.mark_code: 724 self.out("\n[code]") 725 self.p_p = 0 726 727 bq = ">" * self.blockquote 728 if not (force and data and data[0] == ">") and self.blockquote: 729 bq += " " 730 731 if self.pre: 732 if not self.list: 733 bq += " " 734 # else: list content is already partially indented 735 bq += " " * len(self.list) 736 data = data.replace("\n", "\n" + bq) 737 738 if self.startpre: 739 self.startpre = False 740 if self.list: 741 # use existing initial indentation 742 data = data.lstrip("\n") 743 744 if self.start: 745 self.space = False 746 self.p_p = 0 747 self.start = False 748 749 if force == "end": 750 # It's the end. 751 self.p_p = 0 752 self.out("\n") 753 self.space = False 754 755 if self.p_p: 756 self.out((self.br_toggle + "\n" + bq) * self.p_p) 757 self.space = False 758 self.br_toggle = "" 759 760 if self.space: 761 if not self.lastWasNL: 762 self.out(" ") 763 self.space = False 764 765 if self.a and ( 766 (self.p_p == 2 and self.links_each_paragraph) or force == "end" 767 ): 768 if force == "end": 769 self.out("\n") 770 771 newa = [] 772 for link in self.a: 773 if self.outcount > link.outcount: 774 self.out( 775 " [" 776 + str(link.count) 777 + "]: " 778 + urlparse.urljoin(self.baseurl, link.attrs["href"]) 779 ) 780 if "title" in link.attrs: 781 assert link.attrs["title"] is not None 782 self.out(" (" + link.attrs["title"] + ")") 783 self.out("\n") 784 else: 785 newa.append(link) 786 787 # Don't need an extra line when nothing was done. 788 if self.a != newa: 789 self.out("\n") 790 791 self.a = newa 792 793 if self.abbr_list and force == "end": 794 for abbr, definition in self.abbr_list.items(): 795 self.out(" *[" + abbr + "]: " + definition + "\n") 796 797 self.p_p = 0 798 self.out(data) 799 self.outcount += 1 800 801 def handle_data(self, data: str, entity_char: bool = False) -> None: 802 if not data: 803 # Data may be empty for some HTML entities. For example, 804 # LEFT-TO-RIGHT MARK. 805 return 806 807 if self.stressed: 808 data = data.strip() 809 self.stressed = False 810 self.preceding_stressed = True 811 elif self.preceding_stressed: 812 if ( 813 re.match(r"[^\s.!?]", data[0]) 814 and not hn(self.current_tag) 815 and self.current_tag not in ["a", "code", "pre"] 816 ): 817 # should match a letter or common punctuation 818 data = " " + data 819 self.preceding_stressed = False 820 821 if self.style: 822 self.style_def.update(dumb_css_parser(data)) 823 824 if self.maybe_automatic_link is not None: 825 href = self.maybe_automatic_link 826 if ( 827 href == data 828 and self.absolute_url_matcher.match(href) 829 and self.use_automatic_links 830 ): 831 self.o("<" + data + ">") 832 self.empty_link = False 833 return 834 else: 835 self.o("[") 836 self.maybe_automatic_link = None 837 self.empty_link = False 838 839 if not self.code and not self.pre and not entity_char: 840 data = escape_md_section(data, snob=self.escape_snob) 841 self.preceding_data = data 842 self.o(data, puredata=True) 843 844 def charref(self, name: str) -> str: 845 if name[0] in ["x", "X"]: 846 c = int(name[1:], 16) 847 else: 848 c = int(name) 849 850 if not self.unicode_snob and c in unifiable_n: 851 return unifiable_n[c] 852 else: 853 try: 854 return chr(c) 855 except ValueError: # invalid unicode 856 return "" 857 858 def entityref(self, c: str) -> str: 859 if not self.unicode_snob and c in config.UNIFIABLE: 860 return config.UNIFIABLE[c] 861 try: 862 ch = html.entities.html5[c + ";"] 863 except KeyError: 864 return "&" + c + ";" 865 return config.UNIFIABLE[c] if c == "nbsp" else ch 866 867 def google_nest_count(self, style: Dict[str, str]) -> int: 868 """ 869 Calculate the nesting count of google doc lists 870 871 :type style: dict 872 873 :rtype: int 874 """ 875 nest_count = 0 876 if "margin-left" in style: 877 nest_count = int(style["margin-left"][:-2]) // self.google_list_indent 878 879 return nest_count 880 881 def optwrap(self, text: str) -> str: 882 """ 883 Wrap all paragraphs in the provided text. 884 885 :type text: str 886 887 :rtype: str 888 """ 889 if not self.body_width: 890 return text 891 892 result = "" 893 newlines = 0 894 # I cannot think of a better solution for now. 895 # To avoid the non-wrap behaviour for entire paras 896 # because of the presence of a link in it 897 if not self.wrap_links: 898 self.inline_links = False 899 for para in text.split("\n"): 900 if len(para) > 0: 901 if not skipwrap(para, self.wrap_links, self.wrap_list_items): 902 indent = "" 903 if para.startswith(" " + self.ul_item_mark): 904 # list item continuation: add a double indent to the 905 # new lines 906 indent = " " 907 elif para.startswith("> "): 908 # blockquote continuation: add the greater than symbol 909 # to the new lines 910 indent = "> " 911 wrapped = wrap( 912 para, 913 self.body_width, 914 break_long_words=False, 915 subsequent_indent=indent, 916 ) 917 result += "\n".join(wrapped) 918 if para.endswith(" "): 919 result += " \n" 920 newlines = 1 921 elif indent: 922 result += "\n" 923 newlines = 1 924 else: 925 result += "\n\n" 926 newlines = 2 927 else: 928 # Warning for the tempted!!! 929 # Be aware that obvious replacement of this with 930 # line.isspace() 931 # DOES NOT work! Explanations are welcome. 932 if not config.RE_SPACE.match(para): 933 result += para + "\n" 934 newlines = 1 935 else: 936 if newlines < 2: 937 result += "\n" 938 newlines += 1 939 return result 940 941 942def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str: 943 if bodywidth is None: 944 bodywidth = config.BODY_WIDTH 945 h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) 946 947 return h.handle(html) 948