1"""html2text: Turn HTML into equivalent Markdown-structured text."""
2
3import html.entities
4import html.parser
5import re
6import urllib.parse as urlparse
7from textwrap import wrap
8from typing import Dict, List, Optional, Tuple, Union
9
10from . import config
11from .elements import AnchorElement, ListElement
12from .typing import OutCallback
13from .utils import (
14    dumb_css_parser,
15    element_style,
16    escape_md,
17    escape_md_section,
18    google_fixed_width_font,
19    google_has_height,
20    google_list_style,
21    google_text_emphasis,
22    hn,
23    list_numbering_start,
24    pad_tables_in_text,
25    skipwrap,
26    unifiable_n,
27)
28
29__version__ = (2020, 1, 16)
30
31
32# TODO:
33# Support decoded entities with UNIFIABLE.
34
35
36class HTML2Text(html.parser.HTMLParser):
37    def __init__(
38        self,
39        out: Optional[OutCallback] = None,
40        baseurl: str = "",
41        bodywidth: int = config.BODY_WIDTH,
42    ) -> None:
43        """
44        Input parameters:
45            out: possible custom replacement for self.outtextf (which
46                 appends lines of text).
47            baseurl: base URL of the document we process
48        """
49        super().__init__(convert_charrefs=False)
50
51        # Config options
52        self.split_next_td = False
53        self.td_count = 0
54        self.table_start = False
55        self.unicode_snob = config.UNICODE_SNOB  # covered in cli
56        self.escape_snob = config.ESCAPE_SNOB  # covered in cli
57        self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
58        self.body_width = bodywidth  # covered in cli
59        self.skip_internal_links = config.SKIP_INTERNAL_LINKS  # covered in cli
60        self.inline_links = config.INLINE_LINKS  # covered in cli
61        self.protect_links = config.PROTECT_LINKS  # covered in cli
62        self.google_list_indent = config.GOOGLE_LIST_INDENT  # covered in cli
63        self.ignore_links = config.IGNORE_ANCHORS  # covered in cli
64        self.ignore_images = config.IGNORE_IMAGES  # covered in cli
65        self.images_as_html = config.IMAGES_AS_HTML  # covered in cli
66        self.images_to_alt = config.IMAGES_TO_ALT  # covered in cli
67        self.images_with_size = config.IMAGES_WITH_SIZE  # covered in cli
68        self.ignore_emphasis = config.IGNORE_EMPHASIS  # covered in cli
69        self.bypass_tables = config.BYPASS_TABLES  # covered in cli
70        self.ignore_tables = config.IGNORE_TABLES  # covered in cli
71        self.google_doc = False  # covered in cli
72        self.ul_item_mark = "*"  # covered in cli
73        self.emphasis_mark = "_"  # covered in cli
74        self.strong_mark = "**"
75        self.single_line_break = config.SINGLE_LINE_BREAK  # covered in cli
76        self.use_automatic_links = config.USE_AUTOMATIC_LINKS  # covered in cli
77        self.hide_strikethrough = False  # covered in cli
78        self.mark_code = config.MARK_CODE
79        self.wrap_list_items = config.WRAP_LIST_ITEMS  # covered in cli
80        self.wrap_links = config.WRAP_LINKS  # covered in cli
81        self.pad_tables = config.PAD_TABLES  # covered in cli
82        self.default_image_alt = config.DEFAULT_IMAGE_ALT  # covered in cli
83        self.tag_callback = None
84        self.open_quote = config.OPEN_QUOTE  # covered in cli
85        self.close_quote = config.CLOSE_QUOTE  # covered in cli
86
87        if out is None:
88            self.out = self.outtextf
89        else:
90            self.out = out
91
92        # empty list to store output characters before they are "joined"
93        self.outtextlist = []  # type: List[str]
94
95        self.quiet = 0
96        self.p_p = 0  # number of newline character to print before next output
97        self.outcount = 0
98        self.start = True
99        self.space = False
100        self.a = []  # type: List[AnchorElement]
101        self.astack = []  # type: List[Optional[Dict[str, Optional[str]]]]
102        self.maybe_automatic_link = None  # type: Optional[str]
103        self.empty_link = False
104        self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")
105        self.acount = 0
106        self.list = []  # type: List[ListElement]
107        self.blockquote = 0
108        self.pre = False
109        self.startpre = False
110        self.code = False
111        self.quote = False
112        self.br_toggle = ""
113        self.lastWasNL = False
114        self.lastWasList = False
115        self.style = 0
116        self.style_def = {}  # type: Dict[str, Dict[str, str]]
117        self.tag_stack = (
118            []
119        )  # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]]
120        self.emphasis = 0
121        self.drop_white_space = 0
122        self.inheader = False
123        # Current abbreviation definition
124        self.abbr_title = None  # type: Optional[str]
125        # Last inner HTML (for abbr being defined)
126        self.abbr_data = None  # type: Optional[str]
127        # Stack of abbreviations to write later
128        self.abbr_list = {}  # type: Dict[str, str]
129        self.baseurl = baseurl
130        self.stressed = False
131        self.preceding_stressed = False
132        self.preceding_data = ""
133        self.current_tag = ""
134
135        config.UNIFIABLE["nbsp"] = "&nbsp_place_holder;"
136
137    def feed(self, data: str) -> None:
138        data = data.replace("</' + 'script>", "</ignore>")
139        super().feed(data)
140
141    def handle(self, data: str) -> str:
142        self.feed(data)
143        self.feed("")
144        markdown = self.optwrap(self.finish())
145        if self.pad_tables:
146            return pad_tables_in_text(markdown)
147        else:
148            return markdown
149
150    def outtextf(self, s: str) -> None:
151        self.outtextlist.append(s)
152        if s:
153            self.lastWasNL = s[-1] == "\n"
154
155    def finish(self) -> str:
156        self.close()
157
158        self.pbr()
159        self.o("", force="end")
160
161        outtext = "".join(self.outtextlist)
162
163        if self.unicode_snob:
164            nbsp = html.entities.html5["nbsp;"]
165        else:
166            nbsp = " "
167        outtext = outtext.replace("&nbsp_place_holder;", nbsp)
168
169        # Clear self.outtextlist to avoid memory leak of its content to
170        # the next handling.
171        self.outtextlist = []
172
173        return outtext
174
175    def handle_charref(self, c: str) -> None:
176        self.handle_data(self.charref(c), True)
177
178    def handle_entityref(self, c: str) -> None:
179        ref = self.entityref(c)
180
181        # ref may be an empty string (e.g. for &lrm;/&rlm; markers that should
182        # not contribute to the final output).
183        # self.handle_data cannot handle a zero-length string right after a
184        # stressed tag or mid-text within a stressed tag (text get split and
185        # self.stressed/self.preceding_stressed gets switched after the first
186        # part of that text).
187        if ref:
188            self.handle_data(ref, True)
189
190    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
191        self.handle_tag(tag, dict(attrs), start=True)
192
193    def handle_endtag(self, tag: str) -> None:
194        self.handle_tag(tag, {}, start=False)
195
196    def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:
197        """
198        :type attrs: dict
199
200        :returns: The index of certain set of attributes (of a link) in the
201        self.a list. If the set of attributes is not found, returns None
202        :rtype: int
203        """
204        if "href" not in attrs:
205            return None
206
207        match = False
208        for i, a in enumerate(self.a):
209            if "href" in a.attrs and a.attrs["href"] == attrs["href"]:
210                if "title" in a.attrs or "title" in attrs:
211                    if (
212                        "title" in a.attrs
213                        and "title" in attrs
214                        and a.attrs["title"] == attrs["title"]
215                    ):
216                        match = True
217                else:
218                    match = True
219
220            if match:
221                return i
222        return None
223
224    def handle_emphasis(
225        self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str]
226    ) -> None:
227        """
228        Handles various text emphases
229        """
230        tag_emphasis = google_text_emphasis(tag_style)
231        parent_emphasis = google_text_emphasis(parent_style)
232
233        # handle Google's text emphasis
234        strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough
235
236        # google and others may mark a font's weight as `bold` or `700`
237        bold = False
238        for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
239            bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis
240            if bold:
241                break
242
243        italic = "italic" in tag_emphasis and "italic" not in parent_emphasis
244        fixed = (
245            google_fixed_width_font(tag_style)
246            and not google_fixed_width_font(parent_style)
247            and not self.pre
248        )
249
250        if start:
251            # crossed-out text must be handled before other attributes
252            # in order not to output qualifiers unnecessarily
253            if bold or italic or fixed:
254                self.emphasis += 1
255            if strikethrough:
256                self.quiet += 1
257            if italic:
258                self.o(self.emphasis_mark)
259                self.drop_white_space += 1
260            if bold:
261                self.o(self.strong_mark)
262                self.drop_white_space += 1
263            if fixed:
264                self.o("`")
265                self.drop_white_space += 1
266                self.code = True
267        else:
268            if bold or italic or fixed:
269                # there must not be whitespace before closing emphasis mark
270                self.emphasis -= 1
271                self.space = False
272            if fixed:
273                if self.drop_white_space:
274                    # empty emphasis, drop it
275                    self.drop_white_space -= 1
276                else:
277                    self.o("`")
278                self.code = False
279            if bold:
280                if self.drop_white_space:
281                    # empty emphasis, drop it
282                    self.drop_white_space -= 1
283                else:
284                    self.o(self.strong_mark)
285            if italic:
286                if self.drop_white_space:
287                    # empty emphasis, drop it
288                    self.drop_white_space -= 1
289                else:
290                    self.o(self.emphasis_mark)
291            # space is only allowed after *all* emphasis marks
292            if (bold or italic) and not self.emphasis:
293                self.o(" ")
294            if strikethrough:
295                self.quiet -= 1
296
297    def handle_tag(
298        self, tag: str, attrs: Dict[str, Optional[str]], start: bool
299    ) -> None:
300        self.current_tag = tag
301
302        if self.tag_callback is not None:
303            if self.tag_callback(self, tag, attrs, start) is True:
304                return
305
306        # first thing inside the anchor tag is another tag
307        # that produces some output
308        if (
309            start
310            and self.maybe_automatic_link is not None
311            and tag not in ["p", "div", "style", "dl", "dt"]
312            and (tag != "img" or self.ignore_images)
313        ):
314            self.o("[")
315            self.maybe_automatic_link = None
316            self.empty_link = False
317
318        if self.google_doc:
319            # the attrs parameter is empty for a closing tag. in addition, we
320            # need the attributes of the parent nodes in order to get a
321            # complete style description for the current element. we assume
322            # that google docs export well formed html.
323            parent_style = {}  # type: Dict[str, str]
324            if start:
325                if self.tag_stack:
326                    parent_style = self.tag_stack[-1][2]
327                tag_style = element_style(attrs, self.style_def, parent_style)
328                self.tag_stack.append((tag, attrs, tag_style))
329            else:
330                dummy, attrs, tag_style = (
331                    self.tag_stack.pop() if self.tag_stack else (None, {}, {})
332                )
333                if self.tag_stack:
334                    parent_style = self.tag_stack[-1][2]
335
336        if hn(tag):
337            self.p()
338            if start:
339                self.inheader = True
340                self.o(hn(tag) * "#" + " ")
341            else:
342                self.inheader = False
343                return  # prevent redundant emphasis marks on headers
344
345        if tag in ["p", "div"]:
346            if self.google_doc:
347                if start and google_has_height(tag_style):
348                    self.p()
349                else:
350                    self.soft_br()
351            elif self.astack and tag == "div":
352                pass
353            else:
354                self.p()
355
356        if tag == "br" and start:
357            if self.blockquote > 0:
358                self.o("  \n> ")
359            else:
360                self.o("  \n")
361
362        if tag == "hr" and start:
363            self.p()
364            self.o("* * *")
365            self.p()
366
367        if tag in ["head", "style", "script"]:
368            if start:
369                self.quiet += 1
370            else:
371                self.quiet -= 1
372
373        if tag == "style":
374            if start:
375                self.style += 1
376            else:
377                self.style -= 1
378
379        if tag in ["body"]:
380            self.quiet = 0  # sites like 9rules.com never close <head>
381
382        if tag == "blockquote":
383            if start:
384                self.p()
385                self.o("> ", force=True)
386                self.start = True
387                self.blockquote += 1
388            else:
389                self.blockquote -= 1
390                self.p()
391
392        def no_preceding_space(self: HTML2Text) -> bool:
393            return bool(
394                self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1])
395            )
396
397        if tag in ["em", "i", "u"] and not self.ignore_emphasis:
398            if start and no_preceding_space(self):
399                emphasis = " " + self.emphasis_mark
400            else:
401                emphasis = self.emphasis_mark
402
403            self.o(emphasis)
404            if start:
405                self.stressed = True
406
407        if tag in ["strong", "b"] and not self.ignore_emphasis:
408            if start and no_preceding_space(self):
409                strong = " " + self.strong_mark
410            else:
411                strong = self.strong_mark
412
413            self.o(strong)
414            if start:
415                self.stressed = True
416
417        if tag in ["del", "strike", "s"]:
418            if start and no_preceding_space(self):
419                strike = " ~~"
420            else:
421                strike = "~~"
422
423            self.o(strike)
424            if start:
425                self.stressed = True
426
427        if self.google_doc:
428            if not self.inheader:
429                # handle some font attributes, but leave headers clean
430                self.handle_emphasis(start, tag_style, parent_style)
431
432        if tag in ["kbd", "code", "tt"] and not self.pre:
433            self.o("`")  # TODO: `` `this` ``
434            self.code = not self.code
435
436        if tag == "abbr":
437            if start:
438                self.abbr_title = None
439                self.abbr_data = ""
440                if "title" in attrs:
441                    self.abbr_title = attrs["title"]
442            else:
443                if self.abbr_title is not None:
444                    assert self.abbr_data is not None
445                    self.abbr_list[self.abbr_data] = self.abbr_title
446                    self.abbr_title = None
447                self.abbr_data = None
448
449        if tag == "q":
450            if not self.quote:
451                self.o(self.open_quote)
452            else:
453                self.o(self.close_quote)
454            self.quote = not self.quote
455
456        def link_url(self: HTML2Text, link: str, title: str = "") -> None:
457            url = urlparse.urljoin(self.baseurl, link)
458            title = ' "{}"'.format(title) if title.strip() else ""
459            self.o("]({url}{title})".format(url=escape_md(url), title=title))
460
461        if tag == "a" and not self.ignore_links:
462            if start:
463                if (
464                    "href" in attrs
465                    and attrs["href"] is not None
466                    and not (self.skip_internal_links and attrs["href"].startswith("#"))
467                ):
468                    self.astack.append(attrs)
469                    self.maybe_automatic_link = attrs["href"]
470                    self.empty_link = True
471                    if self.protect_links:
472                        attrs["href"] = "<" + attrs["href"] + ">"
473                else:
474                    self.astack.append(None)
475            else:
476                if self.astack:
477                    a = self.astack.pop()
478                    if self.maybe_automatic_link and not self.empty_link:
479                        self.maybe_automatic_link = None
480                    elif a:
481                        assert a["href"] is not None
482                        if self.empty_link:
483                            self.o("[")
484                            self.empty_link = False
485                            self.maybe_automatic_link = None
486                        if self.inline_links:
487                            title = a.get("title") or ""
488                            title = escape_md(title)
489                            link_url(self, a["href"], title)
490                        else:
491                            i = self.previousIndex(a)
492                            if i is not None:
493                                a_props = self.a[i]
494                            else:
495                                self.acount += 1
496                                a_props = AnchorElement(a, self.acount, self.outcount)
497                                self.a.append(a_props)
498                            self.o("][" + str(a_props.count) + "]")
499
500        if tag == "img" and start and not self.ignore_images:
501            if "src" in attrs:
502                assert attrs["src"] is not None
503                if not self.images_to_alt:
504                    attrs["href"] = attrs["src"]
505                alt = attrs.get("alt") or self.default_image_alt
506
507                # If we have images_with_size, write raw html including width,
508                # height, and alt attributes
509                if self.images_as_html or (
510                    self.images_with_size and ("width" in attrs or "height" in attrs)
511                ):
512                    self.o("<img src='" + attrs["src"] + "' ")
513                    if "width" in attrs:
514                        assert attrs["width"] is not None
515                        self.o("width='" + attrs["width"] + "' ")
516                    if "height" in attrs:
517                        assert attrs["height"] is not None
518                        self.o("height='" + attrs["height"] + "' ")
519                    if alt:
520                        self.o("alt='" + alt + "' ")
521                    self.o("/>")
522                    return
523
524                # If we have a link to create, output the start
525                if self.maybe_automatic_link is not None:
526                    href = self.maybe_automatic_link
527                    if (
528                        self.images_to_alt
529                        and escape_md(alt) == href
530                        and self.absolute_url_matcher.match(href)
531                    ):
532                        self.o("<" + escape_md(alt) + ">")
533                        self.empty_link = False
534                        return
535                    else:
536                        self.o("[")
537                        self.maybe_automatic_link = None
538                        self.empty_link = False
539
540                # If we have images_to_alt, we discard the image itself,
541                # considering only the alt text.
542                if self.images_to_alt:
543                    self.o(escape_md(alt))
544                else:
545                    self.o("![" + escape_md(alt) + "]")
546                    if self.inline_links:
547                        href = attrs.get("href") or ""
548                        self.o(
549                            "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")"
550                        )
551                    else:
552                        i = self.previousIndex(attrs)
553                        if i is not None:
554                            a_props = self.a[i]
555                        else:
556                            self.acount += 1
557                            a_props = AnchorElement(attrs, self.acount, self.outcount)
558                            self.a.append(a_props)
559                        self.o("[" + str(a_props.count) + "]")
560
561        if tag == "dl" and start:
562            self.p()
563        if tag == "dt" and not start:
564            self.pbr()
565        if tag == "dd" and start:
566            self.o("    ")
567        if tag == "dd" and not start:
568            self.pbr()
569
570        if tag in ["ol", "ul"]:
571            # Google Docs create sub lists as top level lists
572            if not self.list and not self.lastWasList:
573                self.p()
574            if start:
575                if self.google_doc:
576                    list_style = google_list_style(tag_style)
577                else:
578                    list_style = tag
579                numbering_start = list_numbering_start(attrs)
580                self.list.append(ListElement(list_style, numbering_start))
581            else:
582                if self.list:
583                    self.list.pop()
584                    if not self.google_doc and not self.list:
585                        self.o("\n")
586            self.lastWasList = True
587        else:
588            self.lastWasList = False
589
590        if tag == "li":
591            self.pbr()
592            if start:
593                if self.list:
594                    li = self.list[-1]
595                else:
596                    li = ListElement("ul", 0)
597                if self.google_doc:
598                    nest_count = self.google_nest_count(tag_style)
599                else:
600                    nest_count = len(self.list)
601                # TODO: line up <ol><li>s > 9 correctly.
602                self.o("  " * nest_count)
603                if li.name == "ul":
604                    self.o(self.ul_item_mark + " ")
605                elif li.name == "ol":
606                    li.num += 1
607                    self.o(str(li.num) + ". ")
608                self.start = True
609
610        if tag in ["table", "tr", "td", "th"]:
611            if self.ignore_tables:
612                if tag == "tr":
613                    if start:
614                        pass
615                    else:
616                        self.soft_br()
617                else:
618                    pass
619
620            elif self.bypass_tables:
621                if start:
622                    self.soft_br()
623                if tag in ["td", "th"]:
624                    if start:
625                        self.o("<{}>\n\n".format(tag))
626                    else:
627                        self.o("\n</{}>".format(tag))
628                else:
629                    if start:
630                        self.o("<{}>".format(tag))
631                    else:
632                        self.o("</{}>".format(tag))
633
634            else:
635                if tag == "table":
636                    if start:
637                        self.table_start = True
638                        if self.pad_tables:
639                            self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
640                            self.o("  \n")
641                    else:
642                        if self.pad_tables:
643                            self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
644                            self.o("  \n")
645                if tag in ["td", "th"] and start:
646                    if self.split_next_td:
647                        self.o("| ")
648                    self.split_next_td = True
649
650                if tag == "tr" and start:
651                    self.td_count = 0
652                if tag == "tr" and not start:
653                    self.split_next_td = False
654                    self.soft_br()
655                if tag == "tr" and not start and self.table_start:
656                    # Underline table header
657                    self.o("|".join(["---"] * self.td_count))
658                    self.soft_br()
659                    self.table_start = False
660                if tag in ["td", "th"] and start:
661                    self.td_count += 1
662
663        if tag == "pre":
664            if start:
665                self.startpre = True
666                self.pre = True
667            else:
668                self.pre = False
669                if self.mark_code:
670                    self.out("\n[/code]")
671            self.p()
672
673    # TODO: Add docstring for these one letter functions
674    def pbr(self) -> None:
675        "Pretty print has a line break"
676        if self.p_p == 0:
677            self.p_p = 1
678
679    def p(self) -> None:
680        "Set pretty print to 1 or 2 lines"
681        self.p_p = 1 if self.single_line_break else 2
682
683    def soft_br(self) -> None:
684        "Soft breaks"
685        self.pbr()
686        self.br_toggle = "  "
687
688    def o(
689        self, data: str, puredata: bool = False, force: Union[bool, str] = False
690    ) -> None:
691        """
692        Deal with indentation and whitespace
693        """
694        if self.abbr_data is not None:
695            self.abbr_data += data
696
697        if not self.quiet:
698            if self.google_doc:
699                # prevent white space immediately after 'begin emphasis'
700                # marks ('**' and '_')
701                lstripped_data = data.lstrip()
702                if self.drop_white_space and not (self.pre or self.code):
703                    data = lstripped_data
704                if lstripped_data != "":
705                    self.drop_white_space = 0
706
707            if puredata and not self.pre:
708                # This is a very dangerous call ... it could mess up
709                # all handling of &nbsp; when not handled properly
710                # (see entityref)
711                data = re.sub(r"\s+", r" ", data)
712                if data and data[0] == " ":
713                    self.space = True
714                    data = data[1:]
715            if not data and not force:
716                return
717
718            if self.startpre:
719                # self.out(" :") #TODO: not output when already one there
720                if not data.startswith("\n") and not data.startswith("\r\n"):
721                    # <pre>stuff...
722                    data = "\n" + data
723                if self.mark_code:
724                    self.out("\n[code]")
725                    self.p_p = 0
726
727            bq = ">" * self.blockquote
728            if not (force and data and data[0] == ">") and self.blockquote:
729                bq += " "
730
731            if self.pre:
732                if not self.list:
733                    bq += "    "
734                # else: list content is already partially indented
735                bq += "    " * len(self.list)
736                data = data.replace("\n", "\n" + bq)
737
738            if self.startpre:
739                self.startpre = False
740                if self.list:
741                    # use existing initial indentation
742                    data = data.lstrip("\n")
743
744            if self.start:
745                self.space = False
746                self.p_p = 0
747                self.start = False
748
749            if force == "end":
750                # It's the end.
751                self.p_p = 0
752                self.out("\n")
753                self.space = False
754
755            if self.p_p:
756                self.out((self.br_toggle + "\n" + bq) * self.p_p)
757                self.space = False
758                self.br_toggle = ""
759
760            if self.space:
761                if not self.lastWasNL:
762                    self.out(" ")
763                self.space = False
764
765            if self.a and (
766                (self.p_p == 2 and self.links_each_paragraph) or force == "end"
767            ):
768                if force == "end":
769                    self.out("\n")
770
771                newa = []
772                for link in self.a:
773                    if self.outcount > link.outcount:
774                        self.out(
775                            "   ["
776                            + str(link.count)
777                            + "]: "
778                            + urlparse.urljoin(self.baseurl, link.attrs["href"])
779                        )
780                        if "title" in link.attrs:
781                            assert link.attrs["title"] is not None
782                            self.out(" (" + link.attrs["title"] + ")")
783                        self.out("\n")
784                    else:
785                        newa.append(link)
786
787                # Don't need an extra line when nothing was done.
788                if self.a != newa:
789                    self.out("\n")
790
791                self.a = newa
792
793            if self.abbr_list and force == "end":
794                for abbr, definition in self.abbr_list.items():
795                    self.out("  *[" + abbr + "]: " + definition + "\n")
796
797            self.p_p = 0
798            self.out(data)
799            self.outcount += 1
800
801    def handle_data(self, data: str, entity_char: bool = False) -> None:
802        if not data:
803            # Data may be empty for some HTML entities. For example,
804            # LEFT-TO-RIGHT MARK.
805            return
806
807        if self.stressed:
808            data = data.strip()
809            self.stressed = False
810            self.preceding_stressed = True
811        elif self.preceding_stressed:
812            if (
813                re.match(r"[^\s.!?]", data[0])
814                and not hn(self.current_tag)
815                and self.current_tag not in ["a", "code", "pre"]
816            ):
817                # should match a letter or common punctuation
818                data = " " + data
819            self.preceding_stressed = False
820
821        if self.style:
822            self.style_def.update(dumb_css_parser(data))
823
824        if self.maybe_automatic_link is not None:
825            href = self.maybe_automatic_link
826            if (
827                href == data
828                and self.absolute_url_matcher.match(href)
829                and self.use_automatic_links
830            ):
831                self.o("<" + data + ">")
832                self.empty_link = False
833                return
834            else:
835                self.o("[")
836                self.maybe_automatic_link = None
837                self.empty_link = False
838
839        if not self.code and not self.pre and not entity_char:
840            data = escape_md_section(data, snob=self.escape_snob)
841        self.preceding_data = data
842        self.o(data, puredata=True)
843
844    def charref(self, name: str) -> str:
845        if name[0] in ["x", "X"]:
846            c = int(name[1:], 16)
847        else:
848            c = int(name)
849
850        if not self.unicode_snob and c in unifiable_n:
851            return unifiable_n[c]
852        else:
853            try:
854                return chr(c)
855            except ValueError:  # invalid unicode
856                return ""
857
858    def entityref(self, c: str) -> str:
859        if not self.unicode_snob and c in config.UNIFIABLE:
860            return config.UNIFIABLE[c]
861        try:
862            ch = html.entities.html5[c + ";"]
863        except KeyError:
864            return "&" + c + ";"
865        return config.UNIFIABLE[c] if c == "nbsp" else ch
866
867    def google_nest_count(self, style: Dict[str, str]) -> int:
868        """
869        Calculate the nesting count of google doc lists
870
871        :type style: dict
872
873        :rtype: int
874        """
875        nest_count = 0
876        if "margin-left" in style:
877            nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
878
879        return nest_count
880
881    def optwrap(self, text: str) -> str:
882        """
883        Wrap all paragraphs in the provided text.
884
885        :type text: str
886
887        :rtype: str
888        """
889        if not self.body_width:
890            return text
891
892        result = ""
893        newlines = 0
894        # I cannot think of a better solution for now.
895        # To avoid the non-wrap behaviour for entire paras
896        # because of the presence of a link in it
897        if not self.wrap_links:
898            self.inline_links = False
899        for para in text.split("\n"):
900            if len(para) > 0:
901                if not skipwrap(para, self.wrap_links, self.wrap_list_items):
902                    indent = ""
903                    if para.startswith("  " + self.ul_item_mark):
904                        # list item continuation: add a double indent to the
905                        # new lines
906                        indent = "    "
907                    elif para.startswith("> "):
908                        # blockquote continuation: add the greater than symbol
909                        # to the new lines
910                        indent = "> "
911                    wrapped = wrap(
912                        para,
913                        self.body_width,
914                        break_long_words=False,
915                        subsequent_indent=indent,
916                    )
917                    result += "\n".join(wrapped)
918                    if para.endswith("  "):
919                        result += "  \n"
920                        newlines = 1
921                    elif indent:
922                        result += "\n"
923                        newlines = 1
924                    else:
925                        result += "\n\n"
926                        newlines = 2
927                else:
928                    # Warning for the tempted!!!
929                    # Be aware that obvious replacement of this with
930                    # line.isspace()
931                    # DOES NOT work! Explanations are welcome.
932                    if not config.RE_SPACE.match(para):
933                        result += para + "\n"
934                        newlines = 1
935            else:
936                if newlines < 2:
937                    result += "\n"
938                    newlines += 1
939        return result
940
941
942def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
943    if bodywidth is None:
944        bodywidth = config.BODY_WIDTH
945    h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
946
947    return h.handle(html)
948