1#!/usr/bin/env python
2# coding: utf-8
3"""html2text: Turn HTML into equivalent Markdown-structured text."""
4from __future__ import division
5from __future__ import unicode_literals
6import re
7import sys
8
9try:
10    from textwrap import wrap
11except ImportError:  # pragma: no cover
12    pass
13
14from html2text.compat import urlparse, HTMLParser
15from html2text import config
16
17from html2text.utils import (
18    name2cp,
19    unifiable_n,
20    google_text_emphasis,
21    google_fixed_width_font,
22    element_style,
23    hn,
24    google_has_height,
25    escape_md,
26    google_list_style,
27    list_numbering_start,
28    dumb_css_parser,
29    escape_md_section,
30    skipwrap,
31    pad_tables_in_text
32)
33
34try:
35    chr = unichr
36    nochr = unicode('')
37except NameError:
38    # python3 uses chr
39    nochr = str('')
40
41__version__ = (2018, 1, 9)
42
43
44# TODO:
45# Support decoded entities with UNIFIABLE.
46
47
48class HTML2Text(HTMLParser.HTMLParser):
49    def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
50        """
51        Input parameters:
52            out: possible custom replacement for self.outtextf (which
53                 appends lines of text).
54            baseurl: base URL of the document we process
55        """
56        kwargs = {}
57        if sys.version_info >= (3, 4):
58            kwargs['convert_charrefs'] = False
59        HTMLParser.HTMLParser.__init__(self, **kwargs)
60
61        # Config options
62        self.split_next_td = False
63        self.td_count = 0
64        self.table_start = False
65        self.unicode_snob = config.UNICODE_SNOB  # covered in cli
66        self.escape_snob = config.ESCAPE_SNOB  # covered in cli
67        self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
68        self.body_width = bodywidth  # covered in cli
69        self.skip_internal_links = config.SKIP_INTERNAL_LINKS  # covered in cli
70        self.inline_links = config.INLINE_LINKS  # covered in cli
71        self.protect_links = config.PROTECT_LINKS  # covered in cli
72        self.google_list_indent = config.GOOGLE_LIST_INDENT  # covered in cli
73        self.ignore_links = config.IGNORE_ANCHORS  # covered in cli
74        self.ignore_images = config.IGNORE_IMAGES  # covered in cli
75        self.images_to_alt = config.IMAGES_TO_ALT  # covered in cli
76        self.images_with_size = config.IMAGES_WITH_SIZE  # covered in cli
77        self.ignore_emphasis = config.IGNORE_EMPHASIS  # covered in cli
78        self.bypass_tables = config.BYPASS_TABLES  # covered in cli
79        self.ignore_tables = config.IGNORE_TABLES  # covered in cli
80        self.google_doc = False  # covered in cli
81        self.ul_item_mark = '*'  # covered in cli
82        self.emphasis_mark = '_'  # covered in cli
83        self.strong_mark = '**'
84        self.single_line_break = config.SINGLE_LINE_BREAK  # covered in cli
85        self.use_automatic_links = config.USE_AUTOMATIC_LINKS  # covered in cli
86        self.hide_strikethrough = False  # covered in cli
87        self.mark_code = config.MARK_CODE
88        self.wrap_links = config.WRAP_LINKS  # covered in cli
89        self.pad_tables = config.PAD_TABLES  # covered in cli
90        self.default_image_alt = config.DEFAULT_IMAGE_ALT  # covered in cli
91        self.tag_callback = None
92        self.open_quote = config.OPEN_QUOTE  # covered in cli
93        self.close_quote = config.CLOSE_QUOTE  # covered in cli
94
95        if out is None:  # pragma: no cover
96            self.out = self.outtextf
97        else:  # pragma: no cover
98            self.out = out
99
100        # empty list to store output characters before they are "joined"
101        self.outtextlist = []
102
103        self.quiet = 0
104        self.p_p = 0  # number of newline character to print before next output
105        self.outcount = 0
106        self.start = 1
107        self.space = 0
108        self.a = []
109        self.astack = []
110        self.maybe_automatic_link = None
111        self.empty_link = False
112        self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
113        self.acount = 0
114        self.list = []
115        self.blockquote = 0
116        self.pre = 0
117        self.startpre = 0
118        self.code = False
119        self.quote = False
120        self.br_toggle = ''
121        self.lastWasNL = 0
122        self.lastWasList = False
123        self.style = 0
124        self.style_def = {}
125        self.tag_stack = []
126        self.emphasis = 0
127        self.drop_white_space = 0
128        self.inheader = False
129        self.abbr_title = None  # current abbreviation definition
130        self.abbr_data = None  # last inner HTML (for abbr being defined)
131        self.abbr_list = {}  # stack of abbreviations to write later
132        self.baseurl = baseurl
133        self.stressed = False
134        self.preceding_stressed = False
135        self.preceding_data = None
136        self.current_tag = None
137
138        try:
139            del unifiable_n[name2cp('nbsp')]
140        except KeyError:
141            pass
142        config.UNIFIABLE['nbsp'] = '&nbsp_place_holder;'
143
144    def feed(self, data):
145        data = data.replace("</' + 'script>", "</ignore>")
146        HTMLParser.HTMLParser.feed(self, data)
147
148    def handle(self, data):
149        self.feed(data)
150        self.feed("")
151        markdown = self.optwrap(self.close())
152        if self.pad_tables:
153            return pad_tables_in_text(markdown)
154        else:
155            return markdown
156
157    def outtextf(self, s):
158        self.outtextlist.append(s)
159        if s:
160            self.lastWasNL = s[-1] == '\n'
161
162    def close(self):
163        HTMLParser.HTMLParser.close(self)
164
165        self.pbr()
166        self.o('', 0, 'end')
167
168        outtext = nochr.join(self.outtextlist)
169
170        if self.unicode_snob:
171            nbsp = chr(name2cp('nbsp'))
172        else:
173            nbsp = chr(32)
174        try:
175            outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
176        except NameError:
177            outtext = outtext.replace('&nbsp_place_holder;', nbsp)
178
179        # Clear self.outtextlist to avoid memory leak of its content to
180        # the next handling.
181        self.outtextlist = []
182
183        return outtext
184
185    def handle_charref(self, c):
186        self.handle_data(self.charref(c), True)
187
188    def handle_entityref(self, c):
189        self.handle_data(self.entityref(c), True)
190
191    def handle_starttag(self, tag, attrs):
192        self.handle_tag(tag, attrs, 1)
193
194    def handle_endtag(self, tag):
195        self.handle_tag(tag, None, 0)
196
197    def previousIndex(self, attrs):
198        """
199        :type attrs: dict
200
201        :returns: The index of certain set of attributes (of a link) in the
202        self.a list. If the set of attributes is not found, returns None
203        :rtype: int
204        """
205        if 'href' not in attrs:  # pragma: no cover
206            return None
207        i = -1
208        for a in self.a:
209            i += 1
210            match = 0
211
212            if 'href' in a and a['href'] == attrs['href']:
213                if 'title' in a or 'title' in attrs:
214                    if 'title' in a and \
215                        'title' in attrs and \
216                            a['title'] == attrs['title']:
217                        match = True
218                else:
219                    match = True
220
221            if match:
222                return i
223
224    def handle_emphasis(self, start, tag_style, parent_style):
225        """
226        Handles various text emphases
227        """
228        tag_emphasis = google_text_emphasis(tag_style)
229        parent_emphasis = google_text_emphasis(parent_style)
230
231        # handle Google's text emphasis
232        strikethrough = 'line-through' in \
233                        tag_emphasis and self.hide_strikethrough
234
235        # google and others may mark a font's weight as `bold` or `700`
236        bold = False
237        for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
238            bold = (bold_marker in tag_emphasis
239                    and bold_marker not in parent_emphasis)
240            if bold:
241                break
242
243        italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis
244        fixed = google_fixed_width_font(tag_style) and not \
245            google_fixed_width_font(parent_style) and not self.pre
246
247        if start:
248            # crossed-out text must be handled before other attributes
249            # in order not to output qualifiers unnecessarily
250            if bold or italic or fixed:
251                self.emphasis += 1
252            if strikethrough:
253                self.quiet += 1
254            if italic:
255                self.o(self.emphasis_mark)
256                self.drop_white_space += 1
257            if bold:
258                self.o(self.strong_mark)
259                self.drop_white_space += 1
260            if fixed:
261                self.o('`')
262                self.drop_white_space += 1
263                self.code = True
264        else:
265            if bold or italic or fixed:
266                # there must not be whitespace before closing emphasis mark
267                self.emphasis -= 1
268                self.space = 0
269            if fixed:
270                if self.drop_white_space:
271                    # empty emphasis, drop it
272                    self.drop_white_space -= 1
273                else:
274                    self.o('`')
275                self.code = False
276            if bold:
277                if self.drop_white_space:
278                    # empty emphasis, drop it
279                    self.drop_white_space -= 1
280                else:
281                    self.o(self.strong_mark)
282            if italic:
283                if self.drop_white_space:
284                    # empty emphasis, drop it
285                    self.drop_white_space -= 1
286                else:
287                    self.o(self.emphasis_mark)
288            # space is only allowed after *all* emphasis marks
289            if (bold or italic) and not self.emphasis:
290                self.o(" ")
291            if strikethrough:
292                self.quiet -= 1
293
294    def handle_tag(self, tag, attrs, start):
295        self.current_tag = tag
296        # attrs is None for endtags
297        if attrs is None:
298            attrs = {}
299        else:
300            attrs = dict(attrs)
301
302        if self.tag_callback is not None:
303            if self.tag_callback(self, tag, attrs, start) is True:
304                return
305
306        # first thing inside the anchor tag is another tag
307        # that produces some output
308        if (start and self.maybe_automatic_link is not None and
309                tag not in ['p', 'div', 'style', 'dl', 'dt'] and
310                (tag != "img" or self.ignore_images)):
311            self.o("[")
312            self.maybe_automatic_link = None
313            self.empty_link = False
314
315        if self.google_doc:
316            # the attrs parameter is empty for a closing tag. in addition, we
317            # need the attributes of the parent nodes in order to get a
318            # complete style description for the current element. we assume
319            # that google docs export well formed html.
320            parent_style = {}
321            if start:
322                if self.tag_stack:
323                    parent_style = self.tag_stack[-1][2]
324                tag_style = element_style(attrs, self.style_def, parent_style)
325                self.tag_stack.append((tag, attrs, tag_style))
326            else:
327                dummy, attrs, tag_style = self.tag_stack.pop() \
328                    if self.tag_stack else (None, {}, {})
329                if self.tag_stack:
330                    parent_style = self.tag_stack[-1][2]
331
332        if hn(tag):
333            self.p()
334            if start:
335                self.inheader = True
336                self.o(hn(tag) * "#" + ' ')
337            else:
338                self.inheader = False
339                return  # prevent redundant emphasis marks on headers
340
341        if tag in ['p', 'div']:
342            if self.google_doc:
343                if start and google_has_height(tag_style):
344                    self.p()
345                else:
346                    self.soft_br()
347            elif self.astack and tag == 'div':
348                pass
349            else:
350                self.p()
351
352        if tag == "br" and start:
353            if self.blockquote > 0:
354                self.o("  \n> ")
355            else:
356                self.o("  \n")
357
358        if tag == "hr" and start:
359            self.p()
360            self.o("* * *")
361            self.p()
362
363        if tag in ["head", "style", 'script']:
364            if start:
365                self.quiet += 1
366            else:
367                self.quiet -= 1
368
369        if tag == "style":
370            if start:
371                self.style += 1
372            else:
373                self.style -= 1
374
375        if tag in ["body"]:
376            self.quiet = 0  # sites like 9rules.com never close <head>
377
378        if tag == "blockquote":
379            if start:
380                self.p()
381                self.o('> ', 0, 1)
382                self.start = 1
383                self.blockquote += 1
384            else:
385                self.blockquote -= 1
386                self.p()
387
388        def no_preceding_space(self):
389            return (self.preceding_data
390                    and re.match(r'[^\s]', self.preceding_data[-1]))
391
392        if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
393            if start and no_preceding_space(self):
394                emphasis = ' ' + self.emphasis_mark
395            else:
396                emphasis = self.emphasis_mark
397
398            self.o(emphasis)
399            if start:
400                self.stressed = True
401
402        if tag in ['strong', 'b'] and not self.ignore_emphasis:
403            if start and no_preceding_space(self):
404                strong = ' ' + self.strong_mark
405            else:
406                strong = self.strong_mark
407
408            self.o(strong)
409            if start:
410                self.stressed = True
411
412        if tag in ['del', 'strike', 's']:
413            if start and no_preceding_space(self):
414                strike = ' ~~'
415            else:
416                strike = '~~'
417
418            self.o(strike)
419            if start:
420                self.stressed = True
421
422        if self.google_doc:
423            if not self.inheader:
424                # handle some font attributes, but leave headers clean
425                self.handle_emphasis(start, tag_style, parent_style)
426
427        if tag in ["kbd", "code", "tt"] and not self.pre:
428            self.o('`')  # TODO: `` `this` ``
429            self.code = not self.code
430
431        if tag == "abbr":
432            if start:
433                self.abbr_title = None
434                self.abbr_data = ''
435                if ('title' in attrs):
436                    self.abbr_title = attrs['title']
437            else:
438                if self.abbr_title is not None:
439                    self.abbr_list[self.abbr_data] = self.abbr_title
440                    self.abbr_title = None
441                self.abbr_data = ''
442
443        if tag == "q":
444            if not self.quote:
445                self.o(self.open_quote)
446            else:
447                self.o(self.close_quote)
448            self.quote = not self.quote
449
450        def link_url(self, link, title=""):
451            url = urlparse.urljoin(self.baseurl, link)
452            title = ' "{0}"'.format(title) if title.strip() else ''
453            self.o(']({url}{title})'.format(url=escape_md(url),
454                                            title=title))
455
456        if tag == "a" and not self.ignore_links:
457            if start:
458                if 'href' in attrs and \
459                    attrs['href'] is not None and not \
460                        (self.skip_internal_links and
461                            attrs['href'].startswith('#')):
462                    self.astack.append(attrs)
463                    self.maybe_automatic_link = attrs['href']
464                    self.empty_link = True
465                    if self.protect_links:
466                        attrs['href'] = '<' + attrs['href'] + '>'
467                else:
468                    self.astack.append(None)
469            else:
470                if self.astack:
471                    a = self.astack.pop()
472                    if self.maybe_automatic_link and not self.empty_link:
473                        self.maybe_automatic_link = None
474                    elif a:
475                        if self.empty_link:
476                            self.o("[")
477                            self.empty_link = False
478                            self.maybe_automatic_link = None
479                        if self.inline_links:
480                            try:
481                                title = a['title'] if a['title'] else ''
482                                title = escape_md(title)
483                            except KeyError:
484                                link_url(self, a['href'], '')
485                            else:
486                                link_url(self, a['href'], title)
487                        else:
488                            i = self.previousIndex(a)
489                            if i is not None:
490                                a = self.a[i]
491                            else:
492                                self.acount += 1
493                                a['count'] = self.acount
494                                a['outcount'] = self.outcount
495                                self.a.append(a)
496                            self.o("][" + str(a['count']) + "]")
497
498        if tag == "img" and start and not self.ignore_images:
499            if 'src' in attrs:
500                if not self.images_to_alt:
501                    attrs['href'] = attrs['src']
502                alt = attrs.get('alt') or self.default_image_alt
503
504                # If we have images_with_size, write raw html including width,
505                # height, and alt attributes
506                if self.images_with_size and \
507                        ("width" in attrs or "height" in attrs):
508                    self.o("<img src='" + attrs["src"] + "' ")
509                    if "width" in attrs:
510                        self.o("width='" + attrs["width"] + "' ")
511                    if "height" in attrs:
512                        self.o("height='" + attrs["height"] + "' ")
513                    if alt:
514                        self.o("alt='" + alt + "' ")
515                    self.o("/>")
516                    return
517
518                # If we have a link to create, output the start
519                if self.maybe_automatic_link is not None:
520                    href = self.maybe_automatic_link
521                    if self.images_to_alt and escape_md(alt) == href and \
522                            self.absolute_url_matcher.match(href):
523                        self.o("<" + escape_md(alt) + ">")
524                        self.empty_link = False
525                        return
526                    else:
527                        self.o("[")
528                        self.maybe_automatic_link = None
529                        self.empty_link = False
530
531                # If we have images_to_alt, we discard the image itself,
532                # considering only the alt text.
533                if self.images_to_alt:
534                    self.o(escape_md(alt))
535                else:
536                    self.o("![" + escape_md(alt) + "]")
537                    if self.inline_links:
538                        href = attrs.get('href') or ''
539                        self.o(
540                            "(" +
541                            escape_md(
542                                urlparse.urljoin(
543                                    self.baseurl,
544                                    href
545                                )
546                            ) +
547                            ")"
548                        )
549                    else:
550                        i = self.previousIndex(attrs)
551                        if i is not None:
552                            attrs = self.a[i]
553                        else:
554                            self.acount += 1
555                            attrs['count'] = self.acount
556                            attrs['outcount'] = self.outcount
557                            self.a.append(attrs)
558                        self.o("[" + str(attrs['count']) + "]")
559
560        if tag == 'dl' and start:
561            self.p()
562        if tag == 'dt' and not start:
563            self.pbr()
564        if tag == 'dd' and start:
565            self.o('    ')
566        if tag == 'dd' and not start:
567            self.pbr()
568
569        if tag in ["ol", "ul"]:
570            # Google Docs create sub lists as top level lists
571            if (not self.list) and (not self.lastWasList):
572                self.p()
573            if start:
574                if self.google_doc:
575                    list_style = google_list_style(tag_style)
576                else:
577                    list_style = tag
578                numbering_start = list_numbering_start(attrs)
579                self.list.append({
580                    'name': list_style,
581                    'num': numbering_start
582                })
583            else:
584                if self.list:
585                    self.list.pop()
586                    if (not self.google_doc) and (not self.list):
587                        self.o('\n')
588            self.lastWasList = True
589        else:
590            self.lastWasList = False
591
592        if tag == 'li':
593            self.pbr()
594            if start:
595                if self.list:
596                    li = self.list[-1]
597                else:
598                    li = {'name': 'ul', 'num': 0}
599                if self.google_doc:
600                    nest_count = self.google_nest_count(tag_style)
601                else:
602                    nest_count = len(self.list)
603                # TODO: line up <ol><li>s > 9 correctly.
604                self.o("  " * nest_count)
605                if li['name'] == "ul":
606                    self.o(self.ul_item_mark + " ")
607                elif li['name'] == "ol":
608                    li['num'] += 1
609                    self.o(str(li['num']) + ". ")
610                self.start = 1
611
612        if tag in ["table", "tr", "td", "th"]:
613            if self.ignore_tables:
614                if tag == 'tr':
615                    if start:
616                        pass
617                    else:
618                        self.soft_br()
619                else:
620                    pass
621
622            elif self.bypass_tables:
623                if start:
624                    self.soft_br()
625                if tag in ["td", "th"]:
626                    if start:
627                        self.o('<{0}>\n\n'.format(tag))
628                    else:
629                        self.o('\n</{0}>'.format(tag))
630                else:
631                    if start:
632                        self.o('<{0}>'.format(tag))
633                    else:
634                        self.o('</{0}>'.format(tag))
635
636            else:
637                if tag == "table":
638                    if start:
639                        self.table_start = True
640                        if self.pad_tables:
641                            self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
642                            self.o("  \n")
643                    else:
644                        if self.pad_tables:
645                            self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
646                            self.o("  \n")
647                if tag in ["td", "th"] and start:
648                    if self.split_next_td:
649                        self.o("| ")
650                    self.split_next_td = True
651
652                if tag == "tr" and start:
653                    self.td_count = 0
654                if tag == "tr" and not start:
655                    self.split_next_td = False
656                    self.soft_br()
657                if tag == "tr" and not start and self.table_start:
658                    # Underline table header
659                    self.o("|".join(["---"] * self.td_count))
660                    self.soft_br()
661                    self.table_start = False
662                if tag in ["td", "th"] and start:
663                    self.td_count += 1
664
665        if tag == "pre":
666            if start:
667                self.startpre = 1
668                self.pre = 1
669            else:
670                self.pre = 0
671                if self.mark_code:
672                    self.out("\n[/code]")
673            self.p()
674
675    # TODO: Add docstring for these one letter functions
676    def pbr(self):
677        "Pretty print has a line break"
678        if self.p_p == 0:
679            self.p_p = 1
680
681    def p(self):
682        "Set pretty print to 1 or 2 lines"
683        self.p_p = 1 if self.single_line_break else 2
684
685    def soft_br(self):
686        "Soft breaks"
687        self.pbr()
688        self.br_toggle = '  '
689
690    def o(self, data, puredata=0, force=0):
691        """
692        Deal with indentation and whitespace
693        """
694        if self.abbr_data is not None:
695            self.abbr_data += data
696
697        if not self.quiet:
698            if self.google_doc:
699                # prevent white space immediately after 'begin emphasis'
700                # marks ('**' and '_')
701                lstripped_data = data.lstrip()
702                if self.drop_white_space and not (self.pre or self.code):
703                    data = lstripped_data
704                if lstripped_data != '':
705                    self.drop_white_space = 0
706
707            if puredata and not self.pre:
708                # This is a very dangerous call ... it could mess up
709                # all handling of &nbsp; when not handled properly
710                # (see entityref)
711                data = re.sub(r'\s+', r' ', data)
712                if data and data[0] == ' ':
713                    self.space = 1
714                    data = data[1:]
715            if not data and not force:
716                return
717
718            if self.startpre:
719                # self.out(" :") #TODO: not output when already one there
720                if not data.startswith("\n") and not data.startswith("\r\n"):
721                    # <pre>stuff...
722                    data = "\n" + data
723                if self.mark_code:
724                    self.out("\n[code]")
725                    self.p_p = 0
726
727            bq = (">" * self.blockquote)
728            if not (force and data and data[0] == ">") and self.blockquote:
729                bq += " "
730
731            if self.pre:
732                if not self.list:
733                    bq += "    "
734                # else: list content is already partially indented
735                for i in range(len(self.list)):
736                    bq += "    "
737                data = data.replace("\n", "\n" + bq)
738
739            if self.startpre:
740                self.startpre = 0
741                if self.list:
742                    # use existing initial indentation
743                    data = data.lstrip("\n")
744
745            if self.start:
746                self.space = 0
747                self.p_p = 0
748                self.start = 0
749
750            if force == 'end':
751                # It's the end.
752                self.p_p = 0
753                self.out("\n")
754                self.space = 0
755
756            if self.p_p:
757                self.out((self.br_toggle + '\n' + bq) * self.p_p)
758                self.space = 0
759                self.br_toggle = ''
760
761            if self.space:
762                if not self.lastWasNL:
763                    self.out(' ')
764                self.space = 0
765
766            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or
767                           force == "end"):
768                if force == "end":
769                    self.out("\n")
770
771                newa = []
772                for link in self.a:
773                    if self.outcount > link['outcount']:
774                        self.out("   [" + str(link['count']) + "]: " +
775                                 urlparse.urljoin(self.baseurl, link['href']))
776                        if 'title' in link:
777                            self.out(" (" + link['title'] + ")")
778                        self.out("\n")
779                    else:
780                        newa.append(link)
781
782                # Don't need an extra line when nothing was done.
783                if self.a != newa:
784                    self.out("\n")
785
786                self.a = newa
787
788            if self.abbr_list and force == "end":
789                for abbr, definition in self.abbr_list.items():
790                    self.out("  *[" + abbr + "]: " + definition + "\n")
791
792            self.p_p = 0
793            self.out(data)
794            self.outcount += 1
795
796    def handle_data(self, data, entity_char=False):
797        if self.stressed:
798            data = data.strip()
799            self.stressed = False
800            self.preceding_stressed = True
801        elif (self.preceding_stressed
802              and re.match(r'[^\s.!?]', data[0])
803              and not hn(self.current_tag)
804              and self.current_tag not in ['a', 'code', 'pre']):
805            # should match a letter or common punctuation
806            data = ' ' + data
807            self.preceding_stressed = False
808
809        if self.style:
810            self.style_def.update(dumb_css_parser(data))
811
812        if self.maybe_automatic_link is not None:
813            href = self.maybe_automatic_link
814            if (href == data and self.absolute_url_matcher.match(href) and
815                    self.use_automatic_links):
816                self.o("<" + data + ">")
817                self.empty_link = False
818                return
819            else:
820                self.o("[")
821                self.maybe_automatic_link = None
822                self.empty_link = False
823
824        if not self.code and not self.pre and not entity_char:
825            data = escape_md_section(data, snob=self.escape_snob)
826        self.preceding_data = data
827        self.o(data, 1)
828
829    def unknown_decl(self, data):  # pragma: no cover
830        # TODO: what is this doing here?
831        pass
832
833    def charref(self, name):
834        if name[0] in ['x', 'X']:
835            c = int(name[1:], 16)
836        else:
837            c = int(name)
838
839        if not self.unicode_snob and c in unifiable_n.keys():
840            return unifiable_n[c]
841        else:
842            try:
843                return chr(c)
844            except ValueError:  # invalid unicode
845                return ''
846
847    def entityref(self, c):
848        if not self.unicode_snob and c in config.UNIFIABLE.keys():
849            return config.UNIFIABLE[c]
850        else:
851            try:
852                name2cp(c)
853            except KeyError:
854                return "&" + c + ';'
855            else:
856                if c == 'nbsp':
857                    return config.UNIFIABLE[c]
858                else:
859                    return chr(name2cp(c))
860
861    def replaceEntities(self, s):
862        s = s.group(1)
863        if s[0] == "#":
864            return self.charref(s[1:])
865        else:
866            return self.entityref(s)
867
868    def unescape(self, s):
869        return config.RE_UNESCAPE.sub(self.replaceEntities, s)
870
871    def google_nest_count(self, style):
872        """
873        Calculate the nesting count of google doc lists
874
875        :type style: dict
876
877        :rtype: int
878        """
879        nest_count = 0
880        if 'margin-left' in style:
881            nest_count = int(style['margin-left'][:-2]) \
882                // self.google_list_indent
883
884        return nest_count
885
886    def optwrap(self, text):
887        """
888        Wrap all paragraphs in the provided text.
889
890        :type text: str
891
892        :rtype: str
893        """
894        if not self.body_width:
895            return text
896
897        assert wrap, "Requires Python 2.3."
898        result = ''
899        newlines = 0
900        # I cannot think of a better solution for now.
901        # To avoid the non-wrap behaviour for entire paras
902        # because of the presence of a link in it
903        if not self.wrap_links:
904            self.inline_links = False
905        for para in text.split("\n"):
906            if len(para) > 0:
907                if not skipwrap(para, self.wrap_links):
908                    result += "\n".join(
909                        wrap(para, self.body_width, break_long_words=False)
910                    )
911                    if para.endswith('  '):
912                        result += "  \n"
913                        newlines = 1
914                    else:
915                        result += "\n\n"
916                        newlines = 2
917                else:
918                    # Warning for the tempted!!!
919                    # Be aware that obvious replacement of this with
920                    # line.isspace()
921                    # DOES NOT work! Explanations are welcome.
922                    if not config.RE_SPACE.match(para):
923                        result += para + "\n"
924                        newlines = 1
925            else:
926                if newlines < 2:
927                    result += "\n"
928                    newlines += 1
929        return result
930
931
932def html2text(html, baseurl='', bodywidth=None):
933    if bodywidth is None:
934        bodywidth = config.BODY_WIDTH
935    h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
936
937    return h.handle(html)
938
939
940def unescape(s, unicode_snob=False):
941    h = HTML2Text()
942    h.unicode_snob = unicode_snob
943
944    return h.unescape(s)
945
946
947if __name__ == "__main__":
948    from html2text.cli import main
949
950    main()
951