1#!/usr/bin/env python3
2#
3# html2text.py - converts HTML to text
4#
5# Wireshark - Network traffic analyzer
6# By Gerald Combs <gerald@wireshark.org>
7# Copyright 1998 Gerald Combs
8#
9# SPDX-License-Identifier: GPL-2.0-or-later
10
11from __future__ import unicode_literals
12
13__author__      = "Peter Wu <peter@lekensteyn.nl>"
14__copyright__   = "Copyright 2015, Peter Wu"
15__license__     = "GPL (v2 or later)"
16
17# TODO:
18#   multiple list indentation levels (modify bullets?)
19#   maybe allow for ascii output instead of utf-8?
20
21import sys
22from textwrap import TextWrapper
23try:
24    from HTMLParser import HTMLParser
25    from htmlentitydefs import name2codepoint
26except ImportError: # Python 3
27    from html.parser import HTMLParser
28    from html.entities import name2codepoint
29    unichr = chr # for html entity handling
30
31class TextHTMLParser(HTMLParser):
32    """Converts a HTML document to text."""
33    def __init__(self):
34        try:
35            # Python 3.4
36            HTMLParser. __init__(self, convert_charrefs=True)
37        except Exception:
38            HTMLParser. __init__(self)
39        # All text, concatenated
40        self.output_buffer = ''
41        # The current text block which is being constructed
42        self.text_block = ''
43        # Whether the previous element was terminated with whitespace
44        self.need_space = False
45        # Whether to prevent word-wrapping the contents (for "pre" tag)
46        self.skip_wrap = False
47        # track list items
48        self.list_item_prefix = None
49        self.ordered_list_index = None
50        self.stack_list_item_prefix = []
51        self.stack_ordered_list_index = []
52        self.list_indent_level = 0
53        self.list_item_indent = ""
54        # Indentation (for heading and paragraphs)
55        self.indent_levels = [0, 0]
56        # Don't dump CSS, scripts, etc.
57        self.ignore_tags = ('head', 'style', 'script')
58        self.ignore_level = 0
59        # href footnotes.
60        self.footnotes = []
61        self.href = None
62
63    def _wrap_text(self, text):
64        """Wraps text, but additionally indent list items."""
65        initial_indent = indent = sum(self.indent_levels) * ' '
66        if self.list_item_prefix:
67            initial_indent += self.list_item_prefix
68            indent += '    '
69        kwargs = {
70            'width': 72,
71            'initial_indent': initial_indent,
72            'subsequent_indent': indent
73        }
74        kwargs['break_on_hyphens'] = False
75        wrapper = TextWrapper(**kwargs)
76        return '\n'.join(wrapper.wrap(text))
77
78    def _commit_block(self, newline='\n\n'):
79        text = self.text_block
80        if text:
81            if not self.skip_wrap:
82                text = self._wrap_text(text)
83            self.output_buffer += text + newline
84            self.text_block = ''
85        self.need_space = False
86
87    def handle_starttag(self, tag, attrs):
88        # end a block of text on <br>, but also flush list items which are not
89        # terminated.
90        if tag == 'br' or tag == 'li':
91            self._commit_block('\n')
92        if tag == 'pre':
93            self.skip_wrap = True
94        if tag in ('ol', 'ul'):
95            self.list_indent_level += 1
96            self.list_item_indent = "   " * (self.list_indent_level - 1)
97            self.stack_ordered_list_index.append(self.ordered_list_index)
98            self.stack_list_item_prefix.append(self.list_item_prefix)
99        # Following list items are numbered.
100        if tag == 'ol':
101            self.ordered_list_index = 1
102        if tag == 'ul':
103            self.list_item_prefix = self.list_item_indent + '  • '
104        if tag == 'li' and self.ordered_list_index:
105            self.list_item_prefix =  self.list_item_indent + ' %d. ' % (self.ordered_list_index)
106            self.ordered_list_index += 1
107        if tag[0] == 'h' and len(tag) == 2 and \
108            (tag[1] >= '1' and tag[1] <= '6'):
109            self.indent_levels = [int(tag[1]) - 1, 0]
110        if tag == 'p':
111            self.indent_levels[1] = 1
112        if tag == 'a':
113            try:
114                href = [attr[1] for attr in attrs if attr[0] == 'href'][0]
115                if '://' in href: # Skip relative URLs and links.
116                    self.href = href
117            except IndexError:
118                self.href = None
119        if tag in self.ignore_tags:
120            self.ignore_level += 1
121
122    def handle_data(self, data):
123        if self.ignore_level > 0:
124            return
125        elif self.skip_wrap:
126            block = data
127        else:
128            if self.href and data == self.href:
129                # This is a self link. Don't create a footnote.
130                self.href = None
131
132            # For normal text, fold multiple whitespace and strip
133            # leading and trailing spaces for the whole block (but
134            # keep spaces in the middle).
135            block = ''
136            if data.strip() and data[:1].isspace():
137                # Keep spaces in the middle
138                self.need_space = True
139            if self.need_space and data.strip() and self.text_block:
140                block = ' '
141            block += ' '.join(data.split())
142            self.need_space = data[-1:].isspace()
143        self.text_block += block
144
145    def handle_endtag(self, tag):
146        block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
147        #block_elements += ' dl dd dt'
148        if tag in block_elements.split():
149            self._commit_block()
150        if tag in ('ol', 'ul'):
151            self.list_indent_level -= 1
152            self.list_item_indent = "   " * (self.list_indent_level - 1)
153            self.ordered_list_index = self.stack_ordered_list_index.pop()
154            self.list_item_prefix = self.stack_list_item_prefix.pop()
155        if tag == 'pre':
156            self.skip_wrap = False
157        if tag == 'a' and self.href:
158            self.footnotes.append(self.href)
159            self.text_block += '[{0}]'.format(len(self.footnotes))
160        if tag in self.ignore_tags:
161            self.ignore_level -= 1
162
163    def handle_charref(self, name):
164        self.handle_data(unichr(int(name)))
165
166    def handle_entityref(self, name):
167        self.handle_data(unichr(name2codepoint[name]))
168
169    def close(self):
170        HTMLParser.close(self)
171        self._commit_block()
172
173        if len(self.footnotes) > 0:
174            self.list_item_prefix = None
175            self.indent_levels = [1, 0]
176            self.text_block = 'References'
177            self._commit_block()
178            self.indent_levels = [1, 1]
179            footnote_num = 1
180            for href in self.footnotes:
181                self.text_block += '{0:>2}. {1}\n'.format(footnote_num, href)
182                footnote_num += 1
183                self._commit_block('\n')
184
185
186        byte_output = self.output_buffer.encode('utf-8')
187        if hasattr(sys.stdout, 'buffer'):
188            sys.stdout.buffer.write(byte_output)
189        else:
190            sys.stdout.write(byte_output)
191
192
193def main():
194    htmlparser = TextHTMLParser()
195    if len(sys.argv) > 1 and sys.argv[1] != '-':
196        filename = sys.argv[1]
197        f = open(filename, 'rb')
198    else:
199        filename = None
200        f = sys.stdin
201    try:
202        if hasattr(f, 'buffer'):
203            # Access raw (byte) buffer in Python 3 instead of decoded one
204            f = f.buffer
205        # Read stdin as as Unicode string
206        htmlparser.feed(f.read().decode('utf-8'))
207    finally:
208        if filename is not None:
209            f.close()
210    htmlparser.close()
211
212if __name__ == '__main__':
213    sys.exit(main())
214