1#!/usr/bin/env python3 2# 3# html2text.py - converts HTML to text 4# 5# Wireshark - Network traffic analyzer 6# By Gerald Combs <gerald@wireshark.org> 7# Copyright 1998 Gerald Combs 8# 9# SPDX-License-Identifier: GPL-2.0-or-later 10 11from __future__ import unicode_literals 12 13__author__ = "Peter Wu <peter@lekensteyn.nl>" 14__copyright__ = "Copyright 2015, Peter Wu" 15__license__ = "GPL (v2 or later)" 16 17# TODO: 18# multiple list indentation levels (modify bullets?) 19# maybe allow for ascii output instead of utf-8? 20 21import sys 22from textwrap import TextWrapper 23try: 24 from HTMLParser import HTMLParser 25 from htmlentitydefs import name2codepoint 26except ImportError: # Python 3 27 from html.parser import HTMLParser 28 from html.entities import name2codepoint 29 unichr = chr # for html entity handling 30 31class TextHTMLParser(HTMLParser): 32 """Converts a HTML document to text.""" 33 def __init__(self): 34 try: 35 # Python 3.4 36 HTMLParser. __init__(self, convert_charrefs=True) 37 except Exception: 38 HTMLParser. __init__(self) 39 # All text, concatenated 40 self.output_buffer = '' 41 # The current text block which is being constructed 42 self.text_block = '' 43 # Whether the previous element was terminated with whitespace 44 self.need_space = False 45 # Whether to prevent word-wrapping the contents (for "pre" tag) 46 self.skip_wrap = False 47 # track list items 48 self.list_item_prefix = None 49 self.ordered_list_index = None 50 self.stack_list_item_prefix = [] 51 self.stack_ordered_list_index = [] 52 self.list_indent_level = 0 53 self.list_item_indent = "" 54 # Indentation (for heading and paragraphs) 55 self.indent_levels = [0, 0] 56 # Don't dump CSS, scripts, etc. 57 self.ignore_tags = ('head', 'style', 'script') 58 self.ignore_level = 0 59 # href footnotes. 60 self.footnotes = [] 61 self.href = None 62 63 def _wrap_text(self, text): 64 """Wraps text, but additionally indent list items.""" 65 initial_indent = indent = sum(self.indent_levels) * ' ' 66 if self.list_item_prefix: 67 initial_indent += self.list_item_prefix 68 indent += ' ' 69 kwargs = { 70 'width': 72, 71 'initial_indent': initial_indent, 72 'subsequent_indent': indent 73 } 74 kwargs['break_on_hyphens'] = False 75 wrapper = TextWrapper(**kwargs) 76 return '\n'.join(wrapper.wrap(text)) 77 78 def _commit_block(self, newline='\n\n'): 79 text = self.text_block 80 if text: 81 if not self.skip_wrap: 82 text = self._wrap_text(text) 83 self.output_buffer += text + newline 84 self.text_block = '' 85 self.need_space = False 86 87 def handle_starttag(self, tag, attrs): 88 # end a block of text on <br>, but also flush list items which are not 89 # terminated. 90 if tag == 'br' or tag == 'li': 91 self._commit_block('\n') 92 if tag == 'pre': 93 self.skip_wrap = True 94 if tag in ('ol', 'ul'): 95 self.list_indent_level += 1 96 self.list_item_indent = " " * (self.list_indent_level - 1) 97 self.stack_ordered_list_index.append(self.ordered_list_index) 98 self.stack_list_item_prefix.append(self.list_item_prefix) 99 # Following list items are numbered. 100 if tag == 'ol': 101 self.ordered_list_index = 1 102 if tag == 'ul': 103 self.list_item_prefix = self.list_item_indent + ' • ' 104 if tag == 'li' and self.ordered_list_index: 105 self.list_item_prefix = self.list_item_indent + ' %d. ' % (self.ordered_list_index) 106 self.ordered_list_index += 1 107 if tag[0] == 'h' and len(tag) == 2 and \ 108 (tag[1] >= '1' and tag[1] <= '6'): 109 self.indent_levels = [int(tag[1]) - 1, 0] 110 if tag == 'p': 111 self.indent_levels[1] = 1 112 if tag == 'a': 113 try: 114 href = [attr[1] for attr in attrs if attr[0] == 'href'][0] 115 if '://' in href: # Skip relative URLs and links. 116 self.href = href 117 except IndexError: 118 self.href = None 119 if tag in self.ignore_tags: 120 self.ignore_level += 1 121 122 def handle_data(self, data): 123 if self.ignore_level > 0: 124 return 125 elif self.skip_wrap: 126 block = data 127 else: 128 if self.href and data == self.href: 129 # This is a self link. Don't create a footnote. 130 self.href = None 131 132 # For normal text, fold multiple whitespace and strip 133 # leading and trailing spaces for the whole block (but 134 # keep spaces in the middle). 135 block = '' 136 if data.strip() and data[:1].isspace(): 137 # Keep spaces in the middle 138 self.need_space = True 139 if self.need_space and data.strip() and self.text_block: 140 block = ' ' 141 block += ' '.join(data.split()) 142 self.need_space = data[-1:].isspace() 143 self.text_block += block 144 145 def handle_endtag(self, tag): 146 block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6' 147 #block_elements += ' dl dd dt' 148 if tag in block_elements.split(): 149 self._commit_block() 150 if tag in ('ol', 'ul'): 151 self.list_indent_level -= 1 152 self.list_item_indent = " " * (self.list_indent_level - 1) 153 self.ordered_list_index = self.stack_ordered_list_index.pop() 154 self.list_item_prefix = self.stack_list_item_prefix.pop() 155 if tag == 'pre': 156 self.skip_wrap = False 157 if tag == 'a' and self.href: 158 self.footnotes.append(self.href) 159 self.text_block += '[{0}]'.format(len(self.footnotes)) 160 if tag in self.ignore_tags: 161 self.ignore_level -= 1 162 163 def handle_charref(self, name): 164 self.handle_data(unichr(int(name))) 165 166 def handle_entityref(self, name): 167 self.handle_data(unichr(name2codepoint[name])) 168 169 def close(self): 170 HTMLParser.close(self) 171 self._commit_block() 172 173 if len(self.footnotes) > 0: 174 self.list_item_prefix = None 175 self.indent_levels = [1, 0] 176 self.text_block = 'References' 177 self._commit_block() 178 self.indent_levels = [1, 1] 179 footnote_num = 1 180 for href in self.footnotes: 181 self.text_block += '{0:>2}. {1}\n'.format(footnote_num, href) 182 footnote_num += 1 183 self._commit_block('\n') 184 185 186 byte_output = self.output_buffer.encode('utf-8') 187 if hasattr(sys.stdout, 'buffer'): 188 sys.stdout.buffer.write(byte_output) 189 else: 190 sys.stdout.write(byte_output) 191 192 193def main(): 194 htmlparser = TextHTMLParser() 195 if len(sys.argv) > 1 and sys.argv[1] != '-': 196 filename = sys.argv[1] 197 f = open(filename, 'rb') 198 else: 199 filename = None 200 f = sys.stdin 201 try: 202 if hasattr(f, 'buffer'): 203 # Access raw (byte) buffer in Python 3 instead of decoded one 204 f = f.buffer 205 # Read stdin as as Unicode string 206 htmlparser.feed(f.read().decode('utf-8')) 207 finally: 208 if filename is not None: 209 f.close() 210 htmlparser.close() 211 212if __name__ == '__main__': 213 sys.exit(main()) 214