1"""HTML 2.0 parser. 2 3See the HTML 2.0 specification: 4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html 5""" 6 7from pysollib.formatter import AS_IS 8 9from six.moves import html_parser 10 11 12class HTMLParseError(RuntimeError): 13 """Error raised when an HTML document can't be parsed.""" 14 15 16class HTMLParser(html_parser.HTMLParser): 17 """This is the basic HTML parser class. 18 19 It supports all entity names required by the XHTML 1.0 Recommendation. 20 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 21 elements. 22 23 """ 24 25 # from six.moves.html_entities import entitydefs 26 27 def __init__(self, formatter): 28 """Creates an instance of the HTMLParser class. 29 30 The formatter parameter is the formatter instance associated with 31 the parser. 32 33 """ 34 html_parser.HTMLParser.__init__(self) 35 self.formatter = formatter 36 37 def error(self, message): 38 raise HTMLParseError(message) 39 40 def reset(self): 41 html_parser.HTMLParser.reset(self) 42 self.savedata = None 43 self.isindex = 0 44 self.title = None 45 self.base = None 46 self.anchor = None 47 self.anchorlist = [] 48 self.nofill = 0 49 self.list_stack = [] 50 51 # ------ Methods used internally; some may be overridden 52 53 # --- Formatter interface, taking care of 'savedata' mode; 54 # shouldn't need to be overridden 55 56 def handle_data(self, data): 57 if self.savedata is not None: 58 self.savedata = self.savedata + data 59 else: 60 if self.nofill: 61 self.formatter.add_literal_data(data) 62 else: 63 self.formatter.add_flowing_data(data) 64 65 def handle_starttag(self, tag, attrs): 66 try: 67 method = getattr(self, 'start_' + tag) 68 except AttributeError: 69 try: 70 method = getattr(self, 'do_' + tag) 71 except AttributeError: 72 self.unknown_starttag(tag, attrs) 73 return 74 method(attrs) 75 76 def handle_endtag(self, tag): 77 try: 78 method = getattr(self, 'end_' + tag) 79 except AttributeError: 80 self.unknown_endtag(tag) 81 return 82 method() 83 84 # --- Hooks to save data; shouldn't need to be overridden 85 86 def save_bgn(self): 87 """Begins saving character data in a buffer instead of sending it 88 to the formatter object. 89 90 Retrieve the stored data via the save_end() method. Use of the 91 save_bgn() / save_end() pair may not be nested. 92 93 """ 94 self.savedata = '' 95 96 def save_end(self): 97 """Ends buffering character data and returns all data saved since 98 the preceding call to the save_bgn() method. 99 100 If the nofill flag is false, whitespace is collapsed to single 101 spaces. A call to this method without a preceding call to the 102 save_bgn() method will raise a TypeError exception. 103 104 """ 105 data = self.savedata 106 self.savedata = None 107 if not self.nofill: 108 data = ' '.join(data.split()) 109 return data 110 111 # --- Hooks for anchors; should probably be overridden 112 113 def anchor_bgn(self, href, name, type): 114 """This method is called at the start of an anchor region. 115 116 The arguments correspond to the attributes of the <A> tag with 117 the same names. The default implementation maintains a list of 118 hyperlinks (defined by the HREF attribute for <A> tags) within 119 the document. The list of hyperlinks is available as the data 120 attribute anchorlist. 121 122 """ 123 self.anchor = href 124 if self.anchor: 125 self.anchorlist.append(href) 126 127 def anchor_end(self): 128 """This method is called at the end of an anchor region. 129 130 The default implementation adds a textual footnote marker using an 131 index into the list of hyperlinks created by the anchor_bgn()method. 132 133 """ 134 if self.anchor: 135 self.handle_data("[%d]" % len(self.anchorlist)) 136 self.anchor = None 137 138 # --- Hook for images; should probably be overridden 139 140 def handle_image(self, src, alt, *args): 141 """This method is called to handle images. 142 143 The default implementation simply passes the alt value to the 144 handle_data() method. 145 146 """ 147 self.handle_data(alt) 148 149 # --------- Top level elememts 150 151 def start_html(self, attrs): pass 152 153 def end_html(self): pass 154 155 def start_head(self, attrs): pass 156 157 def end_head(self): pass 158 159 def start_body(self, attrs): pass 160 161 def end_body(self): pass 162 163 # ------ Head elements 164 165 def start_title(self, attrs): 166 self.save_bgn() 167 168 def end_title(self): 169 self.title = self.save_end() 170 171 def do_base(self, attrs): 172 for a, v in attrs: 173 if a == 'href': 174 self.base = v 175 176 def do_isindex(self, attrs): 177 self.isindex = 1 178 179 def do_link(self, attrs): 180 pass 181 182 def do_meta(self, attrs): 183 pass 184 185 def do_nextid(self, attrs): # Deprecated 186 pass 187 188 # ------ Body elements 189 190 # --- Headings 191 192 def start_h1(self, attrs): 193 self.formatter.end_paragraph(1) 194 self.formatter.push_font(('h1', 0, 1, 0)) 195 196 def end_h1(self): 197 self.formatter.end_paragraph(1) 198 self.formatter.pop_font() 199 200 def start_h2(self, attrs): 201 self.formatter.end_paragraph(1) 202 self.formatter.push_font(('h2', 0, 1, 0)) 203 204 def end_h2(self): 205 self.formatter.end_paragraph(1) 206 self.formatter.pop_font() 207 208 def start_h3(self, attrs): 209 self.formatter.end_paragraph(1) 210 self.formatter.push_font(('h3', 0, 1, 0)) 211 212 def end_h3(self): 213 self.formatter.end_paragraph(1) 214 self.formatter.pop_font() 215 216 def start_h4(self, attrs): 217 self.formatter.end_paragraph(1) 218 self.formatter.push_font(('h4', 0, 1, 0)) 219 220 def end_h4(self): 221 self.formatter.end_paragraph(1) 222 self.formatter.pop_font() 223 224 def start_h5(self, attrs): 225 self.formatter.end_paragraph(1) 226 self.formatter.push_font(('h5', 0, 1, 0)) 227 228 def end_h5(self): 229 self.formatter.end_paragraph(1) 230 self.formatter.pop_font() 231 232 def start_h6(self, attrs): 233 self.formatter.end_paragraph(1) 234 self.formatter.push_font(('h6', 0, 1, 0)) 235 236 def end_h6(self): 237 self.formatter.end_paragraph(1) 238 self.formatter.pop_font() 239 240 # --- Block Structuring Elements 241 242 def do_p(self, attrs): 243 self.formatter.end_paragraph(1) 244 245 def start_pre(self, attrs): 246 self.formatter.end_paragraph(1) 247 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 248 self.nofill = self.nofill + 1 249 250 def end_pre(self): 251 self.formatter.end_paragraph(1) 252 self.formatter.pop_font() 253 self.nofill = max(0, self.nofill - 1) 254 255 def start_xmp(self, attrs): 256 self.start_pre(attrs) 257 self.setliteral('xmp') # Tell SGML parser 258 259 def end_xmp(self): 260 self.end_pre() 261 262 def start_listing(self, attrs): 263 self.start_pre(attrs) 264 self.setliteral('listing') # Tell SGML parser 265 266 def end_listing(self): 267 self.end_pre() 268 269 def start_address(self, attrs): 270 self.formatter.end_paragraph(0) 271 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 272 273 def end_address(self): 274 self.formatter.end_paragraph(0) 275 self.formatter.pop_font() 276 277 def start_blockquote(self, attrs): 278 self.formatter.end_paragraph(1) 279 self.formatter.push_margin('blockquote') 280 281 def end_blockquote(self): 282 self.formatter.end_paragraph(1) 283 self.formatter.pop_margin() 284 285 # --- List Elements 286 287 def start_ul(self, attrs): 288 self.formatter.end_paragraph(not self.list_stack) 289 self.formatter.push_margin('ul') 290 self.list_stack.append(['ul', '*', 0]) 291 292 def end_ul(self): 293 if self.list_stack: 294 del self.list_stack[-1] 295 self.formatter.end_paragraph(not self.list_stack) 296 self.formatter.pop_margin() 297 298 def do_li(self, attrs): 299 self.formatter.end_paragraph(0) 300 if self.list_stack: 301 [dummy, label, counter] = top = self.list_stack[-1] 302 top[2] = counter = counter+1 303 else: 304 label, counter = '*', 0 305 self.formatter.add_label_data(label, counter) 306 307 def start_ol(self, attrs): 308 self.formatter.end_paragraph(not self.list_stack) 309 self.formatter.push_margin('ol') 310 label = '1.' 311 for a, v in attrs: 312 if a == 'type': 313 if len(v) == 1: 314 v += '.' 315 label = v 316 self.list_stack.append(['ol', label, 0]) 317 318 def end_ol(self): 319 if self.list_stack: 320 del self.list_stack[-1] 321 self.formatter.end_paragraph(not self.list_stack) 322 self.formatter.pop_margin() 323 324 def start_menu(self, attrs): 325 self.start_ul(attrs) 326 327 def end_menu(self): 328 self.end_ul() 329 330 def start_dir(self, attrs): 331 self.start_ul(attrs) 332 333 def end_dir(self): 334 self.end_ul() 335 336 def start_dl(self, attrs): 337 self.formatter.end_paragraph(1) 338 self.list_stack.append(['dl', '', 0]) 339 340 def end_dl(self): 341 self.ddpop(1) 342 if self.list_stack: 343 del self.list_stack[-1] 344 345 def do_dt(self, attrs): 346 self.ddpop() 347 348 def do_dd(self, attrs): 349 self.ddpop() 350 self.formatter.push_margin('dd') 351 self.list_stack.append(['dd', '', 0]) 352 353 def ddpop(self, bl=0): 354 self.formatter.end_paragraph(bl) 355 if self.list_stack: 356 if self.list_stack[-1][0] == 'dd': 357 del self.list_stack[-1] 358 self.formatter.pop_margin() 359 360 # --- Phrase Markup 361 362 # Idiomatic Elements 363 364 def start_cite(self, attrs): self.start_i(attrs) 365 366 def end_cite(self): self.end_i() 367 368 def start_code(self, attrs): self.start_tt(attrs) 369 370 def end_code(self): self.end_tt() 371 372 def start_em(self, attrs): self.start_i(attrs) 373 374 def end_em(self): self.end_i() 375 376 def start_kbd(self, attrs): self.start_tt(attrs) 377 378 def end_kbd(self): self.end_tt() 379 380 def start_samp(self, attrs): self.start_tt(attrs) 381 382 def end_samp(self): self.end_tt() 383 384 def start_strong(self, attrs): self.start_b(attrs) 385 386 def end_strong(self): self.end_b() 387 388 def start_var(self, attrs): self.start_i(attrs) 389 390 def end_var(self): self.end_i() 391 392 # Typographic Elements 393 394 def start_i(self, attrs): 395 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 396 397 def end_i(self): 398 self.formatter.pop_font() 399 400 def start_b(self, attrs): 401 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) 402 403 def end_b(self): 404 self.formatter.pop_font() 405 406 def start_tt(self, attrs): 407 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 408 409 def end_tt(self): 410 self.formatter.pop_font() 411 412 def start_a(self, attrs): 413 href = '' 414 name = '' 415 type = '' 416 for attrname, value in attrs: 417 value = value.strip() 418 if attrname == 'href': 419 href = value 420 if attrname == 'name': 421 name = value 422 if attrname == 'type': 423 type = value.lower() 424 self.anchor_bgn(href, name, type) 425 426 def end_a(self): 427 self.anchor_end() 428 429 # --- Line Break 430 431 def do_br(self, attrs): 432 self.formatter.add_line_break() 433 434 # --- Horizontal Rule 435 436 def do_hr(self, attrs): 437 self.formatter.add_hor_rule() 438 439 # --- Image 440 441 def do_img(self, attrs): 442 align = '' 443 alt = '(image)' 444 ismap = '' 445 src = '' 446 width = 0 447 height = 0 448 for attrname, value in attrs: 449 if attrname == 'align': 450 align = value 451 if attrname == 'alt': 452 alt = value 453 if attrname == 'ismap': 454 ismap = value 455 if attrname == 'src': 456 src = value 457 if attrname == 'width': 458 try: 459 width = int(value) 460 except ValueError: 461 pass 462 if attrname == 'height': 463 try: 464 height = int(value) 465 except ValueError: 466 pass 467 self.handle_image(src, alt, ismap, align, width, height) 468 469 # --- Really Old Unofficial Deprecated Stuff 470 471 def do_plaintext(self, attrs): 472 self.start_pre(attrs) 473 self.setnomoretags() # Tell SGML parser 474 475 # --- Unhandled tags 476 477 def unknown_starttag(self, tag, attrs): 478 pass 479 480 def unknown_endtag(self, tag): 481 pass 482 483 484def test(args=None): 485 import sys 486 import pysollib.formatter 487 488 if not args: 489 args = sys.argv[1:] 490 491 silent = args and args[0] == '-s' 492 if silent: 493 del args[0] 494 495 if args: 496 fn = args[0] 497 else: 498 fn = 'test.html' 499 500 if fn == '-': 501 data = sys.stdin.read() 502 else: 503 try: 504 with open(fn, 'rt') as fh: 505 data = fh.read() 506 except IOError as msg: 507 print(fn, ":", msg) 508 sys.exit(1) 509 510 if silent: 511 f = pysollib.formatter.NullFormatter() 512 else: 513 f = pysollib.formatter.AbstractFormatter( 514 pysollib.formatter.DumbWriter() 515 ) 516 517 p = HTMLParser(f) 518 p.feed(data) 519 p.close() 520 521 522if __name__ == '__main__': 523 test() 524